From a728daa78f478bfc67f80317cb6cc5f8e700bea1 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Mon, 13 Nov 2023 11:49:31 +0000
Subject: [PATCH 01/26] Merged PR 31742: Fix docker url security: use microsoft
 cr

Fix docker url security: use microsoft container registry instead of public dockerhub
---
 scripts/metrics/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile
index 4641e6571..25a3236a9 100644
--- a/scripts/metrics/Dockerfile
+++ b/scripts/metrics/Dockerfile
@@ -1,5 +1,6 @@
-FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
-
+FROM mcr.microsoft.com/azureml/minimal-ubuntu20.04-py38-cuda11.6.2-gpu-inference:20231102.v2
+# use this if microsoft image is not accessible;
+#FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
 LABEL description="Marian image - Ubuntu 20.04"
 
 ARG DEBIAN_FRONTEND=noninteractive

From 6fe9a8078446cb0fb711e5bbb9d638ee87d3e9cc Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Fri, 24 Nov 2023 15:58:48 +0000
Subject: [PATCH 02/26] Merged PR 31906: Updates to CI pipeline: new vcpkg and
 options to disable specific jobs

---
 azure-pipelines.yml | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1ec1f8739..4e1744375 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -8,10 +8,26 @@
 
 parameters:
 # Allow skipping the entire 'Build' stage
-- name: runBuilds
-  displayName: Run builds? Uncheck to run regression tests only.
+- name: disableAllBuilds
+  displayName: Disable all builds and run regression tests only
   type: boolean
-  default: true
+  default: false
+- name: disableAllTests
+  displayName: Do not run regression tests
+  type: boolean
+  default: false
+- name: disableLinux
+  displayName: Disable Linux builds
+  type: boolean
+  default: false
+- name: disableMacOS
+  displayName: Disable MacOS builds
+  type: boolean
+  default: false
+- name: disableWindows
+  displayName: Disable Windows builds
+  type: boolean
+  default: false
 
 # Warning: the current branch policies disable the automatic triggering to
 # minimize VM usage!
@@ -54,7 +70,7 @@ variables:
   - name: MKL_URL
     value: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip"
   - name: VCPKG_COMMIT
-    value: 2022.03.10
+    value: 2023.11.20
   - name: VCPKG_DIR
     value: "$(Build.SourcesDirectory)/vcpkg"
   - name: VCPKG_PACKAGES
@@ -73,7 +89,7 @@ stages:
   ######################################################################
   - job: BuildWindows
     cancelTimeoutInMinutes: 1
-    condition: eq(${{ parameters.runBuilds }}, true)
+    condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableWindows }}, false) )
     displayName: Windows
 
     strategy:
@@ -210,7 +226,7 @@ stages:
   ######################################################################
   - job: BuildUbuntu
     cancelTimeoutInMinutes: 1
-    condition: eq(${{ parameters.runBuilds }}, true)
+    condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableLinux }}, false) )
     displayName: Ubuntu
     timeoutInMinutes: 120
 
@@ -348,7 +364,7 @@ stages:
   ######################################################################
   - job: BuildMacOS
     cancelTimeoutInMinutes: 1
-    condition: eq(${{ parameters.runBuilds }}, true)
+    condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableMacOS }}, false) )
     displayName: macOS CPU clang
 
     pool:
@@ -398,7 +414,7 @@ stages:
   ######################################################################
   - job: BuildInstall
     cancelTimeoutInMinutes: 1
-    condition: eq(${{ parameters.runBuilds }}, true)
+    condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableLinux }}, false) )
     displayName: Linux CPU library install
 
     pool:
@@ -462,6 +478,7 @@ stages:
   ######################################################################
   - job: TestWindows
     cancelTimeoutInMinutes: 1
+    condition: eq(${{ parameters.disableAllTests }}, false)
     displayName: Windows CPU+FBGEMM
 
     pool:
@@ -588,6 +605,7 @@ stages:
   ######################################################################
   - job: TestLinux
     cancelTimeoutInMinutes: 1
+    condition: eq(${{ parameters.disableAllTests }}, false)
     displayName: Linux CPU+FBGEMM
 
     pool:

From 72c8d60a77eacc73a572d0a0167e648e15e5def5 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rogrundk@microsoft.com>
Date: Mon, 27 Nov 2023 10:18:41 +0000
Subject: [PATCH 03/26] Merged PR 31918: Update MKL in GPU regression tests

---
 azure-regression-tests.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml
index 0448b172a..206c018a1 100644
--- a/azure-regression-tests.yml
+++ b/azure-regression-tests.yml
@@ -84,8 +84,9 @@ stages:
 
     # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
     - bash: |
-        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
-        sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
+        sudo mkdir -p /usr/share/keyrings
+        wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/intel.gpg > /dev/null
+        echo "deb [signed-by=/usr/share/keyrings/intel.gpg] https://apt.repos.intel.com/mkl all main" | sudo tee /etc/apt/sources.list.d/intel-mkl.list
         sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
         sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
       displayName: Install MKL

From a7cc324d50c02ae74257595e2284a543398f498b Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sat, 2 Dec 2023 03:47:54 +0000
Subject: [PATCH 04/26] Merged PR 31730: ALIBI with shifts

This PR adds a first working version of ALIBI with algorithmic shifts for encoder-decoder models. Also adds trainable ALIBI slopes and biases and ALIBI in general to the **new** layer framework. This is still experimental.
---
 CHANGELOG.md                         |   1 +
 CMakeLists.txt                       |   4 +-
 VERSION                              |   2 +-
 src/CMakeLists.txt                   |   3 +
 src/common/config_parser.cpp         |  26 ++
 src/data/corpus_base.cpp             |   2 +-
 src/data/sentencepiece_vocab.cpp     |   3 +-
 src/data/vocab.cpp                   |   8 +
 src/data/vocab.h                     |   3 +
 src/functional/shape.h               |  15 +-
 src/functional/tensor.h              |   3 +-
 src/graph/expression_operators.cpp   |  55 +++--
 src/graph/expression_operators.h     |   6 +-
 src/graph/node_operators_binary.h    |  39 ++-
 src/layers/constructors.h            |   2 +-
 src/layers/loss.h                    |  22 +-
 src/layers_new/alibi.cpp             | 339 +++++++++++++++++++++++++++
 src/layers_new/alibi.cu              | 251 ++++++++++++++++++++
 src/layers_new/alibi.h               | 268 +++++++++++++++++++++
 src/layers_new/attention.cpp         | 105 +++++++++
 src/layers_new/attention.h           |  77 ++++--
 src/layers_new/interface.h           |  25 ++
 src/layers_new/neuralnet.h           |  17 --
 src/layers_new/transformer.h         |  99 +++++---
 src/models/bleurt.h                  |   3 -
 src/models/comet_qe.h                |  83 +++----
 src/models/decoder.h                 |   1 +
 src/models/encoder_decoder.cpp       |  29 ++-
 src/models/encoder_decoder.h         |   2 +-
 src/models/states.h                  |  31 ++-
 src/models/transformer.h             |   3 +
 src/models/transformer_new.h         |  23 +-
 src/tensors/cpu/tensor_operators.cpp |   2 +
 src/tensors/gpu/add.inc              |   3 +
 src/tensors/gpu/add_all.cu           |  15 ++
 src/tensors/gpu/add_all.h            |  14 +-
 src/tensors/gpu/add_all.inc          |   6 +
 src/tensors/gpu/element.inc          |   7 +-
 src/tensors/gpu/tensor_operators.cu  |   3 +-
 src/translator/beam_search.cpp       |   6 +-
 src/translator/translator.h          |   2 +-
 41 files changed, 1391 insertions(+), 217 deletions(-)
 create mode 100644 src/layers_new/alibi.cpp
 create mode 100644 src/layers_new/alibi.cu
 create mode 100644 src/layers_new/alibi.h
 create mode 100644 src/layers_new/attention.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4242e5c19..e4eb14230 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added ALIBI related options to new layer framework.
 - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
 - Added MSE and MAE costs to COMET-QE training.
 - Added augmentation of shuffled examples to COMET-QE training via `--comet-augment-bad`.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c674e68d..2ea841254 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -240,7 +240,7 @@ else(MSVC)
     set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
   endif(CMAKE_COMPILER_IS_GNUCC)
 
-  set(CMAKE_CXX_FLAGS                 "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_CXX_FLAGS                 "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
   set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
@@ -472,7 +472,7 @@ else(CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
 if(NOT MSVC)
   # @TODO: add warnings here too
-  list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
+  list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
   list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
 else()
   list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
diff --git a/VERSION b/VERSION
index 10ae91bd5..274b68518 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.14
+v1.12.15
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77c455946..d4cb8cc14 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -82,6 +82,8 @@ set(MARIAN_SOURCES
   layers/lsh.cpp
 
   layers_new/neuralnet.cpp
+  layers_new/alibi.cpp
+  layers_new/attention.cpp
 
   rnn/cells.cpp
   rnn/attention.cpp
@@ -179,6 +181,7 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY
 
 if(CUDA_FOUND)
   cuda_add_library(marian_cuda
+    layers_new/alibi.cu
     tensors/gpu/device.cu
     tensors/gpu/hash.cu
     tensors/gpu/algorithm.cu
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 425a78143..d797b8e2d 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -320,6 +320,32 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
   cli.add<bool>("--transformer-depth-scaling",
       "Scale down weight initialization in transformer layers by 1 / sqrt(depth)");
 
+  cli.add<std::string>("--transformer-attention-mask",
+      "Type of mask/bias in transformer attention: default, alibi",
+      "default");
+  cli.add<bool>("--transformer-alibi-shift",
+      "Use alibi-shifting with sync-points with --transformer-attention-mask alibi");
+  cli.add<std::string>("--separator-symbol",
+      "Generic separator symbol for different applications, i.e. for transformer-alibi-shift syncpoints, default is [eos] (currently only supported with raw spm models)",
+      "[eos]");
+  cli.add<bool>("--transformer-disable-position-embeddings",
+      "Do not add any position embeddings. Use e.g. with --transformer-attention-mask alibi");
+
+  cli.add<bool>("--transformer-alibi-trainable", 
+      "Make alibi slopes trainable, default slopes are constant");
+
+  // handy shortcut for the current best setup
+  cli.add<bool>("--alibi", 
+      "Use alibi settings for transformer, this is a shortcut for --transformer-attention-mask alibi --transformer-alibi-shift --transformer-disable-position-embeddings --separator-symbol [eos]");
+  cli.alias("alibi", "true", [](YAML::Node& config) {
+    // define current-best alibi settings
+    config["transformer-attention-mask"] = "alibi";
+    config["transformer-alibi-shift"] = true;
+    config["transformer-disable-position-embeddings"] = true;
+    config["separator-symbol"] = "[eos]";
+    config["transformer-alibi-trainable"] = true;
+  });
+
   cli.add<bool>("--transformer-no-bias",
       "Don't use any bias vectors in linear layers");
   cli.add<bool>("--transformer-no-affine",
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index b168ecba1..0ef804b1c 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -355,7 +355,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
     // when force-decoding we want the last vocab to be part of the batch,
     // hence we do not drop it from the input batch.
     bool forceDecoding = options_->get<bool>("force-decode", false);
-    size_t shift = !forceDecoding ? 1 : 0;
+    size_t shift = forceDecoding ? 0 : 1;
 
     for(size_t i = 0; i + shift < numVocs; ++i) {
       Ptr<Vocab> vocab = New<Vocab>(options_, i);
diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp
index 548b95a46..84d406dd5 100644
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@@ -255,7 +255,8 @@ class SentencePieceVocab : public IVocab {
       for(const Word& id : sentence)
         if(!ignoreEOS || id != getEosId())
           line += (*this)[id] + " ";
-      line.pop_back();  // trim the trailing whitespace
+      if(!line.empty())
+        line.pop_back();  // trim the trailing whitespace
     } else {
       // convert vector of Word to vector of int
       std::vector<int> spmSentence;
diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp
index 82a4b8da1..a39c0eeae 100644
--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
@@ -142,6 +142,14 @@ Word Vocab::getEosId() const { return vImpl_->getEosId(); }
 // return UNK symbol id
 Word Vocab::getUnkId() const { return vImpl_->getUnkId(); }
 
+// return generic separator symbol id
+Word Vocab::getSepId() const { 
+  std::string sepSym = options_->get<std::string>("separator-symbol", "[eos]");
+  Word id = (*vImpl_)[sepSym];
+  ABORT_IF(id == getUnkId(), "Separator symbol '{}' not found in vocabulary", sepSym);
+  return id;
+}
+
 std::vector<Word> Vocab::suppressedIds(bool suppressUnk, bool suppressSpecial) const {
   std::vector<Word> ids;
   if(suppressUnk) {
diff --git a/src/data/vocab.h b/src/data/vocab.h
index 7eeca2902..2c60912f7 100644
--- a/src/data/vocab.h
+++ b/src/data/vocab.h
@@ -73,6 +73,9 @@ class Vocab {
   // return UNK symbol id
   Word getUnkId() const;
 
+  // return generic separator symbol id
+  Word getSepId() const;
+
   // return a set of Word ids that should be suppressed based on the underlying vocabulary implementation.
   // Arguments mosty likely provided based on outside options like --allow-unk etc.
   std::vector<Word> suppressedIds(bool suppressUnk = true, bool suppressSpecial = true) const;
diff --git a/src/functional/shape.h b/src/functional/shape.h
index fd354e1e0..330fa9971 100644
--- a/src/functional/shape.h
+++ b/src/functional/shape.h
@@ -76,17 +76,24 @@ struct ConstantShape {
         offset_(shape.offset_) {}
 
   template <size_t M>
-  HOST_DEVICE ConstantShape(const Array<int, M>& shape) {
+  ConstantShape(const Array<int, M>& shape) {
     ABORT_IF(M > N, "Recompile with CONST_SHAPE_DIMS >= {}", M);
 
-    std::copy(shape.begin(), shape.end(), shape_.begin() + N - M);
-    if(N - M)
-      std::fill_n(shape_.begin(), N - M, 1);
+    for(int i = 0; i < shape.size(); ++i)
+      shape_[N - M + i] = shape[i];
+    for(int i = 0; i < N - M; ++i)
+      shape_[i] = 1;
 
     updateStrides();
     updateElements();
   }
 
+  HOST_DEVICE ConstantShape(const Array<int, N>& shape)
+  : shape_(shape) {
+    updateStrides();
+    updateElements();
+  }
+
   HOST_DEVICE ConstantShape(const Array<int, N>& shape,
                             const Array<int, N>& stride,
                             size_t offset)
diff --git a/src/functional/tensor.h b/src/functional/tensor.h
index f5549c608..e631cd63a 100644
--- a/src/functional/tensor.h
+++ b/src/functional/tensor.h
@@ -70,7 +70,8 @@ struct View {
   HOST_DEVICE View(T* ptr, const ConstantShape<D>& shape)
       : data_(ptr), shape_(shape) {}
 
-  HOST View(marian::Tensor t) : data_(t->data<T>()), shape_(adapt<T>(t->shape())) {}
+  HOST View(marian::Tensor t) 
+    : data_(t ? t->data<T>() : nullptr), shape_(t ? adapt<T>(t->shape()) : adapt<T>(marian::Shape({0, 0, 0, 0}))) {}
 
   HOST_DEVICE_INLINE T& operator[](size_t i) {
      return data_[shape_.index((int)i)];
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 1a81ce51f..60a86112f 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -183,7 +183,7 @@ Expr maximum(Expr a, Expr b) {
 
 // @TODO: implement version without constant
 Expr maximum(float a, Expr b) {
-  auto aExpr = b->graph()->constant({}, inits::fromValue(a));
+  auto aExpr = b->graph()->constant({1}, inits::fromValue(a), b->value_type());
   return Expression<MaximumNodeOp>(aExpr, b);
 }
 
@@ -197,7 +197,7 @@ Expr minimum(Expr a, Expr b) {
 
 // @TODO: implement version without constant
 Expr minimum(float a, Expr b) {
-  auto aExpr = b->graph()->constant({}, inits::fromValue(a));
+  auto aExpr = b->graph()->constant({1}, inits::fromValue(a), b->value_type());
   return Expression<MinimumNodeOp>(aExpr, b);
 }
 
@@ -216,19 +216,19 @@ Expr ge(Expr a, Expr b) { return Expression<CmpNodeOp>(a, b, -1,  true); }
 Expr ne(Expr a, Expr b) { return Expression<CmpNodeOp>(a, b,  0,  true); }
 Expr le(Expr a, Expr b) { return Expression<CmpNodeOp>(a, b,  1,  true); }
 
-Expr lt(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, -1, false); }
-Expr eq(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b,  0, false); }
-Expr gt(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b,  1, false); }
-Expr ge(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, -1,  true); }
-Expr ne(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b,  0,  true); }
-Expr le(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b,  1,  true); }
+Expr lt(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, -1, false); }
+Expr eq(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b,  0, false); }
+Expr gt(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b,  1, false); }
+Expr ge(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, -1,  true); }
+Expr ne(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b,  0,  true); }
+Expr le(float a, Expr b) { return Expression<CmpNodeOp>(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b,  1,  true); }
 
-Expr lt(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), -1, false); }
-Expr eq(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()),  0, false); }
-Expr gt(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()),  1, false); }
-Expr ge(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), -1,  true); }
-Expr ne(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()),  0,  true); }
-Expr le(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()),  1,  true); }
+Expr lt(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), -1, false); }
+Expr eq(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()),  0, false); }
+Expr gt(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()),  1, false); }
+Expr ge(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), -1,  true); }
+Expr ne(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()),  0,  true); }
+Expr le(Expr a, float b) { return Expression<CmpNodeOp>(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()),  1,  true); }
 
 /*********************************************************/
 
@@ -280,23 +280,22 @@ Expr operator/(Expr a, float b) {
 
 // TODO: efficient version of this without constant()
 Expr operator/(float a, Expr b) {
-  auto aExpr = b->graph()->constant({}, inits::fromValue(a));
+  auto aExpr = b->graph()->constant({1}, inits::fromValue(a), b->value_type());
   return aExpr / b;
 }
 
-// Expr pow(float a, Expr b) {
-//  return Expression<Scalar1PowNodeOp>(a, b);
-//
-//}
-//
-// Expr pow(Expr a, float b) {
-//  return Expression<Scalar2PowNodeOp>(a, b);
-//
-//}
-//
-// Expr pow(Expr a, Expr b) {
-//  return Expression<PowNodeOp>(a, b);
-//}
+// @TODO: implement proper operators for all three:
+Expr pow(float a, Expr b) {
+  return exp(std::log(a) * b); 
+}
+
+Expr pow(Expr a, float b) {
+  return exp(log(a) * b); 
+}
+
+Expr pow(Expr a, Expr b) {
+  return exp(log(a) * b); 
+}
 
 /*********************************************************/
 
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index e96d8f7c9..c792096b1 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -303,9 +303,9 @@ Expr square(Expr a);
  */
 Expr abs(Expr a);
 
-// Expr pow(Expr a, Expr b);
-// Expr pow(float a, Expr b);
-// Expr pow(Expr a, float b);
+Expr pow(Expr a, Expr b);
+Expr pow(float a, Expr b);
+Expr pow(Expr a, float b);
 
 /**
  * Computes @f$\log(e^a + e^b)@f$.
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 29259f983..8cf0af1a4 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -1692,23 +1692,40 @@ struct RMSNormalizationOp : public NaryNodeOp {
   float eps_;
 };
 
-
+// @TODO: rewriting this fixes a bug for this one node. There should be exactly one 
+// NodeOp per gradient tensor many other nodes have that bug and need to be fixed. 
+// This will only manifest if the first op is not trainable, then gradients for the 
+// other nodes might get skipped despite being trainable.
 struct HighwayNodeOp : public NaryNodeOp {
-  HighwayNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {}
+  HighwayNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes, Shape::broadcast(nodes)) {}
 
   NodeOps forwardOps() override {
-    return {NodeOp(HighwayForward(
-        val_, child(0)->val(), child(1)->val(), child(2)->val()))};
+    using namespace functional;
+    auto alpha = sigmoid(_4);
+    auto fwd = _1 = alpha * _2 + (1.f - alpha) * _3;
+    
+    return {
+      NodeOp(Element(fwd, val_, child(0)->val(), child(1)->val(), child(2)->val()))
+    };
   }
 
   NodeOps backwardOps() override {
-    return {NodeOp(HighwayBackward(child(0)->grad(),
-                                   child(1)->grad(),
-                                   child(2)->grad(),
-                                   child(0)->val(),
-                                   child(1)->val(),
-                                   child(2)->val(),
-                                   adj_))};
+    using namespace functional;
+
+    auto alpha = sigmoid(_1);
+    auto bwd1  = alpha * _2;
+    auto bwd2  = (1.f - alpha) * _2;
+    auto bwd3  = alpha * (1.f - alpha) * _2 * (_3 - _4);
+
+    auto& in1  = child(0)->val();
+    auto& in2  = child(1)->val();
+    auto& gate = child(2)->val();
+
+    return {
+      NodeOp(Add(bwd1, child(0)->grad(), gate, adj_)),
+      NodeOp(Add(bwd2, child(1)->grad(), gate, adj_)),
+      NodeOp(Add(bwd3, child(2)->grad(), gate, adj_, in1, in2))
+    };
   }
 
   const std::string type() override { return "highway"; }
diff --git a/src/layers/constructors.h b/src/layers/constructors.h
index 5597a6a4e..28be22e2d 100644
--- a/src/layers/constructors.h
+++ b/src/layers/constructors.h
@@ -246,7 +246,7 @@ class MLPFactory : public Factory {
   public:
     AsLayerFactory(const WrappedFactory& wrapped) : us(wrapped) {}
     Ptr<IUnaryLayer> construct(Ptr<ExpressionGraph> graph) override final {
-      auto p = std::static_pointer_cast<IUnaryLayer>(us.construct(graph));
+      auto p = std::dynamic_pointer_cast<IUnaryLayer>(us.construct(graph));
       ABORT_IF(!p, "Attempted to cast a Factory to LayerFactory that isn't one");
       return p;
     }
diff --git a/src/layers/loss.h b/src/layers/loss.h
index 5dbb5e553..c27dd954d 100644
--- a/src/layers/loss.h
+++ b/src/layers/loss.h
@@ -198,11 +198,12 @@ class ScaledMultiRationalLoss : public MultiRationalLoss {
   virtual Expr accumulateLoss(const RationalLoss& current) override {
     if(loss_) {
       const auto& first = partialLosses_.front();
-      return loss_
-             + current.loss() * first.count()
-                   / current.count();  // scale up/down to match scale of first loss
+      Type lossType = loss_->value_type();
+       // scale up/down to match scale of first loss
+      return loss_ + cast(current.loss(), lossType) * first.count() / cast(current.count(), lossType); 
     } else {
-      return current.loss();  // first reference loss, keeps to scale with this one
+      // first reference loss, keeps to scale with this one
+      return current.loss(); 
     }
   }
 
@@ -212,7 +213,7 @@ class ScaledMultiRationalLoss : public MultiRationalLoss {
     } else {
       return current.count();  // This is the first loss
     }
-  }
+  } 
 
 public:
   ScaledMultiRationalLoss() : MultiRationalLoss() {}
@@ -233,18 +234,19 @@ class ScaledMultiRationalLoss : public MultiRationalLoss {
 class MeanMultiRationalLoss : public MultiRationalLoss {
 private:
   virtual Expr accumulateLoss(const RationalLoss& current) override {
-    if(loss_)
-      return loss_ + current.loss() / current.count();
-    else
+    if(loss_) {
+      Type lossType = loss_->value_type();
+      return loss_ + cast(current.loss(), lossType) / cast(current.count(), lossType);
+    } else {
       return current.loss() / current.count();
+    }
   }
 
   virtual Expr accumulateCount(const RationalLoss& current) override {
     if(count_)
       return count_;  // keep the existing '1'
     else
-      return current.count()->graph()->ones(
-          {1}, current.loss()->value_type());  // just '1' as labels are factored into loss_
+      return current.count()->graph()->ones({1}, current.loss()->value_type());  // just '1' as labels are factored into loss_
   }
 
 public:
diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp
new file mode 100644
index 000000000..07989ce6a
--- /dev/null
+++ b/src/layers_new/alibi.cpp
@@ -0,0 +1,339 @@
+#include "graph/node_operators_unary.h"
+#include "layers_new/alibi.h"
+
+namespace marian {
+
+AlibiDecoderState::AlibiDecoderState(const rnn::States& states,
+                                     Logits logProbs,
+                                     const std::vector<Ptr<EncoderState>>& encStates,
+                                     Ptr<data::CorpusBatch> batch,
+                                     bool isBatchMajor)
+: DecoderState(states, logProbs, encStates, batch, isBatchMajor) {}
+
+// override to create derived decoder states
+Ptr<DecoderState> AlibiDecoderState::Create(const rnn::States& states,
+                                            Logits logProbs,
+                                            const std::vector<Ptr<EncoderState>>& encStates,
+                                            Ptr<data::CorpusBatch> batch,
+                                            bool isBatchMajor) const {
+  return New<AlibiDecoderState>(states, logProbs, encStates, batch, isBatchMajor);
+}
+
+// expand the decoder state
+Ptr<DecoderState> AlibiDecoderState::next(const rnn::States& states,
+                                          Logits logProbs) const {
+  // expand the previous decoder state via the base class expansion
+  auto state = std::dynamic_pointer_cast<AlibiDecoderState>(DecoderState::next(states, logProbs));
+  // this should always succeed, unless we somehow messed up inheritance
+  ABORT_IF(!state, "state is nullptr, i.e. the conversion to AlibiDecoderState failed??");
+
+  // carry over the sync points and last beam size from the previous state
+  state->syncPoints_ = syncPoints_;
+  state->lastBeam_   = lastBeam_;
+  return state;
+}
+
+// select the hypotheses based on beam search indices
+Ptr<DecoderState> AlibiDecoderState::select(
+    const std::vector<IndexType>& hypIndices,    // [beamIndex * activeBatchSize + batchIndex]
+    const Words& words,
+    const std::vector<IndexType>& batchIndices,  // [batchIndex]
+    int beamSize) const {
+  // select the hypotheses via the base class selection
+  auto state = std::dynamic_pointer_cast<AlibiDecoderState>(DecoderState::select(hypIndices, words, batchIndices, beamSize));
+  // this should always succeed, unless we somehow messed up inheritance
+  ABORT_IF(!state, "state is nullptr, i.e. the conversion to AlibiDecoderState failed??");
+  // compute the new sync points and carry over the current beam size
+  // this is the most important part of the algorithm while decoding
+  state->syncPoints_ = computeSyncPoints(hypIndices, words, batchIndices, beamSize);
+  state->lastBeam_   = beamSize;
+  return state;
+}
+
+// get the alibi shift for the current state based on currently stored sync points computed while decoding
+Expr AlibiDecoderState::getAlibiShift(Ptr<ExpressionGraph> graph, bool decoding) const {
+  if(decoding) {
+    std::vector<float> shift;
+    for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_)
+      shift.push_back((float)(srcPos - trgPos));
+    
+    if(!shift.empty()) {
+      int dimBeam  = lastBeam_;
+      ABORT_IF(dimBeam == 0, "dimBeam is 0??");
+      int dimBatch = (int)shift.size() / dimBeam;
+      return graph->constant({dimBeam, dimBatch, 1, 1}, inits::fromVector(shift)); // [dimBeam, dimBatch, dimTrg=1, 1]
+    } else {
+      return nullptr;
+    }
+  } else {
+    ABORT_IF(getBatch()->sets() != 2, 
+            "--transformer-alibi-shift=true currently only works with batch sets=2");
+    return getAlibiShiftFromBatch(graph);
+  }
+}
+
+// get the alibi shift based on the batch data - this is used during training or scoring where ground truth is available
+Expr AlibiDecoderState::getAlibiShiftFromBatch(Ptr<ExpressionGraph> graph) const {
+  std::vector<float> shift;
+
+  auto targetBatch = getBatch()->back();
+  Word trgSyncSym  = targetBatch->vocab()->getSepId();
+
+  auto locateInTrg = [&targetBatch](int batchIdx, int j) {
+    return targetBatch->data()[targetBatch->locate(batchIdx, j)];
+  };
+
+  auto sourceBatch = getBatch()->front();
+  Word srcSyncSym  = sourceBatch->vocab()->getSepId();
+
+  auto locateInSrc = [&sourceBatch](int batchIdx, int j) {
+    return sourceBatch->data()[sourceBatch->locate(batchIdx, j)];
+  };
+
+  int dimBatch = (int)targetBatch->batchSize();
+  int dimSrc   = (int)sourceBatch->batchWidth();
+  int dimTrg   = (int)targetBatch->batchWidth();
+  
+  for(int batchIdx = 0; batchIdx < dimBatch; ++batchIdx) {
+    int trgPos = -1, srcPos = -1;
+    for(int i = 0; i < dimTrg; ++i) {
+      if(i > 0) { // don't check until we are one word ahead to mimic generation order where we look back by one word (i - 1)
+        if(locateInTrg(batchIdx, i - 1) == trgSyncSym) {
+          trgPos = i - 1; // record that position
+          // now we are looking for the corresponding source position, no need to look backwards
+          for(int j = srcPos + 1; j < dimSrc; ++j) {
+            if(locateInSrc(batchIdx, j) == srcSyncSym) {
+              srcPos = j;
+              break;
+            }
+          }
+        }
+      }
+
+      shift.push_back((float)(srcPos - trgPos));
+    }
+  }
+
+  if(!shift.empty()) {
+    return graph->constant({1, dimBatch, dimTrg, 1}, inits::fromVector(shift)); // [dimBeam=1, dimBatch, dimTrg, 1]
+  } else {
+    return nullptr;
+  }
+}
+
+// compute the sync points for the current state based on the previous sync points and the last generated words.
+// This happens one step at a time while decoding.
+std::vector<AlibiDecoderState::SyncCoord> AlibiDecoderState::computeSyncPoints(
+    const std::vector<IndexType>& hypIndices,   // [beamIndex * activeBatchSize + batchIndex]
+    const Words& words,                         // [beamIndex * activeBatchSize + batchIndex]
+    const std::vector<IndexType>& batchIndices, // [batchIndex] of activeBatchSize
+    int beamSize
+) const {
+  size_t position = getPosition();
+
+  // get the sync symbols for source and target
+  auto sourceBatch   = getBatch()->front();
+  Word srcSyncSymbol = sourceBatch->vocab()->getSepId();
+  Word trgSyncSymbol = srcSyncSymbol; // @TODO: this is actually wrong, we should make sure to use the correct target vocab
+
+  auto locateInSrc = [&sourceBatch](int batchIdx, int j) {
+    return sourceBatch->data()[sourceBatch->locate(batchIdx, j)];
+  };
+
+  int dimBatch = (int)batchIndices.size();
+  std::vector<SyncCoord> nextSyncPoints;
+
+  // For each hypothesis, create an updated sync point.
+  // If the current symbol is not a sync symbol, the sync point is the same as before and gets carried over.
+  // If the current symbol is a sync symbol, the sync point target coordinate is updated to the current position
+  // and the source coordinate is updated to the next sync symbol in the source sentence.
+  for(int i = 0; i < hypIndices.size(); ++i) {
+    SyncCoord pos = syncPoints_.empty() 
+      ? SyncCoord({-1, -1, (int)batchIndices[i % dimBatch]}) // no sync points yet, initialize with -1 position and current batch index
+      : syncPoints_[hypIndices[i]];                          // carry over the sync point from the previous state at first
+    auto& [trgPos, srcPos, batchIdx] = pos;
+
+    // note, words were generated at the step before the current position, hence the pos - 1
+    if(words[i] == trgSyncSymbol) { // the current word is a sync symbol, so update the sync point
+      trgPos = (int)position - 1;
+      // find the next sync symbol in the source sentence
+      for(int j = srcPos + 1; j < sourceBatch->batchWidth(); ++j) {
+        if(locateInSrc(batchIdx, j) == srcSyncSymbol) { // found the next sync symbol in the source
+          srcPos = j; // update the sync point source coordinate
+          break;      // and stop looking
+        }
+      }
+    }
+    nextSyncPoints.push_back(pos);
+  }
+
+  return nextSyncPoints;
+} 
+
+
+Ptr<DecoderState> NewDecoderState(Ptr<Options> options,
+                                  const rnn::States& states,
+                                  Logits logProbs,
+                                  const std::vector<Ptr<EncoderState>>& encStates,
+                                  Ptr<data::CorpusBatch> batch,
+                                  bool isBatchMajor) {
+  if(options->get<bool>("transformer-alibi-shift", false)) {
+    ABORT_IF(options->get<std::string>("transformer-attention-mask") != "alibi", "transformer-alibi-shift=true only works with transformer-attention-mask=\"alibi\"");
+    return New<AlibiDecoderState>(states, logProbs, encStates, batch, isBatchMajor);
+  } else {
+    return New<DecoderState>(states, logProbs, encStates, batch, isBatchMajor);
+  }
+}
+
+Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state, 
+                                          Ptr<ExpressionGraph> graph, 
+                                          bool decoding) {
+  Expr shift;
+  auto alibiState = std::dynamic_pointer_cast<AlibiDecoderState>(state);
+  if(alibiState)
+    shift = alibiState->getAlibiShift(graph, decoding);
+
+  size_t position = state->getPosition();
+  auto nnState = New<nn::DecoderStateList>(position);
+  for(auto& layerState : state->getStates()) {
+    if(alibiState) {
+      nnState->append(New<nn::AlibiDecoderStateItem>(layerState.cell, shift, position));
+    } else {
+      nnState->append(New<nn::DecoderStateItem>(layerState.cell, position));
+    }
+  }
+  return nnState;
+}
+
+#ifdef CUDA_FOUND
+namespace gpu {
+  template <class... Tensors>
+  void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors);
+}
+#endif
+
+namespace cpu {
+  template <class... Tensors>
+  void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) { 
+    ABORT("Not implemented");
+  }
+}
+
+template <class... Tensors>
+void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) {
+#ifdef CUDA_FOUND
+  if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+    gpu::Alibi(numHeads, start, out, tensors...);
+  else
+#endif
+    cpu::Alibi(numHeads, start, out, tensors...);
+}
+
+
+#ifdef CUDA_FOUND
+namespace gpu {
+  template <class... Tensors>
+  void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors);
+}
+#endif
+
+namespace cpu {
+  template <class... Tensors>
+  void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors) { 
+    ABORT("Not implemented");
+  }
+}
+
+template <class... Tensors>
+void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... inputs) {
+#ifdef CUDA_FOUND
+  if(slopesGrad->getBackend()->getDeviceId().type == DeviceType::gpu)
+    gpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...);
+  else
+#endif
+    cpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...);
+}
+
+class AlibiLogMaskNode : public NaryNodeOp {
+private:
+  int numHeads_{8};
+  int start_{0};
+
+  Shape newShape(Expr mask, Expr query, int numHeads) {
+    int dimBeam  = query->shape()[-4];
+    int dimBatch = query->shape()[-3];
+    int dimQuery = query->shape()[-2];
+    int dimKeys  = mask->shape()[-2];
+
+    return { dimBeam, dimBatch * numHeads, dimQuery, dimKeys };
+  }
+
+public:
+  AlibiLogMaskNode(const std::vector<Expr>& nodes, int numHeads, int start)
+  : NaryNodeOp(nodes, newShape(/*mask=*/nodes[0], /*query=*/nodes[1], numHeads), nodes[0]->value_type()), 
+    numHeads_(numHeads), start_{start}
+  {}
+
+  void forward() override {
+    Alibi(
+          numHeads_, 
+          start_,
+          val_, 
+          /*mask=*/  child(0)->val(),
+          /*slopes=*/child(2)->val(), 
+          /*biases=*/child(3)->val(), 
+          /*shift=*/ children().size() == 5 ? child(4)->val() : nullptr);
+  }
+
+  void backward() override {
+    if(!trainable())
+      return;
+    
+    AlibiGrad(
+          numHeads_, 
+          start_,
+          // gradients
+          /*d_f/d_slopes=*/child(2)->grad(), 
+          /*d_f/d_biases=*/child(3)->grad(), 
+          // inputs
+          /*mask=*/   child(0)->val(),
+          /*slopes=*/ child(2)->val(), 
+          /*biases=*/ child(3)->val(), 
+          /*shift=*/  children().size() == 5 ? child(4)->val() : nullptr,
+          // adjoint
+          /*d_J/d_f=*/adj_);
+  }
+
+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, numHeads_);
+    util::hash_combine(seed, start_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<AlibiLogMaskNode>(node);
+    if(!cnode)
+      return false;
+    if(numHeads_ != cnode->numHeads_)
+      return false;
+    if(start_ != cnode->start_)
+      return false;
+    return true;
+  }
+
+  const std::string type() override { return "alibi-log-mask"; }
+};
+
+Expr alibiLogMask(Expr mask, Expr query, Expr slopes, Expr biases, Expr shift, int numHeads, int start) {
+  std::vector<Expr> nodes = {mask, query, slopes, biases};
+  if(shift)
+    nodes.push_back(shift);
+
+  return Expression<AlibiLogMaskNode>(nodes, numHeads, start);
+}
+
+
+} // namespace marian
diff --git a/src/layers_new/alibi.cu b/src/layers_new/alibi.cu
new file mode 100644
index 000000000..07042699b
--- /dev/null
+++ b/src/layers_new/alibi.cu
@@ -0,0 +1,251 @@
+#include "common/types.h"
+#include "functional/functional.h"
+#include "functional/tensor.h"
+#include "tensors/gpu/cuda_helpers.h"
+
+#include <thrust/tuple.h>
+
+namespace marian {
+
+namespace gpu {
+
+template <typename T>
+__global__ void gAlibi(
+  functional::Tensor<T> out,
+  functional::Array<functional::Tensor<T>, 4> inputs,
+  int numHeads,
+  int start,
+  float maskFactor) {
+
+  constexpr size_t N = functional::Shape::size();
+  functional::Array<int, N> oDims;
+  int length = out.shape().elements();
+
+  const auto& mask   = inputs[0];
+  const auto& slopes = inputs[1];
+  const auto& biases = inputs[2];
+  const auto& shift  = inputs[3];
+
+  for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+    int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+    if(index < length) {
+      out.shape().dims(index, oDims);
+
+      int beamIdx      = oDims[0];
+      int batchHeadIdx = oDims[1];
+      int queryIdx     = oDims[2];
+      int keyIdx       = oDims[3];
+
+      // [[maybe_unused]] because NVCC seems to have a bug telling me the variable is not referenced when it appears in an intializer; this surpresses the warning.
+      [[maybe_unused]] int batchIdx = batchHeadIdx / numHeads;
+      [[maybe_unused]] int headIdx  = batchHeadIdx % numHeads;
+
+      int keyPos       = keyIdx;
+      int queryPos     = queryIdx + start;
+      
+      float relPos   = (float)keyPos - (float)queryPos;
+      
+      if(shift.data() != nullptr)
+        relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}];
+
+      float slope = (float)slopes[{0, headIdx, 0, 0}];
+      float bias  = (float)biases[{0, headIdx, 0, 0}];
+      float alibi = slope * abs(relPos + bias);
+
+      float binMask = (float)mask[{0, batchIdx, keyIdx, 0}];
+      float logMask = (2.f * binMask - 1.f) * maskFactor; // range (-maskFactor, maskFactor)
+
+      out[index] = (T)min(logMask, alibi);
+    }
+  }
+}
+
+template <class... Tensors>
+void Alibi(int numHeads, int start, Tensor out, Tensors... tensors) {
+  cudaSetDevice(out->getDeviceId().no);
+  int length = out->size();
+
+  int threads = std::min(MAX_THREADS, length);
+  int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+  float largest = NumericLimits<float>(out->type()).max;
+  float maskFactor = std::min(largest / 2.f, 99999999.f); // to make sure we do not overflow for fp16
+
+  constexpr size_t K = sizeof...(tensors);
+  
+  if(out->type() == Type::float32) {
+    functional::Array<functional::Tensor<float>, K> inputs = {tensors...};
+    gAlibi<float><<<blocks, threads>>>(out, inputs, numHeads, start, maskFactor);
+#if COMPILE_FP16
+  } else if(out->type() == Type::float16) {
+    functional::Array<functional::Tensor<half>, K> inputs = {tensors...};
+    gAlibi<half><<<blocks, threads>>>(out, inputs, numHeads, start, maskFactor);
+#endif
+  } else {
+    ABORT("Alibi for type {} not implemented", out->type());
+  }
+}
+
+// template specialization for h/cpp separation
+template void Alibi<marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor >(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+
+template <typename T>
+__global__ void gAlibiGrad(
+  functional::Tensor<T> slopesGrad,
+  functional::Tensor<T> biasesGrad,
+  functional::Array<functional::Tensor<T>, 5> inputs,
+  int numHeads,
+  int start) {
+
+  const auto& mask   = inputs[0];
+  const auto& slopes = inputs[1];
+  const auto& biases = inputs[2];
+  const auto& shift  = inputs[3];
+  const auto& adj    = inputs[4];
+
+  int cols = adj.size() / numHeads;
+
+  functional::Shape fullShape = adj.shape();
+  int dimBeam      = fullShape[0];
+  int dimBatchHead = fullShape[1];
+  [[maybe_unused]] // because NVCC seems to have a bug telling me the variable is not referenced
+  int dimBatch     = dimBatchHead / numHeads;
+  int dimQuery     = fullShape[2];
+  int dimKeys      = fullShape[3];
+
+  using A5 = functional::Array<int, 5>;
+  using S5 = functional::ConstantShape<5>;
+  S5 fullShape5(A5({dimBeam, dimBatch, numHeads, dimQuery, dimKeys}));
+  S5 headShape5(A5({dimBeam, dimBatch,        1, dimQuery, dimKeys}));
+
+  A5 dims5;
+  const int HEAD_DIM = 2;
+  
+  // compute single element derivate for slopes and biases
+  auto dJ_dxy = [&](int headIdx, int colIdx) -> thrust::tuple<float, float> {
+    // get the location for one head
+    headShape5.dims(colIdx, dims5);
+
+    // set the location of the current head
+    dims5[HEAD_DIM] = headIdx;
+    // get the index into the full tensor
+    int index = fullShape5.index(dims5);
+    // get the value of the full adjoint 
+    float vadj = (float)adj[index];
+
+    // handle the rest
+    int beamIdx  = dims5[0];
+    int batchIdx = dims5[1];
+    int queryIdx = dims5[3];
+    int keyIdx   = dims5[4];
+
+    int keyPos    = keyIdx;
+    int queryPos  = queryIdx + start;
+    
+    float relPos   = (float)keyPos - (float)queryPos;
+    
+    if(shift.data() != nullptr)
+      relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}];
+
+    float slope = (float)slopes[{0, headIdx, 0, 0}];
+    float bias  = (float)biases[{0, headIdx, 0, 0}];
+    float binMask = (float)mask[{0, batchIdx, keyIdx, 0}];
+
+    float signedAlibi = relPos + bias;
+    
+    // compute derivative of slope
+    float dslope = binMask * abs(signedAlibi) * vadj;
+
+    // compute derivative of bias
+    float db;
+    if(signedAlibi > 0)
+      db = 1.f;
+    else if(signedAlibi < 0)
+      db = -1.f;
+    else
+      db = 0.f;
+    float dbias  = binMask * slope * db * vadj;
+
+    return { dslope, dbias };
+  };
+  
+  for(int bid = 0; bid < numHeads; bid += gridDim.x) {
+    int headIdx = bid + blockIdx.x;
+    if(headIdx < numHeads) {
+      // get and assign shared memory
+      extern __shared__ uint8_t _sharedBytes[];
+      float* _sum = (float*)(_sharedBytes);
+      auto sharedSlopes = [_sum](int idx) -> float& { return _sum[2 * idx + 0]; }; // use even indices for slopes
+      auto sharedBiases = [_sum](int idx) -> float& { return _sum[2 * idx + 1]; }; // use odd indices for biases
+
+      sharedSlopes(threadIdx.x) = 0.0;
+      sharedBiases(threadIdx.x) = 0.0;
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int colIdx = tid + threadIdx.x;
+        if(colIdx < cols) {
+          float dslopes = 0, dbiases = 0;
+          // get the element-wise derivative
+          thrust::tie(dslopes, dbiases) = dJ_dxy(headIdx, colIdx);
+          // accumulate by thread id
+          sharedSlopes(threadIdx.x) += dslopes;
+          sharedBiases(threadIdx.x) += dbiases;
+        }
+      }
+      __syncthreads();
+
+      // accumulate here over matrix dimensions, tree reduction
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if(threadIdx.x < (len >> 1)) {
+          sharedSlopes(threadIdx.x) += sharedSlopes(threadIdx.x + skip); // float
+          sharedBiases(threadIdx.x) += sharedBiases(threadIdx.x + skip); // float
+        }
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+
+      // assign accumulated gradients here (preserving existing gradients)
+      slopesGrad[headIdx] += (T)sharedSlopes(0);
+      biasesGrad[headIdx] += (T)sharedBiases(0);
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, class... Tensors>
+void TypedAlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) {
+  cudaSetDevice(slopesGrad->getDeviceId().no);
+
+  constexpr size_t K = sizeof...(tensors);
+  functional::Array<functional::Tensor<T>, K> inputs = {tensors...};
+
+  const auto& adj = inputs[K - 1]; // last one is adjoint and full broadcast shape
+  int total = adj.size();
+  
+  // we will reduce over each head
+  int blocks  = std::min(MAX_BLOCKS,  numHeads);
+  int threads = std::min(MAX_THREADS, total / numHeads);
+  int shared  = sizeof(float) * threads * 2; // Use float32 as accumulation type, we accumulate slopes and biases
+
+  gAlibiGrad<T><<<blocks, threads, shared>>>(slopesGrad, biasesGrad, inputs, numHeads, start);
+}
+
+template <class... Tensors>
+void AlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) {  
+  if(slopesGrad->type() == Type::float32) {
+    TypedAlibiGrad<float>(numHeads, start, slopesGrad, biasesGrad, tensors...);
+#if COMPILE_FP16
+  } else if(slopesGrad->type() == Type::float16) {
+    TypedAlibiGrad<half>(numHeads, start, slopesGrad, biasesGrad, tensors...);
+#endif
+  } else {
+    ABORT("AlibiGrad for type {} not implemented", slopesGrad->type());
+  }
+}
+
+// template specialization for h/cpp separation
+template void AlibiGrad<marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor>(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+}
+}
diff --git a/src/layers_new/alibi.h b/src/layers_new/alibi.h
new file mode 100644
index 000000000..bec2da55d
--- /dev/null
+++ b/src/layers_new/alibi.h
@@ -0,0 +1,268 @@
+#pragma once
+
+#include "models/states.h"
+#include "layers_new/attention.h"
+#include "layers_new/decoder.h"
+#include "layers_new/neuralnet.h"
+
+namespace marian {
+
+// @TODO: this whole set of functions is currently somewhat akward in general, since we need to implement
+// old style and new style decoder state for this to work. We decoder with the old decoder framework, but
+// use the new style transformer layers. This will eventually be cleaned up.
+
+// Specialized version of DecoderState for model that knows about algorithmic ALIBI position shifts
+class AlibiDecoderState : public DecoderState {
+private:
+  typedef std::tuple<int, int, int> SyncCoord;
+  mutable std::vector<SyncCoord> syncPoints_;
+  int lastBeam_{1};
+
+public:
+  AlibiDecoderState(const rnn::States& states,
+                    Logits logProbs,
+                    const std::vector<Ptr<EncoderState>>& encStates,
+                    Ptr<data::CorpusBatch> batch,
+                    bool isBatchMajor = false);
+
+  // override to create derived decoder states
+  virtual Ptr<DecoderState> Create(const rnn::States& states,
+                                   Logits logProbs,
+                                   const std::vector<Ptr<EncoderState>>& encStates,
+                                   Ptr<data::CorpusBatch> batch,
+                                   bool isBatchMajor = false) const override;
+
+  // expand the decoder state
+  virtual Ptr<DecoderState> next(const rnn::States& states,
+                                 Logits logProbs) const override;
+
+  // select the hypotheses based on beam search indices
+  virtual Ptr<DecoderState> select(
+      const std::vector<IndexType>& hypIndices,    // [beamIndex * activeBatchSize + batchIndex]
+      const Words& words,
+      const std::vector<IndexType>& batchIndices,  // [batchIndex]
+      int beamSize) const override;
+
+  // get the alibi shift for the current state based on currently stored sync points computed while decoding
+  Expr getAlibiShift(Ptr<ExpressionGraph> graph, bool decoding) const;
+
+  // get the alibi shift based on the batch data - this is used during training or scoring where ground truth is available
+  Expr getAlibiShiftFromBatch(Ptr<ExpressionGraph> graph) const;
+
+private:
+
+  // compute the sync points for the current state based on the previous sync points and the last generated words.
+  // This happens one step at a time while decoding.
+  std::vector<SyncCoord> computeSyncPoints(
+      const std::vector<IndexType>& hypIndices,   // [beamIndex * activeBatchSize + batchIndex]
+      const Words& words,                         // [beamIndex * activeBatchSize + batchIndex]
+      const std::vector<IndexType>& batchIndices, // [batchIndex] of activeBatchSize
+      int beamSize
+  ) const;
+};
+
+// create a new (alibi) decoder state
+Ptr<DecoderState> NewDecoderState(Ptr<Options> options,
+                                  const rnn::States& states,
+                                  Logits logProbs,
+                                  const std::vector<Ptr<EncoderState>>& encStates,
+                                  Ptr<data::CorpusBatch> batch,
+                                  bool isBatchMajor = false);
+
+// convert an old-style decoder state to an (alibi) decoder state
+Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state, 
+                                          Ptr<ExpressionGraph> graph, 
+                                          bool decoding=false);
+
+// efficient operator for ALIBI log mask with shift and optionally learnable parameters
+Expr alibiLogMask(Expr mask, Expr query, Expr shift, Expr slopes, Expr biases, int numHeads, int start);
+
+namespace nn {
+
+class AlibiDecoderStateItem : public DecoderStateItem {
+private:
+  Expr shift_;
+
+public:
+  AlibiDecoderStateItem(Expr state, Expr shift, size_t position) : DecoderStateItem(state, position), shift_(shift) {}
+  virtual ~AlibiDecoderStateItem() = default;
+
+  Expr getShift() const {
+    return shift_;
+  }
+};
+
+// Experimental implementation of the ALIBI attention mechanism (via masking) (https://arxiv.org/abs/2108.12409)
+class AlibiAttentionMaskProcessor : public AttentionMaskProcessor {
+public:
+  bool trainable{false};    // if true don't use learnable parameters
+
+  Expr slopes;  // learnable per head ALIBI slopes
+  Expr biases;  // learnable per head additive biases
+  
+  using AttentionMaskProcessor::numHeads;
+
+  AlibiAttentionMaskProcessor(Ptr<ExpressionGraph> graph,
+                              Ptr<Options> options)
+    : AttentionMaskProcessor(graph, options),
+      trainable(options->get<bool>("transformer-alibi-trainable", false))
+    {}
+
+  virtual ~AlibiAttentionMaskProcessor() = default;
+
+private:  
+// @TODO: eventually to be removed. This computes ALIBI log masks with multiple operators, replaced with more efficient version below.
+// For now we keep this for documentation and experimentation puprposes.
+// The same functionality is implemented in `alibiLogMask` above via a special operator
+#if 0
+  const float ALIBI_REFERENCE_HEADS{8.f}; // number of reference heads that ALIBI slopes are computed for
+
+  // Compute the alibi mask for a given query and keys
+  Expr alibiMask(Expr query, int dimQuery, int dimKeys, Ptr<DecoderState> state) const {
+    int start = 0;
+    Expr shift = nullptr;
+
+    int dimBatch = query->shape()[-3];
+    int dimBeam  = query->shape()[-4];
+
+    if(state) {
+      start = (int)state->getPosition();
+      auto alibiState = std::dynamic_pointer_cast<AlibiDecoderStateItem>(state);
+      shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1]
+    }
+    
+    // Create constant tensors of reflecting the query and key positions.
+    // When decoding, we start with the decoding state position for the query. The key positions are just the indices for the whole sequence.
+    Expr queryPositions = graph()->constant({1, 1, dimQuery, 1}, inits::range((float)start, (float)(start + dimQuery)));  // [1, 1, dimQuery, 1]
+    Expr keyPositions   = graph()->constant({1, 1, 1,  dimKeys}, inits::range(0.f, (float)dimKeys));                      // [1, 1, 1, dimKeys]
+    
+    // Create matrix of distances between positions, rows are distances of current query position vs all key positions.
+    // Layout is the same as the attention distance matrix where we compute rowwise softmaxes of similarities between
+    // each target word and all the source words
+    Expr alibiBiases = keyPositions - queryPositions; // [1, 1, dimQuery, dimKeys]
+
+    // apply the corrective shift if any sync-points are present
+    if(shift) {
+      alibiBiases = alibiBiases - shift;                                              // [dimBeam, dimBatch, dimQuery, dimKeys]
+      alibiBiases = reshape(alibiBiases, {dimBeam * dimBatch, 1, dimQuery, dimKeys}); // [dimBeam * dimBatch, 1, dimQuery, dimKeys]
+    }
+
+    Expr alibi = slopes * abs(alibiBiases + biases);  // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys]
+    return alibi;
+  };
+
+  // Compute the log mask for a given query and combine with the alibi mask
+  Expr logMask(Expr query, Expr mask, Ptr<DecoderState> state) const {
+    ABORT_IF(!mask, "mask is expected!!");
+
+    // query: [dimBeam, dimBatch, dimQuery, dimModel] -> dimQuery == dimTrgWords
+    int dimBatch = query->shape()[-3];
+    int dimBeam  = query->shape()[-4];
+    
+    int dimQuery = query->shape()[-2];
+    int dimKeys  = mask->shape()[-2];
+    
+    // all this is bascially a copy of the normal attention mask computation, however we need to do some extra reshaping
+    // to make the alibi mask and the log mask broadcastable and then combine them via minimum
+
+    // Note, this is not a typical logMask with values 0 (don't mask) and -inf (mask). Rather we use +inf (or a large value) 
+    // and -inf and then compbine with the ALIBI mask via minimum. This way, we keep the original ALIBI values where the mask has
+    // +inf and have -inf for masking.
+     // largest useful value and making sure we do not overflow for fp16
+    float maskFactor = std::min(NumericLimits<float>(mask->value_type()).max / 2.f, 99999999.f);
+    // convert binary 0/1 mask to -1/1 mask and then muliply with inf, results in -inf/+inf mask.
+    auto logMask = (2.f * mask - 1.f) * maskFactor; // [1, dimBatch, dimKeys, 1]
+    logMask = reshape(logMask, {dimBatch, 1, 1, dimKeys});       // [dimBatch,                      1,        1, dimKeys]
+    
+
+    // make logMask broadcastable when decoding with beam search
+    logMask = repeat(logMask, /*repeats=*/dimBeam, /*axis=*/-4); // [dimBeam|1 * dimBatch,          1,        1, dimKeys]
+    
+    // make logMask and alibiBias broadcastable, then combine
+    auto alibiBias = alibiMask(query, dimQuery, dimKeys, state); // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys]
+    logMask = minimum(logMask, alibiBias);                       // [dimBeam|1 * dimBatch,   numHeads, dimQuery, dimKeys]
+
+    // final reshape to match attention operation
+    logMask = reshape(logMask, {dimBeam, dimBatch * numHeads, dimQuery, dimKeys}); // [dimBeam|1, dimBatch * numHeads, dimQuery, dimKeys]
+    return logMask;
+  }
+#endif
+
+  // Initialized the head-wise scaling factors from ALIBI (they are constant in the original paper,
+  // we are making them optionally learnable here)
+  Ptr<inits::NodeInitializer> initSlopes(bool decoder = false) const {
+// This is the original implementation of ALIBI slopes for LMs. We find our slopes and biases work better for Seq2seq models
+// Keep for now until we find a use, e.g. in LMs
+#if 0
+    std::vector<float> mVec(numHeads);
+    for(size_t i = 0; i < numHeads; ++i) {
+      // slopes in the paper go from 1/2^1 to 1/2^8 where 8 is the reference number of heads;
+      // if there are more or less heads we scale back to 8 heads and interpolate.
+      float exponent = (float)(i + 1) * (ALIBI_REFERENCE_HEADS / (float)numHeads);
+
+      // We multiply slopes with 2 for the symmetric mask to keep total probability mass the 
+      // same as in the causal mask (we have two symmetric halves instead of just one causal half)
+      mVec[i] = -2.f / std::pow(2.f, exponent);
+      if(decoder)
+        mVec[i] *= 0.5f;
+    }
+    
+    return inits::fromVector(mVec);
+#else
+    // Magic numbers, for now don't ask.
+    std::vector<float> init;
+    if(decoder) {
+      return inits::fromValue(-0.1f);
+    } else {
+      init = { -2.00f, -1.00f, -0.50f, -0.25f, -0.05f, -0.05f, -0.05f, -0.05f };
+      init.resize(numHeads, -0.05f);
+      return inits::fromVector(init);
+    }
+#endif
+  }
+
+  // Head-wise biases for ALIBI, this does not occur in the paper, ignore the magic numbers
+  Ptr<inits::NodeInitializer> initBiases(bool decoder=false) const {
+    if(decoder) {
+      return inits::fromValue(0.3f);
+    } else {
+      std::vector<float> init({ 1.00f, -2.00f, 3.00f, -4.00f, 5.00f, -6.00f, 7.00f, -8.00f });
+      init.resize(numHeads, 0.f);
+      return inits::fromVector(init);
+    }
+  }
+
+public:
+  // Apply the alibi mask to the given query and mask
+  virtual Expr apply(Expr query, Expr mask) const override {
+    return apply(query, mask, /*state=*/nullptr);
+  }
+
+  // Apply the alibi mask to the given query and mask for decoder cross-attention
+  virtual Expr apply(Expr query, Expr mask, Ptr<DecoderState> state) const override {
+    bool decoder = state != nullptr;
+
+    if(!trainable) {
+      const_cast<Expr&>(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes(decoder));
+      const_cast<Expr&>(biases) = graph()->constant({numHeads, 1, 1}, initBiases(decoder));
+    } else {
+      registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes(decoder));
+      registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases(decoder));
+    }
+
+    Expr shift = nullptr;
+    int start = 0;
+    
+    if(state) {
+      start = (int)state->getPosition();
+      auto alibiState = std::dynamic_pointer_cast<AlibiDecoderStateItem>(state);
+      shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1]
+    }
+
+    auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start);
+    return alibiMask;
+  }
+};
+
+} // namespace nn
+} // namespace marian
\ No newline at end of file
diff --git a/src/layers_new/attention.cpp b/src/layers_new/attention.cpp
new file mode 100644
index 000000000..c3758296e
--- /dev/null
+++ b/src/layers_new/attention.cpp
@@ -0,0 +1,105 @@
+#include "graph/node_operators_unary.h"
+#include "layers_new/attention.h"
+#include "layers_new/alibi.h"
+
+namespace marian {
+namespace nn { 
+
+// Factory function to create attention layers from options
+Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+  // @TODO: currently this does nothing as it isn't set anywhere
+  std::string selfAttentionType = options->get<std::string>("transformer-encoder-attention", "default"); // currently only default
+
+  // in the future we might add SingleHead or Additive or LSH-based as in Reformer
+  if(selfAttentionType == "default") {
+    int numHeads = options->get<int>("transformer-heads");
+    int modelDim = options->get<int>("transformer-dim-model", options->get<int>("dim-emb"));
+
+    float attentionDropoutProbability = options->get<float>("transformer-dropout-attention", 0.f);
+
+    return New<MultiHeadAttention<MultiplicativeAttention>>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability);
+  }
+  else {
+    ABORT("Unknown transformer encoder attention type: {}", selfAttentionType);
+  }
+}
+
+// Factory function to create attention mask processors from options
+Ptr<AttentionMaskProcessor> attentionMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+  // currently only default or alibi
+  std::string processorType = options->get<std::string>("transformer-attention-mask", "default"); 
+  if(processorType == "default") {
+    return New<AttentionMaskProcessor>(graph, options);
+  } else if(processorType == "alibi") {
+    return New<AlibiAttentionMaskProcessor>(graph, options);
+  } else {
+    ABORT("Unknown transformer attention mask processor type: {}", processorType);
+  }
+}
+
+}  // namespace nn
+
+// specialized faster operator for log-mask computation
+class LogMaskNode : public UnaryNodeOp {
+private:
+  int numHeads_{8};
+
+  Shape newShape(Expr mask, int numHeads) {
+    // incoming mask is expected to have shape [dimBatch, 1, 1, dimKeys]
+    // see the reshape below in the logMask function
+    int dimBatch = mask->shape()[-4];
+    int dimKeys  = mask->shape()[-1];
+    return { dimBatch, numHeads, 1, dimKeys };
+  }
+
+public:
+  LogMaskNode(Expr mask, int numHeads)
+  : UnaryNodeOp(mask, newShape(mask, numHeads)), numHeads_(numHeads)
+  {}
+
+  NodeOps forwardOps() override {
+    float lowest = NumericLimits<float>(value_type()).lowest;
+    float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
+    
+    using namespace functional;
+    // compared to the multi-operation code this does conversion and broadcasting in one step
+    return { NodeOp(Element(_1 = (1.f - _2) * maskFactor, val_, child(0)->val())) }; 
+  }
+
+  NodeOps backwardOps() override {
+    float lowest = NumericLimits<float>(value_type()).lowest;
+    float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
+    using namespace functional;
+    return { NodeOp(Add(-maskFactor * _1, child(0)->grad(), adj_)) };
+  }
+
+  virtual size_t hash() override {
+    size_t seed = NaryNodeOp::hash();
+    util::hash_combine(seed, numHeads_);
+    return seed;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<LogMaskNode>(node);
+    if(!cnode)
+      return false;
+    if(numHeads_ != cnode->numHeads_)
+      return false;
+    return true;
+  }
+
+  const std::string type() override { return "log-mask"; }
+};
+
+Expr logMask(Expr mask, int numHeads) {
+  // incoming mask has shape [1, dimBatch, dimKeys, 1]
+  int dimBatch = mask->shape()[-3];
+  int dimKeys  = mask->shape()[-2];
+  mask = reshape(mask, {dimBatch, 1, 1, dimKeys});
+  auto logMask = Expression<LogMaskNode>(mask, numHeads); // [dimBatch, numHeads, 1, dimKeys]
+  return reshape(logMask, {1, dimBatch * numHeads, 1, dimKeys});
+}
+
+}  // namespace marian
diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h
index 4f4838e48..6ddfaad2a 100644
--- a/src/layers_new/attention.h
+++ b/src/layers_new/attention.h
@@ -1,9 +1,14 @@
 #pragma once
 
 #include "graph/cached_expression.h"
+#include "layers_new/decoder.h"
 #include "layers_new/neuralnet.h"
 
 namespace marian {
+
+// specialized operator for faster logMask computation
+Expr logMask(Expr mask, int numHeads);
+
 namespace nn {
 
 // Abstract base class for attention mechanisms
@@ -41,15 +46,15 @@ class MultiplicativeAttention : public AttentionLayer {
     // multiplicative attention with flattened softmax
     float scale = 1.0f / std::sqrt((float)dimKeys); // scaling to avoid extreme values due to matrix multiplication
     
-    // query, keys and values: [beam depth * batch size, num heads, length, head dim]
-    auto z = bdot(query, keys, false, true, scale); // [beam depth, batch size * num heads, max tgt length, max src length]
+    // query, keys and values: [dimBeam, dimBatch * numHeads, (dimQuery|dimKeys=dimValues), dimHead]
+    auto z = bdot(query, keys, false, true, scale); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys]
 
     // mask out garbage beyond end of sequences
     if(logMask)
       z = z + logMask;
 
     // take softmax along src sequence axis (-1)
-    auto weights = softmax(z); // [beam depth, batch size * num heads, max tgt length, max src length]
+    auto weights = softmax(z); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys]
     
 #if 0 // @TODO: make this work again
     if(saveAttentionWeights)
@@ -60,13 +65,14 @@ class MultiplicativeAttention : public AttentionLayer {
     weights = attentionDropout->apply(weights);
 
     // apply attention weights to values
-    // weights: [beam depth, batch size * num heads, max tgt length, max src length]
-    // values:  [beam depth, batch size * num heads, src length, head dim]
-    auto output = bdot(weights, values);  // [beam depth, batch size * num heads, max tgt length, split vector dim]
+    // weights: [dimBeam, dimBatch * numHeads, dimQuery, dimKeys]
+    // values:  [dimBeam, dimBatch * numHeads,  dimKeys, dimHead]
+    auto output = bdot(weights, values);  // [dimBeam, dimBatch * numHeads, dimQuery, dimHead]
     return output;
   }
 };
 
+// Base class for multi-head attention
 template <class AttentionType> // Currently only used for MultiplicativeAttention
 class MultiHeadAttention : public AttentionType {
 protected:
@@ -110,7 +116,7 @@ class MultiHeadAttention : public AttentionType {
 
   virtual ~MultiHeadAttention() = default;
 
-private:
+protected:
   // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to 
   // be able to do an efficient batched matmul.
   Expr splitHeads(Expr input) const {
@@ -141,6 +147,7 @@ class MultiHeadAttention : public AttentionType {
   }
 
 public:
+  // Apply the multi-head attention to the given query, keys and values
   virtual Expr apply(Expr query, Expr keys, Expr values, Expr mask) const override {
     auto qh = splitHeads(qProj->apply(query));
 
@@ -156,7 +163,7 @@ class MultiHeadAttention : public AttentionType {
       return splitHeads(vProj->apply(values)); 
     }, equal);
 
-    auto output = AttentionType::apply(qh, kh, vh, mask);
+    auto output  = AttentionType::apply(qh, kh, vh, mask);
 
     output = joinHeads(output);
     output = oProj->apply(output);
@@ -171,23 +178,51 @@ class MultiHeadAttention : public AttentionType {
   }
 };
 
-static Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
-  // @TODO: currently this does nothing as it isn't set anywhere
-  std::string selfAttentionType = options->get<std::string>("transformer-encoder-attention", "default"); // currently only default
-
-  // in the future we might add SingleHead or Additive or LSH-based as in Reformer
-  if(selfAttentionType == "default") {
-    int numHeads = options->get<int>("transformer-heads");
-    int modelDim = options->get<int>("transformer-dim-model", options->get<int>("dim-emb"));
+// Base class for attention mask processors
+// Attention mask processors are used to process a given attention mask before it is used in an attention computation.
+struct AttentionMaskProcessor : public LayerWithOptions, public IBinaryLayer, public IBinaryDecoderLayer {
+  int numHeads{1};
 
-    float attentionDropoutProbability = options->get<float>("transformer-dropout-attention", 0.f);
+  AttentionMaskProcessor(Ptr<ExpressionGraph> graph,
+                         Ptr<Options> options)
+    : LayerWithOptions(graph, options), 
+      numHeads(opt<int>("transformer-heads", 1)) {}
 
-    return New<MultiHeadAttention<MultiplicativeAttention>>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability);
+  virtual ~AttentionMaskProcessor() = default;
+  
+  virtual Expr apply(Expr /*query*/, Expr mask) const override {
+    if(!mask)
+      return nullptr;
+
+    // @TODO eventually remove this branch. For now we keep it for documentation purposes
+#if 0
+    // LayerAttention expects mask in a different layout
+    int dimBatch = mask->shape()[-3];
+    int dimKeys  = mask->shape()[-2];
+
+    mask = reshape(mask, {dimBatch, 1, 1, dimKeys}); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
+
+    float maskFactor = std::max(NumericLimits<float>(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
+    auto logMask = (1 - mask) * maskFactor;
+    logMask      = reshape(repeat(logMask, numHeads, -3), {1, dimBatch * numHeads, 1, dimKeys});
+    return logMask;
+#else
+    // shape of mask should be [1, dimBatch, dimKeys, 1]
+    // this does all the above work in one step
+    return marian::logMask(mask, numHeads); // [1, dimBatch * numHeads, 1, dimKeys]
+#endif
   }
-  else {
-    ABORT("Unknown transformer encoder attention type: {}", selfAttentionType);
+
+  virtual Expr apply(Expr query, Expr mask, Ptr<DecoderState> /*state*/) const override {
+    return apply(query, mask);
   }
-}
+};
+
+// Factory function to create attention layers from options
+Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
+
+// Factory function to create attention mask processors from options
+Ptr<AttentionMaskProcessor> attentionMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
 
 } // namespace nn
 } // namespace marian
diff --git a/src/layers_new/interface.h b/src/layers_new/interface.h
index d8317d610..a938803ee 100644
--- a/src/layers_new/interface.h
+++ b/src/layers_new/interface.h
@@ -4,6 +4,7 @@
 #include "graph/expression_graph.h"
 #include "graph/expression_operators.h"
 #include "graph/node_initializers.h"
+#include "layers/loss.h"
 
 #include <type_traits>
 
@@ -118,6 +119,8 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
   std::vector<NamedParameter> namedParameters_; // vector of all named parameters belonging to this specific layer (not recurisve)
   std::vector<NamedLayer<Layer>> namedLayers_;  // vector of all named sublayers for this specific layer (not recursive)
 
+  mutable std::vector<RationalLoss> auxiliaryLosses_;
+
   // Create a layer parameter with a full name composed of the path to this layer and localName
   Expr param(const std::string& localName, const Shape& shape, const Ptr<inits::NodeInitializer>& init) {
     std::string fullName = fmt::format("{}->{}", path(), localName);
@@ -255,6 +258,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
     return marian::utils::join(path, "->");
   }
 
+  // Return a string with information about this layer and its sub-layers if includeChildren is true.
   std::string layerInfo(bool includeChildren=false) const {
     std::stringstream ss;
     std::function<void(const Layer*, int)> recurse;
@@ -301,11 +305,32 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
   }
 
   virtual void clear() override {
+    auxiliaryLosses_.clear();
     for(auto& lr : namedLayers())
       lr.second->clear();
   }
+
+  void addAuxiliaryLoss(const RationalLoss& loss) const {
+    auxiliaryLosses_.push_back(loss);
+  }
+
+  // Return all auxiliary losses for this layer and its sub-layers (descending recursively into sub-layers).
+  std::vector<RationalLoss> getAuxiliaryLosses(bool recurse = false) const {
+    if(recurse) {
+      std::vector<RationalLoss> losses;
+      for(auto layer : allLayers())
+        for(auto loss : layer->getAuxiliaryLosses(/*recurse=*/false))
+          losses.push_back(loss);
+      for(auto loss : auxiliaryLosses_)
+        losses.push_back(loss);
+      return losses;
+    } else {
+      return auxiliaryLosses_;
+    }
+  }
 };
 
+// Layer that holds a reference to a set of options. This is used to allow layers to access options
 class LayerWithOptions : public Layer {
 protected:
   Ptr<Options> options_;
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
index b81728c77..923838aa0 100644
--- a/src/layers_new/neuralnet.h
+++ b/src/layers_new/neuralnet.h
@@ -8,23 +8,6 @@ namespace nn {
 
 static inline Expr swapTimeBatch(Expr input) { return swapAxes(atleast_4d(input), -2, -3); }
 
-  // @TODO: this is an odd function to be here, this should rather be handled somewhere globally?
-  // convert multiplicative 1/0 mask to additive 0/-inf log mask, and transpose to match result of bdot() op in Attention()
-static inline Expr transposedLogMask(Expr mask, int dimHeads) {
-  if(!mask)
-    return nullptr;
-
-  // LayerAttention expects mask in a different layout
-  int dimBatch    = mask->shape()[-3];
-  int dimSrcWords = mask->shape()[-2];
-  mask = reshape(mask, {dimBatch, 1, 1, dimSrcWords}); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
-
-  float maskFactor = std::max(NumericLimits<float>(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
-  auto logMask = (1 - mask) * maskFactor;
-  logMask      = reshape(repeat(logMask, dimHeads, -3), {1, dimBatch * dimHeads, 1, dimSrcWords});
-  return logMask;
-}
-
 /**
  * A generic Activation function layer. Any unary Marian operator or function accepted by 
  * `std::function<Expr(Expr)>` can be turned into an activation function like this: 
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
index ade61a78e..ccce35d13 100644
--- a/src/layers_new/transformer.h
+++ b/src/layers_new/transformer.h
@@ -43,6 +43,8 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
       }
     }
   }
+
+  virtual ~TransformerPrePostProcessor() = default;
   
   Expr apply(Expr input, Expr previous = nullptr) const override {
     Expr output = input;
@@ -84,7 +86,6 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
-    // @TODO: factory to support different attention flavors?
     selfAttention = attentionFromOptions(graph, options);
     registerLayer(selfAttention);
 
@@ -95,10 +96,10 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
     registerLayer(postprocessor);
   }
 
-  Expr apply(Expr input, Expr mask = nullptr) const override {
-    auto output = preprocessor->apply(input);                          // optional preprocessing
-    output      = selfAttention->apply(output, output, output, mask);  // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer
-    output      = postprocessor->apply(output, input);                 // optional postprocessing, optional skip connection
+  Expr apply(Expr input, Expr logMask = nullptr) const override {
+    auto output  = preprocessor->apply(input);                            // optional preprocessing
+    output       = selfAttention->apply(output, output, output, logMask); // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer
+    output       = postprocessor->apply(output, input);                   // optional postprocessing, optional skip connection
     return output;
   }
 };
@@ -214,6 +215,7 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa
  */
 struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
   Ptr<PositionEmbeddingLayer> positionEmbedding;
+  Ptr<AttentionMaskProcessor> maskProcessor;
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<LayerList> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
@@ -222,8 +224,13 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
                      Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
-    positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
-    registerLayer(positionEmbedding);
+    if(!opt<bool>("transformer-disable-position-embeddings", false)) {
+      positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
+      registerLayer(positionEmbedding);
+    }
+
+    maskProcessor = attentionMaskProcessorFromOptions(graph, options);
+    registerLayer(maskProcessor);
 
     preprocessor = New<TransformerPrePostProcessor>(
       graph, 
@@ -271,24 +278,26 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     // decoder state, Frank added information about batchMajor/timeMajor orientation. If we 
     // do that everywhere we can detect inconsistencies automatically. 
     // reorganize batch and timestep
-    auto output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-    if(mask) {
-      mask = swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-      mask = transposedLogMask(mask, opt<int>("transformer-heads"));
-    }
+    auto output = swapTimeBatch(input); // [1, dimBatch, dimSrcWords, dimModel]
+    if(mask)
+      mask = swapTimeBatch(mask); // [1, dimBatch, dimSrcWords, 1]
 
     // apply positional embeddings to contextual input
-    output = positionEmbedding->apply(output);
+    if(positionEmbedding)
+      output = positionEmbedding->apply(output);
+    else
+      output = std::sqrt((float)output->shape()[-1]) * output;
 
     // handle for skip connection at top
     auto prevOutput = output;
 
     // apply dropout or layer-norm to embeddings if required
     output = preprocessor->apply(output);
+    auto logMask = maskProcessor->apply(output, mask);
 
     // traverse the layers, use the same mask for each
     for(auto layer : *layers)
-      output = layer->apply(output, mask);
+      output = layer->apply(output, logMask);
 
     // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection
     output = postprocessor->apply(output, prevOutput);
@@ -327,7 +336,7 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe
       opt<std::string>("transformer-preprocess", ""),  
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
-
+    
     // @TODO: factory to support different attention flavors?
     crossAttention = attentionFromOptions(graph, options);
     registerLayer(crossAttention);
@@ -339,16 +348,14 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe
     registerLayer(postprocessor);
   }
 
-  Expr apply(Expr input, Expr context, Expr contextMask = nullptr) const override {
-    auto output = preprocessor->apply(input);                                   // optional preprocessing
-    output      = crossAttention->apply(output, context, context, contextMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer
-    output      = postprocessor->apply(output, input);                          // optional postprocessing, optional skip connection
+  Expr apply(Expr input, Expr context, Expr logMask) const override {
+    auto output  = preprocessor->apply(input); // optional preprocessing
+    output       = crossAttention->apply(output, context, context, logMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer
+    output       = postprocessor->apply(output, input);                      // optional postprocessing, optional skip connection
     return output;
   }
 };
 
-#if 1
-
 class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer {
 public:
   TransformerAutoRegressiveBlock(Ptr<ExpressionGraph> graph, 
@@ -435,9 +442,9 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna
     registerLayer(filterBlock);
   }
 
-  Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr<DecoderState> state) const override {
+  Expr apply(Expr input, Expr inputMask, Expr context, Expr logMask, Ptr<DecoderState> state) const override {
     Expr output = autoRegressiveBlock->apply(input, inputMask, state);
-    output      = crossAttentionBlock->apply(output, context, contextMask);
+    output      = crossAttentionBlock->apply(output, context, logMask);
     output      = filterBlock->apply(output);
 
     checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)    
@@ -453,6 +460,7 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna
  */
 struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer {
   Ptr<PositionEmbeddingLayer> positionEmbedding;
+  Ptr<AttentionMaskProcessor> maskProcessor;
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<LayerList> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
@@ -461,8 +469,13 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
                      Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
-    positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
-    registerLayer(positionEmbedding);
+    if(!opt<bool>("transformer-disable-position-embeddings", false)) {
+      positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
+      registerLayer(positionEmbedding);
+    }
+
+    maskProcessor = attentionMaskProcessorFromOptions(graph, options);
+    registerLayer(maskProcessor);
 
     preprocessor = New<TransformerPrePostProcessor>(
       graph, 
@@ -527,22 +540,28 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
     // dimensions. This order is more natural for the transformer, but more difficult to handle
     // during beam search or when using RNNs. Hence the input/output transpositions here.
     Expr output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-    context = swapTimeBatch(context); 
+    context = swapTimeBatch(context); // [dimBeam=1, dimBatch, dimSrcWords, dimModel]
+
+    // set current target token position during decoding or training. At training
+    // this should be 0. During translation the current length of the translation.
+    // Used for position embeddings and creating new decoder states.
+    int startPos = (int)state->getPosition();
 
     // @TODO: write function prepareMasks();
     // @TODO: create triangle mask here and combine with inputMask
     LOG_ONCE(info, "Don't forget the triangle mask if required!");
-    if(inputMask) {
-      inputMask = swapTimeBatch(inputMask);   // [beam depth=1, batch size, max length, vector dim=1]
-    }
-
-    if(contextMask) {
-      contextMask = swapTimeBatch(contextMask);    // [beam depth=1, max length, batch size, vector dim=1]
-      contextMask = transposedLogMask(contextMask, opt<int>("transformer-heads")); // [beam broadcast=1, batch size * num heads, max length broadcast=1, max length]
-    }
     
-    // apply positional embeddings to contextual input @TODO: remove need for conversion to int
-    output = positionEmbedding->apply(output, (int)state->getPosition());
+    if(inputMask)
+      inputMask = swapTimeBatch(inputMask); // [dimBeam=1, dimBatch, dimTrgWords, dimModel=1]
+
+    if(contextMask)
+      contextMask = swapTimeBatch(contextMask);  // [dimBeam=1, dimBatch, dimSrcWords, dimModel=1]
+  
+    // apply positional embeddings to contextual input
+    if(positionEmbedding)
+      output = positionEmbedding->apply(output, startPos);
+    else
+      output = std::sqrt((float)output->shape()[-1]) * output;
     
     // handle for skip connection at top
     auto prevOutput = output;
@@ -552,9 +571,12 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
 
     // get an iterator to per-layer states
     auto layerStateIt = state->as<nn::DecoderStateList>()->begin();
+    auto logMask = maskProcessor->apply(output, contextMask, *layerStateIt);
+
     // traverse the layers, use the same mask for each
-    for(auto layer : *layers)
-      output = layer->as<TransformerDecoderLayer>()->apply(output, inputMask, context, contextMask, /*in/out=*/*layerStateIt++);
+    for(auto layer : *layers) {
+      output = layer->as<TransformerDecoderLayer>()->apply(output, inputMask, context, logMask, /*in/out=*/*layerStateIt++);
+    }
 
     // apply final postprocessor if requred, e.g. final layer-norm for pre-norm or final skip connection
     output = postprocessor->apply(output, prevOutput);
@@ -570,7 +592,6 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
     return output;
   }
 };
-#endif
 
 } // namespace nn
 } // namespace marian
diff --git a/src/models/bleurt.h b/src/models/bleurt.h
index 131b675a7..baeb704a5 100644
--- a/src/models/bleurt.h
+++ b/src/models/bleurt.h
@@ -68,10 +68,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder {
 
   Expr apply(Expr input, Expr mask) const override {
     auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-    
     mask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-    auto binMask = mask;
-    mask = marian::nn::transposedLogMask(mask, opt<int>("transformer-heads"));
   
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h
index d0305002b..868f7d6e9 100644
--- a/src/models/comet_qe.h
+++ b/src/models/comet_qe.h
@@ -8,7 +8,41 @@
 namespace marian {
 namespace models {
 
-struct CometEncoder final : public nn::TransformerEncoder {
+class CometEncoder final : public nn::TransformerEncoder {
+private:
+  // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code.
+  // It norms over time, not batch, also should be optimized. Seems safe to disable for custom
+  // models trained by us, but required when doing inference with Unbabel models.
+  Expr cometNorm(Expr x, Expr binaryMask) const {
+    Expr output;
+    if(opt<bool>("comet-mix-norm", false)) {
+      registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
+      int dimModel = x->shape()[-1];
+
+      // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32.
+      Type origType = x->value_type();
+      x             = marian::cast(x,       Type::float32);
+      binaryMask    = marian::cast(binaryMask, Type::float32);
+      
+      x = x * binaryMask;
+      auto denom = (float)dimModel * sum(binaryMask, -2);
+      auto mu    = sum(sum(x, -1), -2) / denom; // sum over model and time
+      auto sigma = sum(sum(square(x - mu), -1), -2) / denom;
+
+      auto normed = (x - mu) / sqrt(sigma + 1e-12f);
+      output = marian::cast(gamma, Type::float32) * sum(normed * binaryMask, -2) / sum(binaryMask, -2);
+
+      // Undo conversion to fp32 if not originally fp32 (most likely fp16 then)
+      output = marian::cast(output, origType);
+    } else {
+      // average over time dimension
+      output = sum(x * binaryMask, -2) / sum(binaryMask, -2);
+    }
+
+    return output;
+  };
+
+public:
   Expr weights;
   Expr gamma;
 
@@ -19,57 +53,24 @@ struct CometEncoder final : public nn::TransformerEncoder {
   Expr apply(Expr input, Expr mask) const override {
     auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
     
-    mask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-    auto binMask = mask;
-    mask = marian::nn::transposedLogMask(mask, opt<int>("transformer-heads"));
-  
+    auto binaryMask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
+    
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
 
     // apply dropout or layer-norm to embeddings if required
     output = preprocessor->apply(output);
-
-    // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code.
-    // It norms over time, not batch, also should be optimized. Seems safe to disable for custom
-    // models trained by us, but required when doing inference with Unbabel models.
-    auto cometNorm = [&, this](Expr x, Expr binMask) {
-      Expr output;
-      if(opt<bool>("comet-mix-norm", false)) {
-        registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
-        int dimModel = x->shape()[-1];
-
-        // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32.
-        Type origType = x->value_type();
-        x       = marian::cast(x,       Type::float32);
-        binMask = marian::cast(binMask, Type::float32);
-        
-        x = x * binMask;
-        auto denom = (float)dimModel * sum(binMask, -2);
-        auto mu    = sum(sum(x, -1), -2) / denom; // sum over model and time
-        auto sigma = sum(sum(square(x - mu), -1), -2) / denom;
-
-        auto normed = (x - mu) / sqrt(sigma + 1e-12f);
-        output = marian::cast(gamma, Type::float32) * sum(normed * binMask, -2) / sum(binMask, -2);
-
-        // Undo conversion to fp32 if not originally fp32 (most likely fp16 then)
-        output = marian::cast(output, origType);
-      } else {
-        // average over time dimension
-        output = sum(x * binMask, -2) / sum(binMask, -2);
-      }
-
-      return output;
-    };
+    auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1]
 
     std::vector<Expr> pooler;
     if(opt<bool>("comet-mix", false))
-      pooler.push_back(cometNorm(output, binMask));
+      pooler.push_back(cometNorm(output, binaryMask));
 
     // traverse the layers, use the same mask for each
     for(auto layer : *layers) {
-      output = layer->apply(output, mask);
+      output = layer->apply(output, logMask);
       if(opt<bool>("comet-mix", false))
-        pooler.push_back(cometNorm(output, binMask)); // [ batch, time, modelDim ]
+        pooler.push_back(cometNorm(output, binaryMask)); // [ batch, time, modelDim ]
     }
 
     if(opt<bool>("comet-mix", false)) {
@@ -78,7 +79,7 @@ struct CometEncoder final : public nn::TransformerEncoder {
       output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim]
     } else {
       // just use last layer, average over time dim
-      output = cometNorm(output, binMask); // [batch, 1, modelDim]
+      output = cometNorm(output, binaryMask); // [batch, 1, modelDim]
     }
 
     return output;
diff --git a/src/models/decoder.h b/src/models/decoder.h
index 5ddaa9643..1646c44b2 100644
--- a/src/models/decoder.h
+++ b/src/models/decoder.h
@@ -70,6 +70,7 @@ class DecoderBase : public EncoderDecoderLayerBase {
     else
       selectedEmbs = embeddingLayer->apply(words, {dimBeam, 1, dimBatch, dimEmb});
     state->setTargetHistoryEmbeddings(selectedEmbs);
+    state->setTargetWords(words);
   }
 
   virtual const std::vector<Expr> getAlignments(int /*i*/ = 0) { return {}; }; // [tgt index][beam depth, max src length, batch size, 1]
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index f70353a64..971726271 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -3,6 +3,8 @@
 #include "common/filesystem.h"
 #include "common/version.h"
 
+#include "models/transformer_new.h"
+
 namespace marian {
 
 EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
@@ -71,6 +73,12 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
 
   modelFeatures_.insert("transformer-no-bias");
   modelFeatures_.insert("transformer-no-affine");
+  
+  modelFeatures_.insert("transformer-disable-position-embeddings");
+  modelFeatures_.insert("transformer-attention-mask");
+  modelFeatures_.insert("transformer-alibi-shift");
+  modelFeatures_.insert("transformer-alibi-trainable");
+  modelFeatures_.insert("separator-symbol");
 }
 
 std::vector<Ptr<EncoderBase>>& EncoderDecoder::getEncoders() {
@@ -183,10 +191,22 @@ void EncoderDecoder::save(Ptr<ExpressionGraph> graph,
 void EncoderDecoder::clear(Ptr<ExpressionGraph> graph) {
   graph->clear();
 
-  for(auto& enc : encoders_)
+  for(auto& enc : encoders_) {
     enc->clear();
-  for(auto& dec : decoders_)
+    // this cast looks redundant, but TransformerBatchEncoder has two base clases with clear()
+    // so we need to cast here and call explicitly. Should be removed once we switch to the new
+    // layer framework everywhere.
+    auto encNew = std::dynamic_pointer_cast<TransformerBatchEncoder>(enc);
+    if(encNew)
+      encNew->clear();
+  }
+  for(auto& dec : decoders_) {
     dec->clear();
+    // Same as above, but TransformerBatchDecoder
+    auto decNew = std::dynamic_pointer_cast<TransformerBatchDecoder>(dec);
+    if(decNew)
+      decNew->clear();
+  }
 }
 
 Ptr<DecoderState> EncoderDecoder::startState(Ptr<ExpressionGraph> graph,
@@ -210,11 +230,12 @@ Ptr<DecoderState> EncoderDecoder::step(Ptr<ExpressionGraph> graph,
                                        const Words& words,                         // [beamIndex * activeBatchSize + batchIndex]
                                        const std::vector<IndexType>& batchIndices, // [batchIndex]
                                        int beamSize) {
+
   // create updated state that reflects reordering and dropping of hypotheses
-  state = hypIndices.empty() ? state : state->select(hypIndices, batchIndices, beamSize);
+  state = hypIndices.empty() ? state : state->select(hypIndices, words, batchIndices, beamSize);
 
   // Fill state with embeddings based on last prediction
-  decoders_[0]->embeddingsFromPrediction(graph, state, words, (int) batchIndices.size(), beamSize);
+  decoders_[0]->embeddingsFromPrediction(graph, state, words, (int)batchIndices.size(), beamSize);
   auto nextState = decoders_[0]->step(graph, state);
   
   return nextState;
diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h
index 4ccc6a93f..ef810ed8b 100644
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@@ -62,7 +62,7 @@ class IEncoderDecoder : public models::IModel {
 
   virtual Ptr<data::Shortlist> getShortlist() = 0;
 
-  virtual data::SoftAlignment getAlignment() = 0;
+  virtual data::SoftAlignment getAlignment() = 0; 
 };
 
 class EncoderDecoder : public IEncoderDecoder, public LayerBase {
diff --git a/src/models/states.h b/src/models/states.h
index a4be3795e..ec5c3aed8 100644
--- a/src/models/states.h
+++ b/src/models/states.h
@@ -56,6 +56,24 @@ class DecoderState {
                Ptr<data::CorpusBatch> batch,
                bool isBatchMajor = false)
       : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch), isBatchMajor_(isBatchMajor) {}
+
+  // override to create derived decoder states
+  virtual Ptr<DecoderState> Create(const rnn::States& states,
+      Logits logProbs,
+      const std::vector<Ptr<EncoderState>>& encStates,
+      Ptr<data::CorpusBatch> batch,
+      bool isBatchMajor = false) const {
+    return New<DecoderState>(states, logProbs, encStates, batch, isBatchMajor);
+  }
+
+    // override to create derived decoder states
+  virtual Ptr<DecoderState> next(const rnn::States& states,
+                                 Logits logProbs) const {
+    auto state = Create(states, logProbs, encStates_, batch_, isBatchMajor_);
+    state->setPosition(getPosition() + 1);
+    return state;
+  }
+
   virtual ~DecoderState() {}
 
   // @TODO: Do we need all these to be virtual?
@@ -68,6 +86,7 @@ class DecoderState {
   // @TODO: should this be a constructor? Then derived classes can call this without the New<> in the loop
   virtual Ptr<DecoderState> select(
       const std::vector<IndexType>& hypIndices,    // [beamIndex * activeBatchSize + batchIndex]
+      const Words& /*words*/, 
       const std::vector<IndexType>& batchIndices,  // [batchIndex]
       int beamSize) const {
     std::vector<Ptr<EncoderState>> newEncStates;
@@ -77,11 +96,11 @@ class DecoderState {
       newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices));
 
     // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices
-    auto selectedState = New<DecoderState>(states_.select(hypIndices, beamSize, /*isBatchMajor=*/isBatchMajor_),
-                                           logProbs_,
-                                           newEncStates,
-                                           batch_, 
-                                           isBatchMajor_);
+    auto selectedState = Create(states_.select(hypIndices, beamSize, /*isBatchMajor=*/isBatchMajor_),
+                                logProbs_,
+                                newEncStates,
+                                batch_, 
+                                isBatchMajor_);
 
     // Set positon of new state based on the target token position of current state
     selectedState->setPosition(getPosition());
@@ -97,7 +116,7 @@ class DecoderState {
 
   virtual const Words& getTargetWords() const { return targetWords_; };
   virtual void setTargetWords(const Words& targetWords) { targetWords_ = targetWords; }
-
+  
   virtual Expr getTargetMask() const { return targetMask_; };
   virtual void setTargetMask(Expr targetMask) { targetMask_ = targetMask; }
 
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 0fa52ff82..1befc726f 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -118,6 +118,9 @@ class Transformer : public EncoderOrDecoderBase {
   }
 
   virtual Expr addSpecialEmbeddings(Expr input, int start = 0, Ptr<data::CorpusBatch> /*batch*/ = nullptr) const {
+    if(opt<bool>("transformer-disable-position-embeddings", false))
+      return input;
+      
     bool trainPosEmbeddings = opt<bool>("transformer-train-positions", false);
     return addPositionalEmbeddings(input, start, trainPosEmbeddings);
   }
diff --git a/src/models/transformer_new.h b/src/models/transformer_new.h
index cfc3a6b14..61de01db2 100644
--- a/src/models/transformer_new.h
+++ b/src/models/transformer_new.h
@@ -1,10 +1,12 @@
 #pragma once
 
 #include "layers_new/transformer.h"
+#include "layers_new/alibi.h"
 
 #include "models/encoder.h"
 #include "models/decoder.h"
 #include "models/states.h"
+#include "models/model_base.h"
 #include "layers/constructors.h"
 
 namespace marian {
@@ -129,11 +131,11 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
       rnn::States startStates(DecoderBase::opt<size_t>("dec-depth"), {start, start});
 
       // don't use TransformerState for RNN layers
-      return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false);
+      return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/false);
     }
     else {
       rnn::States startStates;
-      return New<DecoderState>(startStates, Logits(), encStates, batch, /*isBatchMajor=*/true);
+      return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/true);
     }
   }
 
@@ -148,6 +150,8 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
   }
 
   Ptr<DecoderState> step(Ptr<DecoderState> state) {
+    using db = DecoderBase;
+
     auto embeddings  = state->getTargetHistoryEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
     auto decoderMask = state->getTargetMask();              // [max length, batch size, 1]  --this is a hypothesis
 
@@ -155,13 +159,11 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
 
     auto encoderContext = state->getEncoderStates()[0]->getContext(); // encoder output
     auto encoderMask    = state->getEncoderStates()[0]->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
-    
-    // Convert old style decoder state to new decoder state
-    size_t position = state->getPosition();
-    auto nnState = New<nn::DecoderStateList>(position);
-    for(auto& layerState : state->getStates())
-      nnState->as<nn::DecoderStateList>()->append(New<nn::DecoderStateItem>(layerState.cell, position));
 
+    // Convert old style decoder state to new decoder state
+    using namespace models;
+    usage modelUsage = (usage)db::opt<int>("usage", (int)usage::translation);
+    auto nnState = convertDecoderState(state, graph(), /*decoding=*/modelUsage == usage::translation);
     auto decoderContext = decoder->apply(embeddings, decoderMask, encoderContext, encoderMask, nnState);
 
     // final feed-forward layer (output)
@@ -177,10 +179,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
       decoderStates.push_back(rnn::State({ cellState, cellState }));
     }
     // return unnormalized(!) probabilities
-    auto nextState = New<DecoderState>(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor());
-    nextState->setPosition(state->getPosition() + 1);
-
-    return nextState;
+    return state->next(decoderStates, logits);
   }
 
   // helper function for guided alignment
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 6a075e9c5..9d5c8166d 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -55,6 +55,8 @@ void CopyCastFrom(Tensor out, const T* in, int length) {
     CopyCastTo<add>(out->data<float>(), in, length);
   } else if(out->type() == Type::float16) {
     CopyCastTo<add>(out->data<float16>(), in, length);
+  } else if(out->type() == Type::uint32) {
+    CopyCastTo<add>(out->data<uint32_t>(), in, length);
   } else {
     ABORT("CopyCastTo to type {} not implemented", out->type());
   }
diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc
index ed1e72553..c6953c144 100755
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@@ -41,3 +41,6 @@ template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functio
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > >, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > >, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<3>, marian::functional::Assignee<4> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<3>, marian::functional::Assignee<4> > >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
\ No newline at end of file
diff --git a/src/tensors/gpu/add_all.cu b/src/tensors/gpu/add_all.cu
index bc78709a3..3f14153df 100644
--- a/src/tensors/gpu/add_all.cu
+++ b/src/tensors/gpu/add_all.cu
@@ -111,6 +111,21 @@ void AggregateAll(Ptr<Allocator> allocator,
   AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3);
 }
 
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  marian::Tensor out, 
+                  const marian::Tensor in1,
+                  const marian::Tensor in2,
+                  const marian::Tensor in3,
+                  const marian::Tensor in4) {
+  AggregateAllVar<T, AccType>(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3, in4);
+}
+
+
 #include "tensors/gpu/add_all.inc"
 
 }
\ No newline at end of file
diff --git a/src/tensors/gpu/add_all.h b/src/tensors/gpu/add_all.h
index 2e37fd497..ce8d9df20 100644
--- a/src/tensors/gpu/add_all.h
+++ b/src/tensors/gpu/add_all.h
@@ -13,7 +13,7 @@
 namespace marian {
 
 // These function declarations are repeated as template specialization with variadic template arguments does not seem to work.
-// Here I am just creating version for 1, 2, and 3 arguments. To be extended if required.
+// Here I am just creating version for 1, 2, 3 and 4 arguments. To be extended if required.
 template <typename T, typename AccType, class Functor, class AggFunctor>
 void AggregateAll(Ptr<Allocator> allocator,
                   Functor functor, 
@@ -44,6 +44,18 @@ void AggregateAll(Ptr<Allocator> allocator,
                   const Tensor in2, 
                   const Tensor in3);
 
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor, 
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out, 
+                  const Tensor in1, 
+                  const Tensor in2, 
+                  const Tensor in3,
+                  const Tensor in4);
+
 // Aggregates all values into a single tensor and returns the value of that tensor as a float
 // This does a GPU to CPU memory copy via TensorBase::scalar().
 // Used currently only for L2Norm computation
diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc
index 41da1351b..8987268ba 100644
--- a/src/tensors/gpu/add_all.inc
+++ b/src/tensors/gpu/add_all.inc
@@ -43,6 +43,9 @@ template void marian::AggregateAll<float, float, marian::functional::BinaryFunct
 template void marian::AggregateAll<float, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<3>, marian::functional::Assignee<4> > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<3>, marian::functional::Assignee<4> > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 
 #if COMPILE_FP16
 template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -87,4 +90,7 @@ template void marian::AggregateAll<__half, float, marian::functional::BinaryFunc
 template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Murmur, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<3>, marian::functional::Assignee<4> > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<1> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<3>, marian::functional::Assignee<4> > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 #endif
diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc
index 27cc641da..da957a9ce 100755
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@@ -76,6 +76,10 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Geq, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > > > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Erf, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Capture> > > > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::Assignee<1> > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::Assignee<1> > >, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<4> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<4> > >, marian::functional::Assignee<3> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<4> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::UnaryFunctor<marian::functional::elem::Sigmoid, marian::functional::Assignee<4> > >, marian::functional::Assignee<3> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Capture, marian::functional::Assignee<2> >, marian::functional::Capture> >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 
 // How to add new specializations:
 // When you use a new specialization, it will cause a link error of this form (example):
@@ -84,6 +88,3 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
 //   - replace up to including "undefined reference to `" by "template"
 //   - replace final ' by a semicolon
 //   - replace 'IntrusivePtr<marian::TensorBase>' with 'marian::Tensor'
-
-template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::Assignee<1> > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLU, marian::functional::Assignee<1> > >, IntrusivePtr<marian::TensorBase>);
-template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::sReLUBack, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 6dbded2a4..4662ab041 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -3217,8 +3217,7 @@ __global__ void gHighwayBackward(T* out1,
       T sigma = functional::Ops<T>::sigmoid(t[index]);
       out1[index] = sigma * adj[index];
       out2[index] = ((T)1.f - sigma) * adj[index];
-      outt[index]
-          = sigma * ((T)1.f - sigma) * (in1[index] - in2[index]) * adj[index];
+      outt[index] = sigma * ((T)1.f - sigma) * (in1[index] - in2[index]) * adj[index];
     }
   }
 }
diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp
index 901eddc5c..1eeef913a 100644
--- a/src/translator/beam_search.cpp
+++ b/src/translator/beam_search.cpp
@@ -358,7 +358,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
 
       bool anyCanExpand = false; // stays false if all hyps are invalid factor expansions
       if(t == 0 && factorGroup == 0) { // no scores yet
-        prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0));
+        prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0), Type::float32);
         anyCanExpand = true;
 
         // at the beginning all batch entries are used
@@ -407,7 +407,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
         }
         if(factorGroup == 0)
           currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step
-        prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores));
+        prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores), Type::float32);
       }
       if (!anyCanExpand) // all words cannot expand this factor: skip
         continue;
@@ -462,7 +462,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
       }
 
       // make beams continuous
-      auto expandedPathScores = prevPathScores + stepScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab]
+      auto expandedPathScores = prevPathScores + cast(stepScores, Type::float32); // will become [maxBeamSize, 1, currDimBatch, dimVocab]
       expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab]
 
       // perform NN computation
diff --git a/src/translator/translator.h b/src/translator/translator.h
index f0fc0b908..f1fd04d3f 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -49,7 +49,7 @@ class Translate : public ModelTask {
     options_->set("inference", true,
                   "shuffle", "none");
 
-    corpus_ = New<data::Corpus>(options_, true);
+    corpus_ = New<data::Corpus>(options_, /*translate=*/true);
 
     auto vocabs = options_->get<std::vector<std::string>>("vocabs");
     trgVocab_ = New<Vocab>(options_, vocabs.size() - 1);

From b61755b656899edd21ddbf9a5b416086c3a5a82a Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 4 Dec 2023 18:15:53 +0000
Subject: [PATCH 05/26] Merged PR 31919: Nucleus and epsilon sampling

This adds nucleus and epsilon sampling to the output-sampling options.
* This required the implementation of a sorting algorithm, tested thrust and CUB.
* Implementation of cumsum and logcumsumexp (no gradient for now) operators.
* Various minor improvements.
---
 CHANGELOG.md                          |   1 +
 CMakeLists.txt                        |   6 +-
 VERSION                               |   2 +-
 src/CMakeLists.txt                    |   2 +
 src/common/types.h                    |   8 +-
 src/graph/expression_operators.cpp    |  22 ++
 src/graph/expression_operators.h      |  30 +++
 src/graph/node_initializers.cpp       |   1 +
 src/graph/node_operators_tuple.h      |  88 +++++++
 src/graph/node_operators_unary.h      | 123 +++++++++
 src/layers_new/alibi.cpp              |   2 +-
 src/layers_new/rnn.h                  |   4 +-
 src/models/transformer.h              |   5 +
 src/tensors/cpu/cumsum.cpp            |  90 +++++++
 src/tensors/cpu/topk.cpp              |  35 +++
 src/tensors/gpu/cumsum.cu             | 156 +++++++++++
 src/tensors/gpu/tensor_operators.cu   |  27 +-
 src/tensors/gpu/topk.cu               | 164 +++++++++++-
 src/tensors/rand.cpp                  |  13 +-
 src/tensors/rand.h                    |  19 +-
 src/tensors/tensor_operators.h        |   5 +
 src/tests/transformer_new.cpp         |  11 -
 src/tests/units/CMakeLists.txt        |   1 +
 src/tests/units/operator_tests.cpp    | 100 +++++++
 src/tests/units/transformer_tests.cpp | 147 +++++++++++
 src/translator/beam_search.cpp        |  19 +-
 src/translator/beam_search.h          |   6 +-
 src/translator/sampling.h             | 365 ++++++++++++++++----------
 src/translator/translator.h           |   8 +-
 29 files changed, 1264 insertions(+), 196 deletions(-)
 create mode 100644 src/tensors/cpu/cumsum.cpp
 create mode 100644 src/tensors/gpu/cumsum.cu
 delete mode 100644 src/tests/transformer_new.cpp
 create mode 100644 src/tests/units/transformer_tests.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e4eb14230..51df73b57 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively.
 - Added ALIBI related options to new layer framework.
 - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
 - Added MSE and MAE costs to COMET-QE training.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ea841254..595f87cc1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -466,15 +466,17 @@ endif(COMPILE_CUDA)
 
 # TODO: make compatible with older CUDA versions
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
+  list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
 else(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
+  list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
 endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
 if(NOT MSVC)
   # @TODO: add warnings here too
   list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
   list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
 else()
+  # c++17 doesn't work with CUDA 10
+  # list(APPEND CUDA_NVCC_FLAGS -std=c++17; -Xcompiler "/std:c++17"; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
   list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
 endif()
 
diff --git a/VERSION b/VERSION
index 274b68518..658123368 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.15
+v1.12.16
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4cb8cc14..5bf321af5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,6 +55,7 @@ set(MARIAN_SOURCES
   tensors/backend.cpp
   tensors/rand.cpp
   tensors/tensor.cpp
+  tensors/cpu/cumsum.cpp
   tensors/cpu/device.cpp
   tensors/cpu/prod.cpp
   tensors/cpu/topk.cpp
@@ -182,6 +183,7 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY
 if(CUDA_FOUND)
   cuda_add_library(marian_cuda
     layers_new/alibi.cu
+    tensors/gpu/cumsum.cu
     tensors/gpu/device.cu
     tensors/gpu/hash.cu
     tensors/gpu/algorithm.cu
diff --git a/src/common/types.h b/src/common/types.h
index a0930a0f8..7b50bb691 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -607,9 +607,10 @@ class NumericLimits {
 private:
 
   template <typename MaxType> void setLimitsMax() {
-    max    = (ReturnType)std::numeric_limits<MaxType>::max();
-    min    = (ReturnType)std::numeric_limits<MaxType>::min();
-    lowest = (ReturnType)std::numeric_limits<MaxType>::lowest();
+    max      = (ReturnType)std::numeric_limits<MaxType>::max();
+    min      = (ReturnType)std::numeric_limits<MaxType>::min();
+    lowest   = (ReturnType)std::numeric_limits<MaxType>::lowest();
+    infinity = (ReturnType)std::numeric_limits<MaxType>::infinity();
   }
 
   template <typename RequestType>
@@ -635,6 +636,7 @@ class NumericLimits {
   ReturnType max;
   ReturnType min;
   ReturnType lowest;
+  ReturnType infinity;
 
   NumericLimits(Type type) {
     setLimits(type);
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 60a86112f..47da511cf 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -177,6 +177,28 @@ Expr2 argmin(Expr a, int axis) {
   return topk(a, 1, axis, /*descending=*/false);
 }
 
+Expr2 sort(Expr a, int axis, bool descending) {
+  // only supports sort along last dimension, hence transpose if required
+  a = swapAxes(a, axis, -1);                              // non-op if axes are the same
+  auto sortedVal = Expression<SortNodeOp>(a, /*axis=*/-1, descending); // axis=-1 is OK now as we swapped
+  auto sortedIdx = std::dynamic_pointer_cast<SortNodeOp>(sortedVal)->tupleView(); // get a view on the sorted values
+  return std::make_tuple(swapAxes(sortedVal, axis, -1), swapAxes(sortedIdx, axis, -1)); // non-op if axes are the same
+}
+
+Expr cumsum(Expr a, int axis, bool reverse, bool exclusive) {
+  // only supports sort along last dimension, hence transpose if required
+  a = swapAxes(a, axis, -1); // non-op if axes are the same
+  auto cumsums = Expression<CumSumNodeOp>(a, axis, reverse, exclusive);
+  return swapAxes(cumsums, axis, -1); // non-op if axes are the same
+}
+
+Expr logcumsumexp(Expr a, int axis, bool reverse, bool exclusive, bool fast) {
+  // only supports sort along last dimension, hence transpose if required
+  a = swapAxes(a, axis, -1); // non-op if axes are the same
+  auto logcumsums = Expression<LogCumSumExpNodeOp>(a, axis, reverse, exclusive, fast);
+  return swapAxes(logcumsums, axis, -1); // non-op if axes are the same
+}
+
 Expr maximum(Expr a, Expr b) {
   return Expression<MaximumNodeOp>(a, b);
 }
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index c792096b1..82d8726c5 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -405,6 +405,36 @@ Expr2 argmax(Expr a, int axis);
  */
 Expr2 argmin(Expr a, int axis);
 
+/**
+ * Sorts an expression along an axis.
+ * Sorts the elements of an expression along a specified @p axis.
+ * @param a         Expression to sort
+ * @param axis      Axis to sort along
+ * @param descending If true, sort in descending order. Otherwise, sort in ascending order.
+ *                  Default is true.
+ * @returns A sorted expression
+ */
+Expr2 sort(Expr a, int axis, bool descending = true);
+
+/**
+ * Cumulative sum of an expression along an axis.
+ * Computes the cumulative sum of an expression along a specified @p axis.
+ * @param a     Expression to cumsum
+ * @param axis  Axis to cumsum along
+ * @param exclusive If true, the first element is not included in the sum. Default is false.
+ * @returns     Cumulative sums of @p a along @p axis
+ */
+Expr cumsum(Expr a, int axis, bool reverse=false, bool exclusive = false);
+
+/**
+ * Logarithmic cumulative sum of an expression along an axis.
+ * Computes the logarithmic cumulative sum of an expression along a specified @p axis.
+ * @param a     Expression to cumsum
+ * @param axis  Axis to cumsum along
+ * @param exclusive If true, the first element is not included in the sum. Default is false.
+ * @returns     Logarithmic cumulative sums of @p a along @p axis
+*/
+Expr logcumsumexp(Expr a, int axis, bool reverse = false, bool exclusive = false, bool fast = false);
 
 /**
  * @addtogroup graph_ops_cmp Comparison
diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp
index e44b48287..3afb599a9 100644
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@@ -226,6 +226,7 @@ Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start) {
   return fromLambda([start](Tensor t) { SinusoidalPositionEmbeddings(t, start); }); 
 }
 
+// @TODO: this is rather inefficient also needs axis argument or something
 // computes the equivalent of Python's range()
 template <typename T>
 Ptr<NodeInitializer> range(T begin, T end, T step) {
diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h
index 4444e2ef8..79866681e 100644
--- a/src/graph/node_operators_tuple.h
+++ b/src/graph/node_operators_tuple.h
@@ -165,6 +165,94 @@ struct TopKNodeOp : public UnaryNodeOp,
   }
 };
 
+// This is an implementation of sort, similar to the PyTorch node.
+// At the moment we only handle axis=-1 in here, but do transposes
+// in the actual operator to handle other axes (inefficiently).
+// The normal forward values here are the sorted values per axis,
+// the additional value from the TupleNode contains the integer
+// indices of the sorted values.
+struct SortNodeOp : public UnaryNodeOp, 
+                    public TupleNode { 
+private:
+  int axis_;        // on which axis
+  bool descending_; // sort-order, by default descending. PyTorch has a version without sorting, we always sort.
+
+public:
+  SortNodeOp(Expr a, int axis, bool descending = true) 
+  : UnaryNodeOp(a, a->shape()),
+    axis_{a->shape().axis(axis)}, 
+    descending_{descending} {
+    ABORT_IF(axis_ != shape().size() - 1, "Only implemented along last axis, you tried {}", axis_);
+  }
+
+  // imlementation of TupleNode-specific pure-virtual functions for allocation
+  void allocateTuple() override final {
+    graph()->getTensorAllocator()->allocate(tupleVal_, shape(), Type::uint32);
+  }
+
+  // we override the normal allocation to include the TupleNode allocation
+  void allocate() override {
+    UnaryNodeOp::allocate();
+    allocateTuple();
+  }
+
+  // implementation of TupleNode-specific pure-virtual functions for de-allocation
+  void freeTuple() override final {
+    if(graph()) {
+      if(tupleVal_) {
+        graph()->free(tupleVal_);
+        tupleVal_ = nullptr;
+      }
+    }
+  }
+
+  // we override the normal allocation to include the TupleNode de-allocation
+  void free() override {
+    UnaryNodeOp::free();
+    freeTuple();
+  }
+
+  // Create and return a TupleView to the additional forward value
+  virtual Expr tupleView() override final {
+    return Expression<TupleViewNodeOp>(this, shape(), Type::uint32);
+  }
+
+  void forward() override {
+    Sort(/*out*/val_, /*out: indices=*/tupleVal_, 
+         graph()->allocator(),
+         child(0)->val(), axis_, descending_);
+  }
+
+  void backward() override {
+    Insert</*add=*/true>(/*out*/child(0)->grad(), adj_, tupleVal_, axis_);
+  }
+
+  const std::string type() override { return "sort"; }
+
+  virtual size_t hash() override {
+    if(!hash_) {
+      hash_ = NaryNodeOp::hash();
+      util::hash_combine(hash_, axis_);
+      util::hash_combine(hash_, descending_);
+    }
+    return hash_;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<SortNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(axis_ != cnode->axis_)
+      return false;
+    if(descending_ != cnode->descending_)
+      return false;
+    return true;
+  }
+};
+
+
 // This node attaches multiple children to a parent node and allows 
 // to select one of them via a given index. This is mostly used to avoid
 // unattached nodes that might nevertheless get created based on some 
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 97ffedf61..aa3f5004c 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -613,6 +613,129 @@ struct ReduceNodeOp : public UnaryNodeOp {
   }
 };
 
+class CumSumNodeOp : public UnaryNodeOp {
+private:
+  friend class SerializationHelpers;
+  int axis_;
+  bool reverse_;
+  bool exclusive_;
+  
+public:
+  CumSumNodeOp(Expr a, int axis, bool reverse, bool exclusive) 
+    : UnaryNodeOp(a), 
+      axis_(a->shape().axis(axis)), 
+      reverse_(reverse),
+      exclusive_(exclusive) 
+  {}
+
+  NodeOps forwardOps() override {
+    using namespace functional;
+    return {NodeOp(CumSum(val_, child(0)->val(), reverse_, exclusive_))};
+  }
+
+  NodeOps backwardOps() override {
+    using namespace functional;
+    return {NodeOp(
+      // if we are here then we are done with adding gradients to adj_
+      // so we can canibalize it to compute the gradient of the input
+      // compute the cumsum of the adjoint
+      CumSum(adj_, adj_, !reverse_, exclusive_);
+      // add that cumsum to the gradient of the input
+      Add(_1, child(0)->grad(), adj_);
+    )};
+  }
+
+  const std::string type() override { return "cumsum"; }
+
+  const std::string color() override { return "orange"; }
+
+  virtual size_t hash() override {
+    if(!hash_) {
+      hash_ = NaryNodeOp::hash();
+      util::hash_combine(hash_, axis_);
+      util::hash_combine(hash_, reverse_);
+      util::hash_combine(hash_, exclusive_);
+    }
+    return hash_;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<CumSumNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(axis_ != cnode->axis_)
+      return false;
+    if(reverse_ != cnode->reverse_)
+      return false;
+    if(exclusive_ != cnode->exclusive_)
+      return false;
+    return true;
+  }
+};
+
+class LogCumSumExpNodeOp : public UnaryNodeOp {
+private:
+  friend class SerializationHelpers;
+  int axis_;
+  bool reverse_;
+  bool exclusive_;
+  bool fast_;
+
+public:
+  LogCumSumExpNodeOp(Expr a, int axis, bool reverse, bool exclusive, bool fast=false)
+    : UnaryNodeOp(a), 
+      axis_(a->shape().axis(axis)), 
+      reverse_(reverse),
+      exclusive_(exclusive), 
+      fast_(fast)
+  {}
+
+  NodeOps forwardOps() override {
+    using namespace functional;
+    return {NodeOp(LogCumSumExp(val_, child(0)->val(), reverse_, exclusive_, fast_))};
+  }
+
+  NodeOps backwardOps() override {
+    using namespace functional;
+    ABORT("LogCumSumNodeOp::backwardOps() not implemented yet");
+    // return {NodeOp(LogCumSumExpGrad(child(0)->grad(), adj_, val_, child(0)->val()))};
+  }
+
+  const std::string type() override { return "logcumsumexp"; }
+
+  const std::string color() override { return "orange"; }
+
+  virtual size_t hash() override {
+    if(!hash_) {
+      hash_ = NaryNodeOp::hash();
+      util::hash_combine(hash_, axis_);
+      util::hash_combine(hash_, reverse_);
+      util::hash_combine(hash_, exclusive_);
+      util::hash_combine(hash_, fast_);
+    }
+    return hash_;
+  }
+
+  virtual bool equal(Expr node) override {
+    if(!NaryNodeOp::equal(node))
+      return false;
+    auto cnode = std::dynamic_pointer_cast<LogCumSumExpNodeOp>(node);
+    if(!cnode)
+      return false;
+    if(axis_ != cnode->axis_)
+      return false;
+    if(reverse_ != cnode->reverse_)
+      return false;
+    if(exclusive_ != cnode->exclusive_)
+      return false;
+    if(fast_ != cnode->fast_)
+      return false;
+    return true;
+  }
+};
+
 struct LogNodeOp : public UnaryNodeOp {
   LogNodeOp(Expr a) : UnaryNodeOp(a) {}
 
diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp
index 07989ce6a..abffb6bae 100644
--- a/src/layers_new/alibi.cpp
+++ b/src/layers_new/alibi.cpp
@@ -67,7 +67,7 @@ Expr AlibiDecoderState::getAlibiShift(Ptr<ExpressionGraph> graph, bool decoding)
     }
   } else {
     ABORT_IF(getBatch()->sets() != 2, 
-            "--transformer-alibi-shift=true currently only works with batch sets=2");
+             "--transformer-alibi-shift=true currently only works with batch sets=2");
     return getAlibiShiftFromBatch(graph);
   }
 }
diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h
index 281d2dce9..720fa50f7 100644
--- a/src/layers_new/rnn.h
+++ b/src/layers_new/rnn.h
@@ -8,6 +8,7 @@ namespace nn {
 
 struct CellState {
   Expr recurrent;
+  size_t position = 0;
 };
 
 struct ICell {
@@ -43,7 +44,7 @@ class SSRU final : public Layer, public ICell {
     
     Expr output = iProj->apply(input);
     Expr forget = fProj->apply(input);
-    
+
     return {output, forget};
   }
 
@@ -104,6 +105,7 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer
       std::vector<Expr> stepInputs(inputs.size());
       std::transform(inputs.begin(), inputs.end(), stepInputs.begin(),
                      [i, dimTimeAxis](Expr e) { return slice(e, dimTimeAxis, i); });
+      cellState->position = state->getPosition() + i;
       auto stepMask = inputMask;
       if(stepMask)
          stepMask = slice(inputMask, dimTimeAxis, i);
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 1befc726f..ad018b240 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -246,6 +246,11 @@ class Transformer : public EncoderOrDecoderBase {
                  int dimBeam = 1) {
     int dk = k->shape()[-1];
 
+    // to avoid mistakenly using the old transformer framework for new features
+    auto maskType = opt<std::string>("transformer-attention-mask", "default");
+    ABORT_IF(maskType != "default", 
+             "You specified --transformer-attention-mask={} which is not implemented for legacy Transformer", maskType  );
+
     // softmax over batched dot product of query and keys (applied over all
     // time steps and batch entries), also add mask for illegal connections
 
diff --git a/src/tensors/cpu/cumsum.cpp b/src/tensors/cpu/cumsum.cpp
new file mode 100644
index 000000000..12c7cb155
--- /dev/null
+++ b/src/tensors/cpu/cumsum.cpp
@@ -0,0 +1,90 @@
+#include "tensors/tensor_operators.h"
+
+namespace marian {
+namespace cpu {  
+
+// wrap Marian functor to work with thrust
+template <class Functor, typename T>
+class AccFunctorWrapper {
+private:
+  Functor functor_;
+
+public:
+  AccFunctorWrapper(Functor functor) : functor_(functor) {}
+  T operator()(T x, T y) { return (T)functor_((float)x, (float)y); }
+};
+
+template <class Functor>
+void BatchedScan(Tensor out, const Tensor in, bool reverse, bool exclusive, Functor accOp, float zero) {
+  ABORT_IF(!isFloat(in->type()),      "Input should be float type and not {}", in->type());
+  ABORT_IF(out->type() != in->type(), "Output should have type {}", in->type());
+
+  int cols = in->shape()[0];
+  int rows = in->shape().elements() / cols;
+
+  auto batchedScan = [=](auto inIt, auto outIt) {
+    AccFunctorWrapper<Functor, float> accFunctor(accOp);
+    
+    for(int i = 0; i < rows; ++i) {
+      float sum;
+      int shift = exclusive ? 1 : 0;
+
+      // handle first element differently based on exclusive flag
+      if(exclusive)
+        sum = zero;
+      else
+        sum = inIt[0];
+      outIt[0] = sum;
+      
+      for(int j = 1; j < cols; ++j) {
+        sum = accFunctor(sum, inIt[j - shift]);
+        outIt[j] = sum;
+      }
+      
+      inIt += cols;
+      outIt += cols;
+    }
+  };
+
+  if(reverse) {
+    auto revInIt  = std::make_reverse_iterator(in->data()  + in->size());
+    auto revOutIt = std::make_reverse_iterator(out->data() + out->size());
+    batchedScan(revInIt, revOutIt);
+  } else {
+    auto fwdInIt  = in->data();
+    auto fwdOutIt = out->data();
+    batchedScan(fwdInIt, fwdOutIt);
+  }
+}
+
+// CPU implementation of logcumsumexp operator for LogCumSumExpNodeOp
+void LogCumSumExp(Tensor out, const Tensor in, bool reverse, bool exclusive, bool fast = false) {
+  float max = 0;
+  if(!fast) {
+    // compute max of entire tensor, this is just for stabilization
+    // note, if e.g. all values are logprobs, then the max is at most 0 and we can skip this step
+    // maybe it should be the default to turn this off?
+    max = *std::max_element(in->data(), in->data() + in->size());
+  }
+
+  using namespace functional;
+  auto functor = log(exp(_1 - max) + exp(_2 - max)) + max;
+  auto zero    = -NumericLimits<float>(in->type()).infinity;
+  BatchedScan(out, in, reverse, exclusive, functor, zero); 
+}
+
+// CPU implementation of cumsum operator for CumSumNodeOp
+void CumSum(Tensor out, const Tensor in, bool reverse, bool exclusive) {
+  using namespace functional;
+  auto functor = _1 + _2;
+  BatchedScan(out, in, reverse, exclusive, functor, 0.f);
+}
+
+void CumProd(Tensor out, const Tensor in, bool reverse, bool exclusive) {
+  using namespace functional;
+  auto functor = _1 * _2;
+  BatchedScan(out, in, reverse, exclusive, functor, 1.f);
+}
+
+} // namespace gpu
+} // namespace marian
\ No newline at end of file
diff --git a/src/tensors/cpu/topk.cpp b/src/tensors/cpu/topk.cpp
index 92dcba591..73f0ce273 100644
--- a/src/tensors/cpu/topk.cpp
+++ b/src/tensors/cpu/topk.cpp
@@ -50,5 +50,40 @@ void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tens
   }
 }
 
+// CPU implementation of Marian sort operator for SortNodeOp
+void Sort(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tensor in, int axis, bool descending) {
+  ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis");
+  ABORT_IF(in->type() != Type::float32, "Input should have type {}", Type::float32);
+  ABORT_IF(outInd->type() != Type::uint32, "Output should be have type {}", Type::uint32);
+
+  int cols = in->shape()[axis];
+  int rows = in->shape().elements() / cols;
+
+  std::vector<IndexType> idxs(cols);
+  std::iota(idxs.begin(), idxs.end(), 0);
+
+  const float* inDataPtr = in->data<float>();
+  IndexType* outIndPtr   = outInd->data<IndexType>();
+  float* outValPtr       = outVal->data<float>();
+  for(int i = 0; i < rows; ++i) {
+    std::sort( 
+      idxs.begin(),
+      idxs.end(),
+      [&](int a, int b) { 
+        return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; 
+      }
+    );
+    
+    for(int j = 0; j < cols; j++) {
+      outIndPtr[j] = idxs[j];
+      outValPtr[j] = inDataPtr[idxs[j]];
+    }
+    
+    outIndPtr += cols;
+    outValPtr += cols;
+    inDataPtr += cols;
+  }
+}
+
 }
 }
diff --git a/src/tensors/gpu/cumsum.cu b/src/tensors/gpu/cumsum.cu
new file mode 100644
index 000000000..c0f11c6fa
--- /dev/null
+++ b/src/tensors/gpu/cumsum.cu
@@ -0,0 +1,156 @@
+#include "tensors/tensor_operators.h"
+#include "tensors/gpu/cuda_helpers.h"
+#include "tensors/allocator.h"
+
+#include "functional/operators.h"
+
+#include <cuda.h>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/functional.h>
+
+namespace marian {
+namespace gpu {  
+
+// small operator to compute the row id of an element in a 2d tensor
+class ProjectToRow : public thrust::unary_function<int, int> {
+private:
+  int cols_;
+
+public:
+  ProjectToRow(int cols) : cols_(cols) {}
+  HOST_DEVICE int operator()(int i) { return i / cols_; }
+};
+
+// create the iterators to group the elements of a 2d tensor by row
+auto rowIterators(int rows, int cols) {
+    thrust::counting_iterator<int> firstElement(0);
+    auto begin = thrust::make_transform_iterator(firstElement,                ProjectToRow(cols));
+    auto end   = thrust::make_transform_iterator(firstElement + rows * cols,  ProjectToRow(cols));
+    return std::make_pair(begin, end);
+};
+
+// create the iterators to group the elements of a 2d tensor by row
+auto rowIterators(const Shape& shape) {
+  // use last dimension as column size
+  int cols = shape[-1];
+  // compute number of rows from total number of elements and column size
+  int rows = shape.elements() / cols;
+  return rowIterators(rows, cols);
+}
+
+// wrap Marian functor to work with thrust
+template <class Functor, typename T>
+class AccFunctorWrapper {
+private:
+  Functor functor_;
+
+public:
+  AccFunctorWrapper(Functor functor) : functor_(functor) {}
+  HOST_DEVICE T operator()(T x, T y) { return (T)functor_((float)x, (float)y); }
+};
+
+template <typename T, class Functor>
+void TypedBatchedScan(Tensor out, const Tensor in, bool reverse, bool exclusive, Functor accOpFunctor, T zero) {
+  // use thrust device_ptr to wrap raw pointers
+  thrust::device_ptr<const T> inData(in->data<T>());
+  thrust::device_ptr<T>       outData(out->data<T>());
+
+  // currently use default stream
+  auto exec    = thrust::cuda::par;
+  auto equalOp = thrust::equal_to<int>();
+  auto accOp   = AccFunctorWrapper<Functor, T>(accOpFunctor);
+
+  auto batchedScan = [=](auto inIt, auto outIt) { 
+    // treat each row as as set of keys, only works for last dimension
+    const auto range = rowIterators(in->shape());
+    auto begin = range.first;
+    auto end   = range.second;
+    if(exclusive)
+      thrust::exclusive_scan_by_key(exec, begin, end, inIt, outIt, zero, equalOp, accOp);
+    else
+      thrust::inclusive_scan_by_key(exec, begin, end, inIt, outIt, equalOp, accOp);
+  };
+
+  if(reverse) {
+    auto revInIt  = thrust::make_reverse_iterator(inData  + in->size());
+    auto revOutIt = thrust::make_reverse_iterator(outData + out->size());
+    batchedScan(revInIt, revOutIt);
+  } else {
+    auto fwdInIt  = inData;
+    auto fwdOutIt = outData;
+    batchedScan(fwdInIt, fwdOutIt);
+  }
+}
+
+template <class Functor>
+void BatchedScan(Tensor out, const Tensor in, bool reverse, bool exclusive, Functor functor, float zero) {
+  ABORT_IF(!isFloat(in->type()),      "Input should be float type and not {}", in->type());
+  ABORT_IF(out->type() != in->type(), "Output should have type {}", in->type());
+
+  if(in->type() == Type::float32) {
+    TypedBatchedScan<float>(out, in, reverse, exclusive, functor, zero);
+#if COMPILE_FP16
+  } else if(in->type() == Type::float16) {
+    TypedBatchedScan<__half>(out, in, reverse, exclusive, functor, __float2half(zero));
+#endif
+  } else {
+    ABORT("BatchedScan not implemented for type {}", in->type());
+  }
+}
+
+template <typename T>
+T typedMaxElement(const Tensor in) {
+  // use thrust device_ptr to wrap raw pointers
+  thrust::device_ptr<const T> inData(in->data<T>());
+
+  // currently use default stream
+  auto exec = thrust::cuda::par;
+
+  return *thrust::max_element(exec, inData, inData + in->size());
+}
+
+float MaxElement(const Tensor in) {
+  ABORT_IF(!isFloat(in->type()), "Input should be float type and not {}", in->type());
+  if(in->type() == Type::float32) {
+    return typedMaxElement<float>(in);
+#if COMPILE_FP16
+  } else if(in->type() == Type::float16) {
+    return __half2float(typedMaxElement<__half>(in));
+#endif
+  } else {
+    ABORT("MaxElement not implemented for type {}", in->type());
+  }
+}
+
+void LogCumSumExp(Tensor out, const Tensor in, bool reverse, bool exclusive, bool fast) {
+  float max = 0;
+  if(!fast) {
+    // compute max of entire tensor, this is just for stabilization
+    // note, if e.g. all values are logprobs, then the max is at most 0 and we can skip this step
+    // maybe it should be the default to turn this off?
+    max = MaxElement(in);
+  }
+
+  using namespace functional;
+  auto functor = log(exp(_1 - max) + exp(_2 - max)) + max;
+  auto zero    = -NumericLimits<float>(in->type()).infinity;
+  BatchedScan(out, in, reverse, exclusive, functor, zero); 
+}
+
+void CumSum(Tensor out, const Tensor in, bool reverse, bool exclusive) {
+  using namespace functional;
+  auto functor = _1 + _2;
+  BatchedScan(out, in, reverse, exclusive, functor, 0.f);
+}
+
+void CumProd(Tensor out, const Tensor in, bool reverse, bool exclusive) {
+  using namespace functional;
+  auto functor = _1 * _2;
+  BatchedScan(out, in, reverse, exclusive, functor, 1.f);
+}
+
+} // namespace gpu
+} // namespace marian
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 4662ab041..b7c80394b 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -848,27 +848,30 @@ __global__ void gLogSoftmax(T* out,
   int rows = outShape.elements() / outShape.back();
   int cols = outShape.back();
 
+  // loop over blocks of rows
   for(int bid = 0; bid < rows; bid += gridDim.x) {
-    int j = bid + blockIdx.x;
+    int j = bid + blockIdx.x; // blockIdx.x - row index (within block of rows)
     if(j < rows) {
-      T* so = out + j * cols;
-      const T* sp = in + j * cols;
+      T* so = out + j * cols;      // pointer to row output data
+      const T* sp = in + j * cols; // pointer to row input data
 
       // CUDA complains if type or size of shared memory changes, keep size constant.
       extern __shared__ uint8_t _sharedBytes[];
-      T* _share = (T*)_sharedBytes;
+      T* _share = (T*)_sharedBytes; 
       AccType* _shareAccType = (AccType*)_sharedBytes;
 
       T* _max = _share; // 16-bit is ok for max if applicable
       _max[threadIdx.x] = sp[threadIdx.x];
       for(int tid = 0; tid < cols; tid += blockDim.x) {
-        int id = tid + threadIdx.x;
+        int id = tid + threadIdx.x; // threadIdx.x = column index within block of columns
         if(id < cols) {
           if(sp[id] > _max[threadIdx.x])
             _max[threadIdx.x] = sp[id];
         }
       }
       __syncthreads();
+
+      // max over columns within a column block via tree reduction
       int len = blockDim.x;
       while(len != 1) {
         __syncthreads();
@@ -889,14 +892,18 @@ __global__ void gLogSoftmax(T* out,
       _sum[threadIdx.x] = 0.0;
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
-        if(id < cols) {
-          T sm = sp[id] - max;
-          AccType ex = Ops<AccType>::exp(sm); // sum with AccType
-          so[id] = sm;
+        if(id < cols) {  
+          // @TODO: would it be faster to recompute it below? Also better numeric stability with float?
+          AccType sm = (AccType)sp[id] - (AccType)max; // subtract max for numeric stability
+          so[id] = (T)sm; // assign numerator to output
+
+          AccType ex = Ops<AccType>::exp(sm);
           _sum[threadIdx.x] += ex; // sum with AccType
         }
       }
       __syncthreads();
+
+      // now reduce over all columns within the block
       len = blockDim.x;
       while(len != 1) {
         __syncthreads();
@@ -906,6 +913,8 @@ __global__ void gLogSoftmax(T* out,
         len = (len + 1) >> 1;
       }
       __syncthreads();
+
+      // produce final output data
       AccType sum = _sum[0];
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
diff --git a/src/tensors/gpu/topk.cu b/src/tensors/gpu/topk.cu
index 94256fb7a..3bb8582d4 100644
--- a/src/tensors/gpu/topk.cu
+++ b/src/tensors/gpu/topk.cu
@@ -3,6 +3,13 @@
 #include "tensors/allocator.h"
 
 #include <cuda.h>
+#include <thrust/device_ptr.h>
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#if CUDA_VERSION >= 11000
+#include <cub/cub.cuh>
+#endif
 
 // GPU implementation of proper Marian top-k operator for TopkNodeOp
 // This file contains a lot of code-duplicaton with src/translator/nth_element.cu
@@ -14,7 +21,6 @@ namespace marian {
 namespace gpu {  
 
 const int MAX_BINS   = 500;
-const int BLOCK_SIZE = 512;
 
 #define UNROLL_MAXARG_LOOP(n, max)                    \
   if(tid < (n) && tid + (n) < (max)) {                \
@@ -35,7 +41,7 @@ __global__ void gMaxElement(IndexType* binIndices, // out: top-k positions
                             bool descending)       // This will be the largest possible value if the order is reversed (i.e. we look for the minimum).
 {
   extern __shared__ float sharedValues[];
-  __shared__ IndexType sharedIndices[BLOCK_SIZE];
+  __shared__ IndexType sharedIndices[MAX_THREADS];
 
   // id of current thread within block
   int tid = threadIdx.x;
@@ -147,7 +153,7 @@ __global__ void gMaxElementUpdate(IndexType* binIndices, // memory for bin indic
                                   bool descending)
 {
   extern __shared__ float sharedValues[];
-  __shared__ int    sharedIndices[BLOCK_SIZE];
+  __shared__ int    sharedIndices[MAX_THREADS];
   __shared__ float  bestBinCost;
   __shared__ int    bestBinCostIdx;
 
@@ -332,7 +338,7 @@ void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> allocator, const Tensor i
 
   float minimal = NumericLimits<float>(in->type()).lowest; // lowest if looking for max
 
-  const int numBlocks = std::min(MAX_BINS, int(cols / (2 * BLOCK_SIZE)) + int(cols % (2 * BLOCK_SIZE) != 0));
+  const int numBlocks = std::min(MAX_BINS, int(cols / (2 * MAX_THREADS)) + int(cols % (2 * MAX_THREADS) != 0));
   auto tempMemInd = allocator->alloc<IndexType>(rows * numBlocks);
 
   MemoryPiece::PtrType tempMemVal;
@@ -340,14 +346,14 @@ void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> allocator, const Tensor i
     tempMemVal = allocator->alloc<float>(rows * numBlocks);
     // first find the maximum value per row and block and save indices and values to temporary memory
     gMaxElement<<<numBlocks, // blocks
-                  BLOCK_SIZE, // threads
-                  BLOCK_SIZE * sizeof(float), // shared memory size
+                  MAX_THREADS, // threads
+                  MAX_THREADS * sizeof(float), // shared memory size
                   /* stream_ */ 0>>>(
       tempMemInd->data<IndexType>(), tempMemVal->data<float>(),
       in->data<float>(), rows, cols, minimal, descending);
     gMaxElementUpdate<<<rows,       // blocks ... seems we can have up to 2^31-1 of these, so we are safe?
-                        BLOCK_SIZE, // threads
-                        BLOCK_SIZE * sizeof(float),  // shared memory size
+                        MAX_THREADS, // threads
+                        MAX_THREADS * sizeof(float),  // shared memory size
                         /* stream_ */ 0>>>(
       tempMemInd->data<IndexType>(), tempMemVal->data<float>(), 
       outInd->data<IndexType>(), outVal->data<float>(), 
@@ -357,14 +363,14 @@ void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> allocator, const Tensor i
     tempMemVal = allocator->alloc<__half>(rows * numBlocks);
     // first find the maximum value per row and block and save indices and values to temporary memory
     gMaxElement<<<numBlocks, // blocks
-                  BLOCK_SIZE, // threads
-                  BLOCK_SIZE * sizeof(float), // shared memory size
+                  MAX_THREADS, // threads
+                  MAX_THREADS * sizeof(float), // shared memory size
                   /* stream_ */ 0>>>(
       tempMemInd->data<IndexType>(), tempMemVal->data<__half>(),
       in->data<__half>(), rows, cols, minimal, descending);
     gMaxElementUpdate<<<rows,       // blocks ... seems we can have up to 2^31-1 of these, so we are safe?
-                        BLOCK_SIZE, // threads
-                        BLOCK_SIZE * sizeof(float),  // shared memory size
+                        MAX_THREADS, // threads
+                        MAX_THREADS * sizeof(float),  // shared memory size
                         /* stream_ */ 0>>>(
       tempMemInd->data<IndexType>(), tempMemVal->data<__half>(), 
       outInd->data<IndexType>(), outVal->data<__half>(), 
@@ -378,5 +384,139 @@ void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> allocator, const Tensor i
   allocator->free(tempMemVal);
 }
 
+// this function uses cub::DeviceSegmentedRadixSort::SortPairs to sort each row separately
+template <typename T>
+void TypedSortCUB(Ptr<Allocator> allocator, Tensor outVal, Tensor outInd, const Tensor in, bool descending) {
+#if CUDA_VERSION >= 11000
+  int cols = in->shape()[-1];
+  int rows = in->shape().elements() / cols;
+
+  const T* inValData    = in->data<T>();
+  T* outValData         = outVal->data<T>();
+  IndexType* outIndData = outInd->data<IndexType>();
+
+  // create indices for the input tensor, i.e. [0, 1, 2, ..., cols] per row using single thrust transform
+  // CUB doesn't seem to have a transform operation, so let's use thrust. They seem to be compatible anyway.
+  thrust::transform(thrust::device, 
+                    thrust::counting_iterator<int>(0), 
+                    thrust::counting_iterator<int>(rows * cols), 
+                    outIndData, 
+                    [=] HOST_DEVICE (int i) { return i % cols; });
+
+  // create row iterator, this iterates through the indices of row start offsets, e.g. [0, cols, 2*cols, ...]
+  // this is used to partition the input tensor into rows when sorting with the segmented sort
+  auto rowEndOp        = [cols] HOST_DEVICE (int i) { return i * cols; };
+  using TransformOp    = decltype(rowEndOp);
+  using CountingIt     = cub::CountingInputIterator<int>;
+  using RowPartitionIt = cub::TransformInputIterator<int, TransformOp, CountingIt>;
+  RowPartitionIt rowPartitionIt(CountingIt(0), rowEndOp);
+
+  auto cubSortbyKey = [=](void* storage, size_t& storageSize, bool descending) {
+    using cubSort = cub::DeviceSegmentedRadixSort;
+    if(descending)
+      cubSort::SortPairsDescending(storage, storageSize,
+                                  inValData, outValData,
+                                  outIndData, outIndData,
+                                  /*total=*/rows * cols,
+                                  /*segments=*/rows,
+                                  rowPartitionIt, rowPartitionIt + 1);
+    else
+      cubSort::SortPairs(storage, storageSize,
+                         inValData, outValData,
+                         outIndData, outIndData,
+                         /*total=*/rows * cols,
+                         /*segments=*/rows,
+                         rowPartitionIt, rowPartitionIt + 1);
+  };
+
+  // Important lesson: before I used my own allocation and deallocation of temporary memory, this
+  // was actually slower than the thrust version. Again, mixing computation and cudaMalloc is a bad idea.
+  // @TODO: review other kernels to make sure I don't use cudaMalloc directly anywhere.
+
+  // Determine temporary device storage requirements, this doesn't sort anything
+  size_t tempStorageBytes = 0;
+  cubSortbyKey(nullptr, /*out=*/tempStorageBytes, descending);
+  // Allocate temporary storage
+  auto tempStorage = allocator->alloc(tempStorageBytes);
+  // Run sorting operation
+  cubSortbyKey(tempStorage->data(), tempStorageBytes, descending);
+  // free temporary storage
+  allocator->free(tempStorage);
+#else
+  ABORT("CUB sort requires CUDA 11.0 or higher");
+#endif
+}
+
+// the same as above but using thrust::sort_by_key instead of cub::DeviceSegmentedRadixSort::SortPairs;
+// used for CUDA < 11.0, slower than cub::DeviceSegmentedRadixSort::SortPairs
+template <typename T>
+void TypedSortThrust(Tensor outVal, Tensor outInd, const Tensor in, bool descending) {
+  int cols = in->shape()[-1];
+  int rows = in->shape().elements() / cols;
+
+  // use thrust device_ptr to wrap raw pointers
+  thrust::device_ptr<const T> inVal(in->data<T>());
+  thrust::device_ptr<T> outValData(outVal->data<T>());
+  thrust::device_ptr<IndexType> outIndData(outInd->data<IndexType>());
+
+  // lambda that sorts a row
+  auto sortRow = [=] (int rowIdx) {
+        // currently use default stream
+    cudaStream_t stream = 0;
+    auto exec = thrust::cuda::par.on(stream);
+
+    auto outValRow = outValData + rowIdx * cols; // pointer to row in output value tensor
+    auto outIndRow = outIndData + rowIdx * cols; // pointer to row in output index tensor
+    // sort the indices and values according to the values in the output tensor and using the stream
+    if(descending)
+      thrust::sort_by_key(exec, outValRow, outValRow + cols, outIndRow, thrust::greater<T>());
+    else
+      thrust::sort_by_key(exec, outValRow, outValRow + cols, outIndRow, thrust::less<T>());
+  };
+
+  // copy input tensor to output tensor
+  thrust::copy(thrust::device, inVal, inVal + rows * cols, outValData);
+
+  // create indices for the input tensor, i.e. [0, 1, 2, ..., cols] per row using single thrust transform
+  thrust::transform(thrust::device, 
+                    thrust::counting_iterator<int>(0), 
+                    thrust::counting_iterator<int>(rows * cols), 
+                    outIndData, 
+                    [=] HOST_DEVICE (int i) { return i % cols; });
+  
+  // sort each row of the input tensor separately
+  // couldn't find a way to do this with thrust::for_each that wasn't hilariously slow
+  for(int i = 0; i < rows; ++i)
+    sortRow(i);
+}
+
+template <typename T>
+void TypedSort(Ptr<Allocator> allocator, Tensor outVal, Tensor outInd, const Tensor in, bool descending) {
+#if CUDA_VERSION < 11000
+    // CUDA_VERSION < 11000 doesn't include <cub/cub.cuh> and hence cub::DeviceSegmentedRadixSort::SortPairs
+    // we use thrust::sort_by_key instead which is slower
+    TypedSortThrust<T>(outVal, outInd, in, descending);
+#else
+    TypedSortCUB<T>(allocator, outVal, outInd, in, descending);
+#endif
 }
+
+void Sort(Tensor outVal, Tensor outInd, Ptr<Allocator> allocator, const Tensor in, int axis, bool descending) {
+  ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis");
+  ABORT_IF(!isFloat(in->type()),           "Input should be float type and not {}", in->type());
+  ABORT_IF(outInd->type() != Type::uint32, "Output should have type {}", Type::uint32);
+  ABORT_IF(outVal->type() != in->type(),   "Output should have type {}", in->type());
+
+  if(in->type() == Type::float32) {
+    TypedSort<float>(allocator, outVal, outInd, in, descending);
+#if COMPILE_FP16
+  } else if(in->type() == Type::float16) {
+    TypedSort<__half>(allocator, outVal, outInd, in, descending);
+#endif
+  } else {
+    ABORT("Sort not implemented for type {}", in->type());
+  }
 }
+
+} // namespace gpu
+} // namespace marian
diff --git a/src/tensors/rand.cpp b/src/tensors/rand.cpp
index e6dbc46ed..cfe768f59 100644
--- a/src/tensors/rand.cpp
+++ b/src/tensors/rand.cpp
@@ -14,8 +14,9 @@ class StdlibRandomGenerator : public RandomGenerator {
   std::mt19937 engine_;
 
 public:
-  StdlibRandomGenerator(size_t seed)
-  : RandomGenerator(seed), engine_((unsigned int)seed) {}
+  StdlibRandomGenerator(size_t seed, DeviceId deviceId)
+  : RandomGenerator(seed, deviceId),
+    engine_((unsigned int)RandomGenerator::seed()) {}
 
   virtual void uniform(Tensor tensor, float a, float b) override;
   virtual void normal(Tensor, float mean, float stddev) override;
@@ -68,7 +69,7 @@ void StdlibRandomGenerator::normal(Tensor tensor, float mean, float stddev) {
 #ifdef CUDA_FOUND
 
 CurandRandomGenerator::CurandRandomGenerator(size_t seed, DeviceId deviceId)
-: RandomGenerator(seed), deviceId_(deviceId) {
+: RandomGenerator(seed, deviceId), deviceId_(deviceId) {
     if(deviceId_.type == DeviceType::gpu) {
       cudaSetDevice((int)deviceId_.no);
       CURAND_CHECK(curandCreateGenerator(&generator_, CURAND_RNG_PSEUDO_DEFAULT));
@@ -76,7 +77,7 @@ CurandRandomGenerator::CurandRandomGenerator(size_t seed, DeviceId deviceId)
     else {
       CURAND_CHECK(curandCreateGeneratorHost(&generator_, CURAND_RNG_PSEUDO_DEFAULT));
     }
-    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator_, seed_));
+    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator_, RandomGenerator::seed()));
 }
 
 CurandRandomGenerator::~CurandRandomGenerator() {
@@ -112,9 +113,7 @@ Ptr<RandomGenerator> createRandomGenerator(size_t seed, DeviceId deviceId) {
 #ifdef CUDA_FOUND
     return New<CurandRandomGenerator>(seed, deviceId);
 #else
-    ABORT_IF(deviceId.type != DeviceType::cpu,
-             "StdlibRandomGenerator can only be used for CPU tensors");
-    return New<StdlibRandomGenerator>(seed);
+    return New<StdlibRandomGenerator>(seed, deviceId);
 #endif
 }
 
diff --git a/src/tensors/rand.h b/src/tensors/rand.h
index 94b44a97a..1042104f4 100644
--- a/src/tensors/rand.h
+++ b/src/tensors/rand.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include "common/definitions.h"
+#include "common/hash.h"
+#include "common/logging.h"
 
 #include <random>
 
@@ -13,11 +15,26 @@ class RandomGenerator {
 protected:
   size_t seed_;
 
+  // hashing device type and id to get a unique seed for each device, e.g. for different samples on different devices
+  size_t hashSeed(size_t seed, DeviceId deviceId) {
+    // on the first device, use the seed as is. This keeps unit tests etc. working correctly
+    // on other devices, hash the seed with the device type and id, so that we get different seeds for different devices
+    // this is important for e.g. different samples on different devices
+    if(deviceId.no == 0)
+      return seed;
+    else
+      return util::hashArgs(seed, deviceId.type, deviceId.no);
+  }
+
 public:
-  RandomGenerator(size_t seed) : seed_(seed) { }
+  RandomGenerator(size_t seed, DeviceId deviceId) 
+  : seed_(hashSeed(seed, deviceId)) { 
+    LOG(debug, "Setting random seed to {} (device {}{})", seed_, deviceId.typeAsString(), deviceId.no);
+  }
   virtual ~RandomGenerator() {}
   virtual void uniform(Tensor, float a, float b) = 0;
   virtual void normal(Tensor, float mean, float stddev) = 0;
+  virtual size_t seed() { return seed_; }
 };
 
 Ptr<RandomGenerator> createRandomGenerator(size_t /*seed*/, DeviceId);
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index 2747a6d66..1940e9d95 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -346,6 +346,11 @@ static inline void Select(Tensor out, const Tensor in, const Tensor indices, int
 }
 
 DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr<Allocator>, const marian::Tensor, int, int, bool);
+DISPATCH6(Sort, marian::Tensor, marian::Tensor, Ptr<Allocator>, const marian::Tensor, int, bool);
+
+DISPATCH4(CumSum, marian::Tensor, const marian::Tensor, bool, bool);
+DISPATCH4(CumProd, marian::Tensor, const marian::Tensor, bool, bool);
+DISPATCH5(LogCumSumExp, marian::Tensor, const marian::Tensor, bool, bool, bool);
 
 DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
 DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);
diff --git a/src/tests/transformer_new.cpp b/src/tests/transformer_new.cpp
deleted file mode 100644
index 2d1e89281..000000000
--- a/src/tests/transformer_new.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "marian.h"
-#include "models/transformer_new.h"
-
-
-int main(int argc, char** argv) {
-  using namespace marian;
-
-  testme();
-
-  return 0;
-}
diff --git a/src/tests/units/CMakeLists.txt b/src/tests/units/CMakeLists.txt
index 7745fee14..ee5920521 100644
--- a/src/tests/units/CMakeLists.txt
+++ b/src/tests/units/CMakeLists.txt
@@ -7,6 +7,7 @@ set(UNIT_TESTS
     fastopt_tests
     utils_tests
     binary_tests
+    transformer_tests
     # cosmos_tests # optional, uncomment to test with specific files.
 )
 
diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp
index 5806e94de..565ffb1d7 100644
--- a/src/tests/units/operator_tests.cpp
+++ b/src/tests/units/operator_tests.cpp
@@ -1054,6 +1054,9 @@ void tests(DeviceType device, Type floatType = Type::float32) {
     auto ridx4  = get<1>(rtopk4);
     auto gval4  = gather(a, -2, ridx4);
 
+    const auto& [valDesc, indDesc] = sort(a, /*axis=*/-1, /*descending=*/true);
+    const auto& [valAsc,  indAsc]  = sort(a, /*axis=*/-1, /*descending=*/false);
+
     graph->forward();
 
     CHECK(rval1 != gval1);
@@ -1095,6 +1098,36 @@ void tests(DeviceType device, Type floatType = Type::float32) {
 
     gval4->val()->get(values);
     CHECK( values == vval4 );
+
+    std::vector<T> vvalDesc = { 0.3333,   0,      -0.2,
+                                4.5,      0,      -0.3,
+                                101.45,   5.2,   -10.0,
+                                1.05e-5,  0,    -100.05 };
+    valDesc->val()->get(values);
+    CHECK( values == vvalDesc );
+
+    std::vector<IndexType> vindDesc = { 1, 0, 2, 
+                                        2, 1, 0, 
+                                        2, 0, 1, 
+                                        2, 1, 0 };
+    std::vector<IndexType> testVindDesc;
+    indDesc->val()->get(testVindDesc);
+    CHECK( testVindDesc == vindDesc );
+
+    std::vector<T> vvalAsc = { -0.2,      0,      0.3333,
+                               -0.3,      0,      4.5,
+                               -10.0,     5.2,   101.45,
+                               -100.05,   0,      1.05e-5 };
+    valAsc->val()->get(values);
+    CHECK( values == vvalAsc );
+
+    std::vector<IndexType> vindAsc = { 2, 0, 1, 
+                                       0, 1, 2,
+                                       1, 0, 2,
+                                       0, 1, 2 };
+    std::vector<IndexType> testVindAsc;
+    indAsc->val()->get(testVindAsc);
+    CHECK( testVindAsc == vindAsc );
   }
 
   SECTION("cross entropy with label smoothing vs logsoftmax with gather") {
@@ -1141,6 +1174,73 @@ void tests(DeviceType device, Type floatType = Type::float32) {
     CHECK( std::equal(values.begin(), values.end(),
                       values2.begin(), floatApprox2) );
   }
+
+  SECTION("Scan operations") {
+    std::vector<T> input = {
+      -0.1, -1.2, -0.4,
+       1.2,  2.3, -3.4,
+      -2.2,  1.0, -1.2
+    };
+
+    auto x = graph->constant({3, 3}, inits::fromVector(input));
+    auto a = logcumsumexp(x, /*axis=*/-1);
+    auto b = logcumsumexp(x, /*axis=*/-2, /*reverse=*/false, /*exclusive=*/true);
+
+    auto c = cumsum(x, /*axis=*/-1, /*reverse=*/false, /*exclusive=*/true);
+    auto d = cumsum(x, /*axis=*/-2, /*reverse=*/true);
+    
+    graph->forward();
+
+    CHECK(a->shape() == Shape({3, 3}));
+    CHECK(b->shape() == Shape({3, 3}));
+
+    std::vector<T> aValues = {
+      -0.1000, 0.1875, 0.6294, 
+       1.1992, 2.5859, 2.5879, 
+      -2.1992, 1.0400, 1.1416
+    };
+
+    T negInf = -std::numeric_limits<T>::infinity();
+    std::vector<T> bValues = {
+     negInf, negInf, negInf, 
+     -0.1f, -1.2f, -0.4f, 
+     1.44101f, 2.32975f, -0.35141f, 
+    };
+
+    a->val()->get(values);
+    b->val()->get(values2);
+
+    CHECK( std::equal(values.begin(), values.end(),
+                      aValues.begin(), floatApprox2) );
+
+    CHECK( std::equal(values2.begin(), values2.end(),
+                      bValues.begin(), floatApprox2) );
+
+    CHECK(c->shape() == Shape({3, 3}));
+    CHECK(d->shape() == Shape({3, 3}));
+
+    std::vector<T> cValues = {
+      0, -0.1000, -1.3000,  
+      0,  1.2000,  3.5000,  
+      0, -2.2000, -1.2000, 
+    };
+
+    std::vector<T> dValues = {
+      -1.1, 2.1, -5.0,
+      -1.0, 3.3, -4.6,
+      -2.2, 1.0, -1.2
+    };
+
+    c->val()->get(values);
+    d->val()->get(values2);
+
+    CHECK( std::equal(values.begin(), values.end(),
+                      cValues.begin(), floatApprox2) );
+
+    CHECK( std::equal(values2.begin(), values2.end(),
+                      dValues.begin(), floatApprox2) );
+  }
+
 }
 
 #ifdef CUDA_FOUND
diff --git a/src/tests/units/transformer_tests.cpp b/src/tests/units/transformer_tests.cpp
new file mode 100644
index 000000000..4f7cc4d29
--- /dev/null
+++ b/src/tests/units/transformer_tests.cpp
@@ -0,0 +1,147 @@
+/* All or part of this file was contributed by NVIDIA under license:
+ *   Copyright (C) 2020 NVIDIA Corporation
+ *   SPDX-License-Identifier: MIT
+ */
+#include "catch.hpp"
+#include "graph/expression_graph.h"
+#include "graph/expression_operators.h"
+#include "layers_new/transformer.h"
+
+#ifdef CUDA_FOUND
+#include "tensors/gpu/backend.h"
+#endif
+
+#include <cmath>
+
+using namespace marian;
+
+template <typename T>
+void tests(DeviceType device, Type floatType = Type::float32) {
+
+// Checking for FP16 support and skipping if not supported.
+#ifdef CUDA_FOUND
+  if(device == DeviceType::gpu && floatType == Type::float16) {
+    auto gpuBackend = New<gpu::Backend>(DeviceId({0, device}), /*seed=*/1234);
+    auto cudaCompute = gpuBackend->getCudaComputeCapability();
+    if(cudaCompute.major < 6) return;
+  }
+#endif
+
+  auto floatApprox  = [](T x, T y) -> bool { return x == Approx(y).margin(0.001f); };
+  auto floatApprox2 = [](T x, T y) -> bool { return x == Approx(y).margin(0.01f); };
+  auto floatEqual   = [](T x, T y) -> bool { return x == y; };
+
+  Config::seed = 4321;
+  auto graph = New<ExpressionGraph>();
+  
+  graph->setInference(true);
+  graph->setDefaultElementType(floatType);
+  graph->setDevice({0, device});
+  graph->reserveWorkspaceMB(16);
+
+  std::vector<T> values;
+
+  SECTION("Test equivalence of layers and specialized operators") {
+    graph->clear();
+    values.clear();
+
+    std::vector<T> vecState = {
+      0.82858741, 0.97615969, 0.67942131, 0.17952891,
+      0.65630823, 0.38350773, 0.74830967, 0.67770803,
+      0.00955211, 0.02345274, 0.02023151, 0.97143453,
+      0.89971799, 0.50413132, 0.62781775, 0.59496081,
+      0.14006306, 0.46450409, 0.91360050, 0.10497642,
+      0.25477138, 0.63996094, 0.53658444, 0.88240266,
+      0.37349635, 0.38880551, 0.18208119, 0.62951839,
+      0.04330675, 0.59304160, 0.20436798, 0.74339235,
+      0.32903627, 0.81596214, 0.44163024, 0.92444748,
+      0.80231488, 0.52994978, 0.13350771, 0.40195912,
+      0.55303711, 0.55137914, 0.98701674, 0.54963994,
+      0.45657760, 0.57295781, 0.58645976, 0.74960953,
+      0.77174628, 0.06652048, 0.68104792, 0.84806365,
+      0.75292617, 0.82063907, 0.96599948, 0.63845992,
+      0.47047511, 0.48726216, 0.95756608, 0.01479877,
+      0.75449765, 0.55964196, 0.66664016, 0.34928808
+    };
+
+    auto state = graph->constant({2, 2, 4, 4}, inits::fromVector(vecState));
+
+    using namespace marian::nn;
+
+    auto rnn = New<RNN<SSRU>>(graph, state->shape()[-1], /*transformer-rnn-projection*/true);
+    auto output = rnn->apply(state);
+
+    auto iProj = rnn->cell->iProj->weight;
+    auto iBias = rnn->cell->iProj->bias;
+
+    auto fProj = rnn->cell->fProj->weight;
+    auto fBias = rnn->cell->fProj->bias;
+
+    auto oProj = rnn->oProj->weight;
+    auto oBias = rnn->oProj->bias;
+    
+#if 0
+    debug(output, "output");
+
+    auto x = affine(state, iProj, iBias);
+    auto f = affine(state, fProj, fBias);
+
+    auto ssruFwd = [=](Expr out, const std::vector<Expr>& inputs) {
+      auto x = inputs[0];
+      auto f = inputs[1];
+      
+      SSRUScanForward(out->val(), x->val(), f->val());
+    };
+
+    auto output2 = lambda({x, f}, x->shape(), x->value_type(), ssruFwd);
+    
+    output2 = relu(output2);
+    output2 = affine(output, oProj, oBias);
+    debug(output2, "output2");
+#endif 
+
+    graph->forward();
+
+    std::vector<T> expected = {
+      -0.23135981,  0.04476057,  0.16183880, -0.13936377,
+      -0.47255400, -0.00786887,  0.10853745, -0.06822529,
+      -0.51970947, -0.10289559, -0.06798580,  0.10712720,
+      -0.58211476, -0.10762983, -0.06099827,  0.10525966,
+      -0.33873928,  0.07430670,  0.24815071, -0.21479189,
+      -0.50458324, -0.01065392,  0.11723585, -0.07428676,
+      -0.47146145, -0.07140756, -0.01806587,  0.05478236,
+      -0.49719882, -0.10403568, -0.07004700,  0.10721481,
+      -0.31213918, -0.07793316, -0.06812444,  0.09076738,
+      -0.26403564, -0.08575443, -0.10109652,  0.11913717,
+      -0.57269764, -0.03178894,  0.08730030, -0.03967147,
+      -0.63041478, -0.07102037,  0.02447471,  0.02596882,
+      -0.40184090, -0.07519485, -0.04389046,  0.07439522,
+      -0.62908661, -0.03906321,  0.08765715, -0.03556710,
+      -0.54157418,  0.06784889,  0.27720353, -0.22676750,
+      -0.50410551,  0.02381870,  0.17982434, -0.13504542
+    };
+
+    output->val()->get(values);
+
+    CHECK(values.size() == expected.size());
+    // CHECK(std::equal(values.begin(), values.end(), expected.begin(), floatApprox));
+  }
+}
+
+#ifdef CUDA_FOUND
+TEST_CASE("Expression graph supports basic math operations (gpu)", "[operator]") {
+  tests<float>(DeviceType::gpu);
+}
+
+#if COMPILE_FP16
+TEST_CASE("Expression graph supports basic math operations (gpu fp16)", "[operator]") {
+  tests<float16>(DeviceType::gpu, Type::float16);
+}
+#endif
+#endif
+
+#ifdef BLAS_FOUND
+TEST_CASE("Expression graph supports basic math operations (cpu)", "[operator]") {
+  tests<float>(DeviceType::cpu);
+}
+#endif
diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp
index 1eeef913a..63aa0ec8f 100644
--- a/src/translator/beam_search.cpp
+++ b/src/translator/beam_search.cpp
@@ -79,7 +79,6 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
     if(pathScore == INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
       continue;
     
-    ABORT_IF(pathScore < INVALID_PATH_SCORE, "Actual pathScore ({}) is lower than INVALID_PATH_SCORE ({})??", pathScore, INVALID_PATH_SCORE); // This should not happen in valid situations. Currently the only smaller value would be -inf (effect of overflow in summation?)
     ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)
 
     // map wordIdx to word
@@ -450,19 +449,27 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
           logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, /*shortlist=*/ nullptr, hypIndices, maxBeamSize); // [maxBeamSize, 1, currentDimBatch, dimVocab]
         }
         // expand all hypotheses, [maxBeamSize, 1, currentDimBatch, 1] -> [maxBeamSize, 1, currentDimBatch, dimVocab]
-        if(i == 0)
-          stepScores = scorers_[i]->getWeight() * logProbs;
-        else
+        if(i == 0) {
+          stepScores =              scorers_[i]->getWeight() * logProbs;
+        } else {
           stepScores = stepScores + scorers_[i]->getWeight() * logProbs;
+        }
       }
 
+      // we cast (ensembled) scores to float32, as accumulated them into path scores; 
+      // also beneficial for sampling etc.
+      // @TODO:: consider doing this before ensembling
+      stepScores = cast(stepScores, Type::float32); 
+
       if(factorGroup == 0) {
         stepScores = distMod->force(stepScores, (int)t, (int)maxBeamSize, batchIndices);
-        stepScores = distMod->sample(stepScores);
+        stepScores = distMod->sample(stepScores, /*normalize=*/true);
       }
 
       // make beams continuous
-      auto expandedPathScores = prevPathScores + cast(stepScores, Type::float32); // will become [maxBeamSize, 1, currDimBatch, dimVocab]
+      auto expandedPathScores = prevPathScores + stepScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab]
+
+      // this transpose is required for the combined top-k search below
       expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab]
 
       // perform NN computation
diff --git a/src/translator/beam_search.h b/src/translator/beam_search.h
index 75a9caeb0..0810b3332 100644
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@@ -18,9 +18,9 @@ class BeamSearch {
   const bool PURGE_BATCH = true; // @TODO: diagnostic, to-be-removed once confirmed there are no issues.
 
   static float chooseInvalidPathScore(Ptr<Options> options) {
-    auto prec = options->get<std::vector<std::string>>("precision", {"float32"});
-    auto computeType = typeFromString(prec[0]);
-    return NumericLimits<float>(computeType).lowest;
+    // We are now using float32 for accumulation along path scores, so we can just use float32 for the invalid scores
+    // Division by 2 to stay away from -inf. Here lowest / 2.f is bascially a magic number that marks finished beams.
+    return NumericLimits<float>(Type::float32).lowest / 2.f;
   }
 
 public:
diff --git a/src/translator/sampling.h b/src/translator/sampling.h
index 4ac2063e9..2b13791d9 100644
--- a/src/translator/sampling.h
+++ b/src/translator/sampling.h
@@ -1,141 +1,240 @@
-  namespace marian {
-
-  class DistModifier {
-  private:
-    Ptr<Options> options_;
-    bool forceDecode_{false};
-    bool sampling_{false};
-    std::string samplingMethod_;
-    int topk_{10};
-    float temperature_{1.f};
-
-    Ptr<data::CorpusBatch> batch_;
-    float invalidPathScore_;
-
-    Expr forceBatch_;
+namespace marian {
+
+namespace sampling {
+
+// Prunning functions for sampling from the output distribution
+// All functions take a logits tensor and return a tensor of the same shape and pruned values removed.
+// The logits tensor is assumed to be in log-space (i.e. logprobs) and the returned tensor is also in log-space.
+// The pruned distribution can be renormalized via logsoftmax to ensure that the sum of the probabilities is 1.
+// However this doesn't matter much for sampling since the gumbel max trick works for unnormalized distributions.
+
+// Prune logits via top-k pruning
+Expr topkPruning(Expr scores, int k, bool normalize = false) {
+  Expr val, idx; 
+
+  // note, for around k>200 topk is slower on the GPU than sorting and then selecting the top-k values
+  std::tie(val, idx) = topk(scores, k, /*axis=*/-1, /*descending=*/true);
+  if(normalize)
+    val = logsoftmax(val); // renormalize via logsoftmax
+
+  // Scatter gumbelled values back into logits to fill with usable values
+  auto invalid = constant_like(scores, inits::fromValue(std::log(0.f)));
+  return scatter(invalid, /*axis=*/-1, idx, val);
+}
+
+// Prune logits via nucleus pruning
+Expr nucleusPruning(Expr scores, float threshold, bool normalize = false) {
+  // normalization would make sense here since we compare against a meaningful threshold and 
+  // we don't know what other manipulations have been done to the logits before, but
+  // leaving it to the user for now. We do set it to true in beam_search.cpp
+  if(normalize)
+    scores = logsoftmax(scores); // renormalize via logsoftmax
+
+  // sort scores in descending order, this way we can use the cumulative sum to find the nucleus
+  Expr val, idx; 
+  std::tie(val, idx) = sort(scores, /*axis=*/-1, /*descending=*/true);
+
+  // logcumsumexp because we have logprobs, exclusive because we keep at least the first element
+  // we can skip the numerical stability trick here since we are in log-space
+  auto lcse     = logcumsumexp(val, /*axis=*/-1, /*reverse=*/false, /*exclusive=*/true, /*fast=*/true);
+
+  // mask out all values that for which the cumulative sum is larger than the threshold (i.e. they are outside the nucleus)
+  auto lcseMask = log(le(lcse, std::log(threshold)));
+  val           = minimum(val, lcseMask); // mask out all values outside the nucleus
+
+  if(normalize)
+    val = logsoftmax(val); // renormalize via logsoftmax
+
+  // scatter the masked values back into the correct positions (undo sorting)
+  return scatter(scores, /*axis=*/-1, idx, val);
+}
+
+// Prune logits via epsilon pruning
+Expr epsilonPruning(Expr scores, float epsilon, bool normalize = false) {
+  // normalization would make sense here since we compare against a meaningful threshold and 
+  // we don't know what other manipulations have been done to the logits before
+  if(normalize)
+    scores = logsoftmax(scores); // renormalize via logsoftmax
+  
+  // make sure the epsilon is not larger than the largest value in the scores
+  // otherwise we will mask out all values
+  // equivalent to union of top-1 and log(epsilon)
+  auto safeThreshold = minimum(max(scores, /*axis=*/-1), std::log(epsilon));
+
+  // create mask for all values that are smaller than the epsilon
+  auto logEpsMask   = log(ge(scores, safeThreshold)); // -inf for all values smaller than epsilon
+  auto logEpsScores = minimum(scores, logEpsMask); // mask out all values smaller than epsilon
+
+  if(normalize)
+    logEpsScores = logsoftmax(logEpsScores); // renormalize after masking via logsoftmax
+  return logEpsScores;
+}
+
+Expr gumbelMaxTrick(Expr scores, float temperature) {
+  // scale scores by temperature
+  if(temperature != 1.f)
+    scores = scores / temperature;
+  // add Gumbel noise to all values and renormalize via logsoftmax
+  return logsoftmax(scores + constant_like(scores, inits::gumbel()));
+}
+} // namespace sampling
+
+class DistModifier {
+private:
+  Ptr<Options> options_;
+  bool forceDecode_{false};
+
+  bool sampling_{false};
+  std::function<Expr(Expr, bool)> samplingFn_;
+
+  Ptr<data::CorpusBatch> batch_;
+  float invalidPathScore_;
+
+  Expr forceBatch_;
+  
+public:
+  DistModifier(Ptr<Options> options, Ptr<data::CorpusBatch> batch, float invalidPathScore) :
+    options_(options), forceDecode_(options_->get<bool>("force-decode", false)),
+    batch_(batch), invalidPathScore_(invalidPathScore) {
     
-  public:
-    DistModifier(Ptr<Options> options, Ptr<data::CorpusBatch> batch, float invalidPathScore) :
-      options_(options), forceDecode_(options_->get<bool>("force-decode", false)),
-      batch_(batch), invalidPathScore_(invalidPathScore) {
-      
-      if(options_->hasAndNotEmpty("output-sampling")) {
+    if(options_->hasAndNotEmpty("output-sampling")) {
+      sampling_ = true;
+      auto samplingOpts = options_->get<std::vector<std::string>>("output-sampling", {});
+      std::string samplingMethod = samplingOpts.size() > 0 ? samplingOpts[0] : "full";
+
+      if(samplingMethod == "0") { // for backcompat with boolean values
+        sampling_ = false;
+        samplingMethod = "";
+      } else if(samplingMethod == "1") { // for backcompat with boolean values
         sampling_ = true;
-        auto samplingOpts = options_->get<std::vector<std::string>>("output-sampling", {});
-        samplingMethod_ = samplingOpts.size() > 0 ? samplingOpts[0] : "full";
-        if(samplingMethod_ == "0") { // for backcompat with boolean values
-          sampling_ = false;
-          samplingMethod_ = "";
-        } else if(samplingMethod_ == "1") { // for backcompat with boolean values
-          sampling_ = true;
-          samplingMethod_ = "full";
-        } 
-        
-        if(samplingMethod_ == "full") {
-          if(samplingOpts.size() > 1)
-            temperature_ = std::stof(samplingOpts[1]);
-        }
-
-        if(samplingMethod_ == "topk") {
-          if(samplingOpts.size() > 1)
-            topk_ = std::stoi(samplingOpts[1]);
-          if(samplingOpts.size() > 2)
-            temperature_ = std::stof(samplingOpts[2]);
-        }
+        samplingMethod = "full";
+      } 
+      
+      if(samplingMethod == "full") {
+        float temperature = 1.f;
+        if(samplingOpts.size() > 1)
+          temperature = std::stof(samplingOpts[1]);
+
+        LOG_ONCE(info, "Output sampling from the full softmax distribution with temperature {}", temperature);
+
+        samplingFn_ = [temperature](Expr logits, bool normalize = false) {
+          // full softmax sampling is just gumbel trick with temperature 1 and optional prior renormalization
+          return sampling::gumbelMaxTrick(normalize ? logsoftmax(logits) : logits, temperature);
+        };
+      } else if(samplingMethod == "topk") {
+        int topk = 10; // number of top-k values to sample from
+        float temperature = 1.f;
+        if(samplingOpts.size() > 1)
+          topk = std::stoi(samplingOpts[1]);
+        if(samplingOpts.size() > 2)
+          temperature = std::stof(samplingOpts[2]);
+
+        LOG_ONCE(info, "Output sampling via top-{} sampling with temperature {}", topk, temperature);
+
+        samplingFn_ = [topk, temperature](Expr logits, bool normalize = false) {
+          // top-k sampling is just gumbel trick with temperature 1 and top-k pruning
+          return sampling::gumbelMaxTrick(sampling::topkPruning(logits, topk, normalize), temperature);
+        };
+      } else if(samplingMethod == "nucleus") {
+        float threshold = 0.9f; // probability mass threshold of nucleus
+        float temperature = 1.f;
+        if(samplingOpts.size() > 1)
+          threshold = std::stof(samplingOpts[1]);
+        if(samplingOpts.size() > 2)
+          temperature = std::stof(samplingOpts[2]);
+
+        LOG_ONCE(info, "Output sampling via nucleus sampling with threshold {} temperature {}", threshold, temperature);
+
+        samplingFn_ = [threshold, temperature](Expr logits, bool normalize = false) {
+          // nucleus sampling is just gumbel trick with temperature 1 and nucleus pruning
+          return sampling::gumbelMaxTrick(sampling::nucleusPruning(logits, threshold, normalize), temperature);
+        };
+      } else if(samplingMethod == "epsilon") {
+        float eps = 0.02f; // mimimal probability of sampled token
+        float temperature = 1.f;
+        if(samplingOpts.size() > 1)
+          eps = std::stof(samplingOpts[1]);
+        if(samplingOpts.size() > 2)
+          temperature = std::stof(samplingOpts[2]);
+
+        LOG_ONCE(info, "Output sampling via epsilon sampling with eps {} and temperature {}", eps, temperature);
+
+        samplingFn_ = [eps, temperature](Expr logits, bool normalize = false) {
+          // epsilon sampling is just gumbel trick with temperature 1 and epsilon pruning
+          return sampling::gumbelMaxTrick(sampling::epsilonPruning(logits, eps, normalize), temperature);
+        };
+      } else {
+        ABORT("Unknown sampling method: {}", samplingMethod);
       }
     }
-
-    Expr force(Expr scores, int pos, int beamSize, std::vector<IndexType>& batchIndices) {
-      // we check the last field of the batch for force-decoding content
-      int dimTime = (int)batch_->back()->batchWidth();
-      if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores
-        return scores;
-
-      LOG_ONCE(info, "Force-decoding with given prefixes");
-      // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the 
-      // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections
-      // which preserves the original tensor layout.
-      // This allows for beam-search and batched force-decoding with different length prefixes in a batch 
-      // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search
-      // which then continues as normal on the modified distribution.
-
-      if(!forceBatch_) {
-        // turn the batch into a cached tensor that lives in the computation graph
-        std::vector<WordIndex> forceWords;
-        for(auto& word : batch_->back()->data())
-          forceWords.push_back(word.toWordIndex());
-    
-        int dimBatch = (int)batch_->back()->batchSize();
-        forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1]
-      }
-
-      // if we remove batch entries during decoding (finished decoding) then adjust here
-      if(forceBatch_->shape()[-2] != batchIndices.size())
-        forceBatch_ = index_select(forceBatch_, -2, batchIndices);
-
-      // get vocab index and probability for force-decoded tokens for the current time step
-      Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos);   // [1, 1, dimBatch, 1]
-      Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1]
-
-      // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam
-      // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch
-      // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion.
-      int dimVocab = scores->shape()[-1];      
-      auto graph = scores->graph();
-      // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred.
-      Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f);
-      // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself.
-      Expr dummyVals    = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f);
-
-      // here we add the force-decoded entries back into the zeroed positions
-      dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32);
-      dummyVals    = dummyVals + forceVals;
-
-      // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and 
-      // dummy values into the correct positions.
-      Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_));
-      forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals);
-
-      // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry
-      // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means
-      // free decoding or sampling.
-      WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex();
-      auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId);
-      return interpol * scores + (1.f - interpol) * forcedScores;
+  }
+
+  Expr force(Expr scores, int pos, int beamSize, std::vector<IndexType>& batchIndices) {
+    // we check the last field of the batch for force-decoding content
+    int dimTime = (int)batch_->back()->batchWidth();
+    if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores
+      return scores;
+
+    LOG_ONCE(info, "Force-decoding with given prefixes");
+    // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the 
+    // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections
+    // which preserves the original tensor layout.
+    // This allows for beam-search and batched force-decoding with different length prefixes in a batch 
+    // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search
+    // which then continues as normal on the modified distribution.
+
+    if(!forceBatch_) {
+      // turn the batch into a cached tensor that lives in the computation graph
+      std::vector<WordIndex> forceWords;
+      for(auto& word : batch_->back()->data())
+        forceWords.push_back(word.toWordIndex());
+  
+      int dimBatch = (int)batch_->back()->batchSize();
+      forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1]
     }
 
-    Expr sample(Expr scores) {
-      if(sampling_) {
-        if(temperature_ != 1.f) 
-          scores = scores / temperature_;
-        
-        if(samplingMethod_ == "full") {
-          LOG_ONCE(info, "Output sampling from the full softmax distribution with temperature {}", temperature_);
-          return logsoftmax(scores + constant_like(scores, inits::gumbel()));
-        } else if(samplingMethod_ == "topk") {
-          if(topk_ == 1)
-            LOG_ONCE(info, "Output sampling with k=1 is equivalent to beam search with beam size 1");
-          LOG_ONCE(info, "Output sampling via top-{} sampling with temperature {}", topk_, temperature_);
-          
-          Expr invalidLogits = constant_like(scores, inits::fromValue(invalidPathScore_));
-          
-          // select top-k values
-          Expr val, idx;
-          std::tie(val, idx) = topk(scores, topk_, /*axis=*/-1, /*descending=*/true);
-          
-          // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search
-          Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel()));
-
-          // Scatter gumbelled values back into logits to fill with usable values
-          return scatter(invalidLogits, -1, idx, gumbelVal);
-        } else {
-          ABORT("Unknown sampling method: {}", samplingMethod_);
-        }
-      } else { // no sampling
-        return scores;
-      }
+    // if we remove batch entries during decoding (finished decoding) then adjust here
+    if(forceBatch_->shape()[-2] != batchIndices.size())
+      forceBatch_ = index_select(forceBatch_, -2, batchIndices);
+
+    // get vocab index and probability for force-decoded tokens for the current time step
+    Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos);   // [1, 1, dimBatch, 1]
+    Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1]
+
+    // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam
+    // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch
+    // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion.
+    int dimVocab = scores->shape()[-1];      
+    auto graph = scores->graph();
+    // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred.
+    Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f);
+    // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself.
+    Expr dummyVals    = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f);
+
+    // here we add the force-decoded entries back into the zeroed positions
+    dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32);
+    dummyVals    = dummyVals + forceVals;
+
+    // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and 
+    // dummy values into the correct positions.
+    Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_));
+    forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals);
+
+    // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry
+    // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means
+    // free decoding or sampling.
+    WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex();
+    auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId);
+    return interpol * scores + (1.f - interpol) * forcedScores;
+  }
+
+  Expr sample(Expr scores, bool normalize = false) {
+    if(sampling_) {
+      return samplingFn_(scores, normalize);
+    } else { // no sampling
+      return scores;
     }
+  }
+};
 
-  };
-
-  }
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/src/translator/translator.h b/src/translator/translator.h
index f1fd04d3f..498ef65b3 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -125,12 +125,8 @@ class Translate : public ModelTask {
     if(options_->hasAndNotEmpty("output-sampling")) {
       if(options_->get<size_t>("beam-size") > 1)
         LOG(warn,
-            "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods "
-            "and using them together is not recommended. Set beam-size to 1");
-      if(options_->get<std::vector<std::string>>("models").size() > 1)
-        LOG(warn,
-            "[warning] Output sampling and model ensembling are contradictory methods and using "
-            "them together is not recommended. Use a single model");
+            "[warning] Enabling output sampling and beam search together (--output-sampling [...] && --beam-size > 1) results in so-called stochastic beam-search. "
+            "Are you sure this is desired? For normal sampling, use --beam-size 1.");
     }
   }
 

From 5e47ab2ac4c916ab6f687598f81e19bdf326a509 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sat, 6 Jan 2024 01:54:17 +0000
Subject: [PATCH 06/26] Merged PR 32433: Fix Logmask in BLEURT model

This adjusts the logmask computation to match the implementation in COMET-QE model after the ALIBI refactoring.
---
 CHANGELOG.md        |  1 +
 VERSION             |  2 +-
 src/models/bleurt.h | 50 +++++++++++++++++++++++----------------------
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 51df73b57..83c05ac4c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed BLEURT logmask computation
 - Fixed wrong paramter name for norm in new layer framework
 - Fixed unit test for LayerNorm
 - Only collect batch statistics during mini-batch-fit up to actual max-length.
diff --git a/VERSION b/VERSION
index 658123368..5235dd6a9 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.16
+v1.12.17
diff --git a/src/models/bleurt.h b/src/models/bleurt.h
index baeb704a5..74848b788 100644
--- a/src/models/bleurt.h
+++ b/src/models/bleurt.h
@@ -12,11 +12,11 @@ class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions {
 public:
   Expr embeddings;
 
-  BleurtTypeEmbeddingLayer(Ptr<ExpressionGraph> graph, Ptr<Options> options) 
+  BleurtTypeEmbeddingLayer(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   : LayerWithOptions(graph, options) {}
 
   virtual ~BleurtTypeEmbeddingLayer() = default;
-  
+
   Expr apply(Ptr<data::SubBatch> subBatch) const {
     int dimEmb   = opt<int>("dim-emb");
     int dimTypes = opt<int>("bert-type-vocab-size", 2);
@@ -27,7 +27,7 @@ class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions {
 
     const auto& words = subBatch->data();
     const auto vocab = subBatch->vocab();
-    
+
     // Get word id of special symbols
     Word sepId   = vocab->getEosId();
 
@@ -55,10 +55,10 @@ class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions {
 struct BleurtEncoder final : public nn::TransformerEncoder {
   Ptr<nn::Linear> eProj;
 
-  BleurtEncoder(Ptr<ExpressionGraph> graph, 
-               Ptr<Options> options) 
+  BleurtEncoder(Ptr<ExpressionGraph> graph,
+               Ptr<Options> options)
     : TransformerEncoder(graph, options) {
-    
+
     eProj = New<nn::Linear>(graph, opt<int>("transformer-dim-model"));
     registerLayer(eProj);
 
@@ -68,33 +68,35 @@ struct BleurtEncoder final : public nn::TransformerEncoder {
 
   Expr apply(Expr input, Expr mask) const override {
     auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-    mask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-  
+
+    auto binaryMask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
+    auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1]
+
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
 
     // apply dropout or layer-norm to embeddings if required
     output = preprocessor->apply(output);
-    
+
     // scale from 256 to 1152
     output = eProj->apply(output);
-    
+
     // traverse the layers, use the same mask for each
     for(auto layer : *layers)
-      output = layer->apply(output, mask); 
+      output = layer->apply(output, logMask);
 
     return output;
   }
 };
 
 // Wrapper for backwards compatibility that uses current encoder/decoder framework
-struct BleurtBatchEncoder final : public nn::LayerWithOptions, 
+struct BleurtBatchEncoder final : public nn::LayerWithOptions,
                                   public nn::IEmbeddingLayer,  // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings
                                   public EncoderBase {         // @TODO: should all encoders be IEmbeddingLayer?
   Ptr<BleurtTypeEmbeddingLayer> typeEmbedding;
   Ptr<BleurtEncoder> encoder;
-  
-  BleurtBatchEncoder(Ptr<ExpressionGraph> graph, 
+
+  BleurtBatchEncoder(Ptr<ExpressionGraph> graph,
                     Ptr<Options> options)
     : LayerWithOptions(graph, options),
       EncoderBase(graph, options)
@@ -110,7 +112,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions,
   virtual std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override {
     auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt<bool>("ulr", false));
     const auto& [batchEmbeddings, batchMask] = embeddingLayer->apply(subBatch);
-    
+
 #if 1
     auto typeEmbeddings = typeEmbedding->apply(subBatch);
     auto embeddings = batchEmbeddings + typeEmbeddings;
@@ -142,12 +144,12 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions,
     EncoderBase::graph_ = graph;
     setGraph(graph);
     // This makes sure that the graph passed into the model during construction and now evaluation are identical.
-    // A good check to have for catching weird situations early. 
+    // A good check to have for catching weird situations early.
     ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
 #endif
 
     // @TODO: this needs to convert to a BERT-batch
-    
+
     const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]);
     return New<EncoderState>(batchEmbedding, batchMask, batch);
   }
@@ -157,7 +159,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions,
   }
 };
 
-class BleurtPooler final : public nn::LayerWithOptions, 
+class BleurtPooler final : public nn::LayerWithOptions,
                            public PoolerBase {
 private:
   Ptr<nn::Sequential> layers;
@@ -167,7 +169,7 @@ class BleurtPooler final : public nn::LayerWithOptions,
   BleurtPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   : LayerWithOptions(graph, options),
     PoolerBase(graph, options) {
-    
+
     float dropoutProb = 0.f;
     layers = New<nn::Sequential>(
       graph,
@@ -176,7 +178,7 @@ class BleurtPooler final : public nn::LayerWithOptions,
       New<nn::Dropout>(graph, dropoutProb),
       New<nn::Linear>(graph, 1)
     );
-    
+
     registerLayer(layers);
   }
 
@@ -186,15 +188,15 @@ class BleurtPooler final : public nn::LayerWithOptions,
     PoolerBase::graph_ = graph;
     setGraph(graph);
     // This makes sure that the graph passed into the model during construction and now evaluation are identical.
-    // A good check to have for catching weird situations early. 
+    // A good check to have for catching weird situations early.
     ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
 #endif
 
     auto modelType = LayerWithOptions::opt<std::string>("type");
-    
+
     auto emb = slice(encoderStates[0]->getContext(), -2, 0);
     emb = marian::cast(emb, Type::float32);
-    
+
     Expr output;
     if(LayerWithOptions::opt<int>("usage") == (int)models::usage::evaluating) {
       output = layers->apply(emb);
@@ -202,7 +204,7 @@ class BleurtPooler final : public nn::LayerWithOptions,
       output = reshape(output, {dimBatch, 1, 1});
       return { output };
     } else {
-      ABORT("Usage other than evaluating not implemented");  
+      ABORT("Usage other than evaluating not implemented");
     }
   }
 

From fa06754f0fff1b49e6668c88eab86ff350e1e6a9 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 17 Jan 2024 22:14:41 +0000
Subject: [PATCH 07/26] Merged PR 32547: Add support for sparsemax and comet-22
 (not kiwi yet)

This adds a sparsemax function and support for COMET-22 ref-based metric.

Worth adding a regression test for Unbabel/wmt22-comet-da model later. Scores seem to be pretty much identical to PyTorch implementation when running as float32.
---
 CHANGELOG.md                       |  2 +
 VERSION                            |  2 +-
 scripts/comet/comet2marian.py      | 26 ++++++-----
 src/common/config_parser.cpp       | 13 +++---
 src/graph/expression_operators.cpp | 37 +++++++++++++--
 src/graph/expression_operators.h   | 15 ++++--
 src/models/comet_qe.h              | 75 ++++++++++++++++--------------
 src/models/encoder_pooler.h        |  5 +-
 8 files changed, 110 insertions(+), 65 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 83c05ac4c..3e4a170a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added implementation of COMET-22 (reference-based) model and conversion
+- Added sparsemax operator (slow version)
 - Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively.
 - Added ALIBI related options to new layer framework.
 - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
diff --git a/VERSION b/VERSION
index 5235dd6a9..5c911e82d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.17
+v1.12.18
diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
index d5f86a95f..6b4f557db 100755
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -14,7 +14,7 @@
 # supported_comets = [m for m in available_metrics if 'qe' in m.lower()]
 supported_comets = [
     'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da',
-    'wmt20-comet-da', 'wmt21-comet-da'
+    'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da'
 ]
 log.basicConfig(level=log.INFO)
 
@@ -32,7 +32,7 @@ def load_from_huggingface(model_id):
     log.info(f"Loading transformer model from huggingface {model_id}")
     from transformers import AutoModel, AutoTokenizer
     try:
-        model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) 
+        model = AutoModel.from_pretrained(model_id, add_pooling_layer=False)
         AutoTokenizer.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         return model.eval(), getattr(tokenizer, 'vocab_file', None)
@@ -53,7 +53,7 @@ def load_comet_model(model_path):
     log.info(f"Loading COMET model from checkpoint {model_path}")
     comet_model = load_from_checkpoint(model_path)
     comet_model.eval()
-    
+
     vocab_file = None
     try:
         pretrained_model = comet_model.hparams.get('pretrained_model')
@@ -106,6 +106,11 @@ def load_comet_model(model_path):
 config["bert-train-type-embeddings"] = False
 config["bert-type-vocab-size"] = 0
 config["comet-prepend-zero"] = True
+
+config["comet-mix"] = cometModel.hparams.get("layer") == "mix"
+config["comet-mix-norm"] = cometModel.hparams.get('layer_norm', False)
+config["comet-mix-transformation"] = cometModel.hparams.get("layer_transformation", "softmax");
+
 if not args.roberta:
     config["comet-final-sigmoid"] = args.add_sigmoid
     config["comet-pooler-ffn"] = [2048, 1024]
@@ -155,15 +160,15 @@ def extract(layer, nth, level):
 
         blockPrefix = f"{prefix}->encoder->layers->at({nth})->as<marian::nn::TransformerEncoderLayer>()->selfAttentionBlock"
 
-        # self-attention    
+        # self-attention
         # query transformation
         convert(pd, ["attention.self.query.weight"],       f"{blockPrefix}->selfAttention->qProj->weight")
         convert(pd, ["attention.self.query.bias"],         f"{blockPrefix}->selfAttention->qProj->bias", bias=True)
-        
+
         # key transformation
         convert(pd, ["attention.self.key.weight"],         f"{blockPrefix}->selfAttention->kProj->weight")
         convert(pd, ["attention.self.key.bias"],           f"{blockPrefix}->selfAttention->kProj->bias", bias=True)
-        
+
         # values transformation
         convert(pd, ["attention.self.value.weight"],       f"{blockPrefix}->selfAttention->vProj->weight")
         convert(pd, ["attention.self.value.bias"],         f"{blockPrefix}->selfAttention->vProj->bias", bias=True)
@@ -176,7 +181,7 @@ def extract(layer, nth, level):
         convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True)
         convert(pd, ["attention.output.LayerNorm.bias"],   f"{blockPrefix}->postprocessor->norm->bias", bias=True)
 
-        # ffn 
+        # ffn
         # first ffn layer
         blockPrefix = f"{prefix}->encoder->layers->at({nth})->as<marian::nn::TransformerEncoderLayer>()->filterBlock"
 
@@ -206,7 +211,7 @@ def extract(layer, nth, level):
         marianModel["Wemb"] = npWemb
 
         prefix = "CometEncoder"
-        
+
         # shift position embeddings so that we are back at 512 items and start at 0
         npPos = pd["position_embeddings.weight"].detach().numpy()
         npPos = npPos[2:, :].copy()
@@ -234,9 +239,6 @@ def extract(layer, nth, level):
         # gamma for weird batch/layer-norm step in pooler/encoder of COMET
         # @TODO: make optional
         marianModel["CometEncoder->encoder->gamma"] = pd["gamma"].detach().numpy().copy()
-        config["comet-mix"] = True
-        config["comet-mix-norm"] = True
-        
 
     elif name == "FeedForward":
         for n, p in layer.named_parameters():
@@ -262,7 +264,7 @@ def extract(layer, nth, level):
         convert(pd, ["ff.3.bias"],   f"{prefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
 
         convert(pd, ["ff.6.weight"], f"{prefix}->layers->at(6)->as<marian::nn::Linear>()->weight")
-        convert(pd, ["ff.6.bias"],   f"{prefix}->layers->at(6)->as<marian::nn::Linear>()->bias", bias=True)        
+        convert(pd, ["ff.6.bias"],   f"{prefix}->layers->at(6)->as<marian::nn::Linear>()->bias", bias=True)
     else:
         recurse(layer, level + 1)
 
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index d797b8e2d..ec85e40ad 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -331,11 +331,11 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
   cli.add<bool>("--transformer-disable-position-embeddings",
       "Do not add any position embeddings. Use e.g. with --transformer-attention-mask alibi");
 
-  cli.add<bool>("--transformer-alibi-trainable", 
+  cli.add<bool>("--transformer-alibi-trainable",
       "Make alibi slopes trainable, default slopes are constant");
 
   // handy shortcut for the current best setup
-  cli.add<bool>("--alibi", 
+  cli.add<bool>("--alibi",
       "Use alibi settings for transformer, this is a shortcut for --transformer-attention-mask alibi --transformer-alibi-shift --transformer-disable-position-embeddings --separator-symbol [eos]");
   cli.alias("alibi", "true", [](YAML::Node& config) {
     // define current-best alibi settings
@@ -361,9 +361,10 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
   // Options specific for the "comet-qe" model type
   cli.add<bool>("--comet-final-sigmoid", "Add final sigmoid to COMET model");
   cli.add<bool>("--comet-stop-grad", "Do not propagate gradients through COMET model");
-  
+
   cli.add<bool>("--comet-mix", "Mix encoder layers to produce embedding");
   cli.add<bool>("--comet-mix-norm", "Normalize layers prior to mixing");
+  cli.add<std::string>("--comet-mix-transformation", "Which transformation to apply to layer mixing (softmax [default] or sparsemax)", "softmax");
   cli.add<float>("--comet-dropout", "Dropout for pooler layers", 0.1f);
   cli.add<float>("--comet-mixup", "Alpha parameter for Beta distribution for mixup", 0.0f);
   cli.add<bool>("--comet-mixup-reg", "Use original and mixed-up samples in training");
@@ -418,7 +419,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Do not create model checkpoints, only overwrite main model file with last checkpoint. "
       "Reduces disk usage");
   cli.add<bool>("--overwrite-checkpoint",
-      "When --overwrite=false (default) only model files get written at saving intervals (with iterations numbers). " 
+      "When --overwrite=false (default) only model files get written at saving intervals (with iterations numbers). "
       "Setting --overwrite-checkpoint=false also saves full checkpoints checkpoints with optimizer parameters, etc. "
       "Uses (a lot) more disk space.",
       true);
@@ -604,7 +605,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "Dynamic cost scaling for mixed precision training: "
       "scaling factor, frequency, multiplier, minimum factor")
       ->implicit_val("8.f 10000 1.f 8.f");
-  
+
   cli.add<std::vector<std::string>>("--throw-on-divergence",
       "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps "
       "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations")
@@ -617,7 +618,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "If fp16 training diverges and throws try to continue training with fp32 precision");
   cli.alias("fp16-fallback-to-fp32", "true", [](YAML::Node& config) {
     // use default custom-fallbacks to handle DivergenceException for fp16
-    config["custom-fallbacks"] = std::vector<YAML::Node>({ 
+    config["custom-fallbacks"] = std::vector<YAML::Node>({
       YAML::Load("{fp16 : false, precision: [float32, float32], cost-scaling: []}")
      });
   });
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 47da511cf..c6245636c 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -102,8 +102,7 @@ Expr operator-(Expr a) {
   return Expression<NegNodeOp>(a);
 };
 
-Expr softmax(Expr a, int axis /*=-1*/)
-{
+Expr softmax(Expr a, int axis /*=-1*/) {
   // @TODO: move axis parameter down into the kernel
   if (axis != -1)
   {
@@ -129,6 +128,34 @@ Expr logsoftmax(Expr a) {
   return Expression<LogSoftmaxNodeOp>(a);
 }
 
+// based on https://proceedings.mlr.press/v48/martins16.pdf for k equal to full dimension.
+Expr sparsemax(Expr z, int axis/*=-1*/) {
+  // we currently assume that k == modelDim and that we apply the sparse max to the last dimension
+
+  auto graph = z->graph();
+
+  int dimk = z->shape()[axis]; // assuming axis==-1 for dimension comments
+  Type fType = z->value_type();
+
+  // cast to float32 for better precision
+  auto z32 = cast(z, Type::float32);                     // [dimBatch, dimTime, dimk]
+
+  const auto& [zSorted, zIndices] = sort(z32, /*axis=*/axis, /*descending=*/true);
+  auto zCumSum = cumsum(zSorted, /*axis=*/axis);            // [dimBatch, dimTime, dimk]
+
+  auto k       = graph->constant({dimk}, inits::range(1.f, (float)(dimk + 1)), Type::float32);
+  auto kMask   = gt(1.f + k * zSorted, zCumSum);                // [dimBatch, dimTime, dimk]
+  auto kMax    = max(kMask * k, /*axis=*/axis);                 // [dimBatch, dimTime, 1]
+  auto kMaxIdx = cast(kMax - 1.f, Type::uint32);                // [dimBatch, dimTime, 1]
+  auto zNum    = index_select(zCumSum, /*axis=*/axis, kMaxIdx); // [dimBatch, dimTime, 1]
+  auto tau     = (zNum - 1.f) / kMax;                           // [dimBatch, dimTime, 1]
+
+  auto zSparsemax = maximum(z32 - tau, 0.f);                    // [dimBatch, dimTime, dimk]
+
+  // cast back to original type
+  return cast(zSparsemax, fType);
+}
+
 /*********************************************************/
 
 Expr operator+(Expr a, Expr b) {
@@ -308,15 +335,15 @@ Expr operator/(float a, Expr b) {
 
 // @TODO: implement proper operators for all three:
 Expr pow(float a, Expr b) {
-  return exp(std::log(a) * b); 
+  return exp(std::log(a) * b);
 }
 
 Expr pow(Expr a, float b) {
-  return exp(log(a) * b); 
+  return exp(log(a) * b);
 }
 
 Expr pow(Expr a, Expr b) {
-  return exp(log(a) * b); 
+  return exp(log(a) * b);
 }
 
 /*********************************************************/
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index 82d8726c5..685ef0ebf 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -503,7 +503,7 @@ Expr bdot(Expr a,
           float scalar = 1.f);
 
 /**
- * bdot_legacy is an old implemetation of bdot without correct broadcasting on the batch dimensions, 
+ * bdot_legacy is an old implemetation of bdot without correct broadcasting on the batch dimensions,
  * to be removed once the behavior can be correctly replicated with normal bdot on 5 dimensions.
  */
 Expr bdot_legacy(Expr a,
@@ -739,7 +739,7 @@ Expr gather(Expr a, int axis, Expr indices);
  * @param a       The input expression
  * @param axis    The axis along which to index
  * @param indices The indices to be scattered
- * @param source  Expression with values to scatter. 
+ * @param source  Expression with values to scatter.
  * @returns       Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices
  * @note @p source and @p indices must have the same rank
  * @note In this version @p source and @p indicies must have the same shape
@@ -929,6 +929,11 @@ Expr softmax(Expr a, Expr zeroOneMask, int axis = -1);
  */
 Expr logsoftmax(Expr a);
 
+/**
+ * Compute a sparsemax along the last axis. Slow implementation but differentiable.
+*/
+Expr sparsemax(Expr a, int axis = -1);
+
 /**
  * Computes the cross-entropy loss.
  * @param labelSmoothingAlpha The amount of label smoothing @f$\alpha \in [0,1]@f$.
@@ -966,9 +971,9 @@ Expr weighted_average(Expr in, Expr weights, int ax = 0);
 Expr layerNorm(Expr x, Expr gamma = nullptr, Expr beta = nullptr, float eps = 1e-9);
 
 /**
- * Applies RMS normalization over the last dimension. 
- * 
- * See: Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization. 
+ * Applies RMS normalization over the last dimension.
+ *
+ * See: Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization.
  * In Advances in Neural Information Processing Systems 32. Vancouver, Canada.
  * @f[
    \frac{x}{\sqrt{\frac{1}{N}\sum x^2 + \mathrm{eps}}} \times \gamma + \beta
diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h
index 868f7d6e9..aa335696d 100644
--- a/src/models/comet_qe.h
+++ b/src/models/comet_qe.h
@@ -15,6 +15,7 @@ class CometEncoder final : public nn::TransformerEncoder {
   // models trained by us, but required when doing inference with Unbabel models.
   Expr cometNorm(Expr x, Expr binaryMask) const {
     Expr output;
+
     if(opt<bool>("comet-mix-norm", false)) {
       registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
       int dimModel = x->shape()[-1];
@@ -23,7 +24,7 @@ class CometEncoder final : public nn::TransformerEncoder {
       Type origType = x->value_type();
       x             = marian::cast(x,       Type::float32);
       binaryMask    = marian::cast(binaryMask, Type::float32);
-      
+
       x = x * binaryMask;
       auto denom = (float)dimModel * sum(binaryMask, -2);
       auto mu    = sum(sum(x, -1), -2) / denom; // sum over model and time
@@ -34,8 +35,11 @@ class CometEncoder final : public nn::TransformerEncoder {
 
       // Undo conversion to fp32 if not originally fp32 (most likely fp16 then)
       output = marian::cast(output, origType);
-    } else {
+    } else if(opt<bool>("comet-mix", false)) {
       // average over time dimension
+      registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
+      output = gamma * sum(x * binaryMask, -2) / sum(binaryMask, -2);
+    } else {
       output = sum(x * binaryMask, -2) / sum(binaryMask, -2);
     }
 
@@ -46,15 +50,15 @@ class CometEncoder final : public nn::TransformerEncoder {
   Expr weights;
   Expr gamma;
 
-  CometEncoder(Ptr<ExpressionGraph> graph, 
-               Ptr<Options> options) 
+  CometEncoder(Ptr<ExpressionGraph> graph,
+               Ptr<Options> options)
     : TransformerEncoder(graph, options) {}
 
   Expr apply(Expr input, Expr mask) const override {
     auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-    
+
     auto binaryMask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-    
+
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
 
@@ -75,7 +79,10 @@ class CometEncoder final : public nn::TransformerEncoder {
 
     if(opt<bool>("comet-mix", false)) {
       registerParameterLazy(weights, Shape({ opt<int>("enc-depth") + 1 }), inits::zeros());
-      auto weightsNorm = reshape(softmax(weights), {weights->shape()[-1], 1});
+      // comet22 has a sparsemax here
+      auto normFn = opt<std::string>("comet-mix-transformation", "softmax");
+      auto weightsNorm = (normFn == "sparsemax") ? sparsemax(weights) : softmax(weights);
+      weightsNorm = reshape(weightsNorm, {weights->shape()[-1], 1});
       output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim]
     } else {
       // just use last layer, average over time dim
@@ -87,12 +94,12 @@ class CometEncoder final : public nn::TransformerEncoder {
 };
 
 // Wrapper for backwards compatibility that uses current encoder/decoder framework
-struct CometBatchEncoder final : public nn::LayerWithOptions, 
+struct CometBatchEncoder final : public nn::LayerWithOptions,
                                  public nn::IEmbeddingLayer,  // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings
                                  public EncoderBase {         // @TODO: should all encoders be IEmbeddingLayer?
   Ptr<CometEncoder> encoder;
 
-  CometBatchEncoder(Ptr<ExpressionGraph> graph, 
+  CometBatchEncoder(Ptr<ExpressionGraph> graph,
                     Ptr<Options> options)
     : LayerWithOptions(graph, options),
       EncoderBase(graph, options)
@@ -131,10 +138,10 @@ struct CometBatchEncoder final : public nn::LayerWithOptions,
     EncoderBase::graph_ = graph;
     setGraph(graph);
     // This makes sure that the graph passed into the model during construction and now evaluation are identical.
-    // A good check to have for catching weird situations early. 
+    // A good check to have for catching weird situations early.
     ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
 #endif
-    
+
     const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]);
     return New<EncoderState>(batchEmbedding, batchMask, batch);
   }
@@ -145,7 +152,7 @@ struct CometBatchEncoder final : public nn::LayerWithOptions,
 };
 
 // Dummpy pooler that only returns the encoder context
-class CometEmbeddingPooler final : public nn::LayerWithOptions, 
+class CometEmbeddingPooler final : public nn::LayerWithOptions,
                                    public PoolerBase {
 public:
   CometEmbeddingPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
@@ -159,12 +166,12 @@ class CometEmbeddingPooler final : public nn::LayerWithOptions,
 
     return { encoderStates[0]->getContext() };
   }
-  
+
   void clear() override {}
 };
 
 // Actual COMET-like pooler, works for COMET-QE and COMET models (prior to WMT22)
-class CometMetricPooler final : public nn::LayerWithOptions, 
+class CometMetricPooler final : public nn::LayerWithOptions,
                                 public PoolerBase {
 private:
   Ptr<nn::Sequential> layers;
@@ -174,7 +181,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
   CometMetricPooler(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   : LayerWithOptions(graph, options),
     PoolerBase(graph, options) {
-    
+
     float dropoutProb = LayerWithOptions::opt<float>("comet-dropout", 0.1f);
     auto ffnHidden = LayerWithOptions::opt<std::vector<int>>("comet-pooler-ffn", {2048, 1024});
 
@@ -188,7 +195,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
 
     if(LayerWithOptions::opt<bool>("comet-final-sigmoid"))
       layers->append(New<nn::Sigmoid>(graph));
-    
+
     registerLayer(layers);
   }
 
@@ -198,7 +205,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
     PoolerBase::graph_ = graph;
     setGraph(graph);
     // This makes sure that the graph passed into the model during construction and now evaluation are identical.
-    // A good check to have for catching weird situations early. 
+    // A good check to have for catching weird situations early.
     ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
 #endif
 
@@ -213,10 +220,10 @@ class CometMetricPooler final : public nn::LayerWithOptions,
     auto mixup = [&](Expr x, Expr y, float alpha, bool reg=true) -> Expr2 {
       if(alpha == 0.f)
         return {x, y};
-  
+
       int dimBatch = x->shape()[-3];
       Type xType = x->value_type();
-      
+
       std::vector<IndexType> indices(dimBatch);
       std::iota(indices.begin(), indices.end(), 0);
 
@@ -246,7 +253,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
       int dimBatch = src->shape()[-3];
       float badRatio = LayerWithOptions::opt<float>("comet-augment-bad", 0.f);
       dimBad = (int)std::ceil(dimBatch * badRatio); // use ceiling to make sure it's at least 1
-      
+
       if(dimBad > 0) {
         LOG_ONCE(info, "Adding {:.1f} percent of bad examples to batch with label 0.0f", badRatio * 100);
 
@@ -259,7 +266,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
         indicesSrc.resize(dimBad); // shrink to size
         auto srcSub = index_select(src, -3, indicesSrc);
         src = concatenate({src, srcSub}, /*axis=*/-3);
-        
+
         std::iota(indicesMt.begin(), indicesMt.end(), 0);
         // permute the indices and select batch entries accordingly
         std::shuffle(indicesMt.begin(), indicesMt.end(), rng);
@@ -277,7 +284,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
     auto modelType = LayerWithOptions::opt<std::string>("type");
     ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe");
     ABORT_IF(modelType == "comet"    && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet");
-    
+
     if(modelType == "comet-qe") {
       auto src = encoderStates[0]->getContext();
       auto mt  = encoderStates[1]->getContext();
@@ -296,7 +303,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
         src = get<0>(srcMt);
         mt  = get<1>(srcMt);
       }
-      
+
       auto diff = abs(mt - src);
       auto prod = mt * src;
 
@@ -313,10 +320,10 @@ class CometMetricPooler final : public nn::LayerWithOptions,
         return { output };
       } else {
         auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model]
-        
+
         auto softLabelsWords = batch->front()->data();
         auto classVocab      = batch->front()->vocab();
-        
+
         // we add bad examples to the batch, so we need to make sure the soft labels are padded accordingly with 0s
         int dimBatch = (int)softLabelsWords.size() + dimBad;
         std::vector<float> softLabels;
@@ -340,12 +347,12 @@ class CometMetricPooler final : public nn::LayerWithOptions,
 
         output = marian::cast(layers->apply(emb), Type::float32);
         return { output, labels };
-      }  
+      }
     } else if(modelType == "comet") {
       auto src = encoderStates[0]->getContext();
       auto mt  = encoderStates[1]->getContext();
       auto ref = encoderStates[2]->getContext();
-      
+
       auto diffRef = abs(mt - ref);
       auto prodRef = mt * ref;
 
@@ -361,7 +368,7 @@ class CometMetricPooler final : public nn::LayerWithOptions,
         return { output };
       } else {
         // Currently no training for COMET with reference @TODO: add training
-        ABORT("Usage other than 'evaluating' not implemented");  
+        ABORT("Usage other than 'evaluating' not implemented");
       }
     } else {
       ABORT("Unknown model type {}", modelType);
@@ -380,7 +387,7 @@ class CometLoss final : public ICost {
 
 public:
   CometLoss(Ptr<Options> options)
-    : options_(options), inference_(options->get<bool>("inference", false)), 
+    : options_(options), inference_(options->get<bool>("inference", false)),
       rescore_(options->get<std::string>("cost-type", "ce-sum") == "ce-rescore") { }
 
   Ptr<MultiRationalLoss> apply(Ptr<IModel> model,
@@ -391,7 +398,7 @@ class CometLoss final : public ICost {
     auto corpusBatch = std::static_pointer_cast<data::CorpusBatch>(batch);
 
     auto inputTypes = options_->get<std::vector<std::string>>("input-types", {});
-    ABORT_IF(inputTypes != std::vector<std::string>({"class", "sequence", "sequence"}), 
+    ABORT_IF(inputTypes != std::vector<std::string>({"class", "sequence", "sequence"}),
              "Expected input-types to be have fields (class, sequence, sequence)");
     ABORT_IF(corpusBatch->sets() != 3, "Expected 3 sub-batches, not {}", corpusBatch->sets());
 
@@ -416,9 +423,9 @@ class CometLoss final : public ICost {
     } else {
       ABORT("Unknown loss type {} for COMET training", lossType);
     }
-  
+
     auto encoded = encpool->apply(graph, corpusBatch, clearGraph);
-    
+
     Expr x = encoded[0];
     Expr y = encoded[1];
     auto loss = lossFn(x, y);
@@ -428,9 +435,9 @@ class CometLoss final : public ICost {
     int dimBatch = loss->shape()[-3];
     if(rescore_)
       loss = reshape(loss, {1, dimBatch, 1});
-    else 
+    else
       loss = sum(loss, /*axis=*/-3); // [1, 1, 1]
-    
+
     Ptr<MultiRationalLoss> multiLoss = New<SumMultiRationalLoss>();
     RationalLoss lossPiece(loss, (float)dimBatch);
     multiLoss->push_back(lossPiece);
diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h
index 0a781c9d5..b89f85c9e 100644
--- a/src/models/encoder_pooler.h
+++ b/src/models/encoder_pooler.h
@@ -7,8 +7,8 @@
 #include "models/model_base.h"
 #include "models/states.h"
 
-// @TODO: this introduces functionality to use LASER in Marian for the filtering workflow or for use in MS-internal 
-// COSMOS server-farm. There is a lot of code duplication with Classifier and EncoderDecoder and this needs to be fixed. 
+// @TODO: this introduces functionality to use LASER in Marian for the filtering workflow or for use in MS-internal
+// COSMOS server-farm. There is a lot of code duplication with Classifier and EncoderDecoder and this needs to be fixed.
 // This will be done after the new layer system has been finished.
 
 namespace marian {
@@ -163,6 +163,7 @@ class EncoderPooler : public EncoderPoolerBase {
     modelFeatures_.insert("comet-final-sigmoid");
     modelFeatures_.insert("comet-mix");
     modelFeatures_.insert("comet-mix-norm");
+    modelFeatures_.insert("comet-mix-transformation");
   }
 
   virtual Ptr<Options> getOptions() override { return options_; }

From 7dcebfb924375973f68f094443792706ae7b813a Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sat, 20 Jan 2024 23:30:53 +0000
Subject: [PATCH 08/26] Merged PR 32567: Refactoring of Graph loading and
 mmapping interface

This is a rewrite of the graph loading and memory-mapping functionality. We now mmap and share oportunistically, i.e. whenever it is possible:
* with cpu-decoding and *.bin files everything will be automatically mmapped
* with *.npz files the model will be read only once.
* on the GPU *.bin will be mmapped but still copied to GPU, ideally omitting CPU memory.

This quite drastically reduces unnecessary CPU memory overhead and loading time for things like COMET scoring.
---
 CHANGELOG.md                                |   1 +
 VERSION                                     |   2 +-
 src/command/marian_conv.cpp                 |  14 +-
 src/common/binary.cpp                       |  14 +-
 src/common/config.cpp                       |   6 +-
 src/common/io.cpp                           | 194 +++++++++++++-------
 src/common/io.h                             |  80 +++++++-
 src/common/shape.h                          |   2 +-
 src/embedder/embedder.h                     |  16 +-
 src/evaluator/evaluator.h                   |  31 ++--
 src/examples/mnist/model.h                  |   7 +-
 src/graph/expression_graph.cpp              |   4 +-
 src/graph/expression_graph.h                |  88 ++++-----
 src/graph/node_initializers.cpp             |  39 ++--
 src/microsoft/cosmos.cpp                    |  27 +--
 src/microsoft/quicksand.cpp                 |  60 +++---
 src/microsoft/quicksand.h                   |   9 +-
 src/models/amun.h                           |  80 ++++----
 src/models/costs.h                          |  36 +---
 src/models/encoder_classifier.h             |  32 +---
 src/models/encoder_decoder.cpp              |  20 +-
 src/models/encoder_decoder.h                |  32 +---
 src/models/encoder_pooler.h                 |  36 +---
 src/models/model_base.h                     |  22 +--
 src/models/nematus.h                        |  78 ++++----
 src/models/transformer_factory.h            |  36 ++--
 src/rescorer/rescorer.h                     |   8 +-
 src/tensors/cpu/expression_graph_packable.h |   6 +-
 src/tensors/tensor.cpp                      |   8 +-
 src/training/graph_group.cpp                | 107 ++++++-----
 src/training/graph_group.h                  |   5 +-
 src/training/validator.cpp                  |   6 +-
 src/translator/scorers.cpp                  |  88 ++-------
 src/translator/scorers.h                    |  58 +-----
 src/translator/translator.h                 |  49 ++---
 35 files changed, 597 insertions(+), 704 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e4a170a3..772349e3d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp).
 
 ### Changed
+- Refactoring of model loading, mmapping happens now opportunistically, --mmap-models for decoding forces mmap and croaks if not possible.
 - Removed --num-devices N option that wasn't really used by anyone (I assume).
 
 
diff --git a/VERSION b/VERSION
index 5c911e82d..2f107c43d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.18
+v1.12.19
diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp
index 12412a238..31f47946e 100644
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@@ -24,9 +24,9 @@ int main(int argc, char** argv) {
     cli->add<std::string>("--to,-t", "Output model", "model.bin");
     cli->add<std::string>("--export-as", "Kind of conversion: marian-bin or onnx-{encode,decoder-step,decoder-init,decoder-stop}", "marian-bin");
     cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, "
-                          "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512", 
+                          "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512",
                           "float32");
-    cli->add<std::vector<std::string>>("--add-lsh", 
+    cli->add<std::vector<std::string>>("--add-lsh",
                                        "Encode output matrix and optional rotation matrix into model file. "
                                        "arg1: number of bits in LSH encoding, arg2: name of output weights matrix")->implicit_val("1024 Wemb");
     cli->add<std::vector<std::string>>("--vocabs,-V", "Vocabulary file, required for ONNX export");
@@ -69,21 +69,23 @@ int main(int argc, char** argv) {
     if(lshParams.size() > 1)
       lshOutputWeights = lshParams[1];
   }
-  
+
   // We accept any type here and will later croak during packAndSave if the type cannot be used for conversion
   Type saveGemmType = typeFromString(options->get<std::string>("gemm-type", "float32"));
 
   LOG(info, "Outputting {}, precision: {}", modelTo, saveGemmType);
 
-  YAML::Node config;
+
+  auto modelFile = New<io::ModelWeights>(modelFrom, io::MmapMode::DontMmap);
+  YAML::Node config = modelFile->getYamlFromModel();
   std::stringstream configStr;
-  marian::io::getYamlFromModel(config, "special:model.yml", modelFrom);
+
   configStr << config;
 
   if (exportAs == "marian-bin") {
     auto graph = New<ExpressionGraphPackable>();
     graph->setDevice(CPU0);
-    graph->load(modelFrom);
+    graph->load(modelFile);
 
     std::vector<lsh::ParamConvInfo> toBeLSHed;
     if(addLsh) {
diff --git a/src/common/binary.cpp b/src/common/binary.cpp
index 6bb90c508..0041275c5 100644
--- a/src/common/binary.cpp
+++ b/src/common/binary.cpp
@@ -19,7 +19,7 @@ struct Header {
   uint64_t dataLength;
 };
 
-// cast current void pointer to T pointer and move forward by num elements 
+// cast current void pointer to T pointer and move forward by num elements
 template <typename T>
 const T* get(const void*& current, uint64_t num = 1) {
   const T* ptr = (const T*)current;
@@ -48,9 +48,9 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
   // read in actual shape and data
   for(int i = 0; i < numHeaders; ++i) {
     uint64_t len = headers[i].shapeLength;
-    items[i].shape.resize(len); 
+    items[i].shape.resize(len);
     const int* arr = get<int>(current, len); // read shape
-    std::copy(arr, arr + len, items[i].shape.begin()); // copy to Item::shape 
+    std::copy(arr, arr + len, items[i].shape.begin()); // copy to Item::shape
   }
 
   // move by offset bytes, aligned to 256-bytes boundary
@@ -64,8 +64,8 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
       items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
     }
     if(items[i].mapped) { // memory-mapped, hence only set pointer
-      // @TOOD: verify this actually works for the hardware-specific ones like intgemm8avx2
-      ABORT_IF(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16, "mmap format not supported for hardware non-specific intgemm matrices");
+      if(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16)
+        throw MarianRuntimeException("mmap format not supported for hardware non-specific intgemm matrices", getCallStack(/*skipLevels=*/0));
       items[i].ptr = get<char>(current, headers[i].dataLength);
     } else { // reading into item data
       uint64_t len = headers[i].dataLength;
@@ -170,8 +170,8 @@ void saveItems(const std::string& fileName,
 
   // Write out all values
   for(const auto& item : items)
-    pos += out.write(item.data(), item.bytes.size()); // writes out data with padding, keeps 256-byte boundary. 
-                                                      // Amazingly this is binary-compatible with V1 and aligned and 
+    pos += out.write(item.data(), item.bytes.size()); // writes out data with padding, keeps 256-byte boundary.
+                                                      // Amazingly this is binary-compatible with V1 and aligned and
                                                       // non-aligned models can be read with the same procedure.
                                                       // No version-bump required. Gets 5-8% of speed back when mmapped.
 }
diff --git a/src/common/config.cpp b/src/common/config.cpp
index efdd29c12..20ef6e046 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -170,15 +170,13 @@ void Config::save(const std::string& name) {
 }
 
 bool Config::loadModelParameters(const std::string& name) {
-  YAML::Node config;
-  io::getYamlFromModel(config, "special:model.yml", name);
+  auto config = New<io::ModelWeights>(name)->getYamlFromModel();
   override(config);
   return true;
 }
 
 bool Config::loadModelParameters(const void* ptr) {
-  YAML::Node config;
-  io::getYamlFromModel(config, "special:model.yml", ptr);
+  auto config = New<io::ModelWeights>(ptr)->getYamlFromModel();
   override(config);
   return true;
 }
diff --git a/src/common/io.cpp b/src/common/io.cpp
index 6a7be6a36..109b3a1ed 100644
--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@@ -1,12 +1,15 @@
 #include "common/io.h"
 
 #include "3rd_party/cnpy/cnpy.h"
+#include "common/definitions.h"
 #include "common/shape.h"
 #include "common/types.h"
 
 #include "common/binary.h"
 #include "common/io_item.h"
 
+#include "training/communicator.h"
+
 namespace marian {
 namespace io {
 
@@ -20,78 +23,73 @@ bool isBin(const std::string& fileName) {
          && fileName.substr(fileName.length() - 4) == ".bin";
 }
 
-void getYamlFromNpz(YAML::Node& yaml,
-                    const std::string& varName,
-                    const std::string& fileName) {
-  auto item = cnpy::npz_load(fileName, varName);
-  if(item->size() > 0)
-    yaml = YAML::Load(item->data());
+ModelWeights::FileType ModelWeights::getFileType(const std::string& fileName) {
+  if(isNpz(fileName))
+    return FileType::isNpz;
+  else if(isBin(fileName))
+    return FileType::isBin;
+  else
+    ABORT("Unknown file format for file {}", fileName);
 }
 
-void getYamlFromBin(YAML::Node& yaml,
-                    const std::string& varName,
-                    const std::string& fileName) {
-  auto item = binary::getItem(fileName, varName);
-  if(item.size() > 0)
-    yaml = YAML::Load(item.data());
-}
-
-void getYamlFromModel(YAML::Node& yaml,
-                      const std::string& varName,
-                      const std::string& fileName) {
-  if(io::isNpz(fileName)) {
-    io::getYamlFromNpz(yaml, varName, fileName);
-  } else if(io::isBin(fileName)) {
-    io::getYamlFromBin(yaml, varName, fileName);
-  } else {
-    ABORT("Unknown model file format for file {}", fileName);
-  }
+std::vector<Item>& ModelWeights::items() {
+  load();
+  return items_;
 }
 
-void getYamlFromModel(YAML::Node& yaml,
-                      const std::string& varName,
-                      const void* ptr) {
-  auto item = binary::getItem(ptr, varName);
-  if(item.size() > 0)
-    yaml = YAML::Load(item.data());
+const std::vector<Item>& ModelWeights::items() const {
+  const_cast<ModelWeights&>(*this).load();
+  return items_;
 }
 
-// Load YAML from item
-void getYamlFromModel(YAML::Node& yaml,
-                      const std::string& varName,
-                      const std::vector<Item>& items) {
-    for(auto& item : items) {
-      if(item.name == varName) {
-        yaml = YAML::Load(item.data());
-        return;
-      }
-    }
+const void* ModelWeights::data() const {
+  const_cast<ModelWeights&>(*this).load();
+  switch (fileType_) {
+    case FileType::isNpz:
+      return nullptr;
+    case FileType::isBin:
+      return mmap_->data();
+    case FileType::isBuf:
+      return ptr_;
+    case FileType::isDummy:
+      ABORT("Cannot get data from dummy model");
+    default:
+      ABORT("Unknown file type");
+  }
 }
 
-void addMetaToItems(const std::string& meta,
-                    const std::string& varName,
-                    std::vector<io::Item>& items) {
-  Item item;
-  item.name = varName;
-
-  // increase size by 1 to add \0
-  item.shape = Shape({(int)meta.size() + 1});
-
-  item.bytes.resize(item.shape.elements());
-  std::copy(meta.begin(), meta.end(), item.bytes.begin());
-  // set string terminator
-  item.bytes.back() = '\0';
-
-  item.type = Type::int8;
+size_t ModelWeights::size() const {
+  const_cast<ModelWeights&>(*this).load();
+  switch (fileType_) {
+    case FileType::isNpz:
+      return 0;
+    case FileType::isBin:
+      return mmap_->size();
+    case FileType::isBuf:
+      ABORT("Cannot get size of buffer");
+    case FileType::isDummy:
+      ABORT("Cannot get size from dummy model");
+    default:
+      ABORT("Unknown file type");
+  }
+}
 
-  items.push_back(item);
+// @TODO: bring back fast peeking into the file to get config
+// Load YAML from item
+YAML::Node ModelWeights::getYamlFromModel(const std::string& varName) const {
+  const_cast<ModelWeights&>(*this).load();
+  for(auto& item : items_) {
+    if(item.name == varName) {
+      return YAML::Load(item.data());
+    }
+  }
+  return YAML::Node();
 }
 
 void loadItemsFromNpz(const std::string& fileName, std::vector<Item>& items) {
   auto numpy = cnpy::npz_load(fileName);
   for(auto it : numpy) {
-    ABORT_IF(
-        it.second->fortran_order, "Numpy item '{}' is not stored in row-major order", it.first);
+    ABORT_IF(it.second->fortran_order, "Numpy item '{}' is not stored in row-major order", it.first);
 
     Shape shape;
     shape.resize(it.second->shape.size());
@@ -122,7 +120,7 @@ void loadItemsFromNpz(const std::string& fileName, std::vector<Item>& items) {
   }
 }
 
-std::vector<Item> loadItems(const std::string& fileName) {
+std::vector<Item> ModelWeights::loadItems(const std::string& fileName) {
   std::vector<Item> items;
   if(isNpz(fileName)) {
     loadItemsFromNpz(fileName, items);
@@ -135,16 +133,61 @@ std::vector<Item> loadItems(const std::string& fileName) {
   return items;
 }
 
-std::vector<Item> loadItems(const void* ptr) {
+std::vector<Item> ModelWeights::mmapItems(const void* ptr) {
   std::vector<Item> items;
-  binary::loadItems(ptr, items, false);
+  binary::loadItems(ptr, items, true);
   return items;
 }
 
-std::vector<Item> mmapItems(const void* ptr) {
-  std::vector<Item> items;
-  binary::loadItems(ptr, items, true);
-  return items;
+void ModelWeights::load() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  if(loaded_)
+    return;
+
+  switch (fileType_) {
+    case FileType::isNpz:
+      loadItemsFromNpz(fileName_, items_);
+      break;
+    case FileType::isBin:
+      if(mmapMode_ == MmapMode::DontMmap) {
+        binary::loadItems(fileName_, items_);
+      } else {
+        try {
+          mmap_.reset(new mio::mmap_source(fileName_));
+          binary::loadItems(mmap_->data(), items_, /*mapped=*/true);
+        } catch(const MarianRuntimeException& e) {
+          if(mmapMode_ == MmapMode::RequiredMmap)
+            ABORT("Could not memory-map file '{}': {}", fileName_, e.what());
+          else
+            LOG(warn, "[warning] Could not memory-map file '{}' ({}), falling back to reading from disk", fileName_, e.what());
+          mmapMode_ = MmapMode::DontMmap;
+          binary::loadItems(fileName_, items_);
+        }
+      }
+      break;
+    case FileType::isBuf:
+      binary::loadItems(ptr_, items_, /*mapped=*/mmapMode_ != MmapMode::DontMmap);
+      break;
+    case FileType::isDummy:
+      ABORT("Cannot load from dummy model");
+    default:
+      ABORT("Unknown file type");
+  }
+
+  loaded_ = true;
+}
+
+void ModelWeights::loadAndSync(Ptr<IMPIWrapper> mpi) {
+  ABORT_IF(!mpi, "MPI wrapper is null");
+  ABORT_IF(mmapMode_ != MmapMode::DontMmap, "Mmapping not allowed");
+
+  if(mpi->isMainProcess())
+    load();
+
+  mpi->bCast(fileName_);
+  mpi->bCast(&fileType_, 1, mpi->getDataType((size_t*)&fileType_));
+  mpi->bCast(&loaded_,   1, mpi->getDataType(&loaded_));
+  mpi->bCast(items_);
 }
 
 // @TODO: make cnpy and our wrapper talk to each other in terms of types
@@ -167,12 +210,31 @@ void saveItemsNpz(const std::string& fileName, const std::vector<Item>& items) {
     else if(item.type == Type::uint32)  type = cnpy::map_type(typeid(uint32_t));
     else if(item.type == Type::uint64)  type = cnpy::map_type(typeid(uint64_t));
     else ABORT("Other types ({}) not supported", item.type);
-      
+
     npzItems.emplace_back(item.name, item.bytes, shape, type, sizeOf(item.type));
   }
   cnpy::npz_save(fileName, npzItems);
 }
 
+void addMetaToItems(const std::string& meta,
+                    const std::string& varName,
+                    std::vector<io::Item>& items) {
+  Item item;
+  item.name = varName;
+
+  // increase size by 1 to add \0
+  item.shape = Shape({(int)meta.size() + 1});
+
+  item.bytes.resize(item.shape.elements());
+  std::copy(meta.begin(), meta.end(), item.bytes.begin());
+  // set string terminator
+  item.bytes.back() = '\0';
+
+  item.type = Type::int8;
+
+  items.push_back(item);
+}
+
 void saveItems(const std::string& fileName, const std::vector<Item>& items) {
   if(isNpz(fileName)) {
     saveItemsNpz(fileName, items);
diff --git a/src/common/io.h b/src/common/io.h
index 3f340ed2f..1db0a83fe 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -1,6 +1,12 @@
 #pragma once
 
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+#include "3rd_party/mio/mio.hpp"
 #include "3rd_party/yaml-cpp/yaml.h"
+#include "common/definitions.h"
 #include "common/io_item.h"
 
 #include <string>
@@ -14,28 +20,84 @@
 // CPU decoding.
 
 namespace marian {
+
+struct IMPIWrapper;
+
 namespace io {
 
+enum struct MmapMode { OpportunisticMmap, DontMmap, RequiredMmap };
+
 bool isNpz(const std::string& fileName);
 bool isBin(const std::string& fileName);
 
-void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const std::string& fileName);
-void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const void* ptr);
-void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const std::vector<Item>& items);
+class ModelWeights {
+private:
+  std::mutex mutex_;
+
+  std::string fileName_;
+  const void* ptr_{nullptr};
+
+  enum struct FileType : size_t { isNpz, isBin, isBuf, isDummy };
+  FileType fileType_{FileType::isNpz};
+  FileType getFileType(const std::string& fileName);
+
+  MmapMode mmapMode_{MmapMode::OpportunisticMmap};
+
+  bool loaded_{false};
+
+  std::vector<Item> items_;
+  std::unique_ptr<mio::mmap_source> mmap_;
+
+  std::vector<Item> loadItems(const std::string& fileName);
+  std::vector<Item> mmapItems(const void* ptr);
+
+  void load();
+
+public:
+  ModelWeights(const std::string& fileName, MmapMode mmapMode = MmapMode::OpportunisticMmap)
+  : fileName_(fileName), fileType_(getFileType(fileName)), mmapMode_(mmapMode) {
 
+    // NPZ files cannot be memory-mapped, so we switch opportunistic mmap off, but keep any other mmap mode
+    if(fileType_ == FileType::isNpz && mmapMode_ == MmapMode::OpportunisticMmap)
+      mmapMode_ = MmapMode::DontMmap;
+
+    // so we can croak here for NPZ files if the user sets mmap to required
+    ABORT_IF(fileType_ == FileType::isNpz && mmapMode_ != MmapMode::DontMmap, "NPZ files cannot be memory-mapped");
+  }
+
+  ModelWeights(const void* ptr, MmapMode mmapMode = MmapMode::RequiredMmap)
+  : ptr_(ptr), fileType_(FileType::isBuf), mmapMode_(mmapMode) {}
+
+  ModelWeights()
+  : fileType_(FileType::isDummy), mmapMode_{MmapMode::DontMmap} {}
+
+  ModelWeights(const ModelWeights&&) = delete;
+  ModelWeights(const ModelWeights&) = delete;
+
+  std::vector<Item>& items();
+  const std::vector<Item>& items() const;
+
+  MmapMode mmapMode() const {
+    return mmapMode_;
+  }
+  const void* data() const;
+  size_t size() const;
+
+  YAML::Node getYamlFromModel(const std::string& varName = "special:model.yml") const;
+
+  void loadAndSync(Ptr<IMPIWrapper> mpi);
+};
+
+// for saving we keep the old interface since there is no intelligence going on here and it is useful
+// to be able to assemble a set of items in different places.
 void addMetaToItems(const std::string& meta,
                     const std::string& varName,
                     std::vector<io::Item>& items);
 
-std::vector<Item> loadItems(const std::string& fileName);
-std::vector<Item> loadItems(const void* ptr);
-
-std::vector<Item> mmapItems(const void* ptr);
-
 void saveItems(const std::string& fileName, const std::vector<Item>& items);
 
 /**
- * Creates a flat io::Item from a given std::vector so that it can be saved in a npz file 
+ * Creates a flat io::Item from a given std::vector so that it can be saved in a npz file
  * or Marian's native binary format with the given name.
  */
 template <typename T>
diff --git a/src/common/shape.h b/src/common/shape.h
index ad2be866f..bd9d98512 100644
--- a/src/common/shape.h
+++ b/src/common/shape.h
@@ -17,7 +17,7 @@ namespace marian {
 */
 class ShapeSizeException : public std::runtime_error {
 public:
-  ShapeSizeException(size_t available, size_t asked) 
+  ShapeSizeException(size_t available, size_t asked)
   : std::runtime_error(fmt::format("Expanded shape size {} exceeds numeric capcacity {}", asked, available))
   {}
 };
diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h
index ebd9782e2..812bed57d 100644
--- a/src/embedder/embedder.h
+++ b/src/embedder/embedder.h
@@ -30,7 +30,7 @@ class Embedder {
   Embedder(Ptr<Options> options)
     : model_(createModelFromOptions(options, models::usage::embedding)) {}
 
-  void load(Ptr<ExpressionGraph> graph, const std::string& modelFile) {
+  void load(Ptr<ExpressionGraph> graph, Ptr<io::ModelWeights> modelFile) {
     model_->load(graph, modelFile);
   }
 
@@ -51,11 +51,12 @@ class Embed : public ModelTask {
   Ptr<CorpusBase> corpus_;
   std::vector<Ptr<ExpressionGraph>> graphs_;
   std::vector<Ptr<Model>> models_;
+  Ptr<io::ModelWeights> modelFile_;
 
 public:
   Embed(Ptr<Options> options) : options_(options) {
-    
-    options_ = options_->with("inference", true, 
+
+    options_ = options_->with("inference", true,
                               "shuffle", "none");
 
     // if a similarity is computed then double the input types and vocabs for
@@ -87,7 +88,8 @@ class Embed : public ModelTask {
       graphs_.push_back(graph);
     }
 
-    auto modelFile = options_->get<std::string>("model");
+    auto modelPath = options_->get<std::string>("model");
+    modelFile_ = New<io::ModelWeights>(modelPath);
 
     models_.resize(graphs_.size());
     ThreadPool pool(graphs_.size(), graphs_.size());
@@ -95,7 +97,7 @@ class Embed : public ModelTask {
       pool.enqueue(
           [=](size_t j) {
             models_[j] = New<Model>(options_);
-            models_[j]->load(graphs_[j], modelFile);
+            models_[j]->load(graphs_[j], modelFile_);
           },
           i);
     }
@@ -104,7 +106,7 @@ class Embed : public ModelTask {
   void run() override {
     LOG(info, "Embedding");
     timer::Timer timer;
-    
+
     auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
     batchGenerator->prepare();
 
@@ -140,7 +142,7 @@ class Embed : public ModelTask {
           } else {
             ABORT("Unknown embedding type {}", embeddings->value_type());
           }
-          
+
           // collect embedding vector per sentence.
           // if we compute similarities this is only one similarity per sentence pair.
           for(size_t i = 0; i < batch->size(); ++i) {
diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h
index 31fe00e87..bfed80a53 100644
--- a/src/evaluator/evaluator.h
+++ b/src/evaluator/evaluator.h
@@ -29,12 +29,8 @@ class Evaluator {
   Evaluator(Ptr<Options> options)
     : model_(createModelFromOptions(options, models::usage::evaluating)) {}
 
-  void load(Ptr<ExpressionGraph> graph, const std::vector<io::Item>& items) {
-    model_->load(graph, items);
-  }
-
-  void load(Ptr<ExpressionGraph> graph, const std::string& fileName) {
-    model_->load(graph, fileName);
+  void load(Ptr<ExpressionGraph> graph, Ptr<io::ModelWeights> modelFile) {
+    model_->load(graph, modelFile);
   }
 
   Expr build(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
@@ -54,11 +50,11 @@ class Evaluate : public ModelTask {
   Ptr<CorpusBase> corpus_;
   std::vector<Ptr<ExpressionGraph>> graphs_;
   std::vector<Ptr<Model>> models_;
-  std::vector<marian::io::Item> ioItems_;
+  Ptr<io::ModelWeights> modelFile_;
 
 public:
   Evaluate(Ptr<Options> options) : options_(options) {
-    options_ = options_->with("inference", true, 
+    options_ = options_->with("inference", true,
                               "shuffle", "none");
 
     corpus_ = New<Corpus>(options_);
@@ -68,7 +64,8 @@ class Evaluate : public ModelTask {
 
     auto modelPath = options_->get<std::string>("model");
     LOG(info, "Loading model from {}", modelPath);
-    ioItems_ = io::loadItems(modelPath);
+
+    modelFile_ = New<io::ModelWeights>(modelPath);
 
     graphs_.resize(devices.size());
     models_.resize(devices.size());
@@ -79,15 +76,15 @@ class Evaluate : public ModelTask {
           [=](size_t j) {
             auto graph     = New<ExpressionGraph>(true);
             auto precison  = options_->get<std::vector<std::string>>("precision", {"float32"});
-            graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph   
-            graph->setDevice(devices[j]);        
+            graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
+            graph->setDevice(devices[j]);
             graph->reserveWorkspaceMB(options_->get<int>("workspace"));
-          
+
             auto model = New<Model>(options_);
-            model->load(graph, ioItems_);
+            model->load(graph, modelFile_);
 
             models_[j] = model;
-            graphs_[j] = graph; 
+            graphs_[j] = graph;
           },
           i);
     }
@@ -96,7 +93,7 @@ class Evaluate : public ModelTask {
   void run() override {
     LOG(info, "Evaluating");
     timer::Timer timer;
-    
+
     auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
     batchGenerator->prepare();
 
@@ -105,7 +102,7 @@ class Evaluate : public ModelTask {
     size_t batchId = 0;
     {
       ThreadPool pool(graphs_.size(), graphs_.size());
-      
+
       for(auto batch : *batchGenerator) {
         auto task = [=](size_t id) {
           thread_local Ptr<ExpressionGraph> graph;
@@ -132,7 +129,7 @@ class Evaluate : public ModelTask {
           } else {
             ABORT("Unknown value type {}", scores->value_type());
           }
-          
+
           // collect embedding vector per sentence.
           // if we compute similarities this is only one similarity per sentence pair.
           for(size_t i = 0; i < batch->size(); ++i) {
diff --git a/src/examples/mnist/model.h b/src/examples/mnist/model.h
index 5d50eae96..10c282c03 100755
--- a/src/examples/mnist/model.h
+++ b/src/examples/mnist/model.h
@@ -75,11 +75,8 @@ class MnistFeedForwardNet : public IModel {
     return Logits(apply(graph, batch, inference_));
   }
 
-  void load(Ptr<ExpressionGraph> /*graph*/, const std::vector<io::Item>& /*items*/, bool) override {
-    LOG(critical, "Loading MNIST model is not supported");
-  }
-    
-  void load(Ptr<ExpressionGraph> /*graph*/, const std::string& /*name*/, bool) override {
+
+  void load(Ptr<ExpressionGraph> /*graph*/, Ptr<io::ModelWeights> /*name*/, bool) override {
     LOG(critical, "Loading MNIST model is not supported");
   }
 
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index ce51b0f2b..f59edfca9 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -15,7 +15,7 @@ void ExpressionGraph::setDevice(DeviceId deviceId, Ptr<Device> device) {
     auto params = New<Parameters>(defaultElementType_);
     params->init(backend_);
     paramsByElementType_[defaultElementType_] = params;
-    
+
     if(device)
       tensors_ = New<Tensors>(backend_, device);
     else
@@ -285,7 +285,7 @@ void ExpressionGraph::checkNaN(Tensor t, bool& isNaN, bool& isInf) {
   IsNaN(t, allocator(), isNaN, isInf);
 }
 
-void ExpressionGraph::save(std::vector<io::Item>& ioItems, Type saveElementType) {
+void ExpressionGraph::getItems(std::vector<io::Item>& ioItems, Type saveElementType) {
   // sorted by type in std::map
   for(auto kvParams : paramsByElementType_) {
     // sorted by name in std::map
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index da69af091..915c9df3f 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -676,7 +676,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
    * @param node a pointer to a expression node
    */
   Expr add(Expr node);
-  
+
   /**
    * Removes the node from the set of roots (will not be initialized during back propagation)
    * @param node a pointer to a expression node
@@ -742,50 +742,29 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
   /** Get the flag value whether the graph throws a NaN exception (true) or not */
   bool getThrowNaN() { return throwNaN_; }
 
-public:
-  /** Load model (mainly parameter objects) from array of io::Items */
-  void load(const std::vector<io::Item>& ioItems, bool markReloaded = true) {
-    setReloaded(false);
-    for(auto& item : ioItems) {
-      std::string pName = item.name;
-      // skip over special parameters starting with "special:"
-      if(pName.substr(0, 8) == "special:")
-        continue;
-
-      // if during loading the loaded type is of the same type class as the default element type, allow conversion;
-      // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
-      // have type class TypeClass::float_type.
-      auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type;
-      param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false);
-    }
-    if(markReloaded)
-      setReloaded(true);
-  }
-
-  /** Load model by filename */
-  void load(const std::string& name, bool markReloaded = true) {
-    LOG(info, "Loading model from {}", name);
-    auto items = io::loadItems(name);
-    load(items, markReloaded);
-  }
-
-  /** Load model from buffer (a file pointer) */
-  void load(const void* ptr, bool markReloaded = true) {
-    LOG(info, "Loading model from buffer at {}", ptr);
-    auto items = io::loadItems(ptr);
-    load(items, markReloaded);
-  }
 
   /**
    * Turn the model (given a file pointer) into a memory-mapped type
    * by converting all the parameter object to memory-mapped version, i.e., MappedParameters.
    */
-  void mmap(const void* ptr, bool markReloaded = true) {
-    ABORT_IF(backend_->getDeviceId().type != DeviceType::cpu || !inferenceOnly_,
-             "Memory mapping only supported for CPU inference mode");
+  void prepareMmap(Ptr<io::ModelWeights> modelFile) {
+    bool graphAllowsMmapping = backend_->getDeviceId().type == DeviceType::cpu && inferenceOnly_;
+    auto mmapMode = modelFile->mmapMode();
+
+    // don't do anything if we don't want to mmap regardless if the graph allows it
+    if(mmapMode == io::MmapMode::DontMmap)
+      return;
 
-    LOG(info, "Memory mapping model at {}", ptr);
-    auto items = io::mmapItems(ptr);
+    // silently ignore if we can't mmap and it's not required
+    if(!graphAllowsMmapping && mmapMode != io::MmapMode::RequiredMmap)
+      return;
+
+    // abort if we can't mmap and it's required
+    ABORT_IF(!graphAllowsMmapping && mmapMode == io::MmapMode::RequiredMmap,
+             "Memory mapping required but only supported for CPU inference graphs");
+
+    // if we got here, we mmap either opportunistically or by requirement
+    LOG_ONCE(info, "[memory] Memory mapping model parameters in graph");
 
     // Deal with default parameter set object that might not be a mapped object.
     // This gets assigned during ExpressionGraph::setDevice(...) and by default
@@ -803,7 +782,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     }
 
     // pre-populate parameters by type
-    for(auto& item : items) {
+    for(auto& item : modelFile->items()) {
       auto it1 = paramsByElementType_.find(item.type);
       if(it1 == paramsByElementType_.end()) {
         auto params = New<MappedParameters>(item.type);
@@ -811,17 +790,39 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
         paramsByElementType_.insert({item.type, params});
       }
     }
+  }
+
+public:
+  /** Load model (mainly parameter objects) from a ModelWeights object */
+  void load(Ptr<io::ModelWeights> modelWeights, bool markReloaded = true) {
+    prepareMmap(modelWeights);
+
+    setReloaded(false);
+    for(auto& item : modelWeights->items()) {
+      std::string pName = item.name;
+      // skip over special parameters starting with "special:"
+      if(pName.substr(0, 8) == "special:")
+        continue;
 
-    load(items, markReloaded);
+      // if during loading the loaded type is of the same type class as the default element type, allow conversion;
+      // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both
+      // have type class TypeClass::float_type.
+      auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type;
+      param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false);
+    }
+    if(markReloaded)
+      setReloaded(true);
   }
 
+
 public:
+
   /**
    * Convert all parameters into an array of io::Item elements, for saving.
    * @param ioItems an array of io::Item elements
    * @param saveElementType the element type for saving
    */
-  void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);
+  void getItems(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);
 
   /**
    * Save all parameters into a file (.npz or .bin).
@@ -831,7 +832,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
    */
   void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
     std::vector<io::Item> ioItems;
-    save(ioItems, saveElementType);
+    getItems(ioItems, saveElementType);
     if(ioItems.empty()) {
       LOG(warn, "Item list is empty, skipping saving");
     } else {
@@ -840,6 +841,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
       io::saveItems(name, ioItems);
     }
   }
+
 };
 
 template <class T, typename... Args>
diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp
index 3afb599a9..69f8af92d 100644
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@@ -36,7 +36,7 @@ class LambdaInitConvert : public NodeInitializer {
   private:
     std::function<void(Tensor)> lambda_;
     Type intermediateType_; // is used for the creation of a temporary intermediate tensor on which the lambda actually operates.
-                            // This tensor is then automatically cast and copied to the type of the actual tensor. 
+                            // This tensor is then automatically cast and copied to the type of the actual tensor.
 
   public:
     LambdaInitConvert(std::function<void(Tensor)>&& lambda,
@@ -195,25 +195,24 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
 
 Ptr<NodeInitializer> fromItem(const io::Item& item) {
   if(item.mapped) {
-    return fromLambda([item](Tensor tensor) {
-      // @TODO: implement other types, for now croak loudly.
-      ABORT_IF(tensor->getBackend()->getDeviceId().type != DeviceType::cpu,
-               "Memory mapping only works for CPU tensors");
-      ABORT_IF(tensor->type() != item.type,
-               "Tensor type ({}) and type for mapping ({}) do not match",
-               tensor->type(),
-               item.type);
-      ABORT_IF(tensor->shape() != item.shape,
-               "Tensor shape ({}) and shape of mapped item ({}) do not match",
-               tensor->shape(),
-               item.shape);
-      auto mp = MemoryPiece::New((uint8_t*)item.ptr, item.size()); // @TODO: this is not properly aligned now
-      tensor->reset(mp);
-    });
+    return fromLambda([&item](Tensor tensor) {
+      if(tensor->getBackend()->getDeviceId().type != DeviceType::cpu) {
+        tensor->set(item);
+      } else {
+        ABORT_IF(tensor->type() != item.type,
+                "Tensor type ({}) and type for mapping ({}) do not match",
+                tensor->type(),
+                item.type);
+        ABORT_IF(tensor->shape() != item.shape,
+                "Tensor shape ({}) and shape of mapped item ({}) do not match",
+                tensor->shape(),
+                item.shape);
+        auto mp = MemoryPiece::New((uint8_t*)item.ptr, item.size()); // @TODO: this is not properly aligned now
+        tensor->reset(mp);
+      }
+    }, item.type);
   } else {
-    return fromLambda(
-      [item](Tensor tensor) { tensor->set(item); },
-      item.type);
+    return fromLambda([&item](Tensor tensor) { tensor->set(item); }, item.type);
   }
 }
 
@@ -223,7 +222,7 @@ Ptr<NodeInitializer> fromTensor(Tensor externalTensor) {
 
 // Computes Google's sinusoidal position embeddings
 Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start) {
-  return fromLambda([start](Tensor t) { SinusoidalPositionEmbeddings(t, start); }); 
+  return fromLambda([start](Tensor t) { SinusoidalPositionEmbeddings(t, start); });
 }
 
 // @TODO: this is rather inefficient also needs axis argument or something
diff --git a/src/microsoft/cosmos.cpp b/src/microsoft/cosmos.cpp
index 7493975eb..00ff9c90e 100644
--- a/src/microsoft/cosmos.cpp
+++ b/src/microsoft/cosmos.cpp
@@ -20,7 +20,7 @@ class EmbedderModel {
   EmbedderModel(Ptr<Options> options)
     : model_(createModelFromOptions(options, models::usage::embedding)) {}
 
-  void load(Ptr<ExpressionGraph> graph, const std::string& modelFile) {
+  void load(Ptr<ExpressionGraph> graph, Ptr<io::ModelWeights> modelFile) {
     model_->load(graph, modelFile);
   }
 
@@ -36,21 +36,22 @@ namespace cosmos {
 const size_t MAX_BATCH_SIZE =  32;
 const size_t MAX_LENGTH     = 256;
 
-/** 
+/**
  * Single CPU-core implementation of an Embedder/Similiarity scorer. Turns sets of '\n' strings
  * into parallel batches and either outputs embedding vectors or similarity scores.
  */
 class Embedder {
-private: 
+private:
   Ptr<Options> options_;
   Ptr<ExpressionGraph> graph_;
   Ptr<Vocab> vocab_;
 
   Ptr<EmbedderModel> model_;
-  
+  Ptr<io::ModelWeights> modelFile_;
+
 public:
   Embedder(const std::string& modelPath, const std::string& vocabPath, bool computeSimilarity = false) {
-    options_ = New<Options>("inference", true, 
+    options_ = New<Options>("inference", true,
                             "shuffle", "none",
                             "mini-batch", MAX_BATCH_SIZE,
                             "maxi-batch", 100,
@@ -59,7 +60,7 @@ class Embedder {
                             "max-length-crop", true,
                             "compute-similarity", computeSimilarity,
                             "vocabs", std::vector<std::string>(computeSimilarity ? 2 : 1, vocabPath));
-  
+
     vocab_ = New<Vocab>(options_, 0);
     vocab_->load(vocabPath, 0);
 
@@ -67,20 +68,20 @@ class Embedder {
     graph_->setDevice(CPU0);
     graph_->reserveWorkspaceMB(512);
 
-    YAML::Node config;
-    io::getYamlFromModel(config, "special:model.yml", modelPath);
-    
+    modelFile_ = New<io::ModelWeights>(modelPath);
+    YAML::Node config = modelFile_->getYamlFromModel();
+
     Ptr<Options> modelOpts = New<Options>();
     modelOpts->merge(options_);
     modelOpts->merge(config);
 
     model_ = New<EmbedderModel>(modelOpts);
-    model_->load(graph_, modelPath);
+    model_->load(graph_, modelFile_);
   }
 
   // Compute embedding vectors for a batch of sentences
   std::vector<std::vector<float>> embed(const std::string& input) {
-    auto text = New<data::TextInput>(std::vector<std::string>({input}), 
+    auto text = New<data::TextInput>(std::vector<std::string>({input}),
                                      std::vector<Ptr<Vocab>>({vocab_}),
                                      options_);
     // we set runAsync=false as we are throwing exceptions instead of aborts. Exceptions and threading do not mix well.
@@ -102,7 +103,7 @@ class Embedder {
         auto batchIdx = batch->getSentenceIds()[i];
         if(output.size() <= batchIdx)
           output.resize(batchIdx + 1);
-        
+
         int embSize = embeddings->shape()[-1];
         size_t beg = i * embSize;
         size_t end = (i + 1) * embSize;
@@ -116,7 +117,7 @@ class Embedder {
 
   // Compute cosine similarity scores for a two batches of corresponding sentences
   std::vector<float> similarity(const std::string& input1, const std::string& input2) {
-    auto text = New<data::TextInput>(std::vector<std::string>({input1, input2}), 
+    auto text = New<data::TextInput>(std::vector<std::string>({input1, input2}),
                                      std::vector<Ptr<Vocab>>({vocab_, vocab_}),
                                      options_);
     // we set runAsync=false as we are throwing exceptions instead of aborts. Exceptions and threading do not mix well.
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 2302819eb..6a09469fd 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -7,6 +7,7 @@
 #include "mkl.h"
 #endif
 
+#include "common/io.h"
 #include "data/shortlist.h"
 #include "translator/beam_search.h"
 #include "translator/scorers.h"
@@ -53,6 +54,13 @@ class VocabWrapper : public IVocabWrapper {
   Ptr<Vocab> getVocab() const { return pImpl_; }
 };
 
+IBeamSearchDecoder::IBeamSearchDecoder(Ptr<Options> options,
+                                       const std::vector<const void*>& ptrs)
+  : options_(options) {
+  for(auto ptr : ptrs)
+    modelWeights_.push_back(New<io::ModelWeights>(ptr));
+}
+
 class BeamSearchDecoder : public IBeamSearchDecoder {
 private:
   Ptr<ExpressionGraph> graph_;
@@ -62,7 +70,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
 
   std::vector<Ptr<Vocab>> vocabs_;
 
-  static inline std::unordered_map<std::string, YAML::Node> configCache_;
+  static inline std::unordered_map<size_t, YAML::Node> configCache_;
   static inline std::mutex configCacheMutex_;
 public:
   BeamSearchDecoder(Ptr<Options> options,
@@ -85,45 +93,31 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
     mkl_set_num_threads(options_->get<int>("mkl-threads", 1));
 #endif
 
-    std::vector<std::string> models
-        = options_->get<std::vector<std::string>>("model");
-
-    for(int i = 0; i < models.size(); ++i) {
+    for(int i = 0; i < modelWeights_.size(); ++i) {
       Ptr<Options> modelOpts = New<Options>();
 
       // serializing this YAML can be costly, so read from cache
       YAML::Node config;
-      auto cachedConfig = getConfigFromCache(models[i]);
+      auto cachedConfig = getConfigFromCache((size_t)modelWeights_[i]->data());
       if(cachedConfig != nullptr) {
         config = *cachedConfig;
       } else {
-        if(io::isBin(models[i]) && ptrs_[i] != nullptr)
-          io::getYamlFromModel(config, "special:model.yml", ptrs_[i]);
-        else
-          io::getYamlFromModel(config, "special:model.yml", models[i]);
-        writeConfigToCache(config, models[i]);
+        ABORT_IF(modelWeights_[i]->data() == nullptr, "Model pointer is null");
+        config = modelWeights_[i]->getYamlFromModel("special:model.yml");
+        writeConfigToCache(config, (size_t)modelWeights_[i]->data());
       }
 
       modelOpts->merge(options_);
       modelOpts->merge(config);
 
-      // serializing this to YAML is expensive. we only want to do this once 
-      // we can use whether we loaded the cache from config as a signal 
+      // serializing this to YAML is expensive. we only want to do this once
+      // we can use whether we loaded the cache from config as a signal
       if(cachedConfig == nullptr){
         std::cerr << modelOpts->asYamlString() << std::flush;
       }
 
       auto encdec = models::createModelFromOptions(modelOpts, models::usage::translation);
-
-      if(io::isBin(models[i]) && ptrs_[i] != nullptr) {
-        // if file ends in *.bin and has been mapped by QuickSAND
-        scorers_.push_back(New<ScorerWrapper>(
-          encdec, "F" + std::to_string(scorers_.size()), /*weight=*/1.0f, ptrs[i]));
-      } else {
-        // it's a *.npz file or has not been mapped by QuickSAND
-        scorers_.push_back(New<ScorerWrapper>(
-          encdec, "F" + std::to_string(scorers_.size()), /*weight=*/1.0f, models[i]));
-      }
+      scorers_.push_back(New<ScorerWrapper>(encdec, "F" + std::to_string(scorers_.size()), /*weight=*/1.0f, modelWeights_[i]));
     }
 
     for(auto scorer : scorers_) {
@@ -134,7 +128,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
     graph_->forward();
   }
 
-  YAML::Node* getConfigFromCache(std::string key){
+  YAML::Node* getConfigFromCache(size_t key){
     const std::lock_guard<std::mutex> lock(configCacheMutex_);
     bool inCache = configCache_.find(key) != configCache_.end();
     if (inCache) {
@@ -144,7 +138,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
       return nullptr;
     }
   }
-  void writeConfigToCache(YAML::Node config, std::string key) {
+  void writeConfigToCache(YAML::Node config, size_t key) {
     const std::lock_guard<std::mutex> lock(configCacheMutex_);
     configCache_[key] = config;
   }
@@ -154,7 +148,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
   QSNBestBatch decode(const QSBatch& qsBatch,
                       size_t maxLength,
                       const std::unordered_set<WordIndex>& shortlist) override {
-    
+
     std::vector<int> lshOpts = options_->get<std::vector<int>>("output-approx-knn", {});
     ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters");
     ABORT_IF(lshOpts.size() == 2 && shortlist.size() > 0, "LSH and shortlist cannot be used at the same time");
@@ -168,7 +162,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
         shortListGen = New<data::LSHShortlistGenerator>(lshOpts[0], lshOpts[1], vocabs_[1]->lemmaSize(), /*abortIfDynamic=*/true);
       } else {
         shortListGen = New<data::FakeShortlistGenerator>(shortlist);
-      } 
+      }
       for(auto scorer : scorers_)
         scorer->setShortlistGenerator(shortListGen);
     }
@@ -297,15 +291,17 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
 bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, int32_t lshNBits) {
   std::cerr << "Converting from: " << inputFile << ", to: " << outputFile << ", precision: " << targetPrec << std::endl;
 
-  YAML::Node config;
+  auto modelFile = New<marian::io::ModelWeights>(inputFile);
+
+  YAML::Node config = modelFile->getYamlFromModel();
   std::stringstream configStr;
-  marian::io::getYamlFromModel(config, "special:model.yml", inputFile);
+
   configStr << config;
 
   auto graph = New<ExpressionGraphPackable>();
   graph->setDevice(CPU0);
 
-  graph->load(inputFile);
+  graph->load(modelFile);
 
   // MJD: Note, this is a default settings which we might want to change or expose. Use this only with Polonium students.
   // The LSH will not be used by default even if it exists in the model. That has to be enabled in the decoder config.
@@ -329,8 +325,8 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP
   }
 
   Type targetPrecType = (Type) targetPrec;
-  if (targetPrecType == Type::packed16 
-      || targetPrecType == Type::packed8avx2 
+  if (targetPrecType == Type::packed16
+      || targetPrecType == Type::packed8avx2
       || targetPrecType == Type::packed8avx512
       || (targetPrecType == Type::float32 && addLsh)) { // only allow non-conversion to float32 if we also use the LSH
     graph->packAndSave(outputFile, configStr.str(), targetPrecType);
diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h
index cddcfd22e..3ed866e83 100644
--- a/src/microsoft/quicksand.h
+++ b/src/microsoft/quicksand.h
@@ -13,6 +13,10 @@ using Ptr = std::shared_ptr<T>;
 
 class Options;
 
+namespace io {
+  class ModelWeights;
+}
+
 namespace quicksand {
 
 typedef uint32_t IndexType;
@@ -47,12 +51,11 @@ class IVocabWrapper {
 class IBeamSearchDecoder {
 protected:
   Ptr<Options> options_;
-  std::vector<const void*> ptrs_;
+  std::vector<Ptr<io::ModelWeights>> modelWeights_;
 
 public:
   IBeamSearchDecoder(Ptr<Options> options,
-                     const std::vector<const void*>& ptrs)
-      : options_(options), ptrs_(ptrs) {}
+                     const std::vector<const void*>& ptrs);
 
   virtual ~IBeamSearchDecoder() {}
 
diff --git a/src/models/amun.h b/src/models/amun.h
index 135ce3597..d6b1209c6 100644
--- a/src/models/amun.h
+++ b/src/models/amun.h
@@ -36,7 +36,7 @@ class Amun : public EncoderDecoder {
   }
 
   void load(Ptr<ExpressionGraph> graph,
-            const std::vector<io::Item>& items,
+            Ptr<io::ModelWeights> modelFile,
             bool /*markedReloaded*/ = true) override {
     std::map<std::string, std::string> nameMap
         = {{"decoder_U", "decoder_cell1_U"},
@@ -89,41 +89,51 @@ class Amun : public EncoderDecoder {
     if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
       nameMap["Wemb"] = "Wemb";
 
-    auto ioItems = items;
-    // map names and remove a dummy matrices
-    for(auto it = ioItems.begin(); it != ioItems.end();) {
-      // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
-      // @TODO: consider dropping support for Nematus models
-      if(it->shape.size() == 1) {
-        int dim = it->shape[-1];
-        it->shape.resize(2);
-        it->shape.set(0, 1);
-        it->shape.set(1, dim);
-      }
-
-      if(it->name == "decoder_c_tt") {
-        it = ioItems.erase(it);
-      } else if(it->name == "uidx") {
-        it = ioItems.erase(it);
-      } else if(it->name == "history_errs") {
-        it = ioItems.erase(it);
-      } else {
-        auto pair = nameMap.find(it->name);
-        if(pair != nameMap.end())
-          it->name = pair->second;
-        it++;
+    // we will modify the items directly, so memory mapping etc. should just work
+    // This should never be done, but we need to be compatible with Amun/Nematus for now.
+    auto& ioItems = modelFile->items();
+
+    // @TODO: get rid of all this eventually
+    { // scope for lock_guard
+      // this is needed during loading since we modify the content of modelFile->items() directly
+      // This is quite ugly but this is legacy code anyway.
+      std::mutex mutex;
+      std::lock_guard<std::mutex> lock(mutex);
+
+      // only modify the first time.
+      bool modify = false;
+      for(auto& item : ioItems)
+        if(item.name == "decoder_c_tt") // still there, hence this is the first time.
+          modify = true;
+
+      if(modify) {
+        // map names and remove a dummy matrices
+        for(auto it = ioItems.begin(); it != ioItems.end();) {
+          // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
+          // @TODO: consider dropping support for Nematus models
+          if(it->shape.size() == 1) {
+            int dim = it->shape[-1];
+            it->shape.resize(2);
+            it->shape.set(0, 1);
+            it->shape.set(1, dim);
+          }
+
+          if(it->name == "decoder_c_tt") {
+            it = ioItems.erase(it);
+          } else if(it->name == "uidx") {
+            it = ioItems.erase(it);
+          } else if(it->name == "history_errs") {
+            it = ioItems.erase(it);
+          } else {
+            auto pair = nameMap.find(it->name);
+            if(pair != nameMap.end())
+              it->name = pair->second;
+            it++;
+          }
+        }
       }
     }
-    // load items into the graph
-    graph->load(ioItems);
-  }
-
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool /*markReloaded*/ = true) override {
-    LOG(info, "Loading model from {}", name);
-    auto ioItems = io::loadItems(name);
-    load(graph, ioItems);
+    graph->load(modelFile);
   }
 
   void save(Ptr<ExpressionGraph> graph,
@@ -179,7 +189,7 @@ class Amun : public EncoderDecoder {
 
     // get parameters from the graph to items
     std::vector<io::Item> ioItems;
-    graph->save(ioItems);
+    graph->getItems(ioItems);
     // replace names to be compatible with Nematus
     for(auto& item : ioItems) {
       auto newItemName = nameMap.find(item.name);
diff --git a/src/models/costs.h b/src/models/costs.h
index 45527362f..fa67b5fb2 100644
--- a/src/models/costs.h
+++ b/src/models/costs.h
@@ -218,17 +218,11 @@ class Trainer : public ICriterionFunction {
   Ptr<IModel> getModel() { return model_; }
 
   void load(Ptr<ExpressionGraph> graph,
-            const std::vector<io::Item>& items,
+            Ptr<io::ModelWeights> modelFile,
             bool markedReloaded) override {
-    model_->load(graph, items, markedReloaded);
+    model_->load(graph, modelFile, markedReloaded);
   }
 
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override {
-    model_->load(graph, name, markedReloaded);
-  };
-
   virtual void save(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool saveTranslatorConfig = false) override {
@@ -270,17 +264,11 @@ class Scorer : public IModel {
   Ptr<IModel> getModel() { return model_; }
 
   virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::vector<io::Item>& items,
+                    Ptr<io::ModelWeights> modelFile,
                     bool markReloaded = true) override {
-    model_->load(graph, items, markReloaded);
+    model_->load(graph, modelFile, markReloaded);
   }
 
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override {
-    model_->load(graph, name, markedReloaded);
-  };
-
   virtual void save(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool saveTranslatorConfig = false) override {
@@ -322,23 +310,11 @@ class Stepwise : public IEncoderDecoder {
   Stepwise(Ptr<IEncoderDecoder> encdec, Ptr<ILogProbStep> cost) : encdec_(encdec), cost_(cost) {}
 
   virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::vector<io::Item>& items,
+                    Ptr<io::ModelWeights> modelFile,
                     bool markedReloaded = true) override {
-    encdec_->load(graph, items, markedReloaded);
+    encdec_->load(graph, modelFile, markedReloaded);
   }
 
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override {
-    encdec_->load(graph, name, markedReloaded);
-  }
-
-  virtual void mmap(Ptr<ExpressionGraph> graph,
-                    const void* ptr,
-                    bool markedReloaded = true) override {
-    encdec_->mmap(graph, ptr, markedReloaded);
-  };
-
   virtual void save(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool saveTranslatorConfig = false) override {
diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h
index 552e428f2..7e25f33ed 100644
--- a/src/models/encoder_classifier.h
+++ b/src/models/encoder_classifier.h
@@ -21,26 +21,10 @@ class EncoderClassifierBase : public models::IModel {
 public:
   virtual ~EncoderClassifierBase() {}
 
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override
-      = 0;
-
-  virtual void mmap(Ptr<ExpressionGraph> graph,
-                    const void* ptr,
-                    bool markedReloaded = true)
-      = 0;
-
-  virtual void save(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool saveTranslatorConfig = false) override
-      = 0;
-
   virtual void clear(Ptr<ExpressionGraph> graph) override = 0;
 
   virtual std::vector<Ptr<ClassifierState>> apply(Ptr<ExpressionGraph>, Ptr<data::CorpusBatch>, bool) = 0;
 
-
   virtual Logits build(Ptr<ExpressionGraph> graph,
                        Ptr<data::Batch> batch,
                        bool clearGraph = true) override = 0;
@@ -154,21 +138,9 @@ class EncoderClassifier : public EncoderClassifierBase {
   void push_back(Ptr<ClassifierBase> classifier) { classifiers_.push_back(classifier); }
 
   void load(Ptr<ExpressionGraph> graph,
-            const std::vector<io::Item>& items,
-            bool markedReloaded) override {
-    graph->load(items, markedReloaded && !opt<bool>("ignore-model-config", false));
-  }
-
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool markedReloaded) override {
-    graph->load(name, markedReloaded && !opt<bool>("ignore-model-config", false));
-  }
-
-  void mmap(Ptr<ExpressionGraph> graph,
-            const void* ptr,
+            Ptr<io::ModelWeights> modelFile,
             bool markedReloaded) override {
-    graph->mmap(ptr, markedReloaded && !opt<bool>("ignore-model-config", false));
+    graph->load(modelFile, markedReloaded && !opt<bool>("ignore-model-config", false));
   }
 
   void save(Ptr<ExpressionGraph> graph,
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index 971726271..0c27ab4c7 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -73,7 +73,7 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
 
   modelFeatures_.insert("transformer-no-bias");
   modelFeatures_.insert("transformer-no-affine");
-  
+
   modelFeatures_.insert("transformer-disable-position-embeddings");
   modelFeatures_.insert("transformer-attention-mask");
   modelFeatures_.insert("transformer-alibi-shift");
@@ -159,21 +159,9 @@ std::string EncoderDecoder::getModelParametersAsString() {
 }
 
 void EncoderDecoder::load(Ptr<ExpressionGraph> graph,
-                          const std::vector<io::Item>& items,
-                          bool markedReloaded) {
-  graph->load(items, markedReloaded && !opt<bool>("ignore-model-config", false));
-}
-
-void EncoderDecoder::load(Ptr<ExpressionGraph> graph,
-                          const std::string& name,
+                          Ptr<io::ModelWeights> modelFile,
                           bool markedReloaded) {
-  graph->load(name, markedReloaded && !opt<bool>("ignore-model-config", false));
-}
-
-void EncoderDecoder::mmap(Ptr<ExpressionGraph> graph,
-                          const void* ptr,
-                          bool markedReloaded) {
-  graph->mmap(ptr, markedReloaded && !opt<bool>("ignore-model-config", false));
+  graph->load(modelFile, markedReloaded && !opt<bool>("ignore-model-config", false));
 }
 
 void EncoderDecoder::save(Ptr<ExpressionGraph> graph,
@@ -237,7 +225,7 @@ Ptr<DecoderState> EncoderDecoder::step(Ptr<ExpressionGraph> graph,
   // Fill state with embeddings based on last prediction
   decoders_[0]->embeddingsFromPrediction(graph, state, words, (int)batchIndices.size(), beamSize);
   auto nextState = decoders_[0]->step(graph, state);
-  
+
   return nextState;
 }
 
diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h
index ef810ed8b..9d717dbd2 100644
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@@ -13,26 +13,6 @@ class IEncoderDecoder : public models::IModel {
 public:
   virtual ~IEncoderDecoder() {}
 
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::vector<io::Item>& items,
-                    bool markedReloaded = true) override 
-      = 0;
-
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override
-      = 0;
-
-  virtual void mmap(Ptr<ExpressionGraph> graph,
-                    const void* ptr,
-                    bool markedReloaded = true)
-      = 0;
-
-  virtual void save(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool saveTranslatorConfig = false) override
-      = 0;
-
   virtual void clear(Ptr<ExpressionGraph> graph) override = 0;
 
   virtual Logits build(Ptr<ExpressionGraph> graph,
@@ -62,7 +42,7 @@ class IEncoderDecoder : public models::IModel {
 
   virtual Ptr<data::Shortlist> getShortlist() = 0;
 
-  virtual data::SoftAlignment getAlignment() = 0; 
+  virtual data::SoftAlignment getAlignment() = 0;
 };
 
 class EncoderDecoder : public IEncoderDecoder, public LayerBase {
@@ -98,15 +78,7 @@ class EncoderDecoder : public IEncoderDecoder, public LayerBase {
   void push_back(Ptr<DecoderBase> decoder);
 
   virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::vector<io::Item>& items,
-                    bool markedReloaded = true) override;
-
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override;
-
-  virtual void mmap(Ptr<ExpressionGraph> graph,
-                    const void* ptr,
+                    Ptr<io::ModelWeights>,
                     bool markedReloaded = true) override;
 
   virtual void save(Ptr<ExpressionGraph> graph,
diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h
index b89f85c9e..b9041cd6c 100644
--- a/src/models/encoder_pooler.h
+++ b/src/models/encoder_pooler.h
@@ -25,26 +25,6 @@ class EncoderPoolerBase : public models::IModel {
 public:
   virtual ~EncoderPoolerBase() {}
 
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::vector<io::Item>& items,
-                    bool markedReloaded = true) override
-      = 0;
-
-  virtual void load(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool markedReloaded = true) override
-      = 0;
-
-  virtual void mmap(Ptr<ExpressionGraph> graph,
-                    const void* ptr,
-                    bool markedReloaded = true)
-      = 0;
-
-  virtual void save(Ptr<ExpressionGraph> graph,
-                    const std::string& name,
-                    bool saveTranslatorConfig = false) override
-      = 0;
-
   virtual void clear(Ptr<ExpressionGraph> graph) override = 0;
 
   virtual std::vector<Expr> apply(Ptr<ExpressionGraph>, Ptr<data::CorpusBatch>, bool) = 0;
@@ -175,21 +155,9 @@ class EncoderPooler : public EncoderPoolerBase {
   void push_back(Ptr<PoolerBase> pooler) { poolers_.push_back(pooler); }
 
   void load(Ptr<ExpressionGraph> graph,
-            const std::vector<io::Item>& items,
-            bool markedReloaded) override {
-    graph->load(items, markedReloaded && !opt<bool>("ignore-model-config", false));
-  }
-
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool markedReloaded) override {
-    graph->load(name, markedReloaded && !opt<bool>("ignore-model-config", false));
-  }
-
-  void mmap(Ptr<ExpressionGraph> graph,
-            const void* ptr,
+            Ptr<io::ModelWeights> modelFile,
             bool markedReloaded) override {
-    graph->mmap(ptr, markedReloaded && !opt<bool>("ignore-model-config", false));
+    graph->load(modelFile, markedReloaded && !opt<bool>("ignore-model-config", false));
   }
 
   void save(Ptr<ExpressionGraph> graph,
diff --git a/src/models/model_base.h b/src/models/model_base.h
index 32705bbe7..a159d4e81 100644
--- a/src/models/model_base.h
+++ b/src/models/model_base.h
@@ -10,10 +10,10 @@ namespace marian {
 namespace models {
 
 enum struct usage {
-  raw, 
-  training, 
-  scoring, 
-  translation, 
+  raw,
+  training,
+  scoring,
+  translation,
   embedding,   // used for laser and other models to produce embedding vectors
   evaluating   // evaluating is a special mode for neural metrics, different from (probabilistic) scoring
 };
@@ -30,12 +30,7 @@ namespace models {
 class IModel {
 public:
   virtual void load(Ptr<ExpressionGraph>,
-                    const std::string&,
-                    bool markReloaded = true)
-      = 0;
-
-  virtual void load(Ptr<ExpressionGraph>,
-                    const std::vector<io::Item>&,
+                    Ptr<io::ModelWeights>,
                     bool markReloaded = true)
       = 0;
 
@@ -59,12 +54,7 @@ class ICriterionFunction {
   virtual ~ICriterionFunction() {}
 
   virtual void load(Ptr<ExpressionGraph>,
-                    const std::string&,
-                    bool markReloaded = true)
-      = 0;
-
-  virtual void load(Ptr<ExpressionGraph>,
-                    const std::vector<io::Item>&,
+                    Ptr<io::ModelWeights>,
                     bool markReloaded = true)
       = 0;
 
diff --git a/src/models/nematus.h b/src/models/nematus.h
index aee8e3b04..7d421ec5c 100644
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@@ -26,43 +26,55 @@ class Nematus : public EncoderDecoder {
   }
 
   void load(Ptr<ExpressionGraph> graph,
-            const std::vector<io::Item>& items,
+            Ptr<io::ModelWeights> modelFile,
             bool /*markReloaded*/ = true) override {
-    auto ioItems = items;
-    // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node
-    for(auto it = ioItems.begin(); it != ioItems.end();) {
-      // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
-      // @TODO: consider dropping support for Nematus models
-      if(it->shape.size() == 1) {
-        int dim = it->shape[-1];
-        it->shape.resize(2);
-        it->shape.set(0, 1);
-        it->shape.set(1, dim);
-      }
 
-      if(it->name == "decoder_c_tt") {
-        it = ioItems.erase(it);
-      } else if(it->name == "uidx") {
-        it = ioItems.erase(it);
-      } else if(it->name == "history_errs") {
-        it = ioItems.erase(it);
-      } else {
-        auto pair = nameMap_.find(it->name);
-        if(pair != nameMap_.end())
-          it->name = pair->second;
-        it++;
+    // we will modify the items directly, so memory mapping etc. should just work
+    // This should never be done, but we need to be compatible with Amun/Nematus for now.
+    auto& ioItems = modelFile->items();
+
+    // @TODO: get rid of all this eventually
+    { // scope for lock_guard
+      // this is needed during loading since we modify the content of modelFile->items() directly
+      // This is quite ugly but this is legacy code anyway.
+      std::mutex mutex;
+      std::lock_guard<std::mutex> lock(mutex);
+
+      // only modify the first time.
+      bool modify = false;
+      for(auto& item : ioItems)
+        if(item.name == "decoder_c_tt") // still there, hence this is the first time.
+          modify = true;
+
+      if(modify) {
+        // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node
+        for(auto it = ioItems.begin(); it != ioItems.end();) {
+          // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
+          // @TODO: consider dropping support for Nematus models
+          if(it->shape.size() == 1) {
+            int dim = it->shape[-1];
+            it->shape.resize(2);
+            it->shape.set(0, 1);
+            it->shape.set(1, dim);
+          }
+
+          if(it->name == "decoder_c_tt") {
+            it = ioItems.erase(it);
+          } else if(it->name == "uidx") {
+            it = ioItems.erase(it);
+          } else if(it->name == "history_errs") {
+            it = ioItems.erase(it);
+          } else {
+            auto pair = nameMap_.find(it->name);
+            if(pair != nameMap_.end())
+              it->name = pair->second;
+            it++;
+          }
+        }
       }
     }
-    // load items into the graph
-    graph->load(ioItems);
-  }
 
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool /*markReloaded*/ = true) override {
-    LOG(info, "Loading model from {}", name);
-    auto ioItems = io::loadItems(name);
-    load(graph, ioItems);
+    graph->load(modelFile);
   }
 
   void save(Ptr<ExpressionGraph> graph,
@@ -77,7 +89,7 @@ class Nematus : public EncoderDecoder {
 
     // get parameters from the graph to items
     std::vector<io::Item> ioItems;
-    graph->save(ioItems);
+    graph->getItems(ioItems);
     // replace names to be compatible with Nematus
     for(auto& item : ioItems) {
       auto newItemName = nameMapRev_.find(item.name);
diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h
index 46df741b0..ac86e4dc7 100644
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@@ -14,25 +14,25 @@ Ptr<DecoderBase> NewDecoderTransformer(Ptr<ExpressionGraph> graph, Ptr<Options>
 
 class TransformerLegacy : public EncoderDecoder {
 public:
-  TransformerLegacy(Ptr<ExpressionGraph> graph, Ptr<Options> options) 
+  TransformerLegacy(Ptr<ExpressionGraph> graph, Ptr<Options> options)
    : EncoderDecoder(graph, options), nameMap_(createNameMap()) { }
 
   void load(Ptr<ExpressionGraph> graph,
-            const std::vector<io::Item>& items,
+            Ptr<io::ModelWeights> modelFile,
             bool markedReloaded = true) override {
 
-    for(auto it = items.begin(); it != items.end(); it++) {
-      auto pair = nameMap_.find(it->name);
+    for(auto& item : modelFile->items()) {
+      auto pair = nameMap_.find(item.name);
       if(pair != nameMap_.end()) {
-        LOG(debug, "Mapping parameter {} to {}", it->name, pair->second);
-        const_cast<io::Item&>(*it).name = pair->second;
+        LOG(debug, "Mapping parameter {} to {}", item.name, pair->second);
+        const_cast<io::Item&>(item).name = pair->second;
 
         // reduce shape of bias vectors from {1, dimModel} to {dimModel}
-        int dimModel = it->shape[-1];
-        if(it->shape == Shape({1, dimModel}))
-          const_cast<io::Item&>(*it).shape = Shape({dimModel});
+        int dimModel = item.shape[-1];
+        if(item.shape == Shape({1, dimModel}))
+          const_cast<io::Item&>(item).shape = Shape({dimModel});
       } else {
-        LOG(debug, "Could not find parameter {}", it->name);
+        LOG(debug, "Could not find parameter {}", item.name);
       }
     }
 
@@ -49,20 +49,12 @@ class TransformerLegacy : public EncoderDecoder {
       linear->transposed = false;
 
     // load items into the graph
-    graph->load(items);
-  }
-
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool markReloaded = true) override {
-    LOG(info, "Loading model from {}", name);
-    auto items = io::loadItems(name);
-    load(graph, items, markReloaded);
+    graph->load(modelFile);
   }
 
 private:
   std::map<std::string, std::string> nameMap_;
-  
+
   std::map<std::string, std::string> createNameMap() {
     std::map<std::string, std::string> nameMap = {
       {"Wemb", "Wemb"},
@@ -125,13 +117,13 @@ class TransformerLegacy : public EncoderDecoder {
 
       // name maps for decoder SSRU
       nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo);
-      
+
       nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo);
       nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo);
 
       nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo);
       nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo);
-      
+
       nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
       nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
 
diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h
index 062b91bca..051885c7b 100644
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@@ -25,7 +25,7 @@ class Rescorer {
   Rescorer(Ptr<Options> options)
       : builder_(models::createCriterionFunctionFromOptions(options, models::usage::scoring)) {}
 
-  void load(Ptr<ExpressionGraph> graph, const std::string& modelFile) {
+  void load(Ptr<ExpressionGraph> graph, Ptr<io::ModelWeights> modelFile) {
     builder_->load(graph, modelFile);
   }
 
@@ -46,6 +46,7 @@ class Rescore : public ModelTask {
   Ptr<CorpusBase> corpus_;
   std::vector<Ptr<ExpressionGraph>> graphs_;
   std::vector<Ptr<Model>> models_;
+  Ptr<io::ModelWeights> modelFile_;
 
 public:
   Rescore(Ptr<Options> options) : options_(options) {
@@ -77,7 +78,8 @@ class Rescore : public ModelTask {
       graphs_.push_back(graph);
     }
 
-    auto modelFile = options_->get<std::string>("model");
+    auto modelPath = options_->get<std::string>("model");
+    modelFile_ = New<io::ModelWeights>(modelPath);
 
     models_.resize(graphs_.size());
     ThreadPool pool(graphs_.size(), graphs_.size());
@@ -85,7 +87,7 @@ class Rescore : public ModelTask {
       pool.enqueue(
           [=](size_t j) {
             models_[j] = New<Model>(options_);
-            models_[j]->load(graphs_[j], modelFile);
+            models_[j]->load(graphs_[j], modelFile_);
           },
           i);
     }
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
index 1a233372c..f1a68210e 100644
--- a/src/tensors/cpu/expression_graph_packable.h
+++ b/src/tensors/cpu/expression_graph_packable.h
@@ -18,7 +18,7 @@ namespace marian {
 // This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
 // And, this introduces a low level packed_gemm.h apis interact with high level graph class.
 // So, we make a subclass of ExpressionGraph and put those immature codes in this class.
-// We will improve this in the near future. 
+// We will improve this in the near future.
 class ExpressionGraphPackable : public ExpressionGraph {
 public:
   ExpressionGraphPackable()
@@ -165,7 +165,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
         Tensor tmp;
         allocator->allocate(tmp, val->shape(), val->type());
         cpu::Transpose10(tmp, val);
-  
+
         if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type
           float quantMult = cpu::integer::computeQuantMult<Type::intgemm8>(val);
 
@@ -233,7 +233,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
           }
           //Put the quantMult at the back of the tensor
           cpu::integer::getQuantMult<Type::intgemm16>(paramMat) = quantMult;
-          
+
         } else {
           ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType));
         }
diff --git a/src/tensors/tensor.cpp b/src/tensors/tensor.cpp
index e9a07ab46..d89e41964 100644
--- a/src/tensors/tensor.cpp
+++ b/src/tensors/tensor.cpp
@@ -114,7 +114,7 @@ template std::string TensorBase::debug<int16_t>(int, int);
 template std::string TensorBase::debug<int32_t>(int, int);
 template std::string TensorBase::debug<int64_t>(int, int);
 
-// fill an io::item with data from a Tensor, used for saving 
+// fill an io::item with data from a Tensor, used for saving
 // and other IO operations.
 void TensorBase::get(io::Item& item, const std::string& name) {
   item.name  = name;
@@ -131,10 +131,10 @@ void TensorBase::get(io::Item& item, const std::string& name) {
 void TensorBase::set(const io::Item& item) {
   ABORT_IF(item.type != type_, "Tensor type {} and item type {} do not match", type_, item.type);
   ABORT_IF(item.shape != shape_, "Tensor shape {} and item shape {} do not match", shape_, item.shape);
-  ABORT_IF(item.bytes.size() > memory_->size(), "Item data size {} too large for memory {}", item.bytes.size(), memory_->size());
+  ABORT_IF(item.size() > memory_->size(), "Item data size {} too large for memory {}", item.size(), memory_->size());
   copy(backend_,
-       item.bytes.data(),
-       item.bytes.data() + item.bytes.size(),
+       item.data(),
+       item.data() + item.size(),
        memory_->data<char>());
 }
 
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index 054b0ae76..9b5f300d4 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -16,7 +16,7 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
     if(vcs.size() > 1) costScalingFreq_          = std::stoul(vcs[1]);
     if(vcs.size() > 2) costScalingMultiplier_    = std::stof( vcs[2]);
     if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]);
-    
+
     LOG_ONCE(info,
              "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}",
              costScalingFactor_,
@@ -69,7 +69,7 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
 void GraphGroup::initGraphsAndOpts() {
   for(auto device : devices_) {
     auto graph = New<ExpressionGraph>();
-    
+
     // @TODO: validate precisions in config
     auto precisions = options_->get<std::vector<std::string>>("precision");
     Type parameterType = typeFromString(precisions[0]);
@@ -81,7 +81,7 @@ void GraphGroup::initGraphsAndOpts() {
       graph->setThrowNaN(true);
 
     graph->setDevice(device);
-    
+
     graph->reserveWorkspaceMB(options_->get<int>("workspace"));
 
     graphs_.push_back(graph);
@@ -156,7 +156,7 @@ void GraphGroup::decreaseCostScaleFactor() {
     return;
 
   nanSeen_++;
-  
+
   size_t total = nanSeen_ + noNanSeen_;
 
   // do not reduce cost-scaling factor below minimum
@@ -177,15 +177,15 @@ void GraphGroup::decreaseCostScaleFactor() {
 
 float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) {
   auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin);
-  
+
   // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than
-  // the minimum. If a NaN value is seen we exit here and will reduce the factor next and 
-  // this skips an update. 
-  // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces 
+  // the minimum. If a NaN value is seen we exit here and will reduce the factor next and
+  // this skips an update.
+  // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces
   // NaNs with 0. Updates are not skipped any more.
   // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type.
-  // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent 
-  // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. 
+  // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent
+  // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large.
   if(costScaling_ || checkGradientNan_) {
     bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_;
     bool clipInf  = !checkGradientNan_;
@@ -206,7 +206,7 @@ float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) {
     auto gNorm = L2Norm(curGrad, graphs_[i]->allocator());
     if(isFinite(gNorm) && gNorm > 0.0)
       return gNorm;
-    else 
+    else
       return std::numeric_limits<float>::quiet_NaN();
   }
 
@@ -218,10 +218,10 @@ float GraphGroup::executeAndCollectNorm(const std::function<float(size_t, size_t
   if(mpi_) { // accumulate gradientNorm from subprocesses
     auto gradNormSquared = gradNorm * gradNorm; // undo sqrt
     mpi_->allReduce(&gradNormSquared, &gradNormSquared, 1, MPI_FLOAT, MPI_SUM); // sum all
-    
+
     if(shardingMode_ == ShardingMode::local) // we already have the correct norm on one device, but we also need to check for NaN
       gradNormSquared /= (float)mpi_->numMPIProcesses();
-    
+
     gradNorm = std::sqrt(gradNormSquared); // redo sqrt
   }
   return gradNorm;
@@ -245,16 +245,16 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
 
   if(!isFinite(gNorm)) // we are checking the sanity of the gradient elsewhere
     return normalizationFactor;
-  
+
   if(dynamicGradientScaling_) {
     // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g)
     if(costScaling_)
       gNorm = gNorm / costScalingFactor_;
-    
-    // Normalize gradient norm w.r.t. number of labels in batch for statistics, 
+
+    // Normalize gradient norm w.r.t. number of labels in batch for statistics,
     // there should be no gradient normalization before this point, @TODO: check this
-    gNorm = gNorm / updateTrgWords; 
-    
+    gNorm = gNorm / updateTrgWords;
+
     size_t window; float gNormAvgTransform, gNormVarTransform, gNormTransform, gNormAvg;
     if(dynamicGradientScalingUseLogs_) {
       // tracking the log of the gradient norms rather than the gradient norms itself results in a larger standard deviation as the actual
@@ -265,9 +265,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
     } else {
       std::tie(window, gNormAvgTransform, gNormVarTransform) = scheduler_->getGradientNormStats();
       gNormTransform = gNorm;              // we are not using logs, so we can just use the normal gradient norm
-      gNormAvg       = gNormAvgTransform;  // we are getting the actual running average of gradient norms, no transformation needed.  
+      gNormAvg       = gNormAvgTransform;  // we are getting the actual running average of gradient norms, no transformation needed.
     }
-    
+
     auto deltaTransform    = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average.
     auto gNormStdTransform = std::sqrt(gNormVarTransform);       // compute STD for the running average of (log) gradient norms.
 
@@ -283,7 +283,7 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
         LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}",
             dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm);
 
-      normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. 
+      normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average.
     }
   }
 
@@ -322,23 +322,27 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
     std::string modelFileName = options_->get<std::string>("model");
     bool foundModel = false;
 
-    // these are structures that get fill in the main process and then broadcasted to other MPI
+    // these are structures that get filled in the main process and then broadcasted to other MPI processes
     std::vector<io::Item> items;
     bool markReloaded = true;
 
     if(isMainProcess()) {
       if(filesystem::exists(modelFileName)) {
         LOG(info, "Loading model from {}", modelFileName);
-        foundModel    = true;
-        items         = io::loadItems(modelFileName);
+        foundModel    = true;
+        modelWeights_ = New<io::ModelWeights>(modelFileName, io::MmapMode::DontMmap);
         markReloaded  = true;
       } else if(options_->hasAndNotEmpty("pretrained-model")) {
         std::string pretrainedModelFileName = options_->get<std::string>("pretrained-model");
         LOG(info, "[training] Initializing model weights with pre-trained model {}", pretrainedModelFileName);
         foundModel    = true;
-        items         = io::loadItems(pretrainedModelFileName);
+        modelWeights_ = New<io::ModelWeights>(pretrainedModelFileName, io::MmapMode::DontMmap);
         markReloaded  = false;
       }
+    } else {
+      // Initialize with dummy and set correct file name in main process.
+      // If we are running only one process this will always be correctly initialized above.
+      modelWeights_ = New<io::ModelWeights>();
     }
 
     // if a model file exists, the main process will find it and propagate this information to other MPI nodes
@@ -349,27 +353,31 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
       // continue with checkpoint loading
       if(mpi_) {
         // broadcast model information to other processes
-        mpi_->bCast(items);
+        modelWeights_->loadAndSync(mpi_);
         mpi_->bCast(&markReloaded, 1, mpi_->getDataType(&markReloaded));
       }
 
       // handles MPI
       if(scheduler_)
         scheduler_->load(modelFileName);
-      
-      // we just load it N times from disk (it'll be in disk cache after the first)
-      // this also allocates memory correctly when calling forward() inside restoreOptimizerState
+
+      // We just load it N times but it'll be in read into modelWeights after the first time.
+      // This also allocates memory correctly when calling forward() inside restoreOptimizerState
       size_t i = 0;
-      for(auto graph : graphs_)
-        models_[i++]->load(graph, items, markReloaded);
+      for(auto graph : graphs_) {
+        models_[i++]->load(graph, modelWeights_, markReloaded);
+      }
 
       // try to restore everything from checkpoint now
       loadOptimizerState(modelFileName, scatterFn);
+
+      // @TODO: run another graph->forward() to allocate the weights from the checkpoint?
+      // then we might not need to keep modelWeights_ around.
     }
   }
 }
 
-bool GraphGroup::loadOptimizerState(const std::string& modelFileName, 
+bool GraphGroup::loadOptimizerState(const std::string& modelFileName,
                                     const OptimizerBase::ScatterStateFunc& scatterFn) {
   /*
   if model checkpoint is available:
@@ -383,22 +391,20 @@ bool GraphGroup::loadOptimizerState(const std::string& modelFileName,
   bool foundCheckpoint = filesystem::exists(checkpointName);
   if(mpi_)
     mpi_->bCast(&foundCheckpoint, 1, mpi_->getDataType(&foundCheckpoint));
-  
+
   // all nodes will either continue or exit
   if(!foundCheckpoint) {
     LOG(warn, "No checkpoint found, parameters reloaded from last inference model");
     return false; // failed to restore
   }
 
-  std::vector<marian::io::Item> items;
+  auto checkpoint = New<io::ModelWeights>(checkpointName, io::MmapMode::DontMmap);
+
   // make sure all nodes receive the same checkpoint data from the main process.
-  if(mpi_) { // only the main process loads the checkpoint and the rest receives a copy
-    if(isMainProcess())
-      items = io::loadItems(checkpointName);
-    mpi_->bCast(items);
-  } else { // not doing MPI, so just load the checkpoint from disk
-    items = io::loadItems(checkpointName);
-  }
+  if(mpi_) // only the main process loads the checkpoint and the rest receives a copy
+    checkpoint->loadAndSync(mpi_);
+
+  auto& items = checkpoint->items();
 
   // @TODO: probably we want to have the list of DeviceIds as an attribute
   std::vector<Ptr<Backend>> backends;
@@ -438,7 +444,7 @@ bool GraphGroup::loadOptimizerState(const std::string& modelFileName,
 
 void GraphGroup::saveOptimizerState(const std::string& modelFileName,
                                     const OptimizerBase::GatherStateFunc& gatherFn) {
-  // @TODO: change to .checkpoint.npz, would break backwards compat                                  
+  // @TODO: change to .checkpoint.npz, would break backwards compat
   std::string checkpointName = modelFileName + ".optimizer.npz";
 
   std::vector<io::Item> items;
@@ -446,7 +452,7 @@ void GraphGroup::saveOptimizerState(const std::string& modelFileName,
                             optimizerShards_,
                             gatherFn,
                             isMainProcess());
-                            
+
   if(isMainProcess()) { // only main process does the actual saving
     auto found = std::find_if(items.begin(), items.end(),
       [](const io::Item& item) { return item.name == "master_parameters"; });
@@ -461,7 +467,6 @@ void GraphGroup::saveOptimizerState(const std::string& modelFileName,
       items.push_back(masterParameters);
     }
 
-    
     LOG(info, "[training] Saving training checkpoint to {} and {}", modelFileName, checkpointName);
     io::saveItems(checkpointName, items);
   }
@@ -469,7 +474,7 @@ void GraphGroup::saveOptimizerState(const std::string& modelFileName,
 
 void GraphGroup::saveCheckPoint(const std::string& modelFileName,
                                 bool isFinal,
-                                bool doSaveOptimizerState, 
+                                bool doSaveOptimizerState,
                                 const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) {
   barrier(); // (for better grouping of log messages)
   // bring the smoothed model in
@@ -525,10 +530,10 @@ void GraphGroup::swapWithSmoothed() {
   };
   comm_->foreach(swap);
   comm_->allGatherParams();
-  
+
   if(shardingMode_ == ShardingMode::local)
     comm_->broadcastParams();
-    
+
   barrier();
 }
 
@@ -543,10 +548,10 @@ void GraphGroup::replaceWithSmoothed() {
   };
   comm_->foreach(replace);
   comm_->allGatherParams();
-  
+
   if(shardingMode_ == ShardingMode::local)
     comm_->broadcastParams();
-    
+
   barrier();
 }
 
@@ -587,7 +592,7 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
   size_t step = options_->get<size_t>("mini-batch-fit-step");
 
   size_t maxLength = options_->get<size_t>("max-length");
-  
+
   // this should be only one class label per line on input, hence restricting length to 1
   std::vector<size_t> localMaxes(numFiles, maxLength);
   auto inputTypes = options_->get<std::vector<std::string>>("input-types", {});
@@ -623,7 +628,7 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
   // Do a binary search for maxmimum batch size that fits into given workspace memory
   // for a tested sentence length.
   // We round the maxLength to the next larger step to avoid a situation where we do not
-  // collect batch statistics for maximum length between steps. However, we do not exceed 
+  // collect batch statistics for maximum length between steps. However, we do not exceed
   // the actual maxLength even if the rounded value is larger.
   size_t maxLengthRounded = (size_t)(std::ceil(maxLength / (float)step) * step);
   for(size_t i = step; i <= maxLengthRounded; i += step) {
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index b0c98e3ce..9f70ed81b 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -46,6 +46,7 @@ class GraphGroup {
   std::vector<Ptr<models::ICriterionFunction>> models_; // [deviceIndex]
   std::vector<Ptr<OptimizerBase>> optimizerShards_;     // [deviceIndex]
 
+  Ptr<io::ModelWeights> modelWeights_; // handle for model weights, we keep this around to make sure weights are not deallocated while we are still using them
   Ptr<Scheduler> scheduler_; // scheduler that keeps track of how much has been processed
 
   bool finalized_{false};    // 'true' if training has completed (further updates are no longer allowed)
@@ -105,7 +106,7 @@ class GraphGroup {
 
   void saveCheckPoint(const std::string& modelFileName,
                       bool isFinal,
-                      bool doSaveOptimizerState, 
+                      bool doSaveOptimizerState,
                       const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn);
 
   void saveOptimizerState(const std::string& modelFileName,
@@ -117,7 +118,7 @@ class GraphGroup {
   void swapWithSmoothed();
 
   // This function replaces the current optimizer parameters with the smoothed version (provided smoothing is enabled).
-  // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. 
+  // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten.
   void replaceWithSmoothed();
 
   bool isMainProcess() const { return mpi_->isMainProcess(); } // (we need this test a few times)
diff --git a/src/training/validator.cpp b/src/training/validator.cpp
index b51f1de3f..44c61171b 100644
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
@@ -354,11 +354,12 @@ float TranslationValidator::validate(const std::vector<Ptr<ExpressionGraph>>& gr
 
   // Create scorer
   auto model = options_->get<std::string>("model");
+  auto modelFile = New<io::ModelWeights>(model);
 
   std::vector<Ptr<Scorer>> scorers;
   for(auto graph : graphs) {
     auto builder = models::createModelFromOptions(options_, models::usage::translation);
-    Ptr<Scorer> scorer = New<ScorerWrapper>(builder, "", 1.0f, model);
+    Ptr<Scorer> scorer = New<ScorerWrapper>(builder, "", 1.0f, modelFile);
     scorers.push_back(scorer);  // @TODO: should this be done in the contructor?
   }
 
@@ -591,6 +592,7 @@ float SacreBleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& grap
 
   // Create scorer
   auto model = options_->get<std::string>("model");
+  auto modelFile = New<io::ModelWeights>(model);
 
   // @TODO: check if required - Temporary options for translation
   auto mopts = New<Options>();
@@ -600,7 +602,7 @@ float SacreBleuValidator::validate(const std::vector<Ptr<ExpressionGraph>>& grap
   std::vector<Ptr<Scorer>> scorers;
   for(auto graph : graphs) {
     auto builder = models::createModelFromOptions(options_, models::usage::translation);
-    Ptr<Scorer> scorer = New<ScorerWrapper>(builder, "", 1.0f, model);
+    Ptr<Scorer> scorer = New<ScorerWrapper>(builder, "", 1.0f, modelFile);
     scorers.push_back(scorer);
   }
 
diff --git a/src/translator/scorers.cpp b/src/translator/scorers.cpp
index 7c9745c22..cee57bd85 100644
--- a/src/translator/scorers.cpp
+++ b/src/translator/scorers.cpp
@@ -5,7 +5,7 @@ namespace marian {
 
 Ptr<Scorer> scorerByType(const std::string& fname,
                          float weight,
-                         std::vector<io::Item> items,
+                         Ptr<io::ModelWeights> modelFile,
                          Ptr<Options> options) {
   options->set("inference", true);
   std::string type = options->get<std::string>("type");
@@ -22,48 +22,25 @@ Ptr<Scorer> scorerByType(const std::string& fname,
 
   LOG(info, "Loading scorer of type {} as feature {}", type, fname);
 
-  return New<ScorerWrapper>(encdec, fname, weight, items);
+  return New<ScorerWrapper>(encdec, fname, weight, modelFile);
 }
 
-Ptr<Scorer> scorerByType(const std::string& fname,
-                         float weight,
-                         const void* ptr,
-                         Ptr<Options> options) {
-  options->set("inference", true);
-  std::string type = options->get<std::string>("type");
-
-  // @TODO: solve this better
-  if(type == "lm" && options->has("input")) {
-    size_t index = options->get<std::vector<std::string>>("input").size();
-    options->set("index", index);
-  }
-
-  bool skipCost = options->get<bool>("skip-cost");
-  auto encdec = models::createModelFromOptions(
-      options, skipCost ? models::usage::raw : models::usage::translation);
-
-  LOG(info, "Loading scorer of type {} as feature {}", type, fname);
-
-  return New<ScorerWrapper>(encdec, fname, weight, ptr);
-}
-
-std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<std::vector<io::Item>> models) {
+std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<Ptr<io::ModelWeights>>& modelFiles) {
   std::vector<Ptr<Scorer>> scorers;
 
-  std::vector<float> weights(models.size(), 1.f);
+  std::vector<float> weights(modelFiles.size(), 1.f);
   if(options->hasAndNotEmpty("weights"))
     weights = options->get<std::vector<float>>("weights");
 
   bool isPrevRightLeft = false;  // if the previous model was a right-to-left model
   size_t i = 0;
-  for(auto items : models) {
+  for(auto modelFile : modelFiles) {
     std::string fname = "F" + std::to_string(i);
 
     // load options specific for the scorer
     auto modelOptions = options->clone();
     if(!options->get<bool>("ignore-model-config")) {
-      YAML::Node modelYaml;
-      io::getYamlFromModel(modelYaml, "special:model.yml", items);
+      YAML::Node modelYaml = modelFile->getYamlFromModel("special:model.yml");
       if(!modelYaml.IsNull()) {
         LOG(info, "Loaded model config");
         modelOptions->merge(modelYaml, true);
@@ -74,7 +51,7 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<s
     }
 
     // l2r and r2l cannot be used in the same ensemble
-    if(models.size() > 1 && modelOptions->has("right-left")) {
+    if(modelFiles.size() > 1 && modelOptions->has("right-left")) {
       if(i == 0) {
         isPrevRightLeft = modelOptions->get<bool>("right-left");
       } else {
@@ -85,7 +62,7 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<s
       }
     }
 
-    scorers.push_back(scorerByType(fname, weights[i], items, modelOptions));
+    scorers.push_back(scorerByType(fname, weights[i], modelFile, modelOptions));
     i++;
   }
 
@@ -93,55 +70,14 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<s
 }
 
 std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options) {
-  std::vector<std::vector<io::Item>> model_items;
+  std::vector<Ptr<io::ModelWeights>> modelFiles;
   auto models = options->get<std::vector<std::string>>("models");
   for(auto model : models) {
-    auto items = io::loadItems(model);
-    model_items.push_back(std::move(items));
+    auto modelFile = New<io::ModelWeights>(model);
+    modelFiles.push_back(modelFile);
   }
 
-  return createScorers(options, model_items);
-}
-
-std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<const void*>& ptrs) {
-  std::vector<Ptr<Scorer>> scorers;
-
-  std::vector<float> weights(ptrs.size(), 1.f);
-  if(options->hasAndNotEmpty("weights"))
-    weights = options->get<std::vector<float>>("weights");
-
-  size_t i = 0;
-  for(auto ptr : ptrs) {
-    std::string fname = "F" + std::to_string(i);
-
-    // load options specific for the scorer
-    auto modelOptions = options->clone();
-    if(!options->get<bool>("ignore-model-config")) {
-      YAML::Node modelYaml;
-      io::getYamlFromModel(modelYaml, "special:model.yml", ptr);
-      if(!modelYaml.IsNull()) {
-        LOG(info, "Loaded model config");
-        modelOptions->merge(modelYaml, true);
-      }
-      else {
-        LOG(warn, "No model settings found in model file");
-      }
-    }
-
-    scorers.push_back(scorerByType(fname, weights[i], ptr, modelOptions));
-    i++;
-  }
-
-  return scorers;
-}
-
-std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<mio::mmap_source>& mmaps) {
-  std::vector<const void*> ptrs;
-  for(const auto& mmap : mmaps) {
-    ABORT_IF(!mmap.is_mapped(), "Memory mapping did not succeed");
-    ptrs.push_back(mmap.data());
-  }
-  return createScorers(options, ptrs);
+  return createScorers(options, modelFiles);
 }
 
 }  // namespace marian
diff --git a/src/translator/scorers.h b/src/translator/scorers.h
index 72ebff5df..333e49108 100644
--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@@ -4,7 +4,6 @@
 
 #include "data/shortlist.h"
 #include "models/model_factory.h"
-#include "3rd_party/mio/mio.hpp"
 
 namespace marian {
 
@@ -72,47 +71,25 @@ class ScorerWrapperState : public ScorerState {
 class ScorerWrapper : public Scorer {
 private:
   Ptr<IEncoderDecoder> encdec_;
-  std::string fname_;
-  std::vector<io::Item> items_;
-  const void* ptr_;
+  Ptr<io::ModelWeights> modelWeights_;
 
 public:
   ScorerWrapper(Ptr<models::IModel> encdec,
                 const std::string& name,
                 float weight,
-                std::vector<io::Item>& items)
+                Ptr<io::ModelWeights> modelFile)
       : Scorer(name, weight),
         encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
-        items_(items),
-        ptr_{0} {}
-
-  ScorerWrapper(Ptr<models::IModel> encdec,
-                const std::string& name,
-                float weight,
-                const std::string& fname)
-      : Scorer(name, weight),
-        encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
-        fname_(fname),
-        ptr_{0} {}
-
-  ScorerWrapper(Ptr<models::IModel> encdec,
-                const std::string& name,
-                float weight,
-                const void* ptr)
-      : Scorer(name, weight),
-        encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
-        ptr_{ptr} {}
+        modelWeights_(modelFile)
+      {}
 
   virtual ~ScorerWrapper() {}
 
   virtual void init(Ptr<ExpressionGraph> graph) override {
     graph->switchParams(getName());
-    if(!items_.empty())
-      encdec_->load(graph, items_);
-    else if(ptr_)
-      encdec_->mmap(graph, ptr_);
-    else
-      encdec_->load(graph, fname_);
+    // @TODO: unify to a single call, this logic should happen in modelFile_
+    if(modelWeights_)
+      encdec_->load(graph, modelWeights_);
   }
 
   virtual void clear(Ptr<ExpressionGraph> graph) override {
@@ -154,26 +131,7 @@ class ScorerWrapper : public Scorer {
   }
 };
 
-Ptr<Scorer> scorerByType(const std::string& fname,
-                        float weight,
-                        std::vector<io::Item> items,
-                        Ptr<Options> options);
-
-Ptr<Scorer> scorerByType(const std::string& fname,
-                         float weight,
-                         const std::string& model,
-                         Ptr<Options> config);
-
-
 std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options);
-std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<std::vector<io::Item>> models);
-
-Ptr<Scorer> scorerByType(const std::string& fname,
-                         float weight,
-                         const void* ptr,
-                         Ptr<Options> config);
-
-std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<const void*>& ptrs);
-std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<mio::mmap_source>& mmaps);
+std::vector<Ptr<Scorer>> createScorers(Ptr<Options> options, const std::vector<Ptr<io::ModelWeights>>& models);
 
 }  // namespace marian
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 498ef65b3..081b06c42 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -19,9 +19,6 @@
 #include "models/model_task.h"
 #include "translator/scorers.h"
 
-// currently for diagnostics only, will try to mmap files ending in *.bin suffix when enabled.
-#include "3rd_party/mio/mio.hpp"
-
 namespace marian {
 
 template <class Search>
@@ -36,9 +33,7 @@ class Translate : public ModelTask {
   Ptr<const data::ShortlistGenerator> shortlistGenerator_;
 
   size_t numDevices_;
-
-  std::vector<mio::mmap_source> model_mmaps_; // map
-  std::vector<std::vector<io::Item>> model_items_; // non-mmap
+  std::vector<Ptr<io::ModelWeights>> modelWeights_;
 
 public:
   Translate(Ptr<Options> options)
@@ -70,20 +65,18 @@ class Translate : public ModelTask {
     scorers_.resize(numDevices_);
     graphs_.resize(numDevices_);
 
-    auto models = options->get<std::vector<std::string>>("models");
-    if(options_->get<bool>("model-mmap", false)) {
-      for(auto model : models) {
-        ABORT_IF(!io::isBin(model), "Non-binarized models cannot be mmapped");
-        LOG(info, "Loading model from {}", model);
-        model_mmaps_.push_back(mio::mmap_source(model));
-      }
-    }
-    else {
-      for(auto model : models) {
-        LOG(info, "Loading model from {}", model);
-        auto items = io::loadItems(model);
-        model_items_.push_back(std::move(items));
-      }
+    auto modelPaths = options->get<std::vector<std::string>>("models");
+
+    // We now opportunistically mmap the model files anyways, but to keep backward compatibility
+    // with the old --model-mmap option, we now croak if mmap is explicitly requested during decoding
+    // but not possible in the actual graph, e.g. if --model-mmap is specified but the model file is
+    // a npz-file or we decode on the GPU (will croak in different places).
+    bool mmap     = options_->get<bool>("model-mmap", false);
+    auto mmapMode = mmap ? io::MmapMode::RequiredMmap : io::MmapMode::OpportunisticMmap;
+
+    for(auto modelPath : modelPaths) {
+      LOG(info, "Loading model from {}", modelPath);
+      modelWeights_.push_back(New<io::ModelWeights>(modelPath, mmapMode));
     }
 
     size_t id = 0;
@@ -101,13 +94,7 @@ class Translate : public ModelTask {
         graph->reserveWorkspaceMB(options_->get<int>("workspace"));
         graphs_[id] = graph;
 
-        std::vector<Ptr<Scorer>> scorers;
-        if(options_->get<bool>("model-mmap", false)) {
-          scorers = createScorers(options_, model_mmaps_);
-        }
-        else {
-          scorers = createScorers(options_, model_items_);
-        }
+        std::vector<Ptr<Scorer>> scorers = createScorers(options_, modelWeights_);
 
         for(auto scorer : scorers) {
           scorer->init(graph);
@@ -242,6 +229,8 @@ class TranslateService : public ModelServiceTask {
   Ptr<Vocab> trgVocab_;
   Ptr<const data::ShortlistGenerator> shortlistGenerator_;
 
+  std::vector<Ptr<io::ModelWeights>> modelFiles_;
+
   size_t numDevices_;
 
 public:
@@ -279,11 +268,9 @@ class TranslateService : public ModelServiceTask {
     numDevices_ = devices.size();
 
     // preload models
-    std::vector<std::vector<io::Item>> model_items_;
     auto models = options->get<std::vector<std::string>>("models");
     for(auto model : models) {
-      auto items = io::loadItems(model);
-      model_items_.push_back(std::move(items));
+      modelFiles_.push_back(New<io::ModelWeights>(model));
     }
 
     // initialize scorers
@@ -301,7 +288,7 @@ class TranslateService : public ModelServiceTask {
       graph->reserveWorkspaceMB(options_->get<int>("workspace"));
       graphs_.push_back(graph);
 
-      auto scorers = createScorers(options_, model_items_);
+      auto scorers = createScorers(options_, modelFiles_);
       for(auto scorer : scorers) {
         scorer->init(graph);
         if(shortlistGenerator_)

From 1656b9c0f6e238d3f39d5b43f874ee552f5eb49c Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 24 Jan 2024 01:21:51 +0000
Subject: [PATCH 09/26] Merged PR 32600: Full Comet-Kiwi implementation,
 partial xComet-XL/XXL

This PR implements
* Comet-Kiwi - fully functional
* xComet-XL and xComet-XXL - scores for regressor part fully matching, MQM partial scores not implemented yet.
---
 CHANGELOG.md                  |   2 +
 VERSION                       |   2 +-
 scripts/comet/comet2marian.py | 111 ++++++++++++++++------
 src/common/config_parser.cpp  |   6 ++
 src/data/corpus_base.cpp      |  36 +++----
 src/data/corpus_base.h        |  24 ++---
 src/layers_new/neuralnet.h    |  80 ++++++++--------
 src/layers_new/transformer.h  | 174 +++++++++++++++++++---------------
 src/models/comet_qe.h         | 111 +++++++++++++---------
 src/models/model_factory.cpp  |  67 ++++++++-----
 src/training/graph_group.cpp  |   4 +-
 src/translator/scorers.h      |   5 +-
 12 files changed, 370 insertions(+), 252 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 772349e3d..854162b6d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added implementation of COMET-KIWI
+- Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
 - Added implementation of COMET-22 (reference-based) model and conversion
 - Added sparsemax operator (slow version)
 - Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively.
diff --git a/VERSION b/VERSION
index 2f107c43d..5a8f2d3ca 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.19
+v1.12.20
diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
index 6b4f557db..09c369260 100755
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -14,7 +14,8 @@
 # supported_comets = [m for m in available_metrics if 'qe' in m.lower()]
 supported_comets = [
     'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da',
-    'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da'
+    'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da', 'Unbabel/wmt22-cometkiwi-da',
+    'Unbabel/XCOMET-XL', 'Unbabel/XCOMET-XXL'
 ]
 log.basicConfig(level=log.INFO)
 
@@ -92,6 +93,12 @@ def load_comet_model(model_path):
     config["type"] = "comet-qe"
 elif model_type == "XLMRobertaModel":
     config["type"] = "comet-qe"
+elif model_type == "UnifiedMetric" or model_type == "XCOMETMetric":
+    config["type"] = "comet-unified"
+    config["input-join-fields"] = True
+    config["separator-symbol"] = "</s>"
+    config["comet-use-separator"] = True
+    config["comet-pool"] = "cls"
 else:
     raise Exception(f'Unknown type of model {model_type}')
 
@@ -100,17 +107,32 @@ def load_comet_model(model_path):
 config["transformer-ffn-depth"] = 2
 config["transformer-ffn-activation"] = "gelu" # figure this out dynamically
 config["transformer-train-position-embeddings"] = True
-config["transformer-preprocess"] = ""
-config["transformer-postprocess"] = "dan"
-config["transformer-postprocess-emb"] = "nd"
+
+# Roberta-XXL (hence XCOMET-XXL) has pre-norm
+if model_type == "XCOMETMetric":  # @TODO: make this depend on RobertaXL/XXL rather than model_type
+    config["transformer-preprocess"] = "n"
+    config["transformer-postprocess"] = "da"
+    config["transformer-postprocess-emb"] = ""
+    config["transformer-postprocess-top"] = "n"
+else:
+    config["transformer-preprocess"] = ""
+    config["transformer-postprocess"] = "dan"
+    config["transformer-postprocess-emb"] = "nd"
+
 config["bert-train-type-embeddings"] = False
 config["bert-type-vocab-size"] = 0
 config["comet-prepend-zero"] = True
 
+print(cometModel.hparams)
+
 config["comet-mix"] = cometModel.hparams.get("layer") == "mix"
 config["comet-mix-norm"] = cometModel.hparams.get('layer_norm', False)
 config["comet-mix-transformation"] = cometModel.hparams.get("layer_transformation", "softmax");
 
+# they have a bug in their code that makes this always true
+if model_type == "UnifiedMetric" or model_type == "XCOMETMetric":
+    config["comet-mix-transformation"] = "softmax"
+
 if not args.roberta:
     config["comet-final-sigmoid"] = args.add_sigmoid
     config["comet-pooler-ffn"] = [2048, 1024]
@@ -132,26 +154,32 @@ def yaml2np(config):
     return npDesc
 
 def convert(pd, srcs, trg, transpose=True, bias=False):
-    if len(srcs) == 1:
-        for src in srcs:
-            num = pd[src].detach().numpy()
-            if bias:
-                marianModel[trg] = num.copy()
-            else:
-                if transpose:
-                    marianModel[trg] = np.transpose(num).copy()
-                else:
-                    marianModel[trg] = num
-    else: # path that joins matrices together for fused self-attention
-        nums = [pd[src].detach().numpy() for src in srcs]
+    # make sure exactly one element of list srcs exists in dictionary pd
+    found = sum([src in pd for src in srcs])
+    assert found == 1, f"Found {found} of {srcs} in {pd}"
+
+    for src in srcs:
+        if src not in pd:
+            continue
+        num = pd[src].detach().numpy()
         if bias:
-            nums = [np.transpose(num) for num in nums]
-        marianModel[trg] = np.stack(nums, axis=0).copy()
+            marianModel[trg] = num.copy()
+        else:
+            if transpose:
+                marianModel[trg] = np.transpose(num).copy()
+            else:
+                marianModel[trg] = num
+
+
+def match(regex, string):
+    import re
+    return re.search(regex, string) is not None
 
 def extract(layer, nth, level):
     name = type(layer).__name__
     print("  " * level, nth, name)
-    if "RobertaLayer" in name:
+
+    if match(r"Roberta(XL+)?Layer", name):
         pd = dict(layer.named_parameters())
         for n in pd:
             print("  " * (level + 1), n, pd[n].shape)
@@ -178,8 +206,12 @@ def extract(layer, nth, level):
         convert(pd, ["attention.output.dense.bias"],       f"{blockPrefix}->selfAttention->oProj->bias", bias=True)
 
         # self-attention layer-norm
-        convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True)
-        convert(pd, ["attention.output.LayerNorm.bias"],   f"{blockPrefix}->postprocessor->norm->bias", bias=True)
+        if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type
+            convert(pd, ["attention.self_attn_layer_norm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True)
+            convert(pd, ["attention.self_attn_layer_norm.bias"],   f"{blockPrefix}->preprocessor->norm->bias", bias=True)
+        else:
+            convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True)
+            convert(pd, ["attention.output.LayerNorm.bias"],   f"{blockPrefix}->postprocessor->norm->bias", bias=True)
 
         # ffn
         # first ffn layer
@@ -190,15 +222,20 @@ def extract(layer, nth, level):
         # second ffn layer
         convert(pd, ["output.dense.weight"],               f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->weight")
         convert(pd, ["output.dense.bias"],                 f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
+
         # ffn layer-norm
-        convert(pd, ["output.LayerNorm.weight"],           f"{blockPrefix}->postprocessor->norm->weight", bias=True)
-        convert(pd, ["output.LayerNorm.bias"],             f"{blockPrefix}->postprocessor->norm->bias", bias=True)
+        if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type
+            convert(pd, ["LayerNorm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True)
+            convert(pd, ["LayerNorm.bias"],   f"{blockPrefix}->preprocessor->norm->bias", bias=True)
+        else:
+            convert(pd, ["output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True)
+            convert(pd, ["output.LayerNorm.bias"],   f"{blockPrefix}->postprocessor->norm->bias", bias=True)
 
         config["transformer-dim-ffn"] = pd["intermediate.dense.bias"].shape[-1]
         config["transformer-heads"] = layer.attention.self.num_attention_heads
         config["enc-depth"] += 1
 
-    elif "RobertaEmbeddings" in name:
+    elif match(r"Roberta(XL+)?Embeddings", name):
         for n, p in layer.named_parameters():
             print("  " * (level + 1), n, p.shape)
         pd = dict(layer.named_parameters())
@@ -208,6 +245,10 @@ def extract(layer, nth, level):
         npWemb = npWembTemp[1:-1, :].copy()
         npWemb[0, :] = npWembTemp[0, :]
         npWemb[2, :] = npWembTemp[2, :]
+
+        # XCOMET-XXL has some additional tokens (why?), we truncate it back to normal size
+        npWemb = npWemb[0:250000, :].copy()
+
         marianModel["Wemb"] = npWemb
 
         prefix = "CometEncoder"
@@ -217,14 +258,26 @@ def extract(layer, nth, level):
         npPos = npPos[2:, :].copy()
         marianModel[f"{prefix}->encoder->positionEmbedding->embeddings"] = npPos
 
-        # post-embedding layer normalization
-        convert(pd, ["LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True)
-        convert(pd, ["LayerNorm.bias"],   f"{prefix}->encoder->preprocessor->norm->bias", bias=True)
-
         config["dim-emb"]    =   npWemb.shape[1]
         config["dim-vocabs"] = [ npWemb.shape[0] ]
         config["max-length"] = npPos.shape[0]
 
+    elif match(r"Roberta(XL+)?Model", name):
+        pd = dict(layer.named_parameters())
+        prefix = "CometEncoder"
+
+        # post-embedding layer normalization
+        if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type
+            convert(pd, ["encoder.LayerNorm.weight"], f"{prefix}->encoder->postprocessor->norm->weight", bias=True)
+            convert(pd, ["encoder.LayerNorm.bias"],   f"{prefix}->encoder->postprocessor->norm->bias", bias=True)
+        else:
+            convert(pd, ["embeddings.LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True)
+            convert(pd, ["embeddings.LayerNorm.bias"],   f"{prefix}->encoder->preprocessor->norm->bias", bias=True)
+
+        # on this level we actually keep recursing
+        recurse(layer, level + 1)
+
+
     elif name == "LayerwiseAttention":
         for n, p in layer.named_parameters():
             print("  " * (level + 1), n, p.shape)
@@ -232,7 +285,7 @@ def extract(layer, nth, level):
 
         # mix layers
         weights = []
-        for i in range(25):
+        for i in range(config["enc-depth"] + 1):
             weights.append(pd[f"scalar_parameters.{i}"].detach().numpy())
         marianModel["CometEncoder->encoder->weights"] = np.concatenate(weights).copy()
 
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index ec85e40ad..741a3915c 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -249,6 +249,10 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       "Possible values: sequence, class, alignment, weight. "
       "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).",
       {});
+  cli.add<bool>("--input-join-fields", 
+      "Join input fields (from files or TSV) into a single sequence "
+      "(mostly used single-encoder models like BLEURT and COMET-KIWI)", 
+      false);
   cli.add<bool>("--best-deep",
       "Use Edinburgh deep RNN configuration (s2s)");
   cli.add<bool>("--tied-embeddings",
@@ -364,6 +368,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
 
   cli.add<bool>("--comet-mix", "Mix encoder layers to produce embedding");
   cli.add<bool>("--comet-mix-norm", "Normalize layers prior to mixing");
+  cli.add<std::string>("--comet-pool", "Pooling operation over time dimension (avg, cls, max)", "avg");
   cli.add<std::string>("--comet-mix-transformation", "Which transformation to apply to layer mixing (softmax [default] or sparsemax)", "softmax");
   cli.add<float>("--comet-dropout", "Dropout for pooler layers", 0.1f);
   cli.add<float>("--comet-mixup", "Alpha parameter for Beta distribution for mixup", 0.0f);
@@ -371,6 +376,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
   cli.add<float>("--comet-augment-bad", "Fraction of bad examples added via shuffling for class/label 0.f", 0.0f);
   cli.add<std::vector<int>>("--comet-pooler-ffn", "Hidden sizes for comet pooler", {2048, 1024});
   cli.add<bool>("--comet-prepend-zero", "Add a start symbol to batch entries");
+  cli.add<bool>("--comet-use-separator", "Add a sentence separator to batch entries when joining source, target and mt", false);
 
 #ifdef CUDNN
   cli.add<int>("--char-stride",
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index 0ef804b1c..47381d9b9 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -60,9 +60,10 @@ CorpusBase::CorpusBase(const std::vector<std::string>& paths,
       maxLengthCrop_(options_->get<bool>("max-length-crop")),
       rightLeft_(options_->get<bool>("right-left")),
       prependZero_(options_->get<bool>("comet-prepend-zero", false)),
+      joinFields_(options_->get<bool>("input-join-fields", false)),
+      insertSeparator_(options_->get<bool>("comet-use-separator", false)),
       tsv_(options_->get<bool>("tsv", false)),
-      tsvNumInputFields_(getNumberOfTSVInputFields(options)),
-      joinFields_(options_->get<bool>("input-join-fields", false)) {
+      tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
   // TODO: support passing only one vocab file if we have fully-tied embeddings
   if(tsv_) {
     ABORT_IF(tsvNumInputFields_ != vocabs_.size(),
@@ -87,9 +88,10 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
       maxLengthCrop_(options_->get<bool>("max-length-crop")),
       rightLeft_(options_->get<bool>("right-left")),
       prependZero_(options_->get<bool>("comet-prepend-zero", false)),
+      joinFields_(options_->get<bool>("input-join-fields", false)),
+      insertSeparator_(options_->get<bool>("comet-use-separator", false)),
       tsv_(options_->get<bool>("tsv", false)),
-      tsvNumInputFields_(getNumberOfTSVInputFields(options)),
-      joinFields_(options_->get<bool>("input-join-fields", false)) {
+      tsvNumInputFields_(getNumberOfTSVInputFields(options)) {
   bool training = !translate;
 
   if(training)
@@ -361,7 +363,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
       Ptr<Vocab> vocab = New<Vocab>(options_, i);
       vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]);
       vocabs_.emplace_back(vocab);
-    } 
+    }
 
     // TODO: As above, this is not nice as it modifies the option object and needs to expose the changes
     // outside the corpus as models need to know about the vocabulary size; extract the vocab
@@ -430,18 +432,20 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
 
   // This handles adding starts symbols for COMET (<s>) and BERT/BLEURT ([CLS])
   bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0));
-  if(prepend && inputTypes[batchIndex] == "sequence") {
-    auto prependedWord = Word::fromWordIndex(0);
-    words.insert(words.begin(), prependedWord);
-  }
-  
+  if(prepend && inputTypes[batchIndex] == "sequence")
+    words.insert(words.begin(), Word::fromWordIndex(0));
+
+  bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0;
+  if(prependSep && inputTypes[batchIndex] == "sequence")
+    words.insert(words.begin(), vocabs_[batchIndex]->getSepId());
+
   // if fields are joined and the current sentence is not the first one, we need to make sure that
   // the current sentence is not longer than the maximum length minus the length of the previous sentence
-  // (minus 1 for the separator <eos> token)
+  // (minus 1 for the separator <eos> token or 2 if we also add a separator <sep> token)
   size_t localMaxLength = maxLength_;
   if(joinFields_ && !tup.empty())
-    localMaxLength = std::max(1, (int)maxLength_ - (int)tup.back().size());
-    
+    localMaxLength = std::max(1 + (int)prependSep, (int)maxLength_ - (int)tup.back().size());
+
   // if the current sentence is longer than the maximum length, we need to crop it
   if(maxLengthCrop_ && words.size() > localMaxLength) {
     words.resize(localMaxLength);
@@ -472,7 +476,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
   size_t srcEosPos = tup[0].size() - 1;
   size_t tgtEosPos = tup[1].size() - 1;
 
-  auto align = WordAlignment(line, srcEosPos, tgtEosPos);  
+  auto align = WordAlignment(line, srcEosPos, tgtEosPos);
   tup.setAlignment(align);
 }
 
@@ -497,10 +501,10 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupl
 void CorpusBase::addAlignmentsToBatch(Ptr<CorpusBatch> batch,
                                       const std::vector<Sample>& batchVector) {
   std::vector<WordAlignment> aligns;
-  
+
   int dimBatch = (int)batch->getSentenceIds().size();
   aligns.reserve(dimBatch);
-  
+
   for(int b = 0; b < dimBatch; ++b) {
     // If the batch vector is altered within marian by, for example, case augmentation,
     // the guided alignments we received for this tuple cease to be valid.
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index 1e28da7f4..074689804 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -56,15 +56,15 @@ class SentenceTupleImpl {
    * @brief Returns whether this Tuple was altered or augmented from what
    * was provided to Marian in input.
    */
-  bool isAltered() const { 
-    return altered_; 
+  bool isAltered() const {
+    return altered_;
   }
 
   /**
    * @brief Mark that this Tuple was internally altered or augmented by Marian
    */
-  void markAltered() { 
-    altered_ = true; 
+  void markAltered() {
+    altered_ = true;
   }
 
   /**
@@ -79,7 +79,7 @@ class SentenceTupleImpl {
    *
    * @param words A vector of word indices.
    */
-  void appendToBack(const Words& words) { 
+  void appendToBack(const Words& words) {
     if(tuple_.empty()) {
       tuple_.push_back(words);
     } else {
@@ -155,11 +155,11 @@ class SentenceTuple {
    * @brief Creates an empty tuple with no associated future.
    */
   SentenceTuple() {}
-  
-  SentenceTuple(const SentenceTupleImpl& tupImpl) 
+
+  SentenceTuple(const SentenceTupleImpl& tupImpl)
     : impl_(std::make_shared<SentenceTupleImpl>(tupImpl)) {}
 
-  SentenceTuple(std::future<SentenceTupleImpl>&& fImpl) 
+  SentenceTuple(std::future<SentenceTupleImpl>&& fImpl)
     : fImpl_(new std::future<SentenceTupleImpl>(std::move(fImpl))) {}
 
   SentenceTupleImpl& get() const {
@@ -466,7 +466,7 @@ class CorpusBatch : public Batch {
 
     if(options->get("guided-alignment", std::string("none")) != "none") {
       // @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths
-      
+
       std::vector<data::WordAlignment> alignment;
       for(size_t k = 0; k < batchSize; ++k) {
         data::WordAlignment perSentence;
@@ -658,13 +658,15 @@ class CorpusBase : public DatasetBase<SentenceTuple, CorpusIterator, CorpusBatch
   bool rightLeft_{false};
   bool prependZero_{false};
 
+  bool joinFields_{false};      // if true when given a TSV file or multiple inputs, join them together into a single sentence tuple, 
+                                // the already present </s> separator will demark the fields (mostly used for BLEURT and COMET-KIWI)
+  bool insertSeparator_{false}; // when joining fields with joinFields_, additionally use this separator (mostly used for COMET-KIWI)
+
   bool tsv_{false};  // true if the input is a single file with tab-separated values
   size_t tsvNumInputFields_{0};  // number of fields from the TSV input that are associated
                                   // with vocabs, i.e. excluding fields with alignment or
                                   // weights, only if --tsv
 
-  bool joinFields_{false}; // if true when given a TSV file or multiple inputs, join them together with a specified separator.
-
   /**
    * @brief Determine the number of fields from the TSV input that are associated with
    * vocabs, i.e. excluding fields that contain alignment or weights
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
index 923838aa0..c0912634f 100644
--- a/src/layers_new/neuralnet.h
+++ b/src/layers_new/neuralnet.h
@@ -9,15 +9,15 @@ namespace nn {
 static inline Expr swapTimeBatch(Expr input) { return swapAxes(atleast_4d(input), -2, -3); }
 
 /**
- * A generic Activation function layer. Any unary Marian operator or function accepted by 
- * `std::function<Expr(Expr)>` can be turned into an activation function like this: 
+ * A generic Activation function layer. Any unary Marian operator or function accepted by
+ * `std::function<Expr(Expr)>` can be turned into an activation function like this:
  ```
  auto reluLayer = New<Activation>(graph, (Expr(*)(Expr))relu)
  ```
- * The function pointer cast may be required to disambiguate the operator name if operators 
- * of the same name but with a different sets of parameters exist, otherwise it can be dropped 
+ * The function pointer cast may be required to disambiguate the operator name if operators
+ * of the same name but with a different sets of parameters exist, otherwise it can be dropped
  * or replaced with a more readable lambda function.
- * 
+ *
  * `Activation` will also accept lambdas for more complex activations:
  ```
  // a reasonably accurate approximation of GELU
@@ -30,11 +30,11 @@ class Activation : public Layer, public IUnaryLayer {
 
 public:
   Activation(Ptr<ExpressionGraph> graph,
-             const std::function<Expr(Expr)>& actFn) 
+             const std::function<Expr(Expr)>& actFn)
     : Layer(graph), actFn(actFn) {}
 
   virtual ~Activation() = default;
-  
+
   Expr apply(Expr x) const override {
     return actFn(x);
   }
@@ -68,7 +68,7 @@ struct Swish final : public Activation {
 // Factory for activation function layers from name as string.
 Ptr<Activation> activationLayerByName(Ptr<ExpressionGraph> graph, const std::string& actName);
 
-// Applies a linear transformation to the incoming data: y = xA^T + b 
+// Applies a linear transformation to the incoming data: y = xA^T + b
 struct Linear : public Layer, public IUnaryLayer {
   Expr weight;
   Expr bias;
@@ -79,7 +79,7 @@ struct Linear : public Layer, public IUnaryLayer {
   Ptr<inits::NodeInitializer> init;
 
   // Typical constructor that can take an initializer function
-  Linear(Ptr<ExpressionGraph> graph, 
+  Linear(Ptr<ExpressionGraph> graph,
          int dimOut,
          bool useBias = true,
          bool transposed = false,
@@ -108,22 +108,22 @@ struct Linear : public Layer, public IUnaryLayer {
     } else {
       registerParameterLazy(weight, Shape({ dimIn, dimOut }), init);
     }
-    
+
     if(useBias) {
       registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
     }
 
     Type outputType = x->value_type();
     if(useBias)
-      return marian::affine(x, 
-                            marian::cast(weight, outputType), 
-                            marian::cast(bias, outputType), 
-                            /*transA=*/false, 
+      return marian::affine(x,
+                            marian::cast(weight, outputType),
+                            marian::cast(bias, outputType),
+                            /*transA=*/false,
                             /*transB=*/transposed);
     else
-      return marian::dot(x, 
-                         marian::cast(weight, outputType), 
-                         /*transA=*/false, 
+      return marian::dot(x,
+                         marian::cast(weight, outputType),
+                         /*transA=*/false,
                          /*transB=*/transposed);
   }
 };
@@ -131,15 +131,15 @@ struct Linear : public Layer, public IUnaryLayer {
 struct Dropout final : public Layer, public IUnaryLayer {
   float dropoutProbability;
   Shape::Axes dropoutAxes{{-2, -1}};
-  
-  Dropout(Ptr<ExpressionGraph> graph, 
+
+  Dropout(Ptr<ExpressionGraph> graph,
           float dropoutProbability,
-          const Shape::Axes& dropoutAxes) 
+          const Shape::Axes& dropoutAxes)
     : Layer(graph), dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes)
   {}
 
-  Dropout(Ptr<ExpressionGraph> graph, 
-          float dropoutProbability) 
+  Dropout(Ptr<ExpressionGraph> graph,
+          float dropoutProbability)
     : Layer(graph), dropoutProbability(dropoutProbability)
   {}
 
@@ -170,24 +170,24 @@ struct LinearReluDropout final : public Linear {
   Shape::Axes dropoutAxes{{-2, -1}};
 
   // Typical constructor that can take an initializer function
-  LinearReluDropout(Ptr<ExpressionGraph> graph, 
+  LinearReluDropout(Ptr<ExpressionGraph> graph,
                     int dimOut,
                     float dropoutProbability,
                     bool useBias = true,
                     bool transposed = false,
                     Ptr<inits::NodeInitializer> init = inits::glorotUniform())
-    : Linear(graph, dimOut, useBias, transposed, init),  
+    : Linear(graph, dimOut, useBias, transposed, init),
       dropoutProbability(dropoutProbability) {}
 
   // Typical constructor that can take an initializer function
-  LinearReluDropout(Ptr<ExpressionGraph> graph, 
+  LinearReluDropout(Ptr<ExpressionGraph> graph,
                     int dimOut,
                     float dropoutProbability,
                     const Shape::Axes& dropoutAxes,
                     bool useBias = true,
                     bool transposed = false,
                     Ptr<inits::NodeInitializer> init = inits::glorotUniform())
-    : Linear(graph, dimOut, useBias, transposed, init),  
+    : Linear(graph, dimOut, useBias, transposed, init),
       dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes) {}
 
   Expr apply(Expr x) const override {
@@ -199,7 +199,7 @@ struct LinearReluDropout final : public Linear {
     } else {
       registerParameterLazy(weight, Shape({ dimIn, dimOut }), init);
     }
-    
+
     if(useBias) {
       registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
     }
@@ -223,21 +223,21 @@ struct LinearReluDropout final : public Linear {
 struct Norm : public Layer, public IUnaryLayer {
   Expr weight{nullptr}; // = scale
   Expr bias{nullptr};
-  
+
   bool useScale{true};
   bool useBias{true};
   bool elementwise{true};
   float eps{1e-5f};
 
-  Norm(Ptr<ExpressionGraph> graph, 
-       bool useScale = true, 
-       bool useBias = true, 
-       bool elementwise = true, 
+  Norm(Ptr<ExpressionGraph> graph,
+       bool useScale = true,
+       bool useBias = true,
+       bool elementwise = true,
        float eps = 1e-5f)
-    : Layer(graph), 
-      useScale(useScale), 
-      useBias(useBias), 
-      elementwise(elementwise), 
+    : Layer(graph),
+      useScale(useScale),
+      useBias(useBias),
+      elementwise(elementwise),
       eps(eps) {}
 
   virtual Expr getScale(int dimModel) const {
@@ -264,7 +264,7 @@ struct Norm : public Layer, public IUnaryLayer {
 };
 
 struct LayerNorm : public Norm {
-  LayerNorm(Ptr<ExpressionGraph> graph, 
+  LayerNorm(Ptr<ExpressionGraph> graph,
             bool useScale = true,
             bool useBias = true,
             bool elementwise = true,
@@ -281,9 +281,9 @@ struct LayerNorm : public Norm {
 };
 
 struct RMSNorm : public Norm {
-  RMSNorm(Ptr<ExpressionGraph> graph, 
-          bool useScale = true, 
-          bool useBias = true, 
+  RMSNorm(Ptr<ExpressionGraph> graph,
+          bool useScale = true,
+          bool useBias = true,
           bool elementwise = true,
           float eps = 1e-5f)
    : Norm(graph, useScale, useBias, elementwise, eps)
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
index ccce35d13..d80fe102f 100644
--- a/src/layers_new/transformer.h
+++ b/src/layers_new/transformer.h
@@ -24,7 +24,7 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
   TransformerPrePostProcessor(Ptr<ExpressionGraph> graph,
                               const std::string& actionDesc,
                               float dropoutProbablity)
-    : Layer(graph), 
+    : Layer(graph),
       actionDesc(actionDesc)
   {
     for(char a : actionDesc) {
@@ -45,11 +45,11 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
   }
 
   virtual ~TransformerPrePostProcessor() = default;
-  
+
   Expr apply(Expr input, Expr previous = nullptr) const override {
     Expr output = input;
     for(char action : actionDesc) {
-      if(action == 'd') 
+      if(action == 'd')
         output = dropout->apply(output);
       else if(action == 'a' && previous)
         output = output + previous;
@@ -64,7 +64,7 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
   }
 };
 
-/** 
+/**
  * This is a typical transformer self-attention block. The default configuration will
  * use a multi-head multiplicative self-attention layer, followed by dropout, the skip
  * connection and layer normalization (dan) in the post-processor. The pre-processor does
@@ -76,13 +76,13 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
   Ptr<AttentionLayer> selfAttention;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
-  TransformerSelfAttentionBlock(Ptr<ExpressionGraph> graph, 
+  TransformerSelfAttentionBlock(Ptr<ExpressionGraph> graph,
                                 Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
     preprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-preprocess", ""),  
+      graph,
+      opt<std::string>("transformer-preprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
@@ -90,8 +90,8 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
     registerLayer(selfAttention);
 
     postprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess", ""), 
+      graph,
+      opt<std::string>("transformer-postprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(postprocessor);
   }
@@ -104,9 +104,9 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
   }
 };
 
-/** 
+/**
  * This is a typical transformer filter (1-dimensional convolution) block. The default configuration will
- * use scale up to a larger dimension, apply a ReLU activation and scale down again, followed by dropout, 
+ * use scale up to a larger dimension, apply a ReLU activation and scale down again, followed by dropout,
  * the skip connection and layer normalization (dan) in the post-processor. The pre-processor does
  * nothing in the default configuration.
  */
@@ -115,18 +115,18 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye
   Ptr<Sequential> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
   bool isDecoder{false};
-  
-  TransformerFilterBlock(Ptr<ExpressionGraph> graph, 
+
+  TransformerFilterBlock(Ptr<ExpressionGraph> graph,
                          Ptr<Options> options,
                          bool isDecoder = false)
     : LayerWithOptions(graph, options), isDecoder(isDecoder)
   {
     preprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-preprocess", ""),  
+      graph,
+      opt<std::string>("transformer-preprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
-    
+
     int modelDim = opt<int>("transformer-dim-model", opt<int>("dim-emb"));
     int ffnDim   = opt<int>("transformer-dim-ffn");
     if(isDecoder && opt<int>("transformer-decoder-dim-ffn") != 0)
@@ -144,7 +144,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye
     // assemble filter of given depth
     layers = New<Sequential>(graph);
     registerLayer(layers);
-      
+
     if(actName == "relu") {
       layers->append(New<LinearReluDropout>(graph, ffnDim, ffnDropoutProbability));
     } else {
@@ -164,7 +164,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye
     layers->append(New<Linear>(graph, modelDim));
 
     postprocessor = New<TransformerPrePostProcessor>(
-      graph, 
+      graph,
       opt<std::string>("transformer-postprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(postprocessor);
@@ -178,7 +178,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye
   }
 };
 
-/** 
+/**
  * A full transformer encoder layer consists of a self-attention block followed by
  * a filter block. Skip connections etc. are handled inside the blocks, see above.
  */
@@ -186,13 +186,13 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa
   Ptr<TransformerSelfAttentionBlock> selfAttentionBlock;
   Ptr<TransformerFilterBlock> filterBlock;
 
-  TransformerEncoderLayer(Ptr<ExpressionGraph> graph, 
+  TransformerEncoderLayer(Ptr<ExpressionGraph> graph,
                           Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
     selfAttentionBlock = New<TransformerSelfAttentionBlock>(graph, options);
     registerLayer(selfAttentionBlock);
-    
+
     filterBlock = New<TransformerFilterBlock>(graph, options);
     registerLayer(filterBlock);
   }
@@ -200,30 +200,38 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa
   Expr apply(Expr input, Expr mask = nullptr) const override {
     Expr output = selfAttentionBlock->apply(input, mask);
     output      = filterBlock->apply(output);
-    
+
     checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)
-    
+
     return output;
   }
 };
 
 /**
- * A full transformer encoder stack. Before applying multiple transformer layers (depth of the encoder), we 
+ * A full transformer encoder stack. Before applying multiple transformer layers (depth of the encoder), we
  * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity
- * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. 
+ * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output.
  * @TODO: get rid of these transposes.
  */
 struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
+public:
   Ptr<PositionEmbeddingLayer> positionEmbedding;
   Ptr<AttentionMaskProcessor> maskProcessor;
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<LayerList> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
-  TransformerEncoder(Ptr<ExpressionGraph> graph, 
+protected: // @TODO: should this be public?
+   // collect hidden states as we step through the layers
+  mutable bool keepHiddenStates{false};
+  mutable std::vector<Expr> hiddenStates;
+  // apply this function to hidden states before collecting them
+  mutable std::function<Expr(Expr)> hiddenTransformFn = [](Expr x) { return x; };
+
+public:
+  TransformerEncoder(Ptr<ExpressionGraph> graph,
                      Ptr<Options> options)
-    : LayerWithOptions(graph, options)
-  {
+    : LayerWithOptions(graph, options) {
     if(!opt<bool>("transformer-disable-position-embeddings", false)) {
       positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2);
       registerLayer(positionEmbedding);
@@ -233,8 +241,8 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     registerLayer(maskProcessor);
 
     preprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess-emb", ""),  
+      graph,
+      opt<std::string>("transformer-postprocess-emb", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
@@ -242,15 +250,15 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     registerLayer(layers);
     for(int i = 0; i < opt<int>("enc-depth"); ++i) {
       auto transformerEncoderLayer = New<TransformerEncoderLayer>(graph, options);
-      // example of changing linear layer init functions burried deep in the model 
+      // example of changing linear layer init functions burried deep in the model
       if(opt<bool>("transformer-depth-scaling", false))
         for(auto linear : transformerEncoderLayer->allLayers<Linear>())
           linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
-      
+
       if(opt<bool>("transformer-no-bias", false))
         for(auto linear : transformerEncoderLayer->allLayers<Linear>())
           linear->useBias = false;
-      
+
       if(opt<bool>("transformer-no-affine", false)) {
         for(auto norm : transformerEncoderLayer->allLayers<Norm>()) {
           norm->useScale = false;
@@ -261,8 +269,8 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     }
 
     postprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess-top", ""),  
+      graph,
+      opt<std::string>("transformer-postprocess-top", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(postprocessor);
   }
@@ -274,9 +282,9 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     // dimensions. This order is more natural for the transformer, but more difficult to handle
     // during beam search or when using RNNs. Hence the input/output transpositions here.
 
-    // @TODO: still worth to review this whole transpose business across the tool. In the 
-    // decoder state, Frank added information about batchMajor/timeMajor orientation. If we 
-    // do that everywhere we can detect inconsistencies automatically. 
+    // @TODO: still worth to review this whole transpose business across the tool. In the
+    // decoder state, Frank added information about batchMajor/timeMajor orientation. If we
+    // do that everywhere we can detect inconsistencies automatically.
     // reorganize batch and timestep
     auto output = swapTimeBatch(input); // [1, dimBatch, dimSrcWords, dimModel]
     if(mask)
@@ -296,11 +304,16 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     auto logMask = maskProcessor->apply(output, mask);
 
     // traverse the layers, use the same mask for each
-    for(auto layer : *layers)
+    for(auto layer : *layers) {
+      if(keepHiddenStates) // note, with pre-norm, the hidden states will not be normed here.
+        hiddenStates.push_back(hiddenTransformFn(output));
       output = layer->apply(output, logMask);
+    }
 
     // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection
     output = postprocessor->apply(output, prevOutput);
+    if(keepHiddenStates)
+      hiddenStates.push_back(hiddenTransformFn(output));
 
     // restore organization of batch and time steps. This is currently required
     // to make RNN-based decoders and beam search work with this. We are looking
@@ -313,9 +326,14 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim]
     return output;
   }
+
+  virtual void clear() override {
+    LayerWithOptions::clear();
+    hiddenStates.clear();
+  }
 };
 
-/** 
+/**
  * This is a typical transformer cross-attention block. The default configuration will
  * use a multi-head multiplicative cross-attention layer, followed by dropout, the skip
  * connection and layer normalization (dan) in the post-processor. The pre-processor does
@@ -327,23 +345,23 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe
   Ptr<AttentionLayer> crossAttention;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
-  TransformerCrossAttentionBlock(Ptr<ExpressionGraph> graph, 
+  TransformerCrossAttentionBlock(Ptr<ExpressionGraph> graph,
                                  Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
     preprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-preprocess", ""),  
+      graph,
+      opt<std::string>("transformer-preprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
-    
+
     // @TODO: factory to support different attention flavors?
     crossAttention = attentionFromOptions(graph, options);
     registerLayer(crossAttention);
 
     postprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess", ""), 
+      graph,
+      opt<std::string>("transformer-postprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(postprocessor);
   }
@@ -358,17 +376,17 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe
 
 class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer {
 public:
-  TransformerAutoRegressiveBlock(Ptr<ExpressionGraph> graph, 
+  TransformerAutoRegressiveBlock(Ptr<ExpressionGraph> graph,
                                  Ptr<Options> options)
     : LayerWithOptions(graph, options) {}
-  
+
   virtual ~TransformerAutoRegressiveBlock() = default;
 
   using IBinaryDecoderLayer::apply;
 };
 
-/** 
- * This is a transformer RNN block. 
+/**
+ * This is a transformer RNN block.
  */
 class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
 public:
@@ -376,13 +394,13 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
   Ptr<RNN<SSRU>> rnn;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
-  TransformerRNNBlock(Ptr<ExpressionGraph> graph, 
+  TransformerRNNBlock(Ptr<ExpressionGraph> graph,
                       Ptr<Options> options)
     : TransformerAutoRegressiveBlock(graph, options)
   {
     preprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-preprocess", ""),  
+      graph,
+      opt<std::string>("transformer-preprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
@@ -392,8 +410,8 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
     registerLayer(rnn);
 
     postprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess", ""), 
+      graph,
+      opt<std::string>("transformer-postprocess", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(postprocessor);
   }
@@ -406,22 +424,22 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
   }
 };
 
-/** 
+/**
  * A full transformer decoder layer consists of a self-attention block followed by
- * cross-attention block and a filter block. Skip connections etc. are handled inside 
+ * cross-attention block and a filter block. Skip connections etc. are handled inside
  * the blocks, see above.
- * 
+ *
  * For the self-attention block we need a special mask, usually a triangle mask that
- * prohibits to look into the future. 
- * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive 
- * for many layers. 
+ * prohibits to look into the future.
+ * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive
+ * for many layers.
  */
 struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaternaryDecoderLayer {
   Ptr<TransformerAutoRegressiveBlock> autoRegressiveBlock;
   Ptr<TransformerCrossAttentionBlock> crossAttentionBlock;
   Ptr<TransformerFilterBlock> filterBlock;
 
-  TransformerDecoderLayer(Ptr<ExpressionGraph> graph, 
+  TransformerDecoderLayer(Ptr<ExpressionGraph> graph,
                           Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
@@ -434,10 +452,10 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna
       ABORT("Unknown auto-regression block type {}", autoRegressionType);
     }
     registerLayer(autoRegressiveBlock);
-  
+
     crossAttentionBlock = New<TransformerCrossAttentionBlock>(graph, options);
     registerLayer(crossAttentionBlock);
-    
+
     filterBlock = New<TransformerFilterBlock>(graph, options, /*isDecoder=*/true);
     registerLayer(filterBlock);
   }
@@ -447,15 +465,15 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna
     output      = crossAttentionBlock->apply(output, context, logMask);
     output      = filterBlock->apply(output);
 
-    checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)    
+    checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)
     return output;
   }
 };
 
 /**
- * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we 
+ * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we
  * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity
- * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. 
+ * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output.
  * @TODO: get rid of these transposes.
  */
 struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer {
@@ -464,8 +482,8 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<LayerList> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
-  
-  TransformerDecoder(Ptr<ExpressionGraph> graph, 
+
+  TransformerDecoder(Ptr<ExpressionGraph> graph,
                      Ptr<Options> options)
     : LayerWithOptions(graph, options)
   {
@@ -478,8 +496,8 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
     registerLayer(maskProcessor);
 
     preprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess-emb", ""),  
+      graph,
+      opt<std::string>("transformer-postprocess-emb", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
@@ -505,7 +523,7 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
       }
 
       auto currentLayer = layers->at(i)->as<TransformerDecoderLayer>();
-      // example of changing linear layer init functions burried deep in the model 
+      // example of changing linear layer init functions burried deep in the model
       if(opt<bool>("transformer-depth-scaling", false)) {
         auto autoRegLayer = currentLayer->autoRegressiveBlock->as<TransformerRNNBlock>();
         autoRegLayer->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
@@ -519,7 +537,7 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
       if(opt<bool>("transformer-no-bias", false))
         for(auto linear : currentLayer->allLayers<Linear>())
           linear->useBias = false;
-      
+
       if(opt<bool>("transformer-no-affine", false)) {
         for(auto norm : currentLayer->allLayers<Norm>()) {
           norm->useScale = false;
@@ -529,8 +547,8 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
     }
 
     postprocessor = New<TransformerPrePostProcessor>(
-      graph, 
-      opt<std::string>("transformer-postprocess-top", ""),  
+      graph,
+      opt<std::string>("transformer-postprocess-top", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(postprocessor);
   }
@@ -550,19 +568,19 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
     // @TODO: write function prepareMasks();
     // @TODO: create triangle mask here and combine with inputMask
     LOG_ONCE(info, "Don't forget the triangle mask if required!");
-    
+
     if(inputMask)
       inputMask = swapTimeBatch(inputMask); // [dimBeam=1, dimBatch, dimTrgWords, dimModel=1]
 
     if(contextMask)
       contextMask = swapTimeBatch(contextMask);  // [dimBeam=1, dimBatch, dimSrcWords, dimModel=1]
-  
+
     // apply positional embeddings to contextual input
     if(positionEmbedding)
       output = positionEmbedding->apply(output, startPos);
     else
       output = std::sqrt((float)output->shape()[-1]) * output;
-    
+
     // handle for skip connection at top
     auto prevOutput = output;
 
diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h
index aa335696d..d818ae384 100644
--- a/src/models/comet_qe.h
+++ b/src/models/comet_qe.h
@@ -10,37 +10,46 @@ namespace models {
 
 class CometEncoder final : public nn::TransformerEncoder {
 private:
+  Expr cometPool(Expr x, Expr binaryMask) const {
+    auto poolType = opt<std::string>("comet-pool", "avg");
+    if(poolType == "avg")
+      return sum(x * binaryMask, /*axis=*/-2) / sum(binaryMask, /*axis=*/-2);
+    else if(poolType == "max")
+      return max(x + marian::log(binaryMask), /*axis=*/-2);
+    else if(poolType == "cls")
+      return slice(x, /*axis=*/-2, 0);
+    else
+      ABORT("Unknown pool type {}", poolType);
+  }
+
   // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code.
   // It norms over time, not batch, also should be optimized. Seems safe to disable for custom
   // models trained by us, but required when doing inference with Unbabel models.
   Expr cometNorm(Expr x, Expr binaryMask) const {
-    Expr output;
+    Expr output = x;
 
     if(opt<bool>("comet-mix-norm", false)) {
-      registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
-      int dimModel = x->shape()[-1];
+      int dimModel = output->shape()[-1];
 
       // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32.
-      Type origType = x->value_type();
-      x             = marian::cast(x,       Type::float32);
-      binaryMask    = marian::cast(binaryMask, Type::float32);
-
-      x = x * binaryMask;
-      auto denom = (float)dimModel * sum(binaryMask, -2);
-      auto mu    = sum(sum(x, -1), -2) / denom; // sum over model and time
-      auto sigma = sum(sum(square(x - mu), -1), -2) / denom;
-
-      auto normed = (x - mu) / sqrt(sigma + 1e-12f);
-      output = marian::cast(gamma, Type::float32) * sum(normed * binaryMask, -2) / sum(binaryMask, -2);
-
-      // Undo conversion to fp32 if not originally fp32 (most likely fp16 then)
-      output = marian::cast(output, origType);
-    } else if(opt<bool>("comet-mix", false)) {
-      // average over time dimension
+      Type origType     = output->value_type();
+      auto output32     = marian::cast(output,    Type::float32);
+      auto binaryMask32 = marian::cast(binaryMask, Type::float32);
+
+      output32 = output32 * binaryMask32;
+      auto denom = (float)dimModel * sum(binaryMask32, -2);
+      auto mu    = sum(sum(output32, -1), -2) / denom; // sum over model and time
+      auto sigma = sum(sum(square(output32 - mu), -1), -2) / denom;
+
+      auto normed = (output32 - mu) / sqrt(sigma + 1e-12f);
+      output = marian::cast(normed, origType);
+    }
+
+    output = cometPool(output, binaryMask);
+
+    if(opt<bool>("comet-mix", false)) {
       registerParameterLazy(gamma, Shape({ 1 }), inits::ones());
-      output = gamma * sum(x * binaryMask, -2) / sum(binaryMask, -2);
-    } else {
-      output = sum(x * binaryMask, -2) / sum(binaryMask, -2);
+      output = gamma * output;
     }
 
     return output;
@@ -55,40 +64,36 @@ class CometEncoder final : public nn::TransformerEncoder {
     : TransformerEncoder(graph, options) {}
 
   Expr apply(Expr input, Expr mask) const override {
-    auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-
-    auto binaryMask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-
-    // apply positional embeddings to contextual input
-    output = positionEmbedding->apply(output);
-
-    // apply dropout or layer-norm to embeddings if required
-    output = preprocessor->apply(output);
-    auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1]
 
-    std::vector<Expr> pooler;
-    if(opt<bool>("comet-mix", false))
-      pooler.push_back(cometNorm(output, binaryMask));
-
-    // traverse the layers, use the same mask for each
-    for(auto layer : *layers) {
-      output = layer->apply(output, logMask);
-      if(opt<bool>("comet-mix", false))
-        pooler.push_back(cometNorm(output, binaryMask)); // [ batch, time, modelDim ]
+    auto binaryMask = marian::nn::swapTimeBatch(mask);
+    if(opt<bool>("comet-mix", false)) {
+      // we collect hidden states from the base class encoder
+      TransformerEncoder::keepHiddenStates  = true;
+      // to save memory we can already pool/norm the hidden states before storing them
+      TransformerEncoder::hiddenTransformFn = [this, binaryMask](Expr x) {
+        return cometNorm(x, binaryMask);
+      };
     }
 
+    // execute to populate hidden states
+    // the actual output is not used, because we use the collected hidden states instead
+    auto unused = TransformerEncoder::apply(input, mask);
+
+    Expr output;
     if(opt<bool>("comet-mix", false)) {
       registerParameterLazy(weights, Shape({ opt<int>("enc-depth") + 1 }), inits::zeros());
-      // comet22 has a sparsemax here
+      // comet22/comet-kiwi has a sparsemax here
       auto normFn = opt<std::string>("comet-mix-transformation", "softmax");
       auto weightsNorm = (normFn == "sparsemax") ? sparsemax(weights) : softmax(weights);
       weightsNorm = reshape(weightsNorm, {weights->shape()[-1], 1});
-      output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim]
+      output = sum(weightsNorm * concatenate(hiddenStates, /*axis=*/-2), -2); // [batch, 1, modelDim]
     } else {
       // just use last layer, average over time dim
       output = cometNorm(output, binaryMask); // [batch, 1, modelDim]
     }
 
+    // attach the unused output to the graph to avoid dangling nodes, this is a no-op.
+    output = choose({output, unused}, 0);
     return output;
   }
 };
@@ -147,7 +152,7 @@ struct CometBatchEncoder final : public nn::LayerWithOptions,
   }
 
   virtual void clear() override {
-    Layer::clear();
+    LayerWithOptions::clear();
   }
 };
 
@@ -282,8 +287,9 @@ class CometMetricPooler final : public nn::LayerWithOptions,
     ABORT_IF(usage == models::usage::embedding, "Wrong pooler for embedding??");
 
     auto modelType = LayerWithOptions::opt<std::string>("type");
-    ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe");
-    ABORT_IF(modelType == "comet"    && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet");
+    ABORT_IF(modelType == "comet-qe"      && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe");
+    ABORT_IF(modelType == "comet"         && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet");
+    ABORT_IF(modelType == "comet-unified" && encoderStates.size() != 1, "Pooler expects exactly one encoder state for comet-unified");
 
     if(modelType == "comet-qe") {
       auto src = encoderStates[0]->getContext();
@@ -370,6 +376,19 @@ class CometMetricPooler final : public nn::LayerWithOptions,
         // Currently no training for COMET with reference @TODO: add training
         ABORT("Usage other than 'evaluating' not implemented");
       }
+    } else if(modelType == "comet-unified") {
+      auto emb = encoderStates[0]->getContext();
+      Expr output;
+      if(usage == models::usage::evaluating) {
+        output = layers->apply(emb);
+        output = minimum(output, 1.f); // comet-kiwi/XL/XXL clamp at 1.f
+        int dimBatch = output->shape()[-3];
+        output = reshape(output, {dimBatch, 1, 1});
+        return { output };
+      } else {
+        // Currently no training for COMET with reference @TODO: add training
+        ABORT("Usage other than 'evaluating' not implemented");
+      }
     } else {
       ABORT("Unknown model type {}", modelType);
     }
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index 5b4cd34eb..1316dacd4 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -36,7 +36,7 @@ namespace models {
 Ptr<EncoderBase> EncoderFactory::construct(Ptr<ExpressionGraph> graph) {
   if(options_->get<std::string>("type") == "s2s")
     return New<EncoderS2S>(graph, options_);
-  
+
   if(options_->get<std::string>("type") == "laser" || options_->get<std::string>("type") == "laser-sim")
     return New<EncoderLaser>(graph, options_);
 
@@ -134,12 +134,16 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
   Ptr<ExpressionGraph> graph = nullptr; // graph unknown at this stage
   // clang-format off
 
-  if(type == "comet-qe" || type == "comet") {
-    if(type == "comet") {
-      ABORT_IF(use == usage::training, "Usage {} is not supported for model of type {}", (int)use, type); 
-      ABORT_IF(use == usage::scoring, "Usage {} is not supported for model of type {}", (int)use, type); 
+  if(type == "comet-qe" || type == "comet" || type == "comet-unified") {
+    if(type == "comet" || type == "comet-unified") {
+      ABORT_IF(use == usage::training, "Usage {} is not supported for model of type {}", (int)use, type);
+      ABORT_IF(use == usage::scoring, "Usage {} is not supported for model of type {}", (int)use, type);
+    }
+
+    if(type == "comet-unified") {
+      LOG_ONCE(warn, "Warning: For xCOMET-XL/XXL - this is currently only an implementation of the regressor part and does not include the interpolation with MQM scores");
     }
-    
+
     auto inputTypes = options->get<std::vector<std::string>>("input-types");
     ABORT_IF(inputTypes.empty(),
       "Required option --input-types for COMET-QE not set. "
@@ -149,7 +153,7 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     int shift = 0;
     if(inputTypes[0] == "class")
       shift = 1;
-    
+
     auto newOptions = options->with("usage", use);
     auto res = New<EncoderPooler>(newOptions);
 
@@ -160,24 +164,35 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     switch(use) {
       case usage::embedding:  numEncoders = 1; addEmbeddingPooler = true; break;
       case usage::raw:
-      case usage::evaluating:   
+      case usage::evaluating:
       case usage::scoring:
-      case usage::training:   numEncoders = (type == "comet-qe") ? 2 : 3; addMetricPooler = true; break;
-      default: ABORT("Usage {} is not supported for model of type {}", (int)use, type); 
+      case usage::training:
+        if(type == "comet-qe")
+          numEncoders = 2;
+        else if(type == "comet")
+          numEncoders = 3;
+        else if(type == "comet-unified")
+          numEncoders = 1;
+        else
+          ABORT("Unknown model type {}", type);
+
+        addMetricPooler = true;
+        break;
+      default: ABORT("Usage {} is not supported for model of type {}", (int)use, type);
     }
-  
+
     for(size_t i = 0; i < numEncoders; i++) {
       auto enc = New<CometBatchEncoder>(graph, newOptions->with("type", "transformer", "index", i + shift));
       enc->setName("CometEncoder"); // parameters will be shared
       res->push_back(enc);
     }
-    
+
     if(addEmbeddingPooler) {
       auto pooler = New<CometEmbeddingPooler>(graph, newOptions);
-      pooler->setName("CometEmbeddingPooler"); 
+      pooler->setName("CometEmbeddingPooler");
       res->push_back(pooler);
     }
-    
+
     if(addMetricPooler) {
       auto pooler = New<CometMetricPooler>(graph, newOptions);
       pooler->setName("CometQEPooler"); // @TODO: change name for different models
@@ -188,8 +203,8 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
   }
 
   if(type == "bleurt") {
-    ABORT_IF(use != usage::evaluating, "Usage other than 'evaluating' is not supported for model of type {}", type); 
-    
+    ABORT_IF(use != usage::evaluating, "Usage other than 'evaluating' is not supported for model of type {}", type);
+
     auto newOptions = options->with("usage", use);
     auto res = New<EncoderPooler>(newOptions);
 
@@ -202,11 +217,11 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     int shift = 0;
     if(inputTypes[0] == "class")
       shift = 1;
-    
+
     auto enc = New<BleurtBatchEncoder>(graph, newOptions->with("type", "transformer", "index", 0 + shift));
     enc->setName("BleurtEncoder");
     res->push_back(enc);
-          
+
     auto pooler = New<BleurtPooler>(graph, newOptions);
     pooler->setName("BleurtPooler");
     res->push_back(pooler);
@@ -236,8 +251,8 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
                                  "input-types", std::vector<std::string>({"sequence"}),
                                  "dim-vocabs", std::vector<int>(1, dimVocab));
     }
-    
-    auto res = New<EncoderPooler>(newOptions);      
+
+    auto res = New<EncoderPooler>(newOptions);
     if(options->get<bool>("compute-similarity", false)) {
       res->push_back(models::encoder(newOptions->with("index", 0)).construct(graph));
       res->push_back(models::encoder(newOptions->with("index", 1)).construct(graph));
@@ -270,15 +285,15 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
   else if(type == "transformer-new") {
     auto newOptions = options->with("usage", use);
     auto res = New<EncoderDecoder>(graph, newOptions);
-    
+
     auto enc = New<TransformerBatchEncoder>(graph, newOptions->with("type", "transformer"));
     enc->setName("TransformerBatchEncoder");
     res->push_back(enc);
-    
+
     auto dec = New<TransformerBatchDecoder>(graph, newOptions->with("type", "transformer"));
     dec->setName("TransformerBatchDecoder");
     res->push_back(dec);
-    
+
     return res;
   }
 
@@ -287,15 +302,15 @@ Ptr<IModel> createBaseModelByType(std::string type, usage use, Ptr<Options> opti
     if(tflavor && std::strcmp(tflavor, "experimental") == 0) {
       auto newOptions = options->with("usage", use);
       auto res = New<TransformerLegacy>(graph, newOptions);
-      
+
       auto enc = New<TransformerBatchEncoder>(graph, newOptions->with("type", "transformer"));
       enc->setName("TransformerBatchEncoder");
       res->push_back(enc);
-      
+
       auto dec = New<TransformerBatchDecoder>(graph, newOptions->with("type", "transformer"));
       dec->setName("TransformerBatchDecoder");
       res->push_back(dec);
-      
+
       return res;
     } else {
       auto newOptions = options->with("usage", use);
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index 9b5f300d4..efada03ae 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -329,8 +329,8 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
     if(isMainProcess()) {
       if(filesystem::exists(modelFileName)) {
         LOG(info, "Loading model from {}", modelFileName);
-        foundModel    = true;
-        modelWeights_ = New<io::ModelWeights>(modelFileName, io::MmapMode::DontMmap);
+        foundModel    = true;
+        modelWeights_ = New<io::ModelWeights>(modelFileName, io::MmapMode::DontMmap);
         markReloaded  = true;
       } else if(options_->hasAndNotEmpty("pretrained-model")) {
         std::string pretrainedModelFileName = options_->get<std::string>("pretrained-model");
diff --git a/src/translator/scorers.h b/src/translator/scorers.h
index 333e49108..21ab77c66 100644
--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@@ -77,17 +77,16 @@ class ScorerWrapper : public Scorer {
   ScorerWrapper(Ptr<models::IModel> encdec,
                 const std::string& name,
                 float weight,
-                Ptr<io::ModelWeights> modelFile)
+                Ptr<io::ModelWeights> modelWeights)
       : Scorer(name, weight),
         encdec_(std::static_pointer_cast<IEncoderDecoder>(encdec)),
-        modelWeights_(modelFile)
+        modelWeights_(modelWeights)
       {}
 
   virtual ~ScorerWrapper() {}
 
   virtual void init(Ptr<ExpressionGraph> graph) override {
     graph->switchParams(getName());
-    // @TODO: unify to a single call, this logic should happen in modelFile_
     if(modelWeights_)
       encdec_->load(graph, modelWeights_);
   }

From b5c892e8eee189d3bc81cbf31e3274d692821641 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 1 Feb 2024 16:45:01 +0000
Subject: [PATCH 10/26] Merged PR 32781: Attach missing node for mt-detect
 models

Fixes small bug for mt-detect models
---
 CHANGELOG.md          |  1 +
 VERSION               |  2 +-
 src/models/comet_qe.h | 18 ++++++++++++------
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 854162b6d..382aedb8c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed lost node in mt-detect metrics
 - Fixed BLEURT logmask computation
 - Fixed wrong paramter name for norm in new layer framework
 - Fixed unit test for LayerNorm
diff --git a/VERSION b/VERSION
index 5a8f2d3ca..cddff7b16 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.20
+v1.12.21
diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h
index d818ae384..df351ecf2 100644
--- a/src/models/comet_qe.h
+++ b/src/models/comet_qe.h
@@ -66,6 +66,7 @@ class CometEncoder final : public nn::TransformerEncoder {
   Expr apply(Expr input, Expr mask) const override {
 
     auto binaryMask = marian::nn::swapTimeBatch(mask);
+
     if(opt<bool>("comet-mix", false)) {
       // we collect hidden states from the base class encoder
       TransformerEncoder::keepHiddenStates  = true;
@@ -75,9 +76,8 @@ class CometEncoder final : public nn::TransformerEncoder {
       };
     }
 
-    // execute to populate hidden states
-    // the actual output is not used, because we use the collected hidden states instead
-    auto unused = TransformerEncoder::apply(input, mask);
+    // execute to populate hidden states and compute top output layer
+    auto hiddenTop = TransformerEncoder::apply(input, mask); // [time, batch, modelDim] (because the last state is being transposed again)
 
     Expr output;
     if(opt<bool>("comet-mix", false)) {
@@ -86,14 +86,20 @@ class CometEncoder final : public nn::TransformerEncoder {
       auto normFn = opt<std::string>("comet-mix-transformation", "softmax");
       auto weightsNorm = (normFn == "sparsemax") ? sparsemax(weights) : softmax(weights);
       weightsNorm = reshape(weightsNorm, {weights->shape()[-1], 1});
+
       output = sum(weightsNorm * concatenate(hiddenStates, /*axis=*/-2), -2); // [batch, 1, modelDim]
+
+      // since we use the hidden states from the encoder and not the top layer, we need to
+      // attach the unused output to the graph to avoid dangling nodes, this is a no-op.
+      output = choose({output, hiddenTop}, 0);
     } else {
+      // @TODO: get rid of this
+      // undo the time-batch swap
+      hiddenTop = marian::nn::swapTimeBatch(hiddenTop); // [batch, time, modelDim]
       // just use last layer, average over time dim
-      output = cometNorm(output, binaryMask); // [batch, 1, modelDim]
+      output = cometNorm(hiddenTop, binaryMask); // [batch, 1, modelDim]
     }
 
-    // attach the unused output to the graph to avoid dangling nodes, this is a no-op.
-    output = choose({output, unused}, 0);
     return output;
   }
 };

From 1c63c1ecc0487747906387df6a2050f295b4cf5d Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Sat, 3 Feb 2024 00:23:02 +0000
Subject: [PATCH 11/26] Merged PR 31744: Pymarian: python bindings to marian

* This code is same as  [public github repo tg/pybind-new branch](https://github.com/marian-nmt/marian-dev/pull/1013). Git histories seems slightly different between public and private repo so we are seeing a lot of commits
* This builds on top of work by Elijah https://github.com/marian-nmt/marian-dev/pull/948
---
 .github/workflows/macos.yml         |   8 +-
 .github/workflows/ubuntu.yml        |  11 +
 .github/workflows/windows.yml       |  12 +
 .gitignore                          |  11 +-
 .gitmodules                         |   3 +
 CHANGELOG.md                        |   1 +
 CMakeLists.txt                      |   7 +-
 azure-pipelines.yml                 |  27 ++-
 src/3rd_party/CMakeLists.txt        |  16 +-
 src/3rd_party/pybind11              |   1 +
 src/CMakeLists.txt                  |  37 ++-
 src/common/config.cpp               |  14 +-
 src/common/config.h                 |  15 ++
 src/common/logging.cpp              |  18 +-
 src/data/text_input.cpp             |  34 ++-
 src/data/text_input.h               |  20 +-
 src/embedder/vector_collector.cpp   |   4 +
 src/embedder/vector_collector.h     |  25 ++
 src/evaluator/evaluator.h           |  25 +-
 src/models/model_task.h             |   3 +-
 src/python/README.md                | 185 +++++++++++++++
 src/python/binding/bind.cpp         |  47 ++++
 src/python/binding/embedder.hpp     |  29 +++
 src/python/binding/evaluator.hpp    | 119 ++++++++++
 src/python/binding/trainer.hpp      |  51 +++++
 src/python/binding/translator.hpp   |  69 ++++++
 src/python/pymarian/__init__.py     |  48 ++++
 src/python/pymarian/__main__.py     |  18 ++
 src/python/pymarian/constants.py    |  28 +++
 src/python/pymarian/evaluate.py     | 344 ++++++++++++++++++++++++++++
 src/python/pymarian/mtapi_server.py |  84 +++++++
 src/python/pymarian/qtdemo.py       | 125 ++++++++++
 src/python/pymarian/utils.py        | 101 ++++++++
 src/python/pyproject.toml           |  63 +++++
 src/python/setup.py                 | 102 +++++++++
 src/python/tests/__init__.py        |  15 ++
 src/python/tests/test_evaluate.py   | 148 ++++++++++++
 src/python/tests/test_train.py      | 142 ++++++++++++
 src/python/tests/test_translate.py  |  16 ++
 src/translator/translator.h         | 101 +++++---
 40 files changed, 2035 insertions(+), 92 deletions(-)
 create mode 160000 src/3rd_party/pybind11
 create mode 100644 src/python/README.md
 create mode 100644 src/python/binding/bind.cpp
 create mode 100644 src/python/binding/embedder.hpp
 create mode 100644 src/python/binding/evaluator.hpp
 create mode 100644 src/python/binding/trainer.hpp
 create mode 100644 src/python/binding/translator.hpp
 create mode 100644 src/python/pymarian/__init__.py
 create mode 100644 src/python/pymarian/__main__.py
 create mode 100644 src/python/pymarian/constants.py
 create mode 100755 src/python/pymarian/evaluate.py
 create mode 100755 src/python/pymarian/mtapi_server.py
 create mode 100644 src/python/pymarian/qtdemo.py
 create mode 100644 src/python/pymarian/utils.py
 create mode 100644 src/python/pyproject.toml
 create mode 100644 src/python/setup.py
 create mode 100644 src/python/tests/__init__.py
 create mode 100644 src/python/tests/test_evaluate.py
 create mode 100644 src/python/tests/test_train.py
 create mode 100644 src/python/tests/test_translate.py

diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 8b992e404..abff1d712 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -49,5 +49,11 @@ jobs:
         ./marian --version
         ./marian-decoder --version
         ./marian-scorer --version
-        ./spm_encode --version
         ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \))
+
+    - name: Install PyMarian
+      run: |
+        python3 -m pip install --upgrade pip setuptools wheel pytest
+        CMAKE_ARGS="" python3 -m pip install -v .
+        python3 -m pymarian -v
+        MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
\ No newline at end of file
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index bc01b74a8..f2baae82d 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -115,6 +115,7 @@ jobs:
           -DCOMPILE_CPU=${{ matrix.cpu }} \
           -DCOMPILE_CUDA=${{ matrix.gpu }} \
           -DCOMPILE_EXAMPLES=${{ matrix.examples }} \
+          -DUSE_TCMALLOC=OFF \
           -DCOMPILE_SERVER=on \
           -DCOMPILE_TESTS=${{ matrix.unit_tests }} \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
@@ -143,3 +144,13 @@ jobs:
         ./marian-server --version
         ./spm_encode --version
         ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \))
+
+    - name: Install PyMarian
+      working-directory: build
+      env:
+        CUDA_VERSION: ${{ matrix.cuda }}
+      run: |
+        python3 -m pip install --upgrade pip setuptools wheel pytest
+        CMAKE_ARGS="" python3 -m pip install -v .
+        python3 -m pymarian -v
+        MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index b1d6b1bd1..55ff0d688 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -134,4 +134,16 @@ jobs:
         .\marian-decoder.exe --version
         .\marian-scorer.exe --version
         dir *.exe
+        cd ..
+      shell: cmd
+
+    - name: Install PyMarian
+      working-directory: src/python
+      run: |
+        python3 -m pip install --upgrade pip setuptools wheel pytest
+        python3 -m pip install -v .
+        python3 -m pymarian -v
+        python3 -m pytest -vs src/python/tests
+      env:
+        CUDA_VERSION: ${{ matrix.cuda }}
       shell: cmd
diff --git a/.gitignore b/.gitignore
index d7f2f4df3..a55d45a39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-# Config files from CMake
+.history*
 src/common/project_version.h
 src/common/git_revision.h
 src/common/build_info.cpp
@@ -48,6 +48,8 @@ pingme.txt
 # CMake files
 build
 build-*
+# pymarian wheels
+dist/
 
 # Examples
 examples/*/*.gz
@@ -61,4 +63,9 @@ examples/mnist/*ubyte
 /vs/MarianDll.VC.VC.opendb
 
 .vs
-.vscode
+.vscode
+
+# Python : pymarian
+*.whl
+*.egg-info
+src/python/pymarian/_version.py
diff --git a/.gitmodules b/.gitmodules
index a1a876d8b..7a94dab1d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -20,3 +20,6 @@
 [submodule "src/3rd_party/simple-websocket-server"]
 	path = src/3rd_party/simple-websocket-server
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
+[submodule "src/3rd_party/pybind11"]
+	path = src/3rd_party/pybind11
+	url = https://github.com/pybind/pybind11.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 382aedb8c..13dd5e301 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added `pymarian`: python bindings based on pybind11
 - Added implementation of COMET-KIWI
 - Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
 - Added implementation of COMET-22 (reference-based) model and conversion
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 595f87cc1..0ebe2b819 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,9 +29,11 @@ option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
+option(USE_TCMALLOC "Use TCMALLOC if available" ON)
 option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
 option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
+option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF)
 
 # fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
 # so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
@@ -105,7 +107,7 @@ if(MSVC)
   set(INTRINSICS "/arch:AVX2")
   # set(INTRINSICS "/arch:AVX512")
   # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
-  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
+  set(CMAKE_CXX_FLAGS           "/permissive- /EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
   set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
   set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
 
@@ -347,6 +349,7 @@ if(CUDA_FOUND)
     LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
   endif()
 
+  message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
   if(COMPILE_KEPLER)
     message(STATUS "Compiling code for Kepler GPUs")
     LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
@@ -412,7 +415,7 @@ if(CUDA_FOUND)
     if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
       find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
       if(NOT CUDA_cublasLt_LIBRARY)
-        message(FATAL_ERROR "cuBLASLt library not found")
+        message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
       endif()
       set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
       set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4e1744375..a1e9ea94f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -193,6 +193,7 @@ stages:
           -DUSE_NCCL="FALSE" ^
           -DUSE_SENTENCEPIECE="TRUE" ^
           -DUSE_STATIC_LIBS="TRUE"
+
       displayName: Configure CMake
       env:
         # Set envvars so that CMake can find the installed packages
@@ -322,6 +323,10 @@ stages:
       displayName: Install CUDA
       condition: eq(variables.gpu, true)
 
+    # Some preinstalled versions of pip are bad for pymarian; see https://github.com/pypa/setuptools/issues/3269
+    - bash: python3 -m pip install pip -U 
+      displayName: Upgrade pip
+
     - bash: |
         mkdir -p build
         cd build
@@ -336,7 +341,11 @@ stages:
           -DUSE_SENTENCEPIECE=on \
           -DUSE_STATIC_LIBS=$(static) \
           -DBoost_ARCHITECTURE=-x64 \
-          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda)
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda) \
+          -DUSE_TCMALLOC=off \
+          -DPYMARIAN=ON \
+          -DPYTHON_EXECUTABLE=python3
+
       displayName: Configure CMake
 
     # Clean build/src/ to safe disk space on Azure-hosted VMs and stay below the 10GB limit
@@ -361,6 +370,11 @@ stages:
       displayName: Print versions
       workingDirectory: build
 
+    - bash: |
+        python3 -m pip install build/pymarian-*.whl
+        python3 -m pymarian -v
+      displayName: Build Pymarian
+
   ######################################################################
   - job: BuildMacOS
     cancelTimeoutInMinutes: 1
@@ -393,6 +407,7 @@ stages:
           -DUSE_FBGEMM=on \
           -DUSE_SENTENCEPIECE=on \
           -DUSE_STATIC_LIBS=off
+
       displayName: Configure CMake
 
     - bash: make -j2
@@ -453,7 +468,10 @@ stages:
           -DCOMPILE_CUDA=off \
           -DGENERATE_MARIAN_INSTALL_TARGETS=on \
           -DUSE_FBGEMM=on \
-          -DUSE_SENTENCEPIECE=on
+          -DUSE_SENTENCEPIECE=on \
+          -DPYMARIAN=on \
+          -DPYTHON_EXECUTABLE=python3
+
       displayName: Configure CMake
 
     - bash: make -j3 install
@@ -468,6 +486,11 @@ stages:
       displayName: Check targets
       workingDirectory: install
 
+    - bash: |
+        python3 -m pip install build/pymarian-*.whl
+        python3 -m pymarian -v
+      displayName: Build Pymarian
+
 
 # Marian is built in the same job where the regression tests are run to make sure that executables
 # are compiled and run on a machine with the same CPU architecture, which is required for
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 838951c50..6cf46533f 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -72,13 +72,17 @@ if(USE_SENTENCEPIECE)
 
   # regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically
   set(SPM_ENABLE_SHARED OFF CACHE BOOL  "Builds shared libaries in addition to static libraries." FORCE)
-  set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")
 
-  if(USE_STATIC_LIBS)
-    set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
-  else(USE_STATIC_LIBS)
-    set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.")
-  endif(USE_STATIC_LIBS)
+  if(USE_TCMALLOC)
+    set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.")
+    if(USE_STATIC_LIBS)
+      set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE)
+    else(USE_STATIC_LIBS)
+      set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.")
+    endif(USE_STATIC_LIBS)
+  else(USE_TCMALLOC)
+    set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "Enable TCMalloc if available.")
+  endif(USE_TCMALLOC)
 
   add_subdirectory(./sentencepiece)
   include_directories(./sentencepiece)
diff --git a/src/3rd_party/pybind11 b/src/3rd_party/pybind11
new file mode 160000
index 000000000..869cc1ff0
--- /dev/null
+++ b/src/3rd_party/pybind11
@@ -0,0 +1 @@
+Subproject commit 869cc1ff085dd405635b00eb46e5c84f50f26099
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5bf321af5..c40eabc76 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,6 +4,7 @@ include_directories(.)
 include_directories(3rd_party)
 include_directories(3rd_party/SQLiteCpp/include)
 include_directories(3rd_party/sentencepiece)
+
 if(USE_SENTENCEPIECE)
   include_directories(3rd_party/sentencepiece/third_party/protobuf-lite)
 endif(USE_SENTENCEPIECE)
@@ -260,11 +261,11 @@ if (NOT COMPILE_LIBRARY_ONLY)
   endif(COMPILE_SERVER)
 
   foreach(exec ${EXECUTABLES})
-    target_link_libraries(${exec} marian)
-    if(CUDA_FOUND)
-      target_link_libraries(${exec} marian_cuda)
-    endif(CUDA_FOUND)
-    set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+  target_link_libraries(${exec} marian)
+  if(CUDA_FOUND)
+    target_link_libraries(${exec} marian_cuda)
+  endif(CUDA_FOUND)
+  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
   endforeach(exec)
 endif(NOT COMPILE_LIBRARY_ONLY)
 
@@ -282,9 +283,33 @@ endif(COMPILE_EXAMPLES)
 
 if(GENERATE_MARIAN_INSTALL_TARGETS)
   # Install the marian library if given a "make install" target
-  include(GNUInstallDirs)                 # This defines default values for installation directories (all platforms even if named GNU)
+  include(GNUInstallDirs) # This defines default values for installation directories (all platforms even if named GNU)
   install(TARGETS marian
     EXPORT marian-targets
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif(GENERATE_MARIAN_INSTALL_TARGETS)
+
+
+if(PYMARIAN)
+  if(NOT PYTHON_EXECUTABLE)
+    set(PYTHON_EXECUTABLE python)   # default to python in the environment
+  endif()
+
+  include_directories(3rd_party/pybind11/include)
+  add_subdirectory(3rd_party/pybind11)
+
+  pybind11_add_module(_pymarian MODULE python/binding/bind.cpp)
+  target_link_libraries(_pymarian PUBLIC marian)
+  if(CUDA_FOUND)
+    target_link_libraries(_pymarian PUBLIC marian_cuda)
+  endif(CUDA_FOUND)
+  install(TARGETS _pymarian DESTINATION .)
+
+  # build pymarian wheel
+  add_custom_target(pymarian ALL 
+    ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
+    "${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}"
+    DEPENDS _pymarian 
+    VERBATIM COMMENT "Building pymarian wheel")
+endif(PYMARIAN)
diff --git a/src/common/config.cpp b/src/common/config.cpp
index 20ef6e046..b6296a8b2 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -274,12 +274,22 @@ std::vector<DeviceId> Config::getDevices(Ptr<Options> options,
   return devices;
 }
 
-Ptr<Options>
-parseOptions(int argc, char** argv, cli::mode mode, bool validate){
+Ptr<Options> parseOptions(int argc, char** argv, cli::mode mode, bool validate) {
   ConfigParser cp(mode);
   return cp.parseOptions(argc, argv, validate);
 }
 
+Ptr<Options> parseOptions(const std::string& args, cli::mode mode, bool validate) {
+  std::vector<std::string> vArgs = utils::split(args, " ");
+  
+  std::string dummy("marian");
+  std::vector<char*> cArgs = { &dummy[0] };
+  for(auto& arg : vArgs)
+    cArgs.push_back(&arg[0]);
+
+  return parseOptions((int)cArgs.size(), cArgs.data(), mode, validate);
+}
+
 std::ostream& operator<<(std::ostream& out, const Config& config) {
   YAML::Emitter outYaml;
   cli::OutputYaml(config.get(), outYaml);
diff --git a/src/common/config.h b/src/common/config.h
index c22d7415e..06110e17e 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -119,4 +119,19 @@ Ptr<Options> parseOptions(int argc,
                           cli::mode mode,
                           bool validate = true);
 
+/**
+ * Parse the command line options. 
+ * Same as above, but args provided as C++ string object, space-delimited. This is used for instance
+ * in the python bindings as a simple string-based interface.
+ *
+ * @param args space delimited command line options
+ * @param mode change the set of available command-line options, e.g. training, translation, etc.
+ * @param validate validate parsed options and abort on failure
+ *
+ * @return parsed options
+ */
+Ptr<Options> parseOptions(const std::string& args, 
+                          cli::mode mode,
+                          bool validate = true);
+
 }  // namespace marian
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 69efeb482..53bb6ba81 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -26,11 +26,13 @@ std::shared_ptr<spdlog::logger> createStderrLogger(const std::string& name,
                                                    const std::string& pattern,
                                                    const std::vector<std::string>& files,
                                                    bool quiet) {
-  std::vector<spdlog::sink_ptr> sinks;
+  auto logger = spdlog::get(name);
+  if(!logger) {
+    std::vector<spdlog::sink_ptr> sinks;
 
-  auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
-  if(!quiet)
-    sinks.push_back(stderr_sink);
+    auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
+    if(!quiet)
+      sinks.push_back(stderr_sink);
 
   // @TODO: think how to solve this better than using OMPI_COMM_WORLD_RANK env variable
   // only create output files if we are the main process or if MPI rank is not defined
@@ -42,10 +44,11 @@ std::shared_ptr<spdlog::logger> createStderrLogger(const std::string& name,
     }
   }
 
-  auto logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
+    logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
 
-  spdlog::register_logger(logger);
-  logger->set_pattern(pattern);
+    spdlog::register_logger(logger);
+    logger->set_pattern(pattern);
+  }
   return logger;
 }
 
@@ -72,6 +75,7 @@ bool setLoggingLevel(spdlog::logger& logger, std::string const level) {
 }
 
 static void setErrorHandlers();
+
 void createLoggers(const marian::Config* config) {
   std::vector<std::string> generalLogs;
   std::vector<std::string> validLogs;
diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp
index 3485a223f..0ccaedf14 100644
--- a/src/data/text_input.cpp
+++ b/src/data/text_input.cpp
@@ -13,7 +13,13 @@ void TextIterator::increment() {
 }
 
 bool TextIterator::equal(TextIterator const& other) const {
-  return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid());
+  // two iterators are equal if any of the following is true: 
+  // 1. both are invalid (null ptrs)
+  // 2. both at the end of the stream (empty tuples as record, regardless of pos_) 
+  // 3. both are at the same position
+  return (!this->tup_.valid() && !other.tup_.valid()) ||
+    (this->tup_.valid() && other.tup_.valid() && this->tup_.empty() && other.tup_.empty()) ||
+    this->pos_ == other.pos_;
 }
 
 const SentenceTuple& TextIterator::dereference() const {
@@ -38,30 +44,18 @@ TextInput::TextInput(std::vector<std::string> inputs,
 SentenceTuple TextInput::next() {
   // get index of the current sentence
   size_t curId = pos_++;
-
-  // fill up the sentence tuple with source and/or target sentences
-  SentenceTupleImpl tup(curId);
+  // read next row, i.e. vector<string> from files
+  // if any file is empty, we are done
+  std::vector<std::string> row;
   for(size_t i = 0; i < files_.size(); ++i) {
     std::string line;
     if(io::getline(*files_[i], line)) {
-      Words words = vocabs_[i]->encode(line, /*addEOS=*/true, /*inference=*/inference_);
-      if(this->maxLengthCrop_ && words.size() > this->maxLength_) {
-        words.resize(maxLength_);
-        words.back() = vocabs_.back()->getEosId();  // note: this will not work with class-labels
-      }
-
-      ABORT_IF(words.empty(),   "No words (not even EOS) found in string??");
-      ABORT_IF(tup.size() != i, "Previous tuple elements are missing.");
-      tup.pushBack(words);
+      row.push_back(line);
+    } else {
+      return SentenceTupleImpl(); // return an empty tuple if above test does not pass();
     }
   }
-
-  if(tup.size() == files_.size()) // check if each input file provided an example
-    return SentenceTuple(tup);
-  else if(tup.size() == 0) // if no file provided examples we are done
-    return SentenceTupleImpl(); // return an empty tuple if above test does not pass();
-  else // neither all nor none => we have at least on missing entry
-    ABORT("There are missing entries in the text tuples.");
+  return encode(row, curId);
 }
 
 }  // namespace data
diff --git a/src/data/text_input.h b/src/data/text_input.h
index 98d991bcb..3a399b6d2 100644
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@@ -27,7 +27,7 @@ class TextIterator : public IteratorFacade<TextIterator, SentenceTuple const> {
 };
 
 class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
-private:
+protected:
   std::vector<UPtr<std::istringstream>> files_;
   std::vector<Ptr<Vocab>> vocabs_;
 
@@ -92,6 +92,24 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
   }
 
   void prepare() override {}
+
+  SentenceTuple encode(std::vector<std::string>& row, size_t id) {
+    ABORT_IF(row.size() != vocabs_.size(), "Number of fields does not match number of vocabs");
+    // fill up the sentence tuple with source and/or target sentences
+    SentenceTupleImpl tup(id);
+    for(size_t i = 0; i < row.size(); ++i) {
+      std::string field = row[i];
+      Words words = vocabs_[i]->encode(field, /*addEOS=*/true, /*inference=*/inference_);
+      if(this->maxLengthCrop_ && words.size() > this->maxLength_) {
+        words.resize(maxLength_);
+        words.back() = vocabs_.back()->getEosId();  // note: this will not work with class-labels
+      }
+      ABORT_IF(words.empty(), "No words (not even EOS) found in the input text. ID: " + std::to_string(id));
+      tup.pushBack(words);
+    }
+    return SentenceTuple(tup);
+  }
+
 };
 }  // namespace data
 }  // namespace marian
diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp
index 1268de530..fcfbb02e7 100644
--- a/src/embedder/vector_collector.cpp
+++ b/src/embedder/vector_collector.cpp
@@ -109,6 +109,10 @@ Ptr<VectorCollector> VectorCollector::Create(Ptr<Options> options) {
   return collector;
 }
 
+void BufferedVectorCollector::WriteVector(const std::vector<float>& vec) {
+  buffer.push_back(vec);
+}
+
 const size_t VectorCollector::DEFAULT_WIDTH = 4;
 
 }  // namespace marian
diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h
index 6c727203c..ff4c4dd9c 100644
--- a/src/embedder/vector_collector.h
+++ b/src/embedder/vector_collector.h
@@ -63,4 +63,29 @@ class AveragingVectorCollector : public VectorCollector {
   virtual void WriteAverage();
 };
 
+
+// collects vectors and hold them in memory
+class BufferedVectorCollector : public VectorCollector {
+
+private:
+  std::vector<std::vector<float>> buffer;
+
+protected:
+  virtual void WriteVector(const std::vector<float>& vec) override;
+
+public:
+  BufferedVectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH) 
+  : VectorCollector(binary, width) {}
+  
+  BufferedVectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH)
+  : VectorCollector(outFile, binary, width) {}
+
+  auto getBuffer() -> decltype(buffer) {
+    return buffer;
+  }
+
+  virtual ~BufferedVectorCollector() {}
+
+};
+
 }  // namespace marian
diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h
index bfed80a53..022a8204c 100644
--- a/src/evaluator/evaluator.h
+++ b/src/evaluator/evaluator.h
@@ -47,7 +47,7 @@ template <class Model>
 class Evaluate : public ModelTask {
 private:
   Ptr<Options> options_;
-  Ptr<CorpusBase> corpus_;
+
   std::vector<Ptr<ExpressionGraph>> graphs_;
   std::vector<Ptr<Model>> models_;
   Ptr<io::ModelWeights> modelFile_;
@@ -57,8 +57,12 @@ class Evaluate : public ModelTask {
     options_ = options_->with("inference", true,
                               "shuffle", "none");
 
-    corpus_ = New<Corpus>(options_);
-    corpus_->prepare();
+    /* Number of embeddings parameter is determined at runtime based on the given vocabulary file.
+      In addtiion, this parameter has to be set before initializing the model object.
+      Corpus initializer is the one that sets the number of embeddings into options_ object.
+      However, we do not need to use corpus object here, so we just create a dummy corpus object.
+    */
+    Ptr<CorpusBase> corpus = New<Corpus>(options_);
 
     auto devices = Config::getDevices(options_);
 
@@ -94,11 +98,19 @@ class Evaluate : public ModelTask {
     LOG(info, "Evaluating");
     timer::Timer timer;
 
-    auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_);
+    Ptr<CorpusBase> corpus = New<Corpus>(options_);
+    corpus->prepare();
+    auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus, options_);
     batchGenerator->prepare();
 
     Ptr<VectorCollector> output = VectorCollector::Create(options_);
-
+    run(batchGenerator, output);
+    LOG(info, "Total time: {:.5f}s wall", timer.elapsed());
+  }
+  
+  template <typename T>
+  void run(Ptr<BatchGenerator<T>> batchGenerator,  Ptr<VectorCollector> collector) {
+  
     size_t batchId = 0;
     {
       ThreadPool pool(graphs_.size(), graphs_.size());
@@ -137,14 +149,13 @@ class Evaluate : public ModelTask {
               auto beg = i * numScores;
               auto end = (i + 1) * numScores;
               std::vector<float> sentVector(sentVectors.begin() + beg, sentVectors.begin() + end);
-              output->Write((long)batch->getSentenceIds()[i], sentVector);
+              collector->Write((long)batch->getSentenceIds()[i], sentVector);
           }
         };
 
         pool.enqueue(task, batchId++);
       }
     }
-    LOG(info, "Total time: {:.5f}s wall", timer.elapsed());
   }
 
 };
diff --git a/src/models/model_task.h b/src/models/model_task.h
index 96dfadd0c..798dd5546 100644
--- a/src/models/model_task.h
+++ b/src/models/model_task.h
@@ -11,6 +11,7 @@ struct ModelTask {
 
 struct ModelServiceTask {
   virtual ~ModelServiceTask() {}
-  virtual std::string run(const std::string&) = 0;
+  virtual std::string run(const std::string& /*input*/, const std::string& /*yaml*/) = 0;
+  virtual std::vector<std::string> run(const std::vector<std::string>& /*input*/, const std::string& /*yaml*/) = 0;
 };
 }  // namespace marian
diff --git a/src/python/README.md b/src/python/README.md
new file mode 100644
index 000000000..f8f00bdc5
--- /dev/null
+++ b/src/python/README.md
@@ -0,0 +1,185 @@
+# PyMarian
+
+* Python bindings to Marian (C++) is using [PyBind11]
+* The python package is built using [scikit-build-core](https://github.com/scikit-build/scikit-build-core)
+
+
+## Install
+
+```bash
+# get source code
+git clone https://github.com/marian-nmt/marian-dev
+cd marian-dev 
+
+# build marian with -DPYMARIAN=on option to create a pymarian wheel
+cmake . -Bbuild -DCOMPILE_CUDA=off -DPYMARIAN=on -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j       # -j option parallelizes build on all cpu cores
+python -m pip install build/pymarian-*.whl
+```
+
+Since the above commands uses `python` executable in the PATH to determine Python version to compile marian native extension, make sure to have the desired `python` executable in your environment _before_ invoking these commands.
+
+## Python API
+
+Python API is designed to take same argument as marian CLI string.
+> NOTE: these APIs are experimental only and not finalized. see `mtapi_server.py` for an example use of Translator API 
+
+**Translator**
+```python
+
+# Translator
+from pymarian import Translator
+cli_string = "..."
+translator = Translator(cli_string)
+
+sources = ["sent1" , "sent2" ]
+result = translator.translate(sources)
+print(result)
+```
+
+**Evaluator**
+```python
+# Evaluator
+from pymarian import Evaluator
+cli_string = '-m path/to/model.npz -v path/to.vocab.spm path/to.vocab.spm --like comet-qe'
+evaluator = Evaluator(cli_str)
+
+data = [
+    ["Source1", "Hyp1"],
+    ["Source2", "Hyp2"]
+]
+scores = evaluator.run(data)
+for score in scores:
+    print(score)
+```
+
+## CLI Usage
+. `pymarian-evaluate` : CLI to download and use pretrained metrics such as COMETs, COMETOIDs, ChrFoid, and BLEURT
+. `pymarian-mtapi` : REST API demo powered by Flask
+. `pymarian-qtdemo` : GUI App demo powered by QT 
+
+
+### `pymarian-evaluate` 
+
+```bash
+$ pymarian-evaluate -h
+usage: pymarian-evaluate [-h] [-m MODEL] [--stdin] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE] [-o OUT] [-a {skip,append,only}] [-w WIDTH] [--debug] [--mini-batch MINI_BATCH] [-d [DEVICES ...] | -c
+                         CPU_THREADS] [-ws WORKSPACE] [--backend {subprocess,pymarian}]
+
+options:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        Model name, or path. Known models=['cometoid22-wmt21', 'cometoid22-wmt22', 'cometoid22-wmt23', 'chrfoid-wmt23', 'comet20-da-qe', 'bleurt20', 'comet20-da'] (default:
+                        cometoid22-wmt22)
+  --stdin               Read input from stdin. TSV file with following format: QE metrics: "src<tab>mt", Comet with ref: "src<tab>ref<tab>; or BLEURT: "ref<tab>mt" (default: False)
+  -t MT_FILE, --mt MT_FILE
+                        MT output file. Ignored when --stdin. (default: None)
+  -s SRC_FILE, --src SRC_FILE
+                        Source file. Ignored when --stdin (default: None)
+  -r REF_FILE, --ref REF_FILE
+                        Ref file. Ignored when --stdin (default: None)
+  -o OUT, --out OUT     output file. Default stdout (default: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>)
+  -a {skip,append,only}, --average {skip,append,only}
+                        Average segment scores to produce system score. skip=do not output average (default; segment scores only); append=append average at the end; only=output the average only
+                        (i.e system score only) (default: skip)
+  -w WIDTH, --width WIDTH
+                        Output score width (default: 4)
+  --debug               Verbose output (default: False)
+  --mini-batch MINI_BATCH
+                        Mini-batch size (default: 16)
+  -d [DEVICES ...], --devices [DEVICES ...]
+                        GPU device IDs (default: None)
+  -c CPU_THREADS, --cpu-threads CPU_THREADS
+                        Use CPU threads. 0=use gpu device 0 (default: None)
+  -ws WORKSPACE, --workspace WORKSPACE
+                        Workspace memory (default: 8000)
+  --backend {subprocess,pymarian}
+                        Marian backend interface. subprocess looks for marian binary in PATH. pymarian is a pybind wrapper (default: pymarian)
+```
+
+**Performance Tuning Tips**:
+* For CPU parallelization, `--cpu-threads <n>`
+* For GPU parallelization, assuming pymarian was compiled with cuda support, e.g., `--devices 0 1 2 3` to use the specified 4 gpu devices.
+* When OOM error: adjust `--mini-batch` argument
+* To see full logs from marian, set `--debug`
+
+
+*Example Usage*
+```bash
+# download sample dataset
+langs=en-ru
+prefix=tmp.$langs
+teset=wmt21/systems
+sysname=Online-B
+sacrebleu -t $teset -l $langs --echo src > $prefix.src
+sacrebleu -t $teset -l $langs --echo ref > $prefix.ref
+sacrebleu -t $teset -l $langs --echo $sysname > $prefix.mt
+
+# chrfoid
+paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m chrfoid-wmt23 
+
+# cometoid22-wmt{21,22,23}
+paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m cometoid22-wmt22
+
+# bleurt20
+paste $prefix.{ref,mt} | head | pymarian-evaluate --stdin  -m bleurt20 --debug
+
+# FIXME: comet20-da-qe and comet20-da appear to be broken 
+# comet20-da-qe
+paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m comet20-da-qe
+# comet20-da
+paste $prefix.{src,mt,ref} | pymarian-evaluate  -m comet20-da 
+
+```
+
+### `pymarian-mtapi`
+
+Launch server
+```bash
+# example model: download and extract
+wget http://data.statmt.org/romang/marian-regression-tests/models/wngt19.tar.gz 
+tar xvf wngt19.tar.gz 
+
+# launch server
+pymarian-mtapi -s en -t de "-m wngt19/model.base.npz -v wngt19/en-de.spm wngt19/en-de.spm"
+```
+
+Example request from client
+ 
+```bash
+URL="http://127.0.0.1:5000/translate"
+curl $URL --header "Content-Type: application/json" --request POST --data '[{"text":["Good Morning."]}]'
+```
+
+### `pymarian-qtdemo` 
+```
+pymarian-qtdemo
+```
+
+## Run Tests
+
+```bash
+# install pytest if necessary
+python -m pip install pytest 
+
+# run tests in quiet mode
+python -m pytest src/python/tests/
+
+# or, add -s to see STDOUT/STDERR from tests
+python -m pytest -s src/python/tests/
+
+```
+
+
+## Known issues
+   
+1. In conda or mamba environment, if you see  `.../miniconda3/envs/<envname>/bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error,
+    install libstdcxx-ng
+
+    ```bash
+    conda install -c conda-forge libstdcxx-ng
+    ```
+
+
+
+
diff --git a/src/python/binding/bind.cpp b/src/python/binding/bind.cpp
new file mode 100644
index 000000000..9e8cc4464
--- /dev/null
+++ b/src/python/binding/bind.cpp
@@ -0,0 +1,47 @@
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+// if your IDE/vscode complains about missing paths 
+// pybind11 can be found by "python -m pybind11 --includes"; you may need to add both pybind11 and Python.h
+#include "embedder.hpp"
+#include "evaluator.hpp"
+#include "trainer.hpp"
+#include "translator.hpp"
+
+
+#define PYBIND11_DETAILED_ERROR_MESSAGES
+
+namespace py = pybind11;
+using namespace pymarian;
+
+
+PYBIND11_MODULE(_pymarian, m) {
+    m.doc() = "Marian C++ API bindings via pybind11";
+
+    /** TODOS 
+     *  1. API to check if gpu available: cuda_is_available() -> bool
+     *  2. API to check number of gpus:: cuda_device_count() -> int
+    */
+
+    py::class_<TranslateServicePyWrapper>(m, "Translator")
+        .def(py::init<std::string>())
+        .def("translate", py::overload_cast<const std::string&, const py::kwargs&>(&TranslateServicePyWrapper::run))
+        .def("translate", py::overload_cast<const std::vector<std::string>&, const py::kwargs&>(&TranslateServicePyWrapper::run))
+        ;
+
+    py::class_<EvaluatorPyWrapper>(m, "Evaluator")
+        .def(py::init<std::string>())
+        .def("evaluate", py::overload_cast<const StrVectors&>(&EvaluatorPyWrapper::run))
+        ;
+
+    py::class_<PyTrainer>(m, "Trainer")
+        .def(py::init<std::string>())
+        .def("train", py::overload_cast<>(&PyTrainer::train))
+        ;
+
+      py::class_<PyEmbedder>(m, "Embedder")
+        .def(py::init<std::string>())
+        .def("embed", py::overload_cast<>(&PyEmbedder::embed))
+        ;
+
+}
+
diff --git a/src/python/binding/embedder.hpp b/src/python/binding/embedder.hpp
new file mode 100644
index 000000000..12ae43c9d
--- /dev/null
+++ b/src/python/binding/embedder.hpp
@@ -0,0 +1,29 @@
+#include "marian.h"
+
+#include "common/timer.h"
+#include "embedder/embedder.h"
+#include "models/model_task.h"
+
+
+using namespace marian;
+
+namespace pymarian {
+ class PyEmbedder {
+    private:
+        Ptr<marian::Options> options_;
+        Ptr<Embed<Embedder>> embedder_;
+    public:
+        PyEmbedder(const std::string& cliString) {
+            options_ = parseOptions(cliString, cli::mode::embedding, true);
+            embedder_ = New<Embed<Embedder>>(options_);
+        }
+
+        int embed() {
+            //TODO: add options_ override from args to embed()
+            //TODO:  read input from args instead of STDIN
+            embedder_->run();
+            return 0;
+        }
+    };
+
+} // namespace pymarian
\ No newline at end of file
diff --git a/src/python/binding/evaluator.hpp b/src/python/binding/evaluator.hpp
new file mode 100644
index 000000000..f72ccd08a
--- /dev/null
+++ b/src/python/binding/evaluator.hpp
@@ -0,0 +1,119 @@
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+#include "marian.h"
+
+#include "common/logging.h"
+#include "common/timer.h"
+#include "data/batch_generator.h"
+#include "data/corpus.h"
+#include "data/text_input.h"
+#include "evaluator/evaluator.h"
+#include "models/model_task.h"
+
+
+using namespace marian;
+
+namespace pymarian {
+
+  //type aliases for convenience
+  using StrVector = std::vector<std::string>;
+  using StrVectors = std::vector<StrVector>;
+  using FloatVector = std::vector<float>;
+  using FloatVectors = std::vector<FloatVector>;
+  using Evaluator = marian::Evaluate<marian::Evaluator>;
+  namespace py = pybind11;
+
+  /**
+   * Wrapper for Marian Evaluator.
+   * 
+   * This class is a wrapper for the Marian Evaluator class.
+   * It is used to run the evaluator on a given input.
+   * 
+   **/
+  class EvaluatorPyWrapper {
+    
+  private:
+    Ptr<marian::Options> options_;
+    Ptr<Evaluator> evaluator_;
+    std::vector<Ptr<Vocab>> vocabs_;
+
+  public:
+  /**
+   * Constructor for the EvaluatorPyWrapper class.
+   * @param cliString - the command line string to parse as Marian options
+   */
+    EvaluatorPyWrapper(const std::string& cliString){
+      options_ = parseOptions(cliString, cli::mode::evaluating, true)
+      ->with("inference", true, "shuffle", "none");
+      evaluator_= New<Evaluator>(options_);
+      vocabs_ = loadVocabs(options_);
+    }
+ 
+    /**
+     * @brief Load the vocabularies from the given paths
+     * @param options - the options object
+     * @return vector of vocabularies
+    */
+    static auto loadVocabs(Ptr<marian::Options> options) -> std::vector<Ptr<Vocab>> {
+      std::vector<Ptr<Vocab>> vocabs;
+      auto vocabPaths = options->get<std::vector<std::string>>("vocabs");
+      LOG(info, "Loading vocabularies from {}", utils::join(vocabPaths, ", "));
+      for (size_t i = 0; i < vocabPaths.size(); ++i) {
+        Ptr<Vocab> vocab = New<Vocab>(options, i);
+        vocab->load(vocabPaths[i]);
+        vocabs.emplace_back(vocab);
+      }
+      return vocabs;
+    }
+
+    /**
+     * Given a table of strings (i.e., rows x columns), concatenate each column into a single string.
+     * 
+     * @param data - table of strings : rows x columns
+     * @return List of strings, one string for each column, concatenated across rows.
+    */
+    static auto concatColumns(const StrVectors& data) -> StrVector {
+      // Get the number of rows and columns in the data
+      int rows = data.size();
+      int cols = data[0].size();
+      StrVector result(cols);
+
+      for (int j = 0; j < cols; j++) {
+        std::string column = "";
+        for (int i = 0; i < rows; i++) {
+          column += data[i][j];
+          // If it is not the last row, add a newline character
+          if (i != rows - 1) { column += "\n";}
+        }
+        result[j] = column;
+      }
+      return result;
+    }
+
+    /**
+     * Run the evaluator on the given input. 
+     * Input is transformed as (in memory) files by concatenating columns.
+     * 
+     * @param inputs - table of strings : rows x columns
+     * @return table of floats : rows x columns
+     * 
+    */
+    auto run(const StrVectors& inputs) -> FloatVectors {
+      StrVector columnFiles = concatColumns(inputs);
+      auto corpus = New<data::TextInput>(columnFiles, vocabs_, options_);
+      corpus->prepare();
+
+      auto batchGenerator = New<BatchGenerator<data::TextInput>>(corpus, options_, nullptr, /*runAsync=*/false);
+      batchGenerator->prepare();
+
+      std::string output = options_->get<std::string>("output");
+      Ptr<BufferedVectorCollector> collector = New<BufferedVectorCollector>(output, /*binary=*/false);
+      evaluator_->run(batchGenerator, collector);
+      FloatVectors outputs = collector->getBuffer();
+      return outputs;
+    }
+
+  };
+
+}
diff --git a/src/python/binding/trainer.hpp b/src/python/binding/trainer.hpp
new file mode 100644
index 000000000..35cb34113
--- /dev/null
+++ b/src/python/binding/trainer.hpp
@@ -0,0 +1,51 @@
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include <signal.h>
+#include "marian.h"
+
+#include "common/signal_handling.h"
+#include "training/graph_group_async.h"
+#include "training/graph_group_singleton.h"
+#include "training/graph_group_sync.h"
+#include "training/training.h"
+
+#include "3rd_party/ExceptionWithCallStack.h"
+
+
+namespace py = pybind11;
+using namespace marian;
+
+
+namespace pymarian {
+
+
+    class PyTrainer {
+
+    private:
+        Ptr<marian::Options> options_;
+        Ptr<Train<SyncGraphGroup>> trainer_;
+
+    public:
+        PyTrainer(const std::string& cliString){
+            options_ = parseOptions(cliString, cli::mode::training, true);
+            LOG(info, "Using synchronous SGD");
+            trainer_ = New<Train<SyncGraphGroup>>(options_);
+        }
+
+        int train() {
+            //TODO: add options_ override from args to train()
+            //TODO:  read input from args instead of STDIN
+
+            trainer_->run();
+            // If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM,
+            // as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent
+            // scripts to determine if training terminated naturally or via SIGTERM.
+            // An alternative would be to exit with code 124, which is what the timeout command
+            // returns for timeout -s SIGTERM <seconds> ...., because exiting after SIGTERM
+            // is not technically a fatal error (which is what the 128+x convention usually
+            // stands for).
+            return getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS;
+        }
+    };
+
+}
\ No newline at end of file
diff --git a/src/python/binding/translator.hpp b/src/python/binding/translator.hpp
new file mode 100644
index 000000000..97864c3cc
--- /dev/null
+++ b/src/python/binding/translator.hpp
@@ -0,0 +1,69 @@
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+#include "marian.h"
+
+#include "common/logging.h"
+#include "common/timer.h"
+#include "evaluator/evaluator.h"
+#include "models/model_task.h"
+#include "translator/beam_search.h"
+#include "translator/translator.h"
+
+
+namespace py = pybind11;
+using namespace marian;
+
+namespace pymarian {
+
+  class TranslateServicePyWrapper {
+  private:
+    Ptr<TranslateService<BeamSearch>> pImpl_;
+
+    /**
+     * @brief Convert a pybind11::kwargs object to a YAML string
+     * 
+     * @param kwargs - the kwargs object from pybind11
+     * @return std::string - the YAML string
+     */
+    std::string convertKwargsToYamlString(const py::kwargs& kwargs) {
+      std::stringstream ss;
+      if (kwargs) {
+        for (auto& [key, value] : kwargs) {
+          // Depythonize the keys
+          std::string yamlKey = utils::findReplace(key.cast<std::string>(), "_", "-");
+          ss << yamlKey << ": " << value << std::endl;
+        }
+      }
+      return ss.str();
+    }
+
+  public:
+    TranslateServicePyWrapper(const std::string& cliString)
+    : pImpl_(New<TranslateService<BeamSearch>>(cliString)) {}
+
+    /**
+     * @brief Translate a vector of strings
+     * 
+     * @param inputs - the vector of strings to translate
+     * @param kwargs - the kwargs object from pybind11
+     * @return std::vector<std::string> - the vector of translated strings
+     */
+    std::vector<std::string> run(const std::vector<std::string>& inputs, const py::kwargs& kwargs) {
+      return this->pImpl_->run(inputs, convertKwargsToYamlString(kwargs));
+    }
+
+    /**
+     * @brief Translate a single string
+     * 
+     * @param input - the string to translate
+     * @param kwargs - the kwargs object from pybind11
+     * @return std::string - the translated string
+     */
+    std::string run(const std::string& input, const py::kwargs& kwargs) {
+      return this->pImpl_->run(input, convertKwargsToYamlString(kwargs));
+    }
+  };
+
+}
+
diff --git a/src/python/pymarian/__init__.py b/src/python/pymarian/__init__.py
new file mode 100644
index 000000000..f08d00944
--- /dev/null
+++ b/src/python/pymarian/__init__.py
@@ -0,0 +1,48 @@
+import _pymarian
+
+from ._version import __version__
+from .utils import kwargs_to_cli
+
+
+class Translator(_pymarian.Translator):
+    """Python wrapper for Marian Translator"""
+
+    def __init__(self, cli_string='', **kwargs):
+        """Initializes the translator
+        :param kwargs: kwargs
+        """
+        cli_string += ' ' + kwargs_to_cli(**kwargs)
+        super().__init__(cli_string.strip())
+
+
+class Evaluator(_pymarian.Evaluator):
+    """Python wrapper for Marian Evaluator"""
+
+    def __init__(self, cli_string='', **kwargs):
+        """Initializes the evaluator
+        :param kwargs: kwargs
+        """
+        cli_string += ' ' + kwargs_to_cli(**kwargs)
+        super().__init__(cli_string.strip())
+
+
+class Trainer(_pymarian.Trainer):
+    """Python wrapper for Marian Trainer"""
+
+    def __init__(self, cli_string='', **kwargs):
+        """Initializes the trainer
+        :param kwargs: kwargs
+        """
+        cli_string += ' ' + kwargs_to_cli(**kwargs)
+        super().__init__(cli_string.strip())
+
+
+class Embedder(_pymarian.Embedder):
+    """Python wrapper for Marian Embedder"""
+
+    def __init__(self, cli_string='', **kwargs):
+        """Initializes the embedder
+        :param kwargs: kwargs
+        """
+        cli_string += ' ' + kwargs_to_cli(**kwargs)
+        super().__init__(cli_string.stip())
diff --git a/src/python/pymarian/__main__.py b/src/python/pymarian/__main__.py
new file mode 100644
index 000000000..e0b68cd65
--- /dev/null
+++ b/src/python/pymarian/__main__.py
@@ -0,0 +1,18 @@
+
+import argparse
+
+from pymarian import __version__
+
+def parse_args():
+    parser = argparse.ArgumentParser(prog='pymarian', description="Python wrapper for Marian NMT",
+                                     epilog='URL: https://github.com/marian-nmt/marian-dev')
+    parser.add_argument('--version', '-v', action='version', version=__version__)
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    # prints version for -v/-version option.
+    # no other options are currently supported. Space left/intended for future use. 
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/src/python/pymarian/constants.py b/src/python/pymarian/constants.py
new file mode 100644
index 000000000..3d04abbba
--- /dev/null
+++ b/src/python/pymarian/constants.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+
+class Defaults:
+    BASE_URL = "https://textmt.blob.core.windows.net/www/models/mt-metric"
+    CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metrics'
+    MINI_BATCH = 16
+    MAXI_BATCH = 256
+    WORKSPACE = 8000
+    AVERAGE = 'skip'
+    MAX_LENGTH = 512
+    FLOAT_PRECISION = 4
+
+    # NOTE: model names must be lower case for caseless matching
+    KNOWN_METRICS = {
+        'cometoid22-wmt21': "comet-qe",
+        'cometoid22-wmt22': "comet-qe",
+        'cometoid22-wmt23': "comet-qe",
+        'chrfoid-wmt23': "comet-qe",
+        'comet20-da-qe': "comet-qe",
+        'bleurt20': "bleurt",
+        'comet20-da': "comet",
+    }
+
+    KNOWN_SCHEMA = {'comet-qe': 'src+mt', 'bleurt': 'ref+mt', 'comet': 'src+mt+ref'}
+
+    DEF_MODEL = 'cometoid22-wmt22'
+    DEF_SCHEMA = KNOWN_METRICS[DEF_MODEL]
diff --git a/src/python/pymarian/evaluate.py b/src/python/pymarian/evaluate.py
new file mode 100755
index 000000000..be13f3f00
--- /dev/null
+++ b/src/python/pymarian/evaluate.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python
+#
+# This is a python wrapper for marian evaluate command
+#
+import argparse
+import itertools
+import logging as log
+import shutil
+import subprocess
+import sys
+import threading
+from pathlib import Path
+from typing import Iterator, List, Optional, Tuple, Union
+
+from .constants import Defaults
+from .utils import get_known_model
+
+log.basicConfig(level=log.INFO)
+DEBUG_MODE = False
+
+
+def copy_lines_to_stdin(proc, lines: Iterator[str]):
+    """Write data to subproc stdin. Note: run this on another thread to avoid deadlock
+    This function reads streams, and write them as TSV record to the stdin of the sub process.
+    :param proc: subprocess object to write to
+    """
+
+    for line in lines:
+        # line = line.rstrip('\n') + '\n'
+        proc.stdin.write(line)
+    proc.stdin.flush()
+    proc.stdin.close()  # close stdin to signal end of input
+
+
+def marian_evaluate(
+    model: Path,
+    input_lines: Iterator[str],
+    vocab_file: Path = None,
+    devices: Optional[List[int]] = None,
+    width=Defaults.FLOAT_PRECISION,
+    mini_batch=Defaults.MINI_BATCH,
+    like=Defaults.DEF_SCHEMA,
+    maxi_batch=Defaults.MAXI_BATCH,
+    workspace=Defaults.WORKSPACE,
+    max_length=Defaults.MAX_LENGTH,
+    cpu_threads=0,
+    average: str = Defaults.AVERAGE,
+    backend='subprocess',
+) -> Iterator[Union[float, Tuple[float, float]]]:
+    """Run 'marian evaluate' as a subprocess or using pymarian, read input and write scores
+    Depending on the `model` argument, either a single score or a tuple of scores is returned per input line.
+    :param model: path to model file, or directory containing model.npz.best-embed.npz
+    :param vocab: path to vocabulary file (optional; if not given, assumed to be in the same directory as the model)
+    :param devices: list of GPU devices to use (optional; if not given, decision is let to marian process)
+    :param width: float precision
+    :param mini_batch: mini-batch size (default: 16)
+    :param like: marian embedding model like (default: comet-qe)
+    :param cpu_threads: number of CPU threads to use (default: 0)
+    :param: average: average segment scores to produce system score.
+        skip=do not output average (default; segment scores only);
+        append=append average at the end;
+        only=output the average only (i.e. system score only)
+    :param backend: subprocess or pymarian
+    :return: iterator over scores.
+    """
+
+    assert model.exists()
+    if model.is_dir():
+        model_dir = model
+        _model_files = list(model.glob("*.npz"))
+        assert len(_model_files) == 1, f'Expected exactly one model file in {model_dir}'
+        model_file = _model_files[0]
+    else:
+        assert model.is_file()
+        model_dir = model.parent
+        model_file = model
+    if not vocab_file:
+        _vocab_files = list(model_dir.glob('*.spm'))
+        assert len(_vocab_files) == 1, f'Expected exactly one vocab file in {model_dir}'
+        vocab_file = _vocab_files[0]
+
+    assert model_file.exists(), f'Model file {model_file} does not exist'
+    assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist'
+
+    n_inputs = len(Defaults.KNOWN_SCHEMA[like].split('+'))
+    vocabs = [vocab_file] * n_inputs
+    kwargs = dict(
+        model=model_file,
+        vocabs=vocabs,
+        devices=devices,
+        width=width,
+        like=like,
+        mini_batch=mini_batch,
+        maxi_batch=maxi_batch,
+        max_length=max_length,
+        max_length_crop=True,
+        workspace=workspace,  # negative memory => relative to total memory
+        cpu_threads=cpu_threads,
+        average=average,
+    )
+    if backend == 'pymarian':
+        # handled separately for pymarian due to minibatching and iterator input
+        # TODO: remove this when iterator is supported in evaluator C++ API
+        kwargs['average'] = 'skip'
+
+    cmd_line = []
+    for key, val in kwargs.items():
+        if val is None:  # ignore this key / flag
+            continue
+        cmd_line.append(f"--{key.replace('_', '-')}")
+        if val is True:  # boolean flag
+            cmd_line.append('true')
+        elif val is False:
+            cmd_line.append('false')
+
+        elif isinstance(val, (list, tuple)):
+            cmd_line.extend(str(v) for v in val)
+        else:
+            cmd_line.append(str(val))
+    if not DEBUG_MODE:
+        cmd_line.append('--quiet')
+    if backend == 'subprocess':
+        return subprocess_evaluate(cmd_line, input_lines)
+    elif backend == 'pymarian':
+        cmd_line = ' '.join(cmd_line)
+        batch_size = mini_batch * maxi_batch
+        return pymarian_evaluate(cmd_line, input_lines, batch_size=batch_size, average=average)
+    else:
+        raise ValueError(f'Unknown backend {backend}')
+
+
+def pymarian_evaluate(
+    cmd_line: str, input_lines: Iterator[str], average=Defaults.AVERAGE, batch_size=int(Defaults.MINI_BATCH * Defaults.MAXI_BATCH)
+):
+    try:
+        from pymarian import Evaluator
+    except:
+        raise ImportError('pymarian is not installed. Please install it and rerun')
+
+    log.info(f'Marian CLI::\n\t{cmd_line}')
+    evaluator = Evaluator(cmd_line)
+    assert average in ('skip', 'append', 'only')
+    lines = (line.rstrip('\n').split('\t') for line in input_lines)
+
+    # NOTE: pymarian doesn't support iterator input yet; so mini batching here
+    def make_mini_batches(lines, batch_size=batch_size):
+        assert batch_size > 0
+        while True:
+            chunk = list(itertools.islice(lines, batch_size))
+            if not chunk:
+                return
+            yield chunk
+
+    total, count = 0.0, 0
+    for batch in make_mini_batches(lines):
+        scores = evaluator.evaluate(batch)
+        assert len(scores) == len(batch)
+        for score in scores:
+            if isinstance(score, (tuple, list)):
+                score = score[0]
+            total += score
+            count += 1
+            if average != 'only':  # skip or append
+                yield score
+
+    if average != 'skip':
+        yield total / count
+
+
+def subprocess_evaluate(cmd_line: List[str], input_lines: Iterator[str]):
+    assert isinstance(cmd_line, list)
+    marian_bin_path = shutil.which('marian')
+    if marian_bin_path is None:
+        raise FileNotFoundError('marian binary not found in PATH. Please add it and rerun')
+    cmd_line = [marian_bin_path, 'evaluate'] + cmd_line
+
+    proc = None
+    try:
+        proc = subprocess.Popen(
+            cmd_line,
+            shell=False,
+            stdout=subprocess.PIPE,
+            stdin=subprocess.PIPE,
+            stderr=sys.stderr,
+            text=True,
+            encoding='utf8',
+            errors='replace',
+        )
+        log.info(f'Running command: {" ".join(cmd_line)}')
+        copy_thread = threading.Thread(target=copy_lines_to_stdin, args=(proc, input_lines))
+
+        copy_thread.start()
+        # read output and yield scores
+        for line in proc.stdout:
+            line = line.rstrip()
+            if ' ' in line:
+                yield tuple(float(x) for x in line.split(' '))
+            else:
+                yield float(line)
+
+        # wait for copy thread to finish
+        copy_thread.join()
+        # proc.stdin.close()
+        returncode = proc.wait()
+        if returncode != 0:
+            raise RuntimeError(f'Process exited with code {returncode}')
+    finally:
+        if proc is not None and proc.returncode is None:
+            log.warning(f'Killing process {proc.pid}')
+            proc.kill()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        '-m',
+        '--model',
+        help=f'Model name, or path. Known models={list(Defaults.KNOWN_METRICS.keys())}',
+        default=Defaults.DEF_MODEL,
+        type=str,
+    )
+
+    parser.add_argument(
+        '--stdin',
+        action='store_true',
+        help='Read input from stdin. TSV file with following format: \
+                          QE metrics: "src<tab>mt", Comet with ref: "src<tab>ref<tab>; or BLEURT: "ref<tab>mt"',
+    )
+    parser.add_argument('-t', '--mt', dest='mt_file', help='MT output file. Ignored when --stdin.', type=Path)
+    parser.add_argument('-s', '--src', dest='src_file', help='Source file. Ignored when --stdin', type=Path)
+    parser.add_argument('-r', '--ref', dest='ref_file', help='Ref file. Ignored when --stdin', type=Path)
+    parser.add_argument(
+        '-o', '--out', default=sys.stdout, help='output file. Default: stdout', type=argparse.FileType('w')
+    )
+    parser.add_argument(
+        '-a',
+        '--average',
+        choices=('skip', 'append', 'only'),
+        default='skip',
+        help='Average segment scores to produce system score.'
+        ' skip=do not output average (default; segment scores only);'
+        ' append=append average at the end; '
+        ' only=output the average only (i.e. system score only)',
+    )
+
+    parser.add_argument('-w', '--width', default=4, help='Output score width', type=int)
+    parser.add_argument('--debug', help='Verbose output', action='store_true')
+    parser.add_argument('--mini-batch', default=16, help='Mini-batch size', type=int)
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-d', '--devices', nargs='*', type=int, help='GPU device IDs')
+    group.add_argument(
+        '-c', '--cpu-threads', default=None, type=int, help='Use CPU threads. 0=use GPU device 0'
+    )
+    parser.add_argument('-ws', '--workspace', default=8000, help='Workspace memory', type=int)
+    parser.add_argument(
+        '--backend',
+        default='pymarian',
+        choices=['subprocess', 'pymarian'],
+        help='Marian backend interface. subprocess=look for marian binary in PATH. pymarian=pybind wrapper',
+    )
+
+    args = parser.parse_args()
+    return vars(args)
+
+
+def read_input(args, model_id, schema=None):
+    model_schema = Defaults.KNOWN_METRICS.get(model_id, schema or Defaults.DEF_SCHEMA)
+    input_schema = Defaults.KNOWN_SCHEMA[model_schema]
+    n_inputs = len(input_schema.split('+'))
+    if args.pop('stdin'):
+        del args['mt_file']
+        del args['src_file']
+        del args['ref_file']
+        return sys.stdin
+
+    n_inputs = len(input_schema.split('+'))
+    mt_file = args.pop('mt_file')
+    src_file = args.pop('src_file')
+    ref_file = args.pop('ref_file')
+    assert mt_file.exists(), f'{mt_file} does not exist'
+    if 'src' in input_schema:
+        assert src_file, f'Source file is required for metric {model_id}'
+        assert src_file.exists(), f'{src_file} does not exist'
+    if 'ref' in input_schema:
+        assert ref_file, f'Reference file is required for metric {model_id}'
+        assert ref_file.exists(), f'{ref_file} does not exist'
+    if input_schema == 'src+mt':
+        input_lines = itertools.zip_longest(open(src_file), open(mt_file))
+    elif input_schema == 'src+ref+mt':
+        input_lines = itertools.zip_longest(open(src_file), open(ref_file), open(mt_file))
+    elif input_schema == 'src+mt+ref':
+        input_lines = itertools.zip_longest(open(src_file), open(mt_file), open(ref_file))
+    elif input_schema == 'ref+mt':
+        input_lines = itertools.zip_longest(open(ref_file), open(mt_file))
+    else:
+        raise ValueError(f'Unknown schema {input_schema}')
+
+    def _validate_and_join():
+        for row in input_lines:
+            assert len(row) == n_inputs, f'Expected {n_inputs} columns, but got {len(row)}'
+            for col in row:
+                assert col is not None, f'Expected {n_inputs} columns, but got {len(row)}'
+            yield '\t'.join(row)
+
+    return _validate_and_join()
+
+
+def main(**args):
+    args = args or parse_args()
+    if args.pop('debug'):
+        log.getLogger().setLevel(log.DEBUG)
+        global DEBUG_MODE
+        DEBUG_MODE = True
+        log.debug(args)
+
+    model_id = args.pop('model')
+    if model_id.lower() in Defaults.KNOWN_METRICS:
+        model_path, vocab = get_known_model(model_id.lower())
+        log.info(f'{model_id} --> {model_path}')
+    else:
+        model_path, vocab = Path(model_id), None
+    assert (
+        model_path.exists()
+    ), f'{model_path} does not exist. Known models are {list(Defaults.KNOWN_METRICS.keys())}'
+    args['model'] = model_path
+    args['vocab_file'] = vocab
+
+    args['input_lines'] = read_input(args, model_id=model_id)
+    args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_SCHEMA)
+    out = args.pop('out')
+    width = args.pop('width', Defaults.FLOAT_PRECISION)
+    scores = marian_evaluate(**args)
+    for i, score in enumerate(scores, start=1):
+        if isinstance(score, (tuple, list)):
+            score = score[0]  # the first score
+        out.write(f'{score:.{width}f}\n')
+    out.close()
+
+    log.info(f'Wrote {i} lines to {out.name}')
+
+
+if '__main__' == __name__:
+    main()
diff --git a/src/python/pymarian/mtapi_server.py b/src/python/pymarian/mtapi_server.py
new file mode 100755
index 000000000..4391a3101
--- /dev/null
+++ b/src/python/pymarian/mtapi_server.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+"""
+Implements Microsoft's MTAPI (https://docs.microsoft.com/en-us/azure/cognitive-services/translator/quickstart-translator?tabs=python).
+"""
+
+import argparse
+import json
+import logging as log
+from typing import List
+
+from flask import Flask, request
+from sacremoses import MosesPunctNormalizer
+from sentence_splitter import SentenceSplitter
+
+import pymarian
+
+log.basicConfig(level=log.INFO)
+
+
+class MarianService:
+    def __init__(self, source_lang: str, target_lang: str, cli_string: str = None):
+        self.source_lang = source_lang
+        self.target_lang = target_lang
+        self.cli_string = cli_string
+        self._translator = None  # lazy init
+
+        self.norm = MosesPunctNormalizer(lang="en")
+        self.splitter = SentenceSplitter(source_lang)
+
+    @property
+    def translator(self):
+        if self._translator is None:
+            # lazy init
+            self._translator = pymarian.Translator(self.cli_string)
+        return self._translator
+
+    def translate(self, text: List[str]) -> List[str]:
+        """Translates a list of sentences from source to target language."""
+        text = self.norm.normalize(text)
+        input_lines = self.splitter.split(text)
+        output_lines = self.translator.translate(input_lines)
+        return " ".join(output_lines)
+
+
+def attach_routes(app: Flask, service: MarianService):
+    @app.route('/translate', methods=["GET", "POST"])
+    def translate():
+        request_data = request.get_json()
+        outputs = []
+        for source in request_data:
+            text = source["text"]
+            translation = service.translate(text)
+            outputs.append(translation)
+        response = [
+            {"translations": [{"text": output, "to": service.target_lang} for output in outputs]},
+        ]
+        return json.dumps(response), 200
+
+
+def parse_args():
+    SOURCE_LANG = "en"
+    TARGET_LANG = "de"
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--source-lang", "-s", type=str, default=SOURCE_LANG)
+    parser.add_argument("--target-lang", "-t", type=str, default=TARGET_LANG)
+    parser.add_argument('args', type=str, help="CLI string for loading marian model")
+    parser.add_argument("--port", "-p", type=int, default=5000)
+    return vars(parser.parse_args())
+
+
+def main():
+    app = Flask(__name__)
+    args = parse_args()
+    service = MarianService(
+        source_lang=args["source_lang"], target_lang=args["target_lang"], cli_string=args["args"]
+    )
+    attach_routes(app, service)
+    app.run(port=args["port"])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/pymarian/qtdemo.py b/src/python/pymarian/qtdemo.py
new file mode 100644
index 000000000..e95d0bf12
--- /dev/null
+++ b/src/python/pymarian/qtdemo.py
@@ -0,0 +1,125 @@
+import sys
+import time
+
+from PyQt5.QtGui import *
+from PyQt5.QtWidgets import *
+from sacremoses import MosesPunctNormalizer, MosesTokenizer
+from sentence_splitter import SentenceSplitter
+
+import pymarian
+
+
+class Example(QWidget):
+    def __init__(self):
+        super().__init__()
+
+        self.cache = dict()
+        self.norm = MosesPunctNormalizer(lang="en")
+        self.tok = MosesTokenizer(lang="en")
+        self.splitter = SentenceSplitter("en")
+
+        self.setWindowTitle("Live Translator")
+        self.setFont(QFont(self.font().family(), 11))
+        # setting the geometry of window
+        self.setGeometry(300, 300, 1200, 800)
+
+        # centering
+        qtRectangle = self.frameGeometry()
+        centerPoint = QDesktopWidget().availableGeometry().center()
+        qtRectangle.moveCenter(centerPoint)
+        self.move(qtRectangle.topLeft())
+
+        self.marian = None
+
+        self.input = QPlainTextEdit(self)
+        self.input.textChanged.connect(self.onChanged)
+        self.output = QPlainTextEdit(self)
+
+        hbox = QHBoxLayout()
+        self.cli = QLineEdit(self)
+        self.cli.setText(
+            "-c models/enu.deu.yml --cpu-threads 8 -b1 --mini-batch-words 256 --maxi-batch 100 --maxi-batch-sort src"
+        )
+
+        self.reload = QPushButton("Reload")
+        self.reload.clicked.connect(self.onClicked)
+        self.run = QPushButton("Translate")
+        self.run.clicked.connect(self.onChanged)
+
+        hbox.addWidget(self.cli)
+        hbox.addWidget(self.reload)
+        hbox.addWidget(self.run)
+
+        layout = QVBoxLayout()
+        layout.addLayout(hbox)
+        hbox2 = QHBoxLayout()
+        hbox2.addWidget(self.input)
+        hbox2.addWidget(self.output)
+        layout.addLayout(hbox2)
+
+        self.statusBar = QStatusBar()
+        layout.addWidget(self.statusBar)
+
+        self.setLayout(layout)
+
+        self.reloadMarian()
+        self.show()
+
+    def onChanged(self):
+        inputText = self.input.toPlainText()
+        if not self.current:
+            self.reloadMarian()
+        if self.current:
+            outputText = self.translate(inputText)
+            self.output.setPlainText(outputText)
+
+    def onClicked(self):
+        self.reloadMarian()
+
+    def reloadMarian(self):
+        command = self.cli.text()
+        print(command)
+        self.cache = dict()  # clean instead of caching
+        if command not in self.cache:
+            self.cache[command] = dict()
+            self.cache[command]["#MODEL#"] = pymarian.Translator(command)
+        self.current = self.cache[command]
+
+    def translate(self, inputText):
+        t0 = time.perf_counter()
+
+        inputLines = [self.splitter.split(p) for p in inputText.split("\n")]
+
+        unseenLines = []
+        for paragraph in inputLines:
+            for line in paragraph:
+                if line not in self.current:
+                    unseenLines.append(line)
+
+        normLines = [self.norm.normalize(c) for c in unseenLines]
+
+        t1 = time.perf_counter()
+        outputLines = self.current["#MODEL#"].translate(normLines)
+        t2 = time.perf_counter()
+
+        totalStat = sum([len(self.tok.tokenize(line)) for line in unseenLines])
+
+        if totalStat:
+            self.statusBar.showMessage(
+                f"Translated {totalStat} tokens ({len(unseenLines)} lines) in {t2 - t1:.2f} second ({totalStat / (t2 - t1):.2f} tokens per second). Preprocessing took {t1 - t0:.2f} seconds. Total: {t2 - t0:.2f} seconds"
+            )
+
+        for src, trg in zip(unseenLines, outputLines):
+            self.current[src] = trg
+
+        return "\n".join([" ".join([self.current[src] for src in paragraph]) for paragraph in inputLines])
+
+
+def main():
+    app = QApplication(sys.argv)
+    ex = Example()
+    sys.exit(app.exec_())
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/python/pymarian/utils.py b/src/python/pymarian/utils.py
new file mode 100644
index 000000000..16e2e3c22
--- /dev/null
+++ b/src/python/pymarian/utils.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+#
+# This is a python wrapper for marian evaluate command
+# created by Thamme Gowda on 2023-09-07
+#
+
+import logging as log
+import shutil
+from pathlib import Path
+
+import requests
+from tqdm.auto import tqdm
+
+from .constants import Defaults
+
+log.basicConfig(level=log.INFO)
+DEBUG_MODE = False
+
+
+def get_known_model(model_name):
+    """Given a known model name, this functin gets the checkpoint and vocabulary paths. 
+    This function downloads and extracts model files to a local cache directory if necessary.
+    
+    Specifically,  checkpoint file must have model*.npz and vocab*.spm files in the resolved model directory.
+    :param model_name: model name
+    :return: checkpoint path, vocabulary path
+    """
+    assert model_name in Defaults.KNOWN_METRICS, f'Unknown model {model_name}'
+
+    model_url = f'{Defaults.BASE_URL}/{model_name}.tgz'
+    local_file = Defaults.CACHE_PATH / f'{model_name}.tgz'
+    local_dir = Defaults.CACHE_PATH / model_name
+    maybe_download_file(model_url, local_file)
+    maybe_extract(local_file, local_dir)
+    checkpt_file = list(local_dir.glob('model*.npz'))
+    vocab_file = list(local_dir.glob('vocab*.spm'))
+    assert len(checkpt_file) == 1, f'Expected exactly one model file in {local_dir}'
+    assert len(vocab_file) == 1, f'Expected exactly one vocab file in {local_dir}'
+    checkpt_file = checkpt_file[0]
+    vocab_file = vocab_file[0]
+    return checkpt_file, vocab_file
+
+
+def maybe_download_file(url, local_file: Path):
+    """Downloads the file if not already downloaded
+    :param url: url to download
+    :param local_file: local file path
+    """
+    flag_file = local_file.with_name(local_file.name + '._OK')
+    if local_file.exists() and flag_file.exists():
+        log.info(f'Using cached file {local_file}')
+        return
+    log.info(f'Downloading {url} to {local_file}')
+    local_file.parent.mkdir(parents=True, exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        file_size = int(r.headers.get('Content-Length', 0))
+        with tqdm.wrapattr(r.raw, "read", total=file_size, desc='Downloading', dynamic_ncols=True) as r_raw:
+            with open(local_file, "wb") as f:
+                shutil.copyfileobj(r_raw, f)
+    flag_file.touch()
+
+
+def maybe_extract(archive: Path, outdir: Path) -> Path:
+    """Extracts the archive to outdir if not already extracted
+    :param archive: path to archive file
+    :param outdir: output directory
+    :return: output directory path
+    """
+    assert archive.exists(), f'{archive} does not exist'
+    flag_file = outdir / '._EXTRACT_OK'
+    if not outdir.exists() or not flag_file.exists():
+        shutil.rmtree(outdir, ignore_errors=True)
+        log.info(f'Extracting {archive} to {outdir}')
+        # assumption: root dir in tar matches model name
+        shutil.unpack_archive(archive, outdir.parent)
+        flag_file.touch()
+    return outdir
+
+
+def kwargs_to_cli(**kwargs) -> str:
+    """Converts kwargs to cli args
+    :param kwargs: kwargs
+    :return: cli args
+    """
+    args = []
+    for k, v in kwargs.items():
+        if v is None:
+            continue  # ignore keys if values are None
+        k = k.replace('_', '-')
+        args.append(f'--{k}')
+        if v is '':
+            continue  # only add keys for empty values
+        elif isinstance(v, bool):
+            args.append("true" if v else "false")
+        elif isinstance(v, (list, tuple)):
+            args.extend(str(x) for x in v)
+        else:
+            args.append(f'{v}')
+
+    return ' '.join(args)
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
new file mode 100644
index 000000000..a9cf413a7
--- /dev/null
+++ b/src/python/pyproject.toml
@@ -0,0 +1,63 @@
+[build-system]
+requires = ["setuptools >= 61.0", "pip >= 23.0"]   #NOTE: we had troubles with pip v22; it set name as UNKNOWN
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+# where = ["."]  # ["."] by default
+include = ["pymarian*"]  # ["*"] by default
+# exclude = ["pymarian.tests*"]  # empty by default
+namespaces = true  # true by default
+
+
+[project]
+name = "pymarian"
+dynamic = ["version"]   # see [tool.setuptools.dynamic] below
+description = "Pymarian"
+readme = "README.md"
+authors = [
+  { name = "Marian Developers", email = "noreply@email.com" },
+]
+requires-python = ">=3.7"
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.7",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+]
+
+dependencies = [
+  "tqdm",
+  "requests"
+]
+
+[project.scripts]
+pymarian-evaluate = "pymarian.evaluate:main"
+pymarian-qtdemo = "pymarian.qtdemo:main"
+pymarian-mtapi = "pymarian.mtapi_server:main"
+
+[project.optional-dependencies]
+test = ["pytest"]
+demos = [
+  "flask",
+  "sacremoses",
+  "pyqt5",
+  "sentence-splitter@git+https://github.com/mediacloud/sentence-splitter",
+]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.black]
+line-length = 110
+target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
+include = 'src/python/.*\.pyi?$'
+skip-string-normalization = true
+
+# black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333 
+[tool.isort]
+profile = "black"
+src_paths = ["src/python"]
+# isort  --check --diff src/python/
\ No newline at end of file
diff --git a/src/python/setup.py b/src/python/setup.py
new file mode 100644
index 000000000..01d3a0f5f
--- /dev/null
+++ b/src/python/setup.py
@@ -0,0 +1,102 @@
+import os
+import platform
+import shutil
+import sys
+
+from pathlib import Path
+from setuptools import setup, find_namespace_packages, Distribution
+
+"""
+This script expects _pymarian.*.so to be present in $CMAKE_BINARY_DIR
+
+NOTE: Most of (static) metadata is set in pyproject.toml.
+#  This setup.py is for specifying dynamic aspect of build. All static metadata is in pyproject.toml
+"""
+
+DEF_CMAKE_BINARY_DIR = (Path(__file__).parent / '../../build').resolve()
+CMAKE_BINARY_DIR = os.getenv("CMAKE_BINARY_DIR", DEF_CMAKE_BINARY_DIR)
+print("\t>>>CMAKE_BINARY_DIR is ", CMAKE_BINARY_DIR)
+
+if platform.system() == 'Windows':
+    NATIVE_EXT_GLOB = '_pymarian.*.pyd'
+elif platform.system() == 'Darwin':
+    NATIVE_EXT_GLOB = '_pymarian.*.dylib'
+else:
+    NATIVE_EXT_GLOB = '_pymarian.*.so'
+
+
+def get_version(cuda_version=None) -> str:
+    vfile = Path(__file__).parent / '../../VERSION'
+    if not vfile.exists() and "CMAKE_SOURCE_DIR" in os.environ:
+        # some build tools may copy src/python into a temporary directory, which disconnects it from the source tree
+        # using CMAKE_SOURCE_DIR to find the source tree
+        vfile = Path(os.environ["CMAKE_SOURCE_DIR"]) / 'VERSION'
+    try:
+        assert vfile.exists(), f'Version file {vfile.resolve()} does not exist'
+        version = vfile.read_text().strip().lstrip("v")  # gets rid of 'v' prefix in v1.17.5 etc.
+    except:
+        # FIXME: This is a hack. We need to read version from VERSION file
+        version = '0.0.0'
+        print(
+            f'WARNING: Could not read version from {vfile.resolve()}. Setting version to {version}',
+            file=sys.stderr,
+        )
+    version = version if not cuda_version else f"{version}+cu{cuda_version.replace('.', '')}"
+
+    print("\t>>>Marian version is ", version)
+    # we need to write version to _version.py file, so that it can be imported from python
+    vfile_lines = [
+        "# File generated by setuptools; it will be overwritten on every build",
+        "# Hence, do not edit or track this file in version control",
+        f"__version__ = '{version}'",
+        "__cuda_version__ = %s" % ("None" if not cuda_version else f"'{cuda_version}'"),
+    ]
+
+    vfile_py = Path(__file__).parent / 'pymarian' / '_version.py'
+    vfile_py.write_text("\n".join(vfile_lines))
+    return version
+
+
+def get_native_ext() -> Path:
+
+    native_exts = list(Path(CMAKE_BINARY_DIR).glob(f'src/{NATIVE_EXT_GLOB}'))
+    if not native_exts:
+        raise Exception(
+            f'No native extension found; Looked at {CMAKE_BINARY_DIR}/src/{NATIVE_EXT_GLOB}. \
+            Please run cmake build first with -DPYMARIAN=ON or set CMAKE_BINARY_DIR to the build dir'
+        )
+    elif len(native_exts) >= 2:
+        raise Exception(f'Only one native extension expected, but found: {native_exts}')
+
+    native_ext = native_exts[0]
+    # Pip does not allow inclusion of files from parent dir our outside of package context (for security reasons).
+    # So, we copy the native extension to the package directory
+    native_ext_local = Path(__file__).parent / native_ext.name
+    print(f"\t>>>Found native extension at: {native_ext}")
+    print(f"\t   >>>Making it available under scope at: {native_ext_local}")
+    if native_ext_local.exists():
+        native_ext_local.unlink()
+    shutil.copy(native_ext, native_ext_local)
+    return native_ext_local
+
+
+version = get_version(os.getenv("CUDA_VERSION", default=""))
+native_ext = get_native_ext()
+
+
+# Thanks to https://stackoverflow.com/a/62668026/1506477
+class BinaryDistribution(Distribution):
+    """Distribution which always forces a binary package with platform name"""
+
+    def has_ext_modules(foo):
+        return True
+
+
+setup(
+    version=version,
+    package_dir={"pymarian": "pymarian"},
+    packages=find_namespace_packages(where=".", exclude=["tests", "binding"]),
+    include_package_data=True,
+    package_data={"": [str(native_ext)]},
+    distclass=BinaryDistribution,
+)
diff --git a/src/python/tests/__init__.py b/src/python/tests/__init__.py
new file mode 100644
index 000000000..4fa4672c7
--- /dev/null
+++ b/src/python/tests/__init__.py
@@ -0,0 +1,15 @@
+import os
+
+QUIET = os.getenv('MARIAN_QUIET', "").lower() in ("1", "yes", "y", "true", "on")
+CPU_THREADS = int(os.getenv('MARIAN_CPU_THREADS', "4"))
+WORKSPACE_MEMORY = int(os.getenv('MARIAN_WORKSPACE_MEMORY', "6000"))
+
+EPSILON = 0.0001  # the precision error we afford in float comparison
+
+BASE_ARGS = dict(
+    mini_batch=8,
+    maxi_batch=64,
+    cpu_threads=CPU_THREADS,
+    workspace=WORKSPACE_MEMORY,
+    quiet=QUIET,
+)
diff --git a/src/python/tests/test_evaluate.py b/src/python/tests/test_evaluate.py
new file mode 100644
index 000000000..d79462901
--- /dev/null
+++ b/src/python/tests/test_evaluate.py
@@ -0,0 +1,148 @@
+"""
+# silense marian log
+export MARIAN_QUIET=yes
+
+# run all tests in this file
+  pytest -v src/python/tests/test_evaluate.py 
+  pytest -vx src/python/tests/test_evaluate.py   #stop on first failure
+
+# run a single test:
+   pytest -v src/python/tests/test_evaluate.py -k test_evaluator_chrfoid
+   pytest -vs src/python/tests/test_evaluate.py -k test_evaluator_chrfoid # see stdout and stderr
+"""
+import os
+
+from pymarian import Evaluator
+from pymarian.utils import get_known_model
+
+from . import BASE_ARGS
+
+EPSILON = 0.0001  # the precision error we afford in float comparison
+
+
+# dummy sentences for testing
+SAMPLE_SRC_HYP = [
+    ["This is a test", "This is a test A"],
+    ["This is a test B", "This is a test C"],
+    ["This is a test D", "This is a test E"],
+]
+SAMPLE_REF_HYP = SAMPLE_SRC_HYP  # same for now
+SAMPLE_SRC_HYP_REF = [
+    ["This is a test", "This is a test A", "This is a test AA"],
+    ["This is a test B", "This is a test C", "This is a test CC"],
+    ["This is a test D", "This is a test E", "This is a test EE"],
+]
+
+
+def test_evaluator_chrfoid():
+    model_path, vocab_path = get_known_model("chrfoid-wmt23")
+    args = BASE_ARGS | dict(
+        like="comet-qe",
+        model=model_path,
+        vocabs=[vocab_path, vocab_path],
+    )
+    # args = dict(help='')   # to get help message with all args
+    eval = Evaluator(**args)
+    data = SAMPLE_SRC_HYP
+    expected_scores = [0.0548, 0.0797, 0.0988]
+
+    scores = eval.evaluate(data)
+    assert len(scores) == len(data)
+    for score, expected_score in zip(scores, expected_scores):
+        if isinstance(score, list):
+            score = score[0]
+        assert abs(score - expected_score) < EPSILON
+
+
+def test_evaluator_cometoid22_wmt22():
+    model_path, vocab_path = get_known_model("cometoid22-wmt22")
+    args = BASE_ARGS | dict(
+        like="comet-qe",
+        model=model_path,
+        vocabs=[vocab_path, vocab_path],
+    )
+    # args = dict(help='')   # to get help message with all args
+    eval = Evaluator(**args)
+    data = SAMPLE_SRC_HYP
+    expected_scores = [0.71845, 0.7906, 0.81549]
+
+    scores = eval.evaluate(data)
+    assert len(scores) == len(data)
+
+    for score, expected_score in zip(scores, expected_scores):
+        if isinstance(score, list):
+            score = score[0]
+        assert abs(score - expected_score) < EPSILON
+
+
+def test_evaluator_cometoid22_wmt23():
+    model_path, vocab_path = get_known_model("cometoid22-wmt23")
+    args = BASE_ARGS | dict(
+        like="comet-qe",
+        model=model_path,
+        vocabs=[vocab_path, vocab_path],
+    )
+    eval = Evaluator(**args)
+    data = SAMPLE_SRC_HYP
+    expected_scores = [0.75715, 0.81395, 0.8361]
+
+    scores = eval.evaluate(data)
+    assert len(scores) == len(data)
+    for score, expected_score in zip(scores, expected_scores):
+        if isinstance(score, list):
+            score = score[0]
+        assert abs(score - expected_score) < EPSILON
+
+
+def test_evaluator_bleurt():
+    model_path, vocab_path = get_known_model("bleurt20")
+    args = BASE_ARGS | dict(
+        like="bleurt",
+        model=model_path,
+        vocabs=[vocab_path, vocab_path],
+    )
+
+    eval = Evaluator(**args)
+    data = SAMPLE_REF_HYP
+    scores = eval.evaluate(data)
+    expected_scores = [0.30929, 0.3027, 0.3113]
+    assert len(scores) == len(data)
+    for score, expected_score in zip(scores, expected_scores):
+        if isinstance(score, list):
+            score = score[0]
+        assert abs(score - expected_score) < EPSILON
+
+
+# TODO: These below tests are failing
+
+
+def test_evaluator_comet20qe():
+    model_path, vocab_path = get_known_model("comet20-da-qe")
+    args = BASE_ARGS | dict(
+        like="comet-qe",
+        model=model_path,
+        vocabs=[vocab_path, vocab_path],
+    )
+
+    eval = Evaluator(**args)
+    data = SAMPLE_SRC_HYP
+    scores = eval.evaluate(data)
+    assert len(scores) == len(data)
+    # TODO: add expected scores and asserts
+
+
+def test_evaluator_comet20ref():
+    model_path, vocab_path = get_known_model("comet20-da")
+    args = BASE_ARGS | dict(
+        like="comet",
+        model=model_path,
+        vocabs=[vocab_path, vocab_path],
+    )
+
+    eval = Evaluator(**args)
+    data = SAMPLE_SRC_HYP_REF
+    scores = eval.evaluate(data)
+    len(scores) == len(data)
+
+
+# TODO: add expected scores and asserts
diff --git a/src/python/tests/test_train.py b/src/python/tests/test_train.py
new file mode 100644
index 000000000..543e45db5
--- /dev/null
+++ b/src/python/tests/test_train.py
@@ -0,0 +1,142 @@
+import tarfile
+import tempfile
+import urllib.request
+from pathlib import Path
+
+from pymarian import Trainer
+from pymarian.utils import get_known_model
+
+QUIET = False
+
+TMP_DATA_DIR = Path.home() / 'tmp' / 'marian-tests'
+DATA_URL = "https://textmt.blob.core.windows.net/www/data/marian-tests-data.tgz"
+
+
+def setup():
+    ok_file = TMP_DATA_DIR / '_OK'
+    if not TMP_DATA_DIR.exists() or not ok_file.exists():
+        TMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+        print("Downloading data package...")
+        with urllib.request.urlopen(DATA_URL) as response:
+            with tarfile.open(fileobj=response, mode="r|gz") as tar:
+                tar.extractall(path=TMP_DATA_DIR)
+        ok_file.touch()
+        print("Done.")
+
+
+setup()
+
+
+def test_train_comet_qe():
+    data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng'
+    vocab_file = data_dir / 'vocab.8k.spm'
+    classe_file = data_dir / 'classes4f.txt'
+    train_file = data_dir / 'sample.5k.chrfoid-deu-eng.tsv'
+    # pretrained_model, vocab_file = get_known_model("chrfoid-wmt23")
+    assert classe_file.exists()
+    assert vocab_file.exists()
+    assert train_file.exists()
+
+    args = {
+        'dim_emb': 128,
+        'enc_depth': 3,
+        'dec_depth': 1,
+        'tied_embeddings_all': True,
+        'transformer_heads': 2,
+        'transformer_dim_ffn': 256,
+        'transformer_ffn_activation': 'relu',
+        'transformer_dropout': 0.1,
+        'cost_type': 'ce-mean',
+        'max_length': 80,
+        'mini_batch_fit': False,
+        'maxi_batch': 256,
+        'optimizer_params': [0.9, 0.98, '1e-09'],
+        'sync_sgd': True,
+        'learn_rate': 0.0003,
+        'lr_decay_inv_sqrt': [16000],
+        'lr_warmup': 16000,
+        'label_smoothing': 0.1,
+        'clip_norm': 0,
+        'exponential_smoothing': 0.0001,
+        'early_stopping': 2,
+        'keep_best': True,
+        'beam_size': 2,
+        'normalize': 1,
+        'valid_metrics': ['perplexity'],
+        'valid_mini_batch': 16,
+        'mini_batch': 8,
+        'after': '400u',
+        'valid_freq': '200u',
+        'disp_freq': 100,
+        'disp_first': 4,
+        'save_freq': '200u',
+        'quiet': QUIET,
+        #'like': 'comet-qe',   # only supported at inference; for training, see task and input_types
+        'task': 'comet-qe',
+        'input_types': ['class', 'sequence', 'sequence'],  # required for training
+        #'pretrained_model': pretrained_model,     # for finetuning; not using it because its too big for tests
+        'train_sets': [train_file],  # TSV file having 3 columns: class sequence sequence
+        'tsv': True,
+        'tsv-fields': 3,  # or it will complain that vocabs and train_sets should be one to one map
+        'vocabs': [classe_file, vocab_file, vocab_file],  # class sequence sequence
+    }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        save_at = tmpdir + '/model.npz'
+        trainer = Trainer(model=save_at, **args)
+        trainer.train()
+
+
+def test_train_transformer_nmt():
+    data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng'
+    vocab_file = data_dir / 'vocab.8k.spm'
+    train_prefix = str(data_dir / 'sample.5k')
+    src_lang = "deu"
+    tgt_lang = "eng"
+    train_src = train_prefix + "." + src_lang
+    train_tgt = train_prefix + "." + tgt_lang
+
+    # these are taken from regression-tests repo and simplified
+    args = {
+        'type': 'transformer',
+        'dim_emb': 128,
+        'enc_depth': 3,
+        'dec_depth': 1,
+        'tied_embeddings_all': True,
+        'transformer_heads': 2,
+        'transformer_dim_ffn': 256,
+        'transformer_ffn_activation': 'relu',
+        'transformer_dropout': 0.1,
+        'cost_type': 'ce-mean-words',
+        'max_length': 80,
+        'mini_batch_fit': False,
+        'maxi_batch': 256,
+        'optimizer_params': [0.9, 0.98, '1e-09'],
+        'sync_sgd': True,
+        'learn_rate': 0.0003,
+        'lr_decay_inv_sqrt': [16000],
+        'lr_warmup': 16000,
+        'label_smoothing': 0.1,
+        'clip_norm': 0,
+        'exponential_smoothing': 0.0001,
+        'early_stopping': 2,
+        'keep_best': True,
+        'beam_size': 2,
+        'normalize': 1,
+        'valid_metrics': ['ce-mean-words', 'bleu', 'perplexity'],
+        'valid_mini_batch': 16,
+        'mini_batch': 8,
+        'after': '400u',  # stop after 500 updates
+        'valid_freq': '200u',  # validate every 250 updates
+        'disp_freq': 100,
+        'disp_first': 4,
+        'save_freq': '200u',
+        'vocabs': [vocab_file, vocab_file],
+        'train_sets': [train_src, train_tgt],
+        'quiet': QUIET,
+    }
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        save_at = tmpdir + '/model.npz'
+        trainer = Trainer(model=save_at, **args)
+        trainer.train()
diff --git a/src/python/tests/test_translate.py b/src/python/tests/test_translate.py
new file mode 100644
index 000000000..0ad5adc60
--- /dev/null
+++ b/src/python/tests/test_translate.py
@@ -0,0 +1,16 @@
+from pathlib import Path
+
+from pymarian import Translator
+
+from . import BASE_ARGS
+
+
+def test_ende():
+    # TODO: download model from blob storage
+    model_dir = Path.home() / 'tmp/marian-eng-deu'
+    model_file = str(model_dir / 'model.bin')
+    vocab_file = str(model_dir / 'vocab.spm')
+    args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file])
+    translator = Translator(**args)
+    hyp = translator.translate("Hello. Good morning.")
+    assert hyp == "Hallo. Guten Morgen."
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 081b06c42..b15683867 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -229,13 +229,17 @@ class TranslateService : public ModelServiceTask {
   Ptr<Vocab> trgVocab_;
   Ptr<const data::ShortlistGenerator> shortlistGenerator_;
 
-  std::vector<Ptr<io::ModelWeights>> modelFiles_;
+  std::vector<Ptr<io::ModelWeights>> modelWeights_;
 
   size_t numDevices_;
+  std::vector<std::vector<io::Item>> model_items_; // non-mmap
 
 public:
   virtual ~TranslateService() {}
 
+  TranslateService(const std::string& cliString)
+    : TranslateService(parseOptions(cliString, cli::mode::translation, /*validate=*/true)) {}
+
   TranslateService(Ptr<Options> options)
     : options_(options->clone()) {
     // initialize vocabs
@@ -255,7 +259,7 @@ class TranslateService : public ModelServiceTask {
     trgVocab_->load(vocabPaths.back());
     auto srcVocab = srcVocabs_.front();
 
-    std::vector<int> lshOpts = options_->get<std::vector<int>>("output-approx-knn");
+    std::vector<int> lshOpts = options_->get<std::vector<int>>("output-approx-knn", {});
     ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters");
 
     // load lexical shortlist
@@ -267,47 +271,74 @@ class TranslateService : public ModelServiceTask {
     auto devices = Config::getDevices(options_);
     numDevices_ = devices.size();
 
+    ThreadPool threadPool(numDevices_, numDevices_);
+    scorers_.resize(numDevices_);
+    graphs_.resize(numDevices_);
+
+    bool mmap     = options_->get<bool>("model-mmap", false);
+    auto mmapMode = mmap ? io::MmapMode::RequiredMmap : io::MmapMode::OpportunisticMmap;
+
     // preload models
-    auto models = options->get<std::vector<std::string>>("models");
-    for(auto model : models) {
-      modelFiles_.push_back(New<io::ModelWeights>(model));
-    }
+    auto modelPaths = options->get<std::vector<std::string>>("models");
+    for(auto modelPath : modelPaths)
+      modelWeights_.push_back(New<io::ModelWeights>(modelPath, mmapMode));
 
     // initialize scorers
+    size_t id = 0;
     for(auto device : devices) {
-      auto graph = New<ExpressionGraph>(true);
-
-      auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
-      graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
-      graph->setDevice(device);
-      if (device.type == DeviceType::cpu) {
-        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-        graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
-        graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
-      }
-      graph->reserveWorkspaceMB(options_->get<int>("workspace"));
-      graphs_.push_back(graph);
-
-      auto scorers = createScorers(options_, modelFiles_);
-      for(auto scorer : scorers) {
-        scorer->init(graph);
-        if(shortlistGenerator_)
-          scorer->setShortlistGenerator(shortlistGenerator_);
-      }
-      scorers_.push_back(scorers);
+      auto task = [&](DeviceId device, size_t id) {
+        auto graph = New<ExpressionGraph>(true);
+
+        auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
+        graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
+        graph->setDevice(device);
+        if (device.type == DeviceType::cpu) {
+          graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
+          graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
+          graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
+        }
+        graph->reserveWorkspaceMB(options_->get<int>("workspace"));
+        graphs_[id] = graph;
+
+        auto scorers = createScorers(options_, modelWeights_);
+        for(auto scorer : scorers) {
+          scorer->init(graph);
+          if(shortlistGenerator_)
+            scorer->setShortlistGenerator(shortlistGenerator_);
+        }
+
+        scorers_[id] = scorers;
+        graph->forward();
+      };
+
+      threadPool.enqueue(task, device, id++);
     }
   }
 
-  std::string run(const std::string& input) override {
+  std::vector<std::string> run(const std::vector<std::string>& inputs, const std::string& yamlOverridesStr="") override {
+      auto input = utils::join(inputs, "\n");
+      auto translations = run(input, yamlOverridesStr);
+      return utils::split(translations, "\n", /*keepEmpty=*/true);
+  }
+
+  std::string run(const std::string& input, const std::string& yamlOverridesStr="") override {
+    YAML::Node configOverrides = YAML::Load(yamlOverridesStr);
+
+    auto currentOptions = New<Options>(options_->clone());
+    if (!configOverrides.IsNull()) {
+      LOG(info,  "Overriding options:\n {}", configOverrides);
+      currentOptions->merge(configOverrides, /*overwrite=*/true);
+    }
+
     // split tab-separated input into fields if necessary
-    auto inputs = options_->get<bool>("tsv", false)
-                      ? convertTsvToLists(input, options_->get<size_t>("tsv-fields", 1))
+    auto inputs = currentOptions->get<bool>("tsv", false)
+                      ? convertTsvToLists(input, currentOptions->get<size_t>("tsv-fields", 1))
                       : std::vector<std::string>({input});
-    auto corpus_ = New<data::TextInput>(inputs, srcVocabs_, options_);
-    data::BatchGenerator<data::TextInput> batchGenerator(corpus_, options_, nullptr, /*runAsync=*/false);
+    auto corpus_ = New<data::TextInput>(inputs, srcVocabs_, currentOptions);
+    data::BatchGenerator<data::TextInput> batchGenerator(corpus_, currentOptions, nullptr, /*runAsync=*/false);
 
-    auto collector = New<StringCollector>(options_->get<bool>("quiet-translation", false));
-    auto printer = New<OutputPrinter>(options_, trgVocab_);
+    auto collector = New<StringCollector>(currentOptions->get<bool>("quiet-translation", false));
+    auto printer = New<OutputPrinter>(currentOptions, trgVocab_);
     size_t batchId = 0;
 
     batchGenerator.prepare();
@@ -325,7 +356,7 @@ class TranslateService : public ModelServiceTask {
             scorers = scorers_[id % numDevices_];
           }
 
-          auto search = New<Search>(options_, scorers, trgVocab_);
+          auto search = New<Search>(currentOptions, scorers, trgVocab_);
           auto histories = search->search(graph, batch);
 
           for(auto history : histories) {
@@ -341,7 +372,7 @@ class TranslateService : public ModelServiceTask {
       }
     }
 
-    auto translations = collector->collect(options_->get<bool>("n-best"));
+    auto translations = collector->collect(currentOptions->get<bool>("n-best"));
     return utils::join(translations, "\n");
   }
 

From 5e6e1a04c25e18852d35932bf96f4d68d6a0ec0b Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Mon, 5 Feb 2024 22:47:19 +0000
Subject: [PATCH 12/26] Merged PR 32806: Various small changes and fixes to
 pybindings and pymarian-evaluate

This PR add minor fixes to pybindings and pymarian-evaluate:
* comet2marian.py script correctly handles the wmt23-cometkiwi-da-xl/xxl models.
* pymarian-evaluate now correctly computes scores
* evaluator now exposes an interface function to read the model config
---
 CMakeLists.txt                   | 28 ++++++++--------
 VERSION                          |  2 +-
 scripts/comet/comet2marian.py    | 34 +++++++++++--------
 src/common/config.cpp            |  2 +-
 src/data/text_input.cpp          | 11 +++++--
 src/data/text_input.h            | 56 +++++++++++++++++++++++++++-----
 src/evaluator/evaluator.h        | 17 +++++++---
 src/python/binding/bind.cpp      |  5 +--
 src/python/binding/evaluator.hpp | 22 ++++++++-----
 src/python/pymarian/evaluate.py  |  6 ++++
 src/python/pyproject.toml        |  5 +--
 11 files changed, 130 insertions(+), 58 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ebe2b819..7c9ccc424 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -496,20 +496,22 @@ if(USE_STATIC_LIBS)
 endif()
 
 ###############################################################################
-# Find Tcmalloc_minimal 
+# Find Tcmalloc_minimal
 # re-used from sentencepiece
-if(NOT WIN32)
-  if(USE_STATIC_LIBS)
-    find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
-  else()
-    find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
-  endif()
-  if (TCMALLOC_LIB)
-    message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
-    set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
-    add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
-  else()
-    message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
+if(USE_TCMALLOC)
+  if(NOT WIN32)
+    if(USE_STATIC_LIBS)
+      find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
+    else()
+      find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
+    endif()
+    if (TCMALLOC_LIB)
+      message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
+      set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
+      add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
+    else()
+      message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
+    endif()
   endif()
 endif()
 
diff --git a/VERSION b/VERSION
index cddff7b16..8b8e7fdd6 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.21
+v1.12.22
diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
index 09c369260..68912befd 100755
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -13,9 +13,11 @@
 # from comet.models import available_metrics
 # supported_comets = [m for m in available_metrics if 'qe' in m.lower()]
 supported_comets = [
-    'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da',
-    'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da', 'Unbabel/wmt22-cometkiwi-da',
-    'Unbabel/XCOMET-XL', 'Unbabel/XCOMET-XXL'
+    'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt20-comet-da',
+    'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da', 'wmt21-comet-da',
+    'Unbabel/wmt22-comet-da', 'Unbabel/wmt22-cometkiwi-da',
+    'Unbabel/XCOMET-XL', 'Unbabel/XCOMET-XXL',
+    'Unbabel/wmt23-cometkiwi-da-xl', 'Unbabel/wmt23-cometkiwi-da-xxl'
 ]
 log.basicConfig(level=log.INFO)
 
@@ -87,18 +89,22 @@ def load_comet_model(model_path):
 config = dict()
 
 model_type = type(cometModel).__name__
+print("COMET model params:", cometModel.hparams, file=sys.stderr)
+
+# are we using the xml-roberta-xl or xml-roberta-xxl model?
+isXlmXL = any(pre in cometModel.hparams.get("pretrained_model") for pre in ["xlm-roberta-xl", "xlm-roberta-xxl"])
+
 if model_type == "RegressionMetric":
     config["type"] = "comet"
 elif model_type == "ReferencelessRegression":
     config["type"] = "comet-qe"
 elif model_type == "XLMRobertaModel":
     config["type"] = "comet-qe"
-elif model_type == "UnifiedMetric" or model_type == "XCOMETMetric":
+elif model_type == "UnifiedMetric" or isXlmXL:
     config["type"] = "comet-unified"
     config["input-join-fields"] = True
     config["separator-symbol"] = "</s>"
     config["comet-use-separator"] = True
-    config["comet-pool"] = "cls"
 else:
     raise Exception(f'Unknown type of model {model_type}')
 
@@ -109,7 +115,7 @@ def load_comet_model(model_path):
 config["transformer-train-position-embeddings"] = True
 
 # Roberta-XXL (hence XCOMET-XXL) has pre-norm
-if model_type == "XCOMETMetric":  # @TODO: make this depend on RobertaXL/XXL rather than model_type
+if isXlmXL:
     config["transformer-preprocess"] = "n"
     config["transformer-postprocess"] = "da"
     config["transformer-postprocess-emb"] = ""
@@ -123,15 +129,16 @@ def load_comet_model(model_path):
 config["bert-type-vocab-size"] = 0
 config["comet-prepend-zero"] = True
 
-print(cometModel.hparams)
-
+config["comet-pool"] = cometModel.hparams.get("pool", "cls")
 config["comet-mix"] = cometModel.hparams.get("layer") == "mix"
 config["comet-mix-norm"] = cometModel.hparams.get('layer_norm', False)
 config["comet-mix-transformation"] = cometModel.hparams.get("layer_transformation", "softmax");
 
-# they have a bug in their code that makes this always true
-if model_type == "UnifiedMetric" or model_type == "XCOMETMetric":
+# there are several issues in their code that make the following always true regardless of values in hparams
+# that was hard to find out
+if model_type == "UnifiedMetric" or isXlmXL:
     config["comet-mix-transformation"] = "softmax"
+    config["comet-pool"] = "cls"
 
 if not args.roberta:
     config["comet-final-sigmoid"] = args.add_sigmoid
@@ -206,7 +213,7 @@ def extract(layer, nth, level):
         convert(pd, ["attention.output.dense.bias"],       f"{blockPrefix}->selfAttention->oProj->bias", bias=True)
 
         # self-attention layer-norm
-        if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type
+        if isXlmXL:
             convert(pd, ["attention.self_attn_layer_norm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True)
             convert(pd, ["attention.self_attn_layer_norm.bias"],   f"{blockPrefix}->preprocessor->norm->bias", bias=True)
         else:
@@ -224,7 +231,7 @@ def extract(layer, nth, level):
         convert(pd, ["output.dense.bias"],                 f"{blockPrefix}->layers->at(3)->as<marian::nn::Linear>()->bias", bias=True)
 
         # ffn layer-norm
-        if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type
+        if isXlmXL:
             convert(pd, ["LayerNorm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True)
             convert(pd, ["LayerNorm.bias"],   f"{blockPrefix}->preprocessor->norm->bias", bias=True)
         else:
@@ -267,7 +274,7 @@ def extract(layer, nth, level):
         prefix = "CometEncoder"
 
         # post-embedding layer normalization
-        if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type
+        if isXlmXL:
             convert(pd, ["encoder.LayerNorm.weight"], f"{prefix}->encoder->postprocessor->norm->weight", bias=True)
             convert(pd, ["encoder.LayerNorm.bias"],   f"{prefix}->encoder->postprocessor->norm->bias", bias=True)
         else:
@@ -309,7 +316,6 @@ def extract(layer, nth, level):
         # 3-layer FFN network that computes COMET regression
         prefix = "CometQEPooler"
 
-        # @TODO: make final sigmoid optional
         convert(pd, ["ff.0.weight"], f"{prefix}->layers->at(0)->as<marian::nn::Linear>()->weight")
         convert(pd, ["ff.0.bias"],   f"{prefix}->layers->at(0)->as<marian::nn::Linear>()->bias", bias=True)
 
diff --git a/src/common/config.cpp b/src/common/config.cpp
index b6296a8b2..78c2aac1b 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -281,7 +281,7 @@ Ptr<Options> parseOptions(int argc, char** argv, cli::mode mode, bool validate)
 
 Ptr<Options> parseOptions(const std::string& args, cli::mode mode, bool validate) {
   std::vector<std::string> vArgs = utils::split(args, " ");
-  
+
   std::string dummy("marian");
   std::vector<char*> cArgs = { &dummy[0] };
   for(auto& arg : vArgs)
diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp
index 0ccaedf14..e8801afb8 100644
--- a/src/data/text_input.cpp
+++ b/src/data/text_input.cpp
@@ -13,9 +13,9 @@ void TextIterator::increment() {
 }
 
 bool TextIterator::equal(TextIterator const& other) const {
-  // two iterators are equal if any of the following is true: 
+  // two iterators are equal if any of the following is true:
   // 1. both are invalid (null ptrs)
-  // 2. both at the end of the stream (empty tuples as record, regardless of pos_) 
+  // 2. both at the end of the stream (empty tuples as record, regardless of pos_)
   // 3. both are at the same position
   return (!this->tup_.valid() && !other.tup_.valid()) ||
     (this->tup_.valid() && other.tup_.valid() && this->tup_.empty() && other.tup_.empty()) ||
@@ -32,7 +32,12 @@ TextInput::TextInput(std::vector<std::string> inputs,
     : DatasetBase(inputs, options),
       vocabs_(vocabs),
       maxLength_(options_->get<size_t>("max-length")),
-      maxLengthCrop_(options_->get<bool>("max-length-crop")) {
+      maxLengthCrop_(options_->get<bool>("max-length-crop")),
+      rightLeft_(options_->get<bool>("right-left")),
+      prependZero_(options_->get<bool>("comet-prepend-zero", false)),
+      joinFields_(options_->get<bool>("input-join-fields", false)),
+      insertSeparator_(options_->get<bool>("comet-use-separator", false))
+ {
   // Note: inputs are automatically stored in the inherited variable named paths_, but these are
   // texts not paths!
   for(const auto& text : paths_)
diff --git a/src/data/text_input.h b/src/data/text_input.h
index 3a399b6d2..0e6d86e23 100644
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@@ -35,6 +35,13 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
 
   size_t maxLength_{0};
   bool maxLengthCrop_{false};
+  bool rightLeft_{false};
+
+  // copied from corpus.h - TODO: refactor or unify code between Corpus and TextInput
+  bool prependZero_{false};
+  bool joinFields_{false};      // if true when given a TSV file or multiple inputs, join them together into a single sentence tuple,
+                                // the already present </s> separator will demark the fields (mostly used for BLEURT and COMET-KIWI)
+  bool insertSeparator_{false}; // when joining fields with joinFields_, additionally use this separator (mostly used for COMET-KIWI)
 
 public:
   TextInput(std::vector<std::string> inputs, std::vector<Ptr<Vocab>> vocabs, Ptr<Options> options);
@@ -97,15 +104,48 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
     ABORT_IF(row.size() != vocabs_.size(), "Number of fields does not match number of vocabs");
     // fill up the sentence tuple with source and/or target sentences
     SentenceTupleImpl tup(id);
-    for(size_t i = 0; i < row.size(); ++i) {
-      std::string field = row[i];
-      Words words = vocabs_[i]->encode(field, /*addEOS=*/true, /*inference=*/inference_);
-      if(this->maxLengthCrop_ && words.size() > this->maxLength_) {
-        words.resize(maxLength_);
-        words.back() = vocabs_.back()->getEosId();  // note: this will not work with class-labels
+
+    // copied and adapted from corpus.cpp - @TODO: refactor or unify code between Corpus and TextInput
+    for(size_t batchIndex = 0; batchIndex < row.size(); ++batchIndex) {
+      std::string& field = row[batchIndex];
+      Words words = vocabs_[batchIndex]->encode(field, /*addEOS =*/true, inference_);
+      ABORT_IF(words.empty(), "Empty input sequences are presently untested");
+
+      // This handles adding starts symbols for COMET (<s>) and BERT/BLEURT ([CLS])
+      bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0));
+      if(prepend)
+        words.insert(words.begin(), Word::fromWordIndex(0));
+
+      bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0;
+      if(prependSep)
+        words.insert(words.begin(), vocabs_[batchIndex]->getSepId());
+
+      // if fields are joined and the current sentence is not the first one, we need to make sure that
+      // the current sentence is not longer than the maximum length minus the length of the previous sentence
+      // (minus 1 for the separator <eos> token or 2 if we also add a separator <sep> token)
+      size_t localMaxLength = maxLength_;
+      if(joinFields_ && !tup.empty())
+        localMaxLength = std::max(1 + (int)prependSep, (int)maxLength_ - (int)tup.back().size());
+
+      // if the current sentence is longer than the maximum length, we need to crop it
+      if(maxLengthCrop_ && words.size() > localMaxLength) {
+        words.resize(localMaxLength);
+        words.back() = vocabs_[batchIndex]->getEosId();
+      }
+
+      // if true, the words are reversed
+      if(rightLeft_)
+        std::reverse(words.begin(), words.end() - 1);
+
+      // if true, the numeric indices get joined with the previous sentence, <eos> acts as a separator here
+      if(joinFields_) {
+        size_t currLength = tup.empty() ? 0 : tup.back().size();
+        // if the current sentence would exceed the maximum length we don't add any more fields
+        if(currLength + words.size() < maxLength_)
+          tup.appendToBack(words);
+      } else {
+        tup.pushBack(words);
       }
-      ABORT_IF(words.empty(), "No words (not even EOS) found in the input text. ID: " + std::to_string(id));
-      tup.pushBack(words);
     }
     return SentenceTuple(tup);
   }
diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h
index 022a8204c..257804e41 100644
--- a/src/evaluator/evaluator.h
+++ b/src/evaluator/evaluator.h
@@ -50,7 +50,7 @@ class Evaluate : public ModelTask {
 
   std::vector<Ptr<ExpressionGraph>> graphs_;
   std::vector<Ptr<Model>> models_;
-  Ptr<io::ModelWeights> modelFile_;
+  Ptr<io::ModelWeights> modelWeights_;
 
 public:
   Evaluate(Ptr<Options> options) : options_(options) {
@@ -69,7 +69,7 @@ class Evaluate : public ModelTask {
     auto modelPath = options_->get<std::string>("model");
     LOG(info, "Loading model from {}", modelPath);
 
-    modelFile_ = New<io::ModelWeights>(modelPath);
+    modelWeights_ = New<io::ModelWeights>(modelPath);
 
     graphs_.resize(devices.size());
     models_.resize(devices.size());
@@ -85,7 +85,7 @@ class Evaluate : public ModelTask {
             graph->reserveWorkspaceMB(options_->get<int>("workspace"));
 
             auto model = New<Model>(options_);
-            model->load(graph, modelFile_);
+            model->load(graph, modelWeights_);
 
             models_[j] = model;
             graphs_[j] = graph;
@@ -107,10 +107,10 @@ class Evaluate : public ModelTask {
     run(batchGenerator, output);
     LOG(info, "Total time: {:.5f}s wall", timer.elapsed());
   }
-  
+
   template <typename T>
   void run(Ptr<BatchGenerator<T>> batchGenerator,  Ptr<VectorCollector> collector) {
-  
+
     size_t batchId = 0;
     {
       ThreadPool pool(graphs_.size(), graphs_.size());
@@ -158,6 +158,13 @@ class Evaluate : public ModelTask {
     }
   }
 
+  std::string getModelConfig() {
+    ABORT_IF(!modelWeights_, "Model weights are not loaded");
+    YAML::Emitter outYaml;
+    cli::OutputYaml(modelWeights_->getYamlFromModel(), outYaml);
+    return outYaml.c_str();
+  }
+
 };
 
 }  // namespace marian
diff --git a/src/python/binding/bind.cpp b/src/python/binding/bind.cpp
index 9e8cc4464..38a1e3429 100644
--- a/src/python/binding/bind.cpp
+++ b/src/python/binding/bind.cpp
@@ -1,6 +1,6 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-// if your IDE/vscode complains about missing paths 
+// if your IDE/vscode complains about missing paths
 // pybind11 can be found by "python -m pybind11 --includes"; you may need to add both pybind11 and Python.h
 #include "embedder.hpp"
 #include "evaluator.hpp"
@@ -17,7 +17,7 @@ using namespace pymarian;
 PYBIND11_MODULE(_pymarian, m) {
     m.doc() = "Marian C++ API bindings via pybind11";
 
-    /** TODOS 
+    /** TODOS
      *  1. API to check if gpu available: cuda_is_available() -> bool
      *  2. API to check number of gpus:: cuda_device_count() -> int
     */
@@ -31,6 +31,7 @@ PYBIND11_MODULE(_pymarian, m) {
     py::class_<EvaluatorPyWrapper>(m, "Evaluator")
         .def(py::init<std::string>())
         .def("evaluate", py::overload_cast<const StrVectors&>(&EvaluatorPyWrapper::run))
+        .def("get_model_config", py::overload_cast<>(&EvaluatorPyWrapper::getModelConfig))
         ;
 
     py::class_<PyTrainer>(m, "Trainer")
diff --git a/src/python/binding/evaluator.hpp b/src/python/binding/evaluator.hpp
index f72ccd08a..37b687d21 100644
--- a/src/python/binding/evaluator.hpp
+++ b/src/python/binding/evaluator.hpp
@@ -26,13 +26,13 @@ namespace pymarian {
 
   /**
    * Wrapper for Marian Evaluator.
-   * 
+   *
    * This class is a wrapper for the Marian Evaluator class.
    * It is used to run the evaluator on a given input.
-   * 
+   *
    **/
   class EvaluatorPyWrapper {
-    
+
   private:
     Ptr<marian::Options> options_;
     Ptr<Evaluator> evaluator_;
@@ -46,10 +46,10 @@ namespace pymarian {
     EvaluatorPyWrapper(const std::string& cliString){
       options_ = parseOptions(cliString, cli::mode::evaluating, true)
       ->with("inference", true, "shuffle", "none");
-      evaluator_= New<Evaluator>(options_);
+      evaluator_ = New<Evaluator>(options_);
       vocabs_ = loadVocabs(options_);
     }
- 
+
     /**
      * @brief Load the vocabularies from the given paths
      * @param options - the options object
@@ -69,7 +69,7 @@ namespace pymarian {
 
     /**
      * Given a table of strings (i.e., rows x columns), concatenate each column into a single string.
-     * 
+     *
      * @param data - table of strings : rows x columns
      * @return List of strings, one string for each column, concatenated across rows.
     */
@@ -92,12 +92,12 @@ namespace pymarian {
     }
 
     /**
-     * Run the evaluator on the given input. 
+     * Run the evaluator on the given input.
      * Input is transformed as (in memory) files by concatenating columns.
-     * 
+     *
      * @param inputs - table of strings : rows x columns
      * @return table of floats : rows x columns
-     * 
+     *
     */
     auto run(const StrVectors& inputs) -> FloatVectors {
       StrVector columnFiles = concatColumns(inputs);
@@ -114,6 +114,10 @@ namespace pymarian {
       return outputs;
     }
 
+    auto getModelConfig() -> std::string {
+      return evaluator_->getModelConfig();
+    }
+
   };
 
 }
diff --git a/src/python/pymarian/evaluate.py b/src/python/pymarian/evaluate.py
index be13f3f00..371a37006 100755
--- a/src/python/pymarian/evaluate.py
+++ b/src/python/pymarian/evaluate.py
@@ -9,6 +9,8 @@
 import subprocess
 import sys
 import threading
+import yaml
+
 from pathlib import Path
 from typing import Iterator, List, Optional, Tuple, Union
 
@@ -138,7 +140,11 @@ def pymarian_evaluate(
         raise ImportError('pymarian is not installed. Please install it and rerun')
 
     log.info(f'Marian CLI::\n\t{cmd_line}')
+
     evaluator = Evaluator(cmd_line)
+    config = yaml.safe_load(evaluator.get_model_config())
+    log.info(f'Model config: {config}')
+
     assert average in ('skip', 'append', 'only')
     lines = (line.rstrip('\n').split('\t') for line in input_lines)
 
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
index a9cf413a7..84d1b0e8f 100644
--- a/src/python/pyproject.toml
+++ b/src/python/pyproject.toml
@@ -30,7 +30,8 @@ classifiers = [
 
 dependencies = [
   "tqdm",
-  "requests"
+  "requests",
+  "pyyaml"
 ]
 
 [project.scripts]
@@ -56,7 +57,7 @@ target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
 include = 'src/python/.*\.pyi?$'
 skip-string-normalization = true
 
-# black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333 
+# black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333
 [tool.isort]
 profile = "black"
 src_paths = ["src/python"]

From 4cdf93a2c8b1d02f38faade4de73635a8d474f1b Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Tue, 6 Feb 2024 05:36:43 +0000
Subject: [PATCH 13/26] Merged PR 32860: Azure CI: save disk space by disabling
 compilation for Ampere and Turing

Ubuntu CI: ON to Maxwell, Pascal and Volta; OFF to Ampere and Turing

* to fix space issue on CI vms
---
 azure-pipelines.yml      | 27 ++++++++++++++++++++-------
 src/data/corpus_base.cpp |  4 +++-
 src/data/text_input.h    |  4 +++-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index a1e9ea94f..4c7cd0bfd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -247,6 +247,7 @@ stages:
           gcc: 9
           unit_tests: true
           examples: false
+          pymarian: true
           static: true
         # Ubuntu GPU-only build
         "GPU-only":
@@ -258,6 +259,7 @@ stages:
           gcc: 9
           unit_tests: false
           examples: false
+          pymarian: true
           static: false
         ################################################################
         # Ubuntu 22.04 supports CUDA 11+
@@ -271,20 +273,23 @@ stages:
           gpu: true
           cuda: 11.7
           gcc: 11
-          unit_tests: false # disable unit tests to minimize compilation time
-          examples: false   # disable examples to minimize compilation time
+          unit_tests: true
+          examples: true
+          pymarian: true
           static: false
         ################################################################
         # Ubuntu 20.04 supports CUDA 11+
         "20.04 CUDA 11.1 gcc-9":
           image: ubuntu-20.04
-          boost: true
+          boost: false
           cpu: true
           gpu: true
           cuda: 11.1
           gcc: 9
-          unit_tests: true
-          examples: true
+          # static cause large binaries so we turn off tests and examples
+          unit_tests: false
+          examples: false
+          pymarian: false
           static: true
         ################################################################
         # Ubuntu 16.04 is no longer available on Azure-hosted machines
@@ -324,8 +329,9 @@ stages:
       condition: eq(variables.gpu, true)
 
     # Some preinstalled versions of pip are bad for pymarian; see https://github.com/pypa/setuptools/issues/3269
-    - bash: python3 -m pip install pip -U 
+    - bash: python3 -m pip install pip -U
       displayName: Upgrade pip
+      condition: eq(variables.pymarian, true)
 
     - bash: |
         mkdir -p build
@@ -334,6 +340,12 @@ stages:
         cmake .. \
           -DCOMPILE_CPU=$(cpu) \
           -DCOMPILE_CUDA=$(gpu) \
+          -DCOMPILE_MAXWELL=$(gpu) \
+          -DCOMPILE_PASCAL=$(gpu) \
+          -DCOMPILE_VOLTA=$(gpu) \
+          -DCOMPILE_AMPERE=OFF \
+          -DCOMPILE_AMPERE_RTX=OFF \
+          -DCOMPILE_TURING=OFF \
           -DCOMPILE_EXAMPLES=$(examples) \
           -DCOMPILE_SERVER=$(boost) \
           -DCOMPILE_TESTS=$(unit_tests) \
@@ -343,7 +355,7 @@ stages:
           -DBoost_ARCHITECTURE=-x64 \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda) \
           -DUSE_TCMALLOC=off \
-          -DPYMARIAN=ON \
+          -DPYMARIAN=$(pymarian) \
           -DPYTHON_EXECUTABLE=python3
 
       displayName: Configure CMake
@@ -374,6 +386,7 @@ stages:
         python3 -m pip install build/pymarian-*.whl
         python3 -m pymarian -v
       displayName: Build Pymarian
+      condition: eq(variables.pymarian, true)
 
   ######################################################################
   - job: BuildMacOS
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index 47381d9b9..e1b0aad62 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -461,8 +461,10 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
   if(joinFields_) {
     size_t currLength = tup.empty() ? 0 : tup.back().size();
     // if the current sentence would exceed the maximum length we don't add any more fields
-    if(currLength + words.size() < maxLength_)
+    if(currLength + words.size() <= maxLength_)
       tup.appendToBack(words);
+
+    ABORT_IF(tup.empty(), "This should have content if we got here??");
   } else {
     tup.pushBack(words);
   }
diff --git a/src/data/text_input.h b/src/data/text_input.h
index 0e6d86e23..f2e9831de 100644
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@@ -141,8 +141,10 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
       if(joinFields_) {
         size_t currLength = tup.empty() ? 0 : tup.back().size();
         // if the current sentence would exceed the maximum length we don't add any more fields
-        if(currLength + words.size() < maxLength_)
+        if(currLength + words.size() <= maxLength_)
           tup.appendToBack(words);
+
+        ABORT_IF(tup.empty(), "This should have content if we got here??");
       } else {
         tup.pushBack(words);
       }

From bd9a679396c304609ff7fce14bb7fc2e8535d840 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 6 Feb 2024 15:39:39 +0000
Subject: [PATCH 14/26] Merged PR 32636: Extending new layer framework to match
 production models

This PR implements a bunch of missing functionality in the new layer framework. Among others:

* Autoregressive self-attention
* Guided alignment training
* Decode-time alignment

Minor refactoring of previous code to accommodate above changes.

When setting `export TRANSFORMER_FLAVOR=experimental` all legacy transformer models are internally mapped to the new layer framework. With that enabled:

Production regression tests all pass.

Passes all public regression tests with the exception of:

- tests/factors/test_factors_concat.sh
- tests/factors/test_factors_decoder_concat.sh
- tests/models/wnmt18/test_student_small_aan.sh
- tests/models/wnmt18/test_student_small_aan_intgemm16.sh
- tests/models/wnmt18/test_student_small_aan_intgemm8.sh

and

- tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh
- tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh

I could get these to work, but it doesn't seem to be worth it. I plan to remove both code paths in the future. The last two are -- I think -- just divergences due to mild model differences and probably don't need fixing, rather future adaptation.
---
 CMakeLists.txt                   |   7 +-
 VERSION                          |   2 +-
 src/common/io.cpp                |  12 +-
 src/common/io.h                  |  23 +-
 src/data/corpus_base.h           |   3 +-
 src/graph/cached_expression.h    |  16 +-
 src/graph/expression_graph.h     |   2 +
 src/graph/node_operators_unary.h |  28 +--
 src/layers/output.cpp            |   4 +-
 src/layers_new/alibi.cpp         |  90 +++++---
 src/layers_new/alibi.cu          |  60 +++--
 src/layers_new/alibi.h           | 234 ++++++++++---------
 src/layers_new/attention.cpp     |  74 ++++--
 src/layers_new/attention.h       | 287 +++++++++++++++++------
 src/layers_new/decoder.h         |  42 +++-
 src/layers_new/interface.h       |  77 ++++---
 src/layers_new/neuralnet.h       |  12 +-
 src/layers_new/rnn.h             |  29 ++-
 src/layers_new/transformer.h     | 378 +++++++++++++++++++++++--------
 src/microsoft/quicksand.cpp      |   2 +-
 src/models/amun.h                |   5 +-
 src/models/bleurt.h              |   9 +-
 src/models/nematus.h             |   5 +-
 src/models/transformer.h         |  54 ++---
 src/models/transformer_factory.h |  34 +--
 src/models/transformer_new.h     |  45 ++--
 26 files changed, 1006 insertions(+), 528 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c9ccc424..e16876f78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -479,8 +479,11 @@ if(NOT MSVC)
   list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
 else()
   # c++17 doesn't work with CUDA 10
-  # list(APPEND CUDA_NVCC_FLAGS -std=c++17; -Xcompiler "/std:c++17"; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
-  list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
+  if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++17; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
+  else()
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++14; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
+  endif()
 endif()
 
 list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS)
diff --git a/VERSION b/VERSION
index 8b8e7fdd6..9db15f195 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.22
+v1.12.23
diff --git a/src/common/io.cpp b/src/common/io.cpp
index 109b3a1ed..61a9054cd 100644
--- a/src/common/io.cpp
+++ b/src/common/io.cpp
@@ -139,8 +139,18 @@ std::vector<Item> ModelWeights::mmapItems(const void* ptr) {
   return items;
 }
 
+std::unique_ptr<std::lock_guard<std::mutex>> ModelWeights::scopedLockGuard() const {
+  // @TODO: this should use std::optional, but as long as we use CUDA 10.x there may be
+  // random problems with std::optional and nvcc compilation
+  if(locking_)
+    return std::unique_ptr<std::lock_guard<std::mutex>>(new std::lock_guard<std::mutex>(mutex_));
+  else
+    return nullptr;
+}
+
 void ModelWeights::load() {
-  std::lock_guard<std::mutex> lock(mutex_);
+  auto optionalLock = scopedLockGuard();
+
   if(loaded_)
     return;
 
diff --git a/src/common/io.h b/src/common/io.h
index 1db0a83fe..8eaf47665 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -9,6 +9,7 @@
 #include "common/definitions.h"
 #include "common/io_item.h"
 
+#include <mutex>
 #include <string>
 #include <vector>
 
@@ -32,8 +33,6 @@ bool isBin(const std::string& fileName);
 
 class ModelWeights {
 private:
-  std::mutex mutex_;
-
   std::string fileName_;
   const void* ptr_{nullptr};
 
@@ -48,14 +47,17 @@ class ModelWeights {
   std::vector<Item> items_;
   std::unique_ptr<mio::mmap_source> mmap_;
 
+  mutable std::mutex mutex_;
+  bool locking_{true}; // if true, the mutex will be locked when accessing the data, see scopedLockGuard()
+
   std::vector<Item> loadItems(const std::string& fileName);
   std::vector<Item> mmapItems(const void* ptr);
 
   void load();
 
 public:
-  ModelWeights(const std::string& fileName, MmapMode mmapMode = MmapMode::OpportunisticMmap)
-  : fileName_(fileName), fileType_(getFileType(fileName)), mmapMode_(mmapMode) {
+  ModelWeights(const std::string& fileName, MmapMode mmapMode = MmapMode::OpportunisticMmap, bool locking = true)
+  : fileName_(fileName), fileType_(getFileType(fileName)), mmapMode_(mmapMode), locking_(locking) {
 
     // NPZ files cannot be memory-mapped, so we switch opportunistic mmap off, but keep any other mmap mode
     if(fileType_ == FileType::isNpz && mmapMode_ == MmapMode::OpportunisticMmap)
@@ -65,11 +67,11 @@ class ModelWeights {
     ABORT_IF(fileType_ == FileType::isNpz && mmapMode_ != MmapMode::DontMmap, "NPZ files cannot be memory-mapped");
   }
 
-  ModelWeights(const void* ptr, MmapMode mmapMode = MmapMode::RequiredMmap)
-  : ptr_(ptr), fileType_(FileType::isBuf), mmapMode_(mmapMode) {}
+  ModelWeights(const void* ptr, MmapMode mmapMode = MmapMode::RequiredMmap, bool locking = true)
+  : ptr_(ptr), fileType_(FileType::isBuf), mmapMode_(mmapMode), locking_(locking) {}
 
-  ModelWeights()
-  : fileType_(FileType::isDummy), mmapMode_{MmapMode::DontMmap} {}
+  ModelWeights(bool locking = true)
+  : fileType_(FileType::isDummy), mmapMode_{MmapMode::DontMmap}, locking_(locking) {}
 
   ModelWeights(const ModelWeights&&) = delete;
   ModelWeights(const ModelWeights&) = delete;
@@ -85,6 +87,11 @@ class ModelWeights {
 
   YAML::Node getYamlFromModel(const std::string& varName = "special:model.yml") const;
 
+  // If locking is set to false, the returned unique_ptr will be empty and no lock will be acquired.
+  // Otherwise the returned unique_ptr will contain a lock guard that will be released when the unique_ptr
+  // goes out of scope. So we have an optional scoped lock guard.
+  std::unique_ptr<std::lock_guard<std::mutex>> scopedLockGuard() const;
+
   void loadAndSync(Ptr<IMPIWrapper> mpi);
 };
 
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index 074689804..b21da01c6 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -657,8 +657,7 @@ class CorpusBase : public DatasetBase<SentenceTuple, CorpusIterator, CorpusBatch
   bool maxLengthCrop_{false};
   bool rightLeft_{false};
   bool prependZero_{false};
-
-  bool joinFields_{false};      // if true when given a TSV file or multiple inputs, join them together into a single sentence tuple, 
+  bool joinFields_{false};      // if true when given a TSV file or multiple inputs, join them together into a single sentence tuple,
                                 // the already present </s> separator will demark the fields (mostly used for BLEURT and COMET-KIWI)
   bool insertSeparator_{false}; // when joining fields with joinFields_, additionally use this separator (mostly used for COMET-KIWI)
 
diff --git a/src/graph/cached_expression.h b/src/graph/cached_expression.h
index f7adff8bc..376a9f14f 100644
--- a/src/graph/cached_expression.h
+++ b/src/graph/cached_expression.h
@@ -6,8 +6,8 @@
 
 namespace marian {
 
-// This class allows for simpler caching of Expr objects and automatic checking if the 
-// cached Expr needs to be updated/recreated. 
+// This class allows for simpler caching of Expr objects and automatic checking if the
+// cached Expr needs to be updated/recreated.
 class CachedExpr {
   private:
     ENABLE_INTRUSIVE_PTR(CachedExpr);
@@ -21,27 +21,27 @@ class CachedExpr {
     UPtr<ApplyFunT> applyFun_; // function that creates the cached result
     UPtr<EqualFunT> equalFun_; // function that checks if the input changed. If yes,
                                // the `apply_` functions gets reapplied and the new result
-                               // is cached. 
-  
+                               // is cached.
+
   public:
     // No functors are given; they will have to supplied when calling `apply`.
     CachedExpr() {};
 
     // No apply functor is given; it will have to supplied when calling `apply`.
-    CachedExpr(EqualFunT equalFun) 
+    CachedExpr(EqualFunT equalFun)
     : equalFun_(new EqualFunT(equalFun)) {};
 
     // Both functors are given, and will be used by default. They can however be overriden
     // if supplied directly in `apply`.
-    CachedExpr(ApplyFunT applyFun, EqualFunT equalFun) 
+    CachedExpr(ApplyFunT applyFun, EqualFunT equalFun)
     : applyFun_(new ApplyFunT(applyFun)), equalFun_(new EqualFunT(equalFun)) {};
 
-    // lazily executes the factory `applyFun` if `equalFun` indicates that the input has changed.
+    // lazily executes the factory `applyFun` if no value is cached or `equalFun` indicates that the input has changed.
     Expr apply(Expr key, ApplyFunT applyFun, EqualFunT equalFun) {
       if(!cachedKey_ || !equalFun(cachedKey_, key)) {
         cachedKey_ = key;
         cachedValue_ = applyFun(key);
-      } 
+      }
       return cachedValue_;
     }
 
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 915c9df3f..239ecaeaf 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -799,6 +799,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
 
     setReloaded(false);
     for(auto& item : modelWeights->items()) {
+      auto lockGuard = modelWeights->scopedLockGuard();
+
       std::string pName = item.name;
       // skip over special parameters starting with "special:"
       if(pName.substr(0, 8) == "special:")
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index aa3f5004c..ba11eca0e 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -503,7 +503,7 @@ struct ReduceNodeOp : public UnaryNodeOp {
       : UnaryNodeOp(a, newShape(a, axis)), opCode_(opCode)
   {
     reducedDim_ = a->shape()[axis]; // e.g. used in mean()
-    ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(), 
+    ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(),
              "Bug in determining reducedDim {} != {}",
              reducedDim_,
              a->shape().elements() / shape().elements());
@@ -619,13 +619,13 @@ class CumSumNodeOp : public UnaryNodeOp {
   int axis_;
   bool reverse_;
   bool exclusive_;
-  
+
 public:
-  CumSumNodeOp(Expr a, int axis, bool reverse, bool exclusive) 
-    : UnaryNodeOp(a), 
-      axis_(a->shape().axis(axis)), 
+  CumSumNodeOp(Expr a, int axis, bool reverse, bool exclusive)
+    : UnaryNodeOp(a),
+      axis_(a->shape().axis(axis)),
       reverse_(reverse),
-      exclusive_(exclusive) 
+      exclusive_(exclusive)
   {}
 
   NodeOps forwardOps() override {
@@ -685,10 +685,10 @@ class LogCumSumExpNodeOp : public UnaryNodeOp {
 
 public:
   LogCumSumExpNodeOp(Expr a, int axis, bool reverse, bool exclusive, bool fast=false)
-    : UnaryNodeOp(a), 
-      axis_(a->shape().axis(axis)), 
+    : UnaryNodeOp(a),
+      axis_(a->shape().axis(axis)),
       reverse_(reverse),
-      exclusive_(exclusive), 
+      exclusive_(exclusive),
       fast_(fast)
   {}
 
@@ -1019,10 +1019,10 @@ class CallbackNodeOp : public ReshapeNodeOp {
 private:
   typedef std::function<void(Expr)> LambdaNodeCallback;
   std::unique_ptr<LambdaNodeCallback> callback_;
-  
+
 public:
   CallbackNodeOp(Expr node, LambdaNodeCallback callback)
-  : ReshapeNodeOp(node, node->shape()), 
+  : ReshapeNodeOp(node, node->shape()),
     callback_(new LambdaNodeCallback(callback)) {
   }
 
@@ -1053,10 +1053,10 @@ class CallbackNodeOp : public ReshapeNodeOp {
 class DropoutReluInplaceNodeOp : public ReshapeNodeOp {
 private:
   Expr mask_;
-  
+
 public:
   DropoutReluInplaceNodeOp(Expr node, Expr mask = nullptr)
-  : ReshapeNodeOp(node, node->shape()), 
+  : ReshapeNodeOp(node, node->shape()),
     mask_(mask) {}
 
   void forward() override {
@@ -1312,7 +1312,7 @@ struct ShiftNodeOp : public UnaryNodeOp {
     if(!cnode)
       return false;
     if(shift_ != cnode->shift_)
-      return false;    
+      return false;
     if(padValue_ != cnode->padValue_)
       return false;
     return true;
diff --git a/src/layers/output.cpp b/src/layers/output.cpp
index 8977464b1..05b70645b 100644
--- a/src/layers/output.cpp
+++ b/src/layers/output.cpp
@@ -92,7 +92,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
   };
 
   auto affineShortlist = [this](Expr x, Expr W, Expr b, bool transA, bool transB) {
-    /*    
+    /*
     std::cerr << "affineShortlist.x=" << x->shape() << std::endl;
     std::cerr << "affineShortlist.W=" << W->shape() << std::endl;
     if (b) std::cerr << "affineShortlist.b=" << b->shape() << std::endl;
@@ -114,7 +114,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
     else {
       // original shortlist. W always has 1 for beam & batch
       ret = dot(x, W, transA, transB);
-    } 
+    }
 
     //std::cerr << "ret.x=" << ret->shape() << std::endl;
     return ret;
diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp
index abffb6bae..44f0eb60b 100644
--- a/src/layers_new/alibi.cpp
+++ b/src/layers_new/alibi.cpp
@@ -56,7 +56,7 @@ Expr AlibiDecoderState::getAlibiShift(Ptr<ExpressionGraph> graph, bool decoding)
     std::vector<float> shift;
     for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_)
       shift.push_back((float)(srcPos - trgPos));
-    
+
     if(!shift.empty()) {
       int dimBeam  = lastBeam_;
       ABORT_IF(dimBeam == 0, "dimBeam is 0??");
@@ -66,7 +66,7 @@ Expr AlibiDecoderState::getAlibiShift(Ptr<ExpressionGraph> graph, bool decoding)
       return nullptr;
     }
   } else {
-    ABORT_IF(getBatch()->sets() != 2, 
+    ABORT_IF(getBatch()->sets() != 2,
              "--transformer-alibi-shift=true currently only works with batch sets=2");
     return getAlibiShiftFromBatch(graph);
   }
@@ -93,7 +93,7 @@ Expr AlibiDecoderState::getAlibiShiftFromBatch(Ptr<ExpressionGraph> graph) const
   int dimBatch = (int)targetBatch->batchSize();
   int dimSrc   = (int)sourceBatch->batchWidth();
   int dimTrg   = (int)targetBatch->batchWidth();
-  
+
   for(int batchIdx = 0; batchIdx < dimBatch; ++batchIdx) {
     int trgPos = -1, srcPos = -1;
     for(int i = 0; i < dimTrg; ++i) {
@@ -148,7 +148,7 @@ std::vector<AlibiDecoderState::SyncCoord> AlibiDecoderState::computeSyncPoints(
   // If the current symbol is a sync symbol, the sync point target coordinate is updated to the current position
   // and the source coordinate is updated to the next sync symbol in the source sentence.
   for(int i = 0; i < hypIndices.size(); ++i) {
-    SyncCoord pos = syncPoints_.empty() 
+    SyncCoord pos = syncPoints_.empty()
       ? SyncCoord({-1, -1, (int)batchIndices[i % dimBatch]}) // no sync points yet, initialize with -1 position and current batch index
       : syncPoints_[hypIndices[i]];                          // carry over the sync point from the previous state at first
     auto& [trgPos, srcPos, batchIdx] = pos;
@@ -168,7 +168,7 @@ std::vector<AlibiDecoderState::SyncCoord> AlibiDecoderState::computeSyncPoints(
   }
 
   return nextSyncPoints;
-} 
+}
 
 
 Ptr<DecoderState> NewDecoderState(Ptr<Options> options,
@@ -185,16 +185,22 @@ Ptr<DecoderState> NewDecoderState(Ptr<Options> options,
   }
 }
 
-Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state, 
-                                          Ptr<ExpressionGraph> graph, 
+Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state,
+                                          Ptr<ExpressionGraph> graph,
                                           bool decoding) {
   Expr shift;
   auto alibiState = std::dynamic_pointer_cast<AlibiDecoderState>(state);
   if(alibiState)
     shift = alibiState->getAlibiShift(graph, decoding);
 
-  size_t position = state->getPosition();
-  auto nnState = New<nn::DecoderStateList>(position);
+  // @TODO: allow for 0 encoder states, i.e. a decoder-only model
+  ABORT_IF(state->getEncoderStates().size() != 1, "Only supports exactly one encoder state");
+
+  size_t position     = state->getPosition();
+  auto encoderContext = state->getEncoderStates()[0]->getContext();
+  auto encoderMask    = state->getEncoderStates()[0]->getMask();
+
+  auto nnState = New<nn::DecoderSeq2SeqState>(position, encoderContext, encoderMask);
   for(auto& layerState : state->getStates()) {
     if(alibiState) {
       nnState->append(New<nn::AlibiDecoderStateItem>(layerState.cell, shift, position));
@@ -208,97 +214,108 @@ Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state,
 #ifdef CUDA_FOUND
 namespace gpu {
   template <class... Tensors>
-  void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors);
+  void Alibi(int numHeads, int start, bool addCausalMask, marian::Tensor out, Tensors... tensors);
 }
 #endif
 
 namespace cpu {
   template <class... Tensors>
-  void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) { 
+  void Alibi(int numHeads, int start, bool addCausalMask, marian::Tensor out, Tensors... tensors) {
     ABORT("Not implemented");
   }
 }
 
 template <class... Tensors>
-void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) {
+void Alibi(int numHeads, int start, bool addCausalMask, marian::Tensor out, Tensors... tensors) {
 #ifdef CUDA_FOUND
   if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
-    gpu::Alibi(numHeads, start, out, tensors...);
+    gpu::Alibi(numHeads, start, addCausalMask, out, tensors...);
   else
 #endif
-    cpu::Alibi(numHeads, start, out, tensors...);
+    cpu::Alibi(numHeads, start, addCausalMask, out, tensors...);
 }
 
 
 #ifdef CUDA_FOUND
 namespace gpu {
   template <class... Tensors>
-  void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors);
+  void AlibiGrad(int numHeads, int start, bool addCausalMask, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors);
 }
 #endif
 
 namespace cpu {
   template <class... Tensors>
-  void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors) { 
+  void AlibiGrad(int numHeads, int start, bool addCausalMask, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors) {
     ABORT("Not implemented");
   }
 }
 
 template <class... Tensors>
-void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... inputs) {
+void AlibiGrad(int numHeads, int start, bool addCausalMask, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... inputs) {
 #ifdef CUDA_FOUND
   if(slopesGrad->getBackend()->getDeviceId().type == DeviceType::gpu)
-    gpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...);
+    gpu::AlibiGrad(numHeads, start, addCausalMask, slopesGrad, biasesGrad, inputs...);
   else
 #endif
-    cpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...);
+    cpu::AlibiGrad(numHeads, start, addCausalMask, slopesGrad, biasesGrad, inputs...);
 }
 
 class AlibiLogMaskNode : public NaryNodeOp {
 private:
   int numHeads_{8};
   int start_{0};
+  bool addCausalMask_{false};
 
-  Shape newShape(Expr mask, Expr query, int numHeads) {
+  Shape newShape(Expr mask, Expr query, int numHeads, bool addCausalMask) {
     int dimBeam  = query->shape()[-4];
     int dimBatch = query->shape()[-3];
     int dimQuery = query->shape()[-2];
     int dimKeys  = mask->shape()[-2];
 
+    ABORT_IF(addCausalMask && dimQuery != dimKeys, "Causal mask only works for square attention matrices");
+
     return { dimBeam, dimBatch * numHeads, dimQuery, dimKeys };
   }
 
 public:
-  AlibiLogMaskNode(const std::vector<Expr>& nodes, int numHeads, int start)
-  : NaryNodeOp(nodes, newShape(/*mask=*/nodes[0], /*query=*/nodes[1], numHeads), nodes[0]->value_type()), 
-    numHeads_(numHeads), start_{start}
+  AlibiLogMaskNode(const std::vector<Expr>& nodes, int numHeads, int start, bool addCausalMask)
+  : NaryNodeOp(nodes, newShape(/*mask=*/nodes[0], /*query=*/nodes[1], numHeads, addCausalMask), nodes[0]->value_type()),
+    numHeads_(numHeads), start_{start}, addCausalMask_{addCausalMask}
   {}
 
   void forward() override {
     Alibi(
-          numHeads_, 
+          numHeads_,
           start_,
-          val_, 
+          addCausalMask_,
+          val_,
           /*mask=*/  child(0)->val(),
-          /*slopes=*/child(2)->val(), 
-          /*biases=*/child(3)->val(), 
+          /*slopes=*/child(2)->val(),
+          /*biases=*/child(3)->val(),
           /*shift=*/ children().size() == 5 ? child(4)->val() : nullptr);
   }
 
   void backward() override {
     if(!trainable())
       return;
-    
+
+    if(!child(2)->trainable())
+      return;
+
+    if(!child(3)->trainable())
+      return;
+
     AlibiGrad(
-          numHeads_, 
+          numHeads_,
           start_,
+          addCausalMask_,
           // gradients
-          /*d_f/d_slopes=*/child(2)->grad(), 
-          /*d_f/d_biases=*/child(3)->grad(), 
+          /*d_f/d_slopes=*/child(2)->grad(),
+          /*d_f/d_biases=*/child(3)->grad(),
           // inputs
           /*mask=*/   child(0)->val(),
-          /*slopes=*/ child(2)->val(), 
-          /*biases=*/ child(3)->val(), 
+          /*slopes=*/ child(2)->val(),
+          /*biases=*/ child(3)->val(),
           /*shift=*/  children().size() == 5 ? child(4)->val() : nullptr,
           // adjoint
           /*d_J/d_f=*/adj_);
@@ -308,6 +325,7 @@ class AlibiLogMaskNode : public NaryNodeOp {
     size_t seed = NaryNodeOp::hash();
     util::hash_combine(seed, numHeads_);
     util::hash_combine(seed, start_);
+    util::hash_combine(seed, addCausalMask_);
     return seed;
   }
 
@@ -321,18 +339,20 @@ class AlibiLogMaskNode : public NaryNodeOp {
       return false;
     if(start_ != cnode->start_)
       return false;
+    if(addCausalMask_ != cnode->addCausalMask_)
+      return false;
     return true;
   }
 
   const std::string type() override { return "alibi-log-mask"; }
 };
 
-Expr alibiLogMask(Expr mask, Expr query, Expr slopes, Expr biases, Expr shift, int numHeads, int start) {
+Expr alibiLogMask(Expr mask, Expr query, Expr slopes, Expr biases, Expr shift, int numHeads, int start, bool addCausalMask) {
   std::vector<Expr> nodes = {mask, query, slopes, biases};
   if(shift)
     nodes.push_back(shift);
 
-  return Expression<AlibiLogMaskNode>(nodes, numHeads, start);
+  return Expression<AlibiLogMaskNode>(nodes, numHeads, start, addCausalMask);
 }
 
 
diff --git a/src/layers_new/alibi.cu b/src/layers_new/alibi.cu
index 07042699b..be4b30dea 100644
--- a/src/layers_new/alibi.cu
+++ b/src/layers_new/alibi.cu
@@ -15,7 +15,8 @@ __global__ void gAlibi(
   functional::Array<functional::Tensor<T>, 4> inputs,
   int numHeads,
   int start,
-  float maskFactor) {
+  float maskFactor,
+  bool addCausalMask) {
 
   constexpr size_t N = functional::Shape::size();
   functional::Array<int, N> oDims;
@@ -42,9 +43,9 @@ __global__ void gAlibi(
 
       int keyPos       = keyIdx;
       int queryPos     = queryIdx + start;
-      
+
       float relPos   = (float)keyPos - (float)queryPos;
-      
+
       if(shift.data() != nullptr)
         relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}];
 
@@ -53,7 +54,12 @@ __global__ void gAlibi(
       float alibi = slope * abs(relPos + bias);
 
       float binMask = (float)mask[{0, batchIdx, keyIdx, 0}];
-      float logMask = (2.f * binMask - 1.f) * maskFactor; // range (-maskFactor, maskFactor)
+      float logMask = binMask == 0 ? -maskFactor : maskFactor; // range (-maskFactor, maskFactor)
+
+      if(addCausalMask) {
+        float causalMask = keyPos > queryPos ? -maskFactor : maskFactor; // range (-maskFactor, maskFactor)
+        logMask          = min(logMask, causalMask); // range (-maskFactor, maskFactor) if any mask is set to -maskFactor then the result is -maskFactor
+      }
 
       out[index] = (T)min(logMask, alibi);
     }
@@ -61,25 +67,23 @@ __global__ void gAlibi(
 }
 
 template <class... Tensors>
-void Alibi(int numHeads, int start, Tensor out, Tensors... tensors) {
+void Alibi(int numHeads, int start, bool addCausalMask, Tensor out, Tensors... tensors) {
   cudaSetDevice(out->getDeviceId().no);
   int length = out->size();
 
   int threads = std::min(MAX_THREADS, length);
   int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
 
-  float largest = NumericLimits<float>(out->type()).max;
-  float maskFactor = std::min(largest / 2.f, 99999999.f); // to make sure we do not overflow for fp16
+  float maskFactor = std::numeric_limits<float>::infinity();
 
   constexpr size_t K = sizeof...(tensors);
-  
   if(out->type() == Type::float32) {
     functional::Array<functional::Tensor<float>, K> inputs = {tensors...};
-    gAlibi<float><<<blocks, threads>>>(out, inputs, numHeads, start, maskFactor);
+    gAlibi<float><<<blocks, threads>>>(out, inputs, numHeads, start, maskFactor, addCausalMask);
 #if COMPILE_FP16
   } else if(out->type() == Type::float16) {
     functional::Array<functional::Tensor<half>, K> inputs = {tensors...};
-    gAlibi<half><<<blocks, threads>>>(out, inputs, numHeads, start, maskFactor);
+    gAlibi<half><<<blocks, threads>>>(out, inputs, numHeads, start, maskFactor, addCausalMask);
 #endif
   } else {
     ABORT("Alibi for type {} not implemented", out->type());
@@ -87,7 +91,7 @@ void Alibi(int numHeads, int start, Tensor out, Tensors... tensors) {
 }
 
 // template specialization for h/cpp separation
-template void Alibi<marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor >(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Alibi<marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor >(int, int, bool, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 
 template <typename T>
 __global__ void gAlibiGrad(
@@ -95,7 +99,8 @@ __global__ void gAlibiGrad(
   functional::Tensor<T> biasesGrad,
   functional::Array<functional::Tensor<T>, 5> inputs,
   int numHeads,
-  int start) {
+  int start,
+  bool addCausalMask) {
 
   const auto& mask   = inputs[0];
   const auto& slopes = inputs[1];
@@ -120,7 +125,7 @@ __global__ void gAlibiGrad(
 
   A5 dims5;
   const int HEAD_DIM = 2;
-  
+
   // compute single element derivate for slopes and biases
   auto dJ_dxy = [&](int headIdx, int colIdx) -> thrust::tuple<float, float> {
     // get the location for one head
@@ -130,7 +135,7 @@ __global__ void gAlibiGrad(
     dims5[HEAD_DIM] = headIdx;
     // get the index into the full tensor
     int index = fullShape5.index(dims5);
-    // get the value of the full adjoint 
+    // get the value of the full adjoint
     float vadj = (float)adj[index];
 
     // handle the rest
@@ -141,9 +146,9 @@ __global__ void gAlibiGrad(
 
     int keyPos    = keyIdx;
     int queryPos  = queryIdx + start;
-    
+
     float relPos   = (float)keyPos - (float)queryPos;
-    
+
     if(shift.data() != nullptr)
       relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}];
 
@@ -152,7 +157,12 @@ __global__ void gAlibiGrad(
     float binMask = (float)mask[{0, batchIdx, keyIdx, 0}];
 
     float signedAlibi = relPos + bias;
-    
+
+    if(addCausalMask) {
+      float causalMask = keyPos > queryPos ? 0.f : 1.f;
+      binMask = binMask * causalMask;
+    }
+
     // compute derivative of slope
     float dslope = binMask * abs(signedAlibi) * vadj;
 
@@ -168,7 +178,7 @@ __global__ void gAlibiGrad(
 
     return { dslope, dbias };
   };
-  
+
   for(int bid = 0; bid < numHeads; bid += gridDim.x) {
     int headIdx = bid + blockIdx.x;
     if(headIdx < numHeads) {
@@ -215,7 +225,7 @@ __global__ void gAlibiGrad(
 }
 
 template <typename T, class... Tensors>
-void TypedAlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) {
+void TypedAlibiGrad(int numHeads, int start, bool addCausalMask, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) {
   cudaSetDevice(slopesGrad->getDeviceId().no);
 
   constexpr size_t K = sizeof...(tensors);
@@ -223,22 +233,22 @@ void TypedAlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGra
 
   const auto& adj = inputs[K - 1]; // last one is adjoint and full broadcast shape
   int total = adj.size();
-  
+
   // we will reduce over each head
   int blocks  = std::min(MAX_BLOCKS,  numHeads);
   int threads = std::min(MAX_THREADS, total / numHeads);
   int shared  = sizeof(float) * threads * 2; // Use float32 as accumulation type, we accumulate slopes and biases
 
-  gAlibiGrad<T><<<blocks, threads, shared>>>(slopesGrad, biasesGrad, inputs, numHeads, start);
+  gAlibiGrad<T><<<blocks, threads, shared>>>(slopesGrad, biasesGrad, inputs, numHeads, start, addCausalMask);
 }
 
 template <class... Tensors>
-void AlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) {  
+void AlibiGrad(int numHeads, int start, bool addCausalMask, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) {
   if(slopesGrad->type() == Type::float32) {
-    TypedAlibiGrad<float>(numHeads, start, slopesGrad, biasesGrad, tensors...);
+    TypedAlibiGrad<float>(numHeads, start, addCausalMask, slopesGrad, biasesGrad, tensors...);
 #if COMPILE_FP16
   } else if(slopesGrad->type() == Type::float16) {
-    TypedAlibiGrad<half>(numHeads, start, slopesGrad, biasesGrad, tensors...);
+    TypedAlibiGrad<half>(numHeads, start, addCausalMask, slopesGrad, biasesGrad, tensors...);
 #endif
   } else {
     ABORT("AlibiGrad for type {} not implemented", slopesGrad->type());
@@ -246,6 +256,6 @@ void AlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Te
 }
 
 // template specialization for h/cpp separation
-template void AlibiGrad<marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor>(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void AlibiGrad<marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor>(int, int, bool, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 }
 }
diff --git a/src/layers_new/alibi.h b/src/layers_new/alibi.h
index bec2da55d..66c102235 100644
--- a/src/layers_new/alibi.h
+++ b/src/layers_new/alibi.h
@@ -7,6 +7,8 @@
 
 namespace marian {
 
+const int ALIBI_REFERENCE_HEADS = 8; // number of heads in the reference model
+
 // @TODO: this whole set of functions is currently somewhat akward in general, since we need to implement
 // old style and new style decoder state for this to work. We decoder with the old decoder framework, but
 // use the new style transformer layers. This will eventually be cleaned up.
@@ -70,12 +72,12 @@ Ptr<DecoderState> NewDecoderState(Ptr<Options> options,
                                   bool isBatchMajor = false);
 
 // convert an old-style decoder state to an (alibi) decoder state
-Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state, 
-                                          Ptr<ExpressionGraph> graph, 
+Ptr<nn::DecoderState> convertDecoderState(Ptr<DecoderState> state,
+                                          Ptr<ExpressionGraph> graph,
                                           bool decoding=false);
 
 // efficient operator for ALIBI log mask with shift and optionally learnable parameters
-Expr alibiLogMask(Expr mask, Expr query, Expr shift, Expr slopes, Expr biases, int numHeads, int start);
+Expr alibiLogMask(Expr mask, Expr query, Expr shift, Expr slopes, Expr biases, int numHeads, int start, bool addCausalMask = false);
 
 namespace nn {
 
@@ -92,14 +94,16 @@ class AlibiDecoderStateItem : public DecoderStateItem {
   }
 };
 
-// Experimental implementation of the ALIBI attention mechanism (via masking) (https://arxiv.org/abs/2108.12409)
+/**
+ * Experimental implementation of the ALIBI attention mechanism (via masking) (https://arxiv.org/abs/2108.12409)
+ */
 class AlibiAttentionMaskProcessor : public AttentionMaskProcessor {
 public:
-  bool trainable{false};    // if true don't use learnable parameters
+  bool trainable{false}; // if true don't use learnable parameters
 
   Expr slopes;  // learnable per head ALIBI slopes
   Expr biases;  // learnable per head additive biases
-  
+
   using AttentionMaskProcessor::numHeads;
 
   AlibiAttentionMaskProcessor(Ptr<ExpressionGraph> graph,
@@ -110,87 +114,11 @@ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor {
 
   virtual ~AlibiAttentionMaskProcessor() = default;
 
-private:  
-// @TODO: eventually to be removed. This computes ALIBI log masks with multiple operators, replaced with more efficient version below.
-// For now we keep this for documentation and experimentation puprposes.
-// The same functionality is implemented in `alibiLogMask` above via a special operator
-#if 0
-  const float ALIBI_REFERENCE_HEADS{8.f}; // number of reference heads that ALIBI slopes are computed for
-
-  // Compute the alibi mask for a given query and keys
-  Expr alibiMask(Expr query, int dimQuery, int dimKeys, Ptr<DecoderState> state) const {
-    int start = 0;
-    Expr shift = nullptr;
-
-    int dimBatch = query->shape()[-3];
-    int dimBeam  = query->shape()[-4];
-
-    if(state) {
-      start = (int)state->getPosition();
-      auto alibiState = std::dynamic_pointer_cast<AlibiDecoderStateItem>(state);
-      shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1]
-    }
-    
-    // Create constant tensors of reflecting the query and key positions.
-    // When decoding, we start with the decoding state position for the query. The key positions are just the indices for the whole sequence.
-    Expr queryPositions = graph()->constant({1, 1, dimQuery, 1}, inits::range((float)start, (float)(start + dimQuery)));  // [1, 1, dimQuery, 1]
-    Expr keyPositions   = graph()->constant({1, 1, 1,  dimKeys}, inits::range(0.f, (float)dimKeys));                      // [1, 1, 1, dimKeys]
-    
-    // Create matrix of distances between positions, rows are distances of current query position vs all key positions.
-    // Layout is the same as the attention distance matrix where we compute rowwise softmaxes of similarities between
-    // each target word and all the source words
-    Expr alibiBiases = keyPositions - queryPositions; // [1, 1, dimQuery, dimKeys]
-
-    // apply the corrective shift if any sync-points are present
-    if(shift) {
-      alibiBiases = alibiBiases - shift;                                              // [dimBeam, dimBatch, dimQuery, dimKeys]
-      alibiBiases = reshape(alibiBiases, {dimBeam * dimBatch, 1, dimQuery, dimKeys}); // [dimBeam * dimBatch, 1, dimQuery, dimKeys]
-    }
-
-    Expr alibi = slopes * abs(alibiBiases + biases);  // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys]
-    return alibi;
-  };
-
-  // Compute the log mask for a given query and combine with the alibi mask
-  Expr logMask(Expr query, Expr mask, Ptr<DecoderState> state) const {
-    ABORT_IF(!mask, "mask is expected!!");
-
-    // query: [dimBeam, dimBatch, dimQuery, dimModel] -> dimQuery == dimTrgWords
-    int dimBatch = query->shape()[-3];
-    int dimBeam  = query->shape()[-4];
-    
-    int dimQuery = query->shape()[-2];
-    int dimKeys  = mask->shape()[-2];
-    
-    // all this is bascially a copy of the normal attention mask computation, however we need to do some extra reshaping
-    // to make the alibi mask and the log mask broadcastable and then combine them via minimum
-
-    // Note, this is not a typical logMask with values 0 (don't mask) and -inf (mask). Rather we use +inf (or a large value) 
-    // and -inf and then compbine with the ALIBI mask via minimum. This way, we keep the original ALIBI values where the mask has
-    // +inf and have -inf for masking.
-     // largest useful value and making sure we do not overflow for fp16
-    float maskFactor = std::min(NumericLimits<float>(mask->value_type()).max / 2.f, 99999999.f);
-    // convert binary 0/1 mask to -1/1 mask and then muliply with inf, results in -inf/+inf mask.
-    auto logMask = (2.f * mask - 1.f) * maskFactor; // [1, dimBatch, dimKeys, 1]
-    logMask = reshape(logMask, {dimBatch, 1, 1, dimKeys});       // [dimBatch,                      1,        1, dimKeys]
-    
-
-    // make logMask broadcastable when decoding with beam search
-    logMask = repeat(logMask, /*repeats=*/dimBeam, /*axis=*/-4); // [dimBeam|1 * dimBatch,          1,        1, dimKeys]
-    
-    // make logMask and alibiBias broadcastable, then combine
-    auto alibiBias = alibiMask(query, dimQuery, dimKeys, state); // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys]
-    logMask = minimum(logMask, alibiBias);                       // [dimBeam|1 * dimBatch,   numHeads, dimQuery, dimKeys]
-
-    // final reshape to match attention operation
-    logMask = reshape(logMask, {dimBeam, dimBatch * numHeads, dimQuery, dimKeys}); // [dimBeam|1, dimBatch * numHeads, dimQuery, dimKeys]
-    return logMask;
-  }
-#endif
+private:
 
   // Initialized the head-wise scaling factors from ALIBI (they are constant in the original paper,
   // we are making them optionally learnable here)
-  Ptr<inits::NodeInitializer> initSlopes(bool decoder = false) const {
+  Ptr<inits::NodeInitializer> initSlopes() const {
 // This is the original implementation of ALIBI slopes for LMs. We find our slopes and biases work better for Seq2seq models
 // Keep for now until we find a use, e.g. in LMs
 #if 0
@@ -200,69 +128,137 @@ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor {
       // if there are more or less heads we scale back to 8 heads and interpolate.
       float exponent = (float)(i + 1) * (ALIBI_REFERENCE_HEADS / (float)numHeads);
 
-      // We multiply slopes with 2 for the symmetric mask to keep total probability mass the 
+      // We multiply slopes with 2 for the symmetric mask to keep total probability mass the
       // same as in the causal mask (we have two symmetric halves instead of just one causal half)
       mVec[i] = -2.f / std::pow(2.f, exponent);
       if(decoder)
         mVec[i] *= 0.5f;
     }
-    
+
     return inits::fromVector(mVec);
 #else
     // Magic numbers, for now don't ask.
-    std::vector<float> init;
-    if(decoder) {
-      return inits::fromValue(-0.1f);
-    } else {
-      init = { -2.00f, -1.00f, -0.50f, -0.25f, -0.05f, -0.05f, -0.05f, -0.05f };
-      init.resize(numHeads, -0.05f);
-      return inits::fromVector(init);
-    }
+    std::vector<float> init = { -2.00f, -1.00f, -0.50f, -0.25f, -0.05f, -0.05f, -0.05f, -0.05f };
+    init.resize(numHeads, -0.05f);
+    return inits::fromVector(init);
 #endif
   }
 
   // Head-wise biases for ALIBI, this does not occur in the paper, ignore the magic numbers
-  Ptr<inits::NodeInitializer> initBiases(bool decoder=false) const {
-    if(decoder) {
-      return inits::fromValue(0.3f);
-    } else {
-      std::vector<float> init({ 1.00f, -2.00f, 3.00f, -4.00f, 5.00f, -6.00f, 7.00f, -8.00f });
-      init.resize(numHeads, 0.f);
-      return inits::fromVector(init);
-    }
+  Ptr<inits::NodeInitializer> initBiases() const {
+    std::vector<float> init({ 1.00f, -2.00f, 3.00f, -4.00f, 5.00f, -6.00f, 7.00f, -8.00f });
+    init.resize(numHeads, 0.f);
+    return inits::fromVector(init);
   }
 
 public:
+
   // Apply the alibi mask to the given query and mask
   virtual Expr apply(Expr query, Expr mask) const override {
-    return apply(query, mask, /*state=*/nullptr);
-  }
-
-  // Apply the alibi mask to the given query and mask for decoder cross-attention
-  virtual Expr apply(Expr query, Expr mask, Ptr<DecoderState> state) const override {
-    bool decoder = state != nullptr;
-
     if(!trainable) {
-      const_cast<Expr&>(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes(decoder));
-      const_cast<Expr&>(biases) = graph()->constant({numHeads, 1, 1}, initBiases(decoder));
+      const_cast<Expr&>(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes());
+      const_cast<Expr&>(biases) = graph()->constant({numHeads, 1, 1}, initBiases());
     } else {
-      registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes(decoder));
-      registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases(decoder));
+      registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes());
+      registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases());
     }
 
     Expr shift = nullptr;
     int start = 0;
-    
-    if(state) {
-      start = (int)state->getPosition();
-      auto alibiState = std::dynamic_pointer_cast<AlibiDecoderStateItem>(state);
-      shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1]
-    }
 
     auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start);
     return alibiMask;
   }
 };
 
+/**
+ * Experimental implementation of the ALIBI attention mechanism for decoder layers
+ */
+class AlibiDecoderAttentionMaskProcessor : public DecoderAttentionMaskProcessor {
+public:
+  bool trainable{false}; // if true don't use learnable parameters
+
+  Expr slopes;  // learnable per head ALIBI slopes
+  Expr biases;  // learnable per head additive biases
+
+  using DecoderAttentionMaskProcessor::numHeads;
+
+  AlibiDecoderAttentionMaskProcessor(Ptr<ExpressionGraph> graph,
+                                     Ptr<Options> options,
+                                     bool addCausalMask = false)
+    : DecoderAttentionMaskProcessor(graph, options, addCausalMask),
+      trainable(options->get<bool>("transformer-alibi-trainable", false)) {}
+
+  virtual ~AlibiDecoderAttentionMaskProcessor() = default;
+
+private:
+  // Initialized the head-wise scaling factors from ALIBI (they are constant in the original paper,
+  // we are making them optionally learnable here)
+  Ptr<inits::NodeInitializer> initSlopes() const {
+    if(addCausalMask) {
+      std::vector<float> mVec(numHeads);
+      for(size_t i = 0; i < numHeads; ++i) {
+        // slopes in the paper go from 1/2^1 to 1/2^8 where 8 is the reference number of heads;
+        // if there are more or less heads we scale back to 8 heads and interpolate.
+        float exponent = (float)(i + 1) * (ALIBI_REFERENCE_HEADS / (float)numHeads);
+        mVec[i] = -1.f / std::pow(2.f, exponent);
+      }
+      return inits::fromVector(mVec);
+    } else {
+      return inits::fromValue(-0.1f); // Magic numbers, for now don't ask.
+    }
+  }
+
+  // Head-wise biases for ALIBI, this does not occur in the paper, ignore the magic numbers
+  Ptr<inits::NodeInitializer> initBiases() const {
+    if(addCausalMask) {
+      return inits::fromValue(0.0f);
+    } else {
+      return inits::fromValue(0.3f);
+    }
+  }
+
+public:
+  // Apply the alibi mask to the given query and mask for decoder cross-attention
+  virtual Expr apply(Expr query, Expr mask, Ptr<DecoderState> state) const override {
+    auto processMask = [this, query, state](Expr mask) {
+      if(!trainable) {
+        const_cast<Expr&>(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes());
+        const_cast<Expr&>(biases) = graph()->constant({numHeads, 1, 1}, initBiases());
+      } else {
+        registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes());
+        registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases());
+      }
+
+      Expr shift = nullptr;
+      int start = 0;
+
+      if(state) {
+        start = (int)state->getPosition();
+        auto alibiState = std::dynamic_pointer_cast<AlibiDecoderStateItem>(state);
+        shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1]
+      }
+
+      // @TODO: make sure that we never want to have a causal mask here if start > 0 (this should indicate decoding)
+      return alibiLogMask(mask, query, slopes, biases, shift, numHeads, start, addCausalMask && start == 0);
+    };
+
+    if(mask) {
+      // recompute the mask if input mask changes (different memory address), otherwise return cached version
+      auto equal = [](Expr a, Expr b) { return a == b; };
+      return cachedMask_->apply(mask, processMask, equal);
+    } else {
+      // @TODO: avoid this mask recreation for every layer
+      int dimBatch   = query->shape()[-3];
+      int dimKeys    = (int)state->getPosition() + 1;
+      mask = graph()->constant({1, dimBatch, dimKeys, 1}, inits::ones());
+
+      // recompute the ALIBI mask if shape changes, but still has to create the above temporary mask first
+      auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); };
+      return cachedMask_->apply(mask, processMask, equal);
+    }
+  }
+};
+
 } // namespace nn
 } // namespace marian
\ No newline at end of file
diff --git a/src/layers_new/attention.cpp b/src/layers_new/attention.cpp
index c3758296e..2ec081a30 100644
--- a/src/layers_new/attention.cpp
+++ b/src/layers_new/attention.cpp
@@ -3,10 +3,10 @@
 #include "layers_new/alibi.h"
 
 namespace marian {
-namespace nn { 
+namespace nn {
 
 // Factory function to create attention layers from options
-Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options, bool enableCache) {
   // @TODO: currently this does nothing as it isn't set anywhere
   std::string selfAttentionType = options->get<std::string>("transformer-encoder-attention", "default"); // currently only default
 
@@ -17,7 +17,7 @@ Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options
 
     float attentionDropoutProbability = options->get<float>("transformer-dropout-attention", 0.f);
 
-    return New<MultiHeadAttention<MultiplicativeAttention>>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability);
+    return New<MultiHeadAttention>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability, enableCache);
   }
   else {
     ABORT("Unknown transformer encoder attention type: {}", selfAttentionType);
@@ -25,9 +25,9 @@ Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options
 }
 
 // Factory function to create attention mask processors from options
-Ptr<AttentionMaskProcessor> attentionMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+Ptr<MaskProcessor> maskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
   // currently only default or alibi
-  std::string processorType = options->get<std::string>("transformer-attention-mask", "default"); 
+  std::string processorType = options->get<std::string>("transformer-attention-mask", "default");
   if(processorType == "default") {
     return New<AttentionMaskProcessor>(graph, options);
   } else if(processorType == "alibi") {
@@ -37,6 +37,33 @@ Ptr<AttentionMaskProcessor> attentionMaskProcessorFromOptions(Ptr<ExpressionGrap
   }
 }
 
+Ptr<DecoderMaskProcessor> selfMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+  auto autoRegType = options->get<std::string>("transformer-decoder-autoreg", "self-attention");
+  if(autoRegType == "rnn") {
+    // creates a dummy processor that returns an unprocessed mask
+    return New<DummyDecoderMaskProcessor>(graph, options);
+  } else if(autoRegType == "self-attention") {
+    // here we will return modified log masks for self-attention
+    std::string processorType = options->get<std::string>("transformer-attention-mask", "default");
+    if(processorType == "alibi") {
+      return New<AlibiDecoderAttentionMaskProcessor>(graph, options, /*addCausalMask=*/true);
+    } else {
+      return New<DecoderAttentionMaskProcessor>(graph, options, /*addCausalMask=*/true);
+    }
+  } else {
+    ABORT("Unknown transformer decoder autoregressive type: {}", autoRegType);
+  }
+}
+
+Ptr<DecoderMaskProcessor> contextDecoderMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options) {
+  std::string processorType = options->get<std::string>("transformer-attention-mask", "default");
+  if(processorType == "alibi") {
+    return New<AlibiDecoderAttentionMaskProcessor>(graph, options, /*addCausalMask=*/false);
+  } else {
+    return New<DecoderAttentionMaskProcessor>(graph, options, /*addCausalMask=*/false);
+  }
+}
+
 }  // namespace nn
 
 // specialized faster operator for log-mask computation
@@ -49,28 +76,27 @@ class LogMaskNode : public UnaryNodeOp {
     // see the reshape below in the logMask function
     int dimBatch = mask->shape()[-4];
     int dimKeys  = mask->shape()[-1];
-    return { dimBatch, numHeads, 1, dimKeys };
+    return { dimBatch, numHeads,       1, dimKeys };
   }
 
 public:
   LogMaskNode(Expr mask, int numHeads)
-  : UnaryNodeOp(mask, newShape(mask, numHeads)), numHeads_(numHeads)
+  : UnaryNodeOp(mask, newShape(mask, numHeads)),
+    numHeads_(numHeads)
   {}
 
   NodeOps forwardOps() override {
-    float lowest = NumericLimits<float>(value_type()).lowest;
-    float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
-    
-    using namespace functional;
     // compared to the multi-operation code this does conversion and broadcasting in one step
-    return { NodeOp(Element(_1 = (1.f - _2) * maskFactor, val_, child(0)->val())) }; 
+    using namespace functional;
+    return { NodeOp(Element(_1 = log(_2), val_, child(0)->val())) };
   }
 
   NodeOps backwardOps() override {
-    float lowest = NumericLimits<float>(value_type()).lowest;
-    float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
+    if(!trainable())
+      return { };
+
     using namespace functional;
-    return { NodeOp(Add(-maskFactor * _1, child(0)->grad(), adj_)) };
+    return { NodeOp(Add(_1 / _2, child(0)->grad(), adj_, child(0)->val())) };
   }
 
   virtual size_t hash() override {
@@ -93,13 +119,27 @@ class LogMaskNode : public UnaryNodeOp {
   const std::string type() override { return "log-mask"; }
 };
 
-Expr logMask(Expr mask, int numHeads) {
+Expr logMask(Expr mask, int numHeads, bool addCausalMask) {
   // incoming mask has shape [1, dimBatch, dimKeys, 1]
   int dimBatch = mask->shape()[-3];
   int dimKeys  = mask->shape()[-2];
   mask = reshape(mask, {dimBatch, 1, 1, dimKeys});
   auto logMask = Expression<LogMaskNode>(mask, numHeads); // [dimBatch, numHeads, 1, dimKeys]
-  return reshape(logMask, {1, dimBatch * numHeads, 1, dimKeys});
+  logMask = reshape(logMask, {1, dimBatch * numHeads, 1, dimKeys});
+
+  // @TODO: this is needlessly slow, integrate with the above in special kernel
+  if(addCausalMask) {
+    // add causal mask to logMask
+    std::vector<float> vMask(dimKeys * dimKeys, 0.f);
+    for(int i = 0; i < dimKeys; ++i)
+      for(int j = i + 1; j < dimKeys; ++j)
+        vMask[i * dimKeys + j] = -std::numeric_limits<float>::infinity();
+
+    auto triangle = mask->graph()->constant({1, 1, dimKeys, dimKeys}, inits::fromVector(vMask));
+    logMask = minimum(logMask, triangle); // [1, dimBatch * numHeads, dimKeys, dimKeys]
+  }
+
+  return logMask;
 }
 
 }  // namespace marian
diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h
index 6ddfaad2a..9bd31baa0 100644
--- a/src/layers_new/attention.h
+++ b/src/layers_new/attention.h
@@ -6,31 +6,88 @@
 
 namespace marian {
 
-// specialized operator for faster logMask computation
-Expr logMask(Expr mask, int numHeads);
+/**
+ * Specialized operator for faster logMask computation
+ */
+Expr logMask(Expr mask, int numHeads, bool addCausalMask);
 
 namespace nn {
 
-// Abstract base class for attention mechanisms
-class AttentionLayer : public Layer, 
+/**
+ * Abstract base class for attention mechanisms
+ */
+class AttentionLayer : public Layer,
                        public IQuaternaryLayer {
 protected:
   using Layer::namedLayers_;
-  
+
 public:
   AttentionLayer(Ptr<ExpressionGraph> graph) : Layer(graph) {}
   virtual ~AttentionLayer() = default;
 };
 
-class MultiplicativeAttention : public AttentionLayer {
+/**
+ * Base class for attention layers that collect attention weights
+ */
+class AttentionCollector {
+private:
+  mutable std::vector<Expr> alignments_; // @TODO: rename to something more accurate
+
+public:
+  bool saveAttentionWeights{false};
+  int numHeads{8};
+
+  AttentionCollector(bool saveAttentionWeights, int numHeads = 8)
+    : saveAttentionWeights(saveAttentionWeights), numHeads(numHeads) {}
+
+  void collectOneHead(Expr weights) const {
+    // weights: [dimBeam, dimBatch * numHeads, dimQuery|1, dimKeys]
+
+    int dimBeam       = weights->shape()[-4];
+    int dimBatchHeads = weights->shape()[-3];
+    int dimQuery      = weights->shape()[-2]; // (max) length of trg sequence, or 1 in decoding
+    int dimKeys       = weights->shape()[-1]; // (max) length of src sequence
+
+    int dimBatch = dimBatchHeads / numHeads;
+
+    weights = reshape(weights, {dimBeam * dimBatch, numHeads, dimQuery, dimKeys});
+    auto head0 = slice(weights, -3, 0); // [dimBeam * dimBatch, 1, dimQuery, dimKeys]
+
+    // reshape and transpose to match the format guided_alignment expects
+    head0 = reshape(head0, {dimBeam, dimBatch, dimQuery, dimKeys});
+    head0 = transpose(head0, {0, 3, 1, 2}); // [beam depth, dimKeys, dimBatch, dimQuery|1]
+
+    // save only last alignment set. For training this will be all alignments,
+    // for translation only the last one. Also split alignments by target words.
+    // @TODO: make splitting obsolete
+    // @TODO: why is this even here?
+    alignments_.clear();
+    for(int i = 0; i < dimQuery; ++i) { // loop over all trg positions. In decoding, there is only one.
+      alignments_.push_back(slice(head0, -1, i)); // [tgt index][beam depth, max src length, batch size, 1] P(src pos|trg pos, beam index, batch index)
+    }
+  }
+
+  const std::vector<Expr>& getAlignments() const {
+    return alignments_;
+  }
+
+  void clear() {
+    alignments_.clear();
+  }
+};
+
+/**
+ * Base class for multiplicative attention layers (can collect attention weights)
+ */
+class MultiplicativeAttention : public AttentionLayer, public AttentionCollector {
 protected:
   using AttentionLayer::namedLayers_;
 
 public:
   Ptr<Dropout> attentionDropout;
 
-  MultiplicativeAttention(Ptr<ExpressionGraph> graph, float dropoutProbability)
-   : AttentionLayer(graph) {
+  MultiplicativeAttention(Ptr<ExpressionGraph> graph, float dropoutProbability, bool saveAttentionWeights = false)
+   : AttentionLayer(graph), AttentionCollector(saveAttentionWeights) {
     attentionDropout = New<Dropout>(graph, dropoutProbability);
     registerLayer(attentionDropout);
   }
@@ -45,7 +102,7 @@ class MultiplicativeAttention : public AttentionLayer {
 
     // multiplicative attention with flattened softmax
     float scale = 1.0f / std::sqrt((float)dimKeys); // scaling to avoid extreme values due to matrix multiplication
-    
+
     // query, keys and values: [dimBeam, dimBatch * numHeads, (dimQuery|dimKeys=dimValues), dimHead]
     auto z = bdot(query, keys, false, true, scale); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys]
 
@@ -55,11 +112,10 @@ class MultiplicativeAttention : public AttentionLayer {
 
     // take softmax along src sequence axis (-1)
     auto weights = softmax(z); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys]
-    
-#if 0 // @TODO: make this work again
-    if(saveAttentionWeights)
-      collectOneHead(weights, dimBeam);
-#endif
+
+    if(saveAttentionWeights) {
+      collectOneHead(weights);
+    }
 
     // optional dropout for attention weights
     weights = attentionDropout->apply(weights);
@@ -70,15 +126,25 @@ class MultiplicativeAttention : public AttentionLayer {
     auto output = bdot(weights, values);  // [dimBeam, dimBatch * numHeads, dimQuery, dimHead]
     return output;
   }
+
+  virtual void clear() override {
+    AttentionLayer::clear();
+    AttentionCollector::clear();
+  }
 };
 
-// Base class for multi-head attention
-template <class AttentionType> // Currently only used for MultiplicativeAttention
-class MultiHeadAttention : public AttentionType {
+/**
+ * Extended multiplicative attention layer with multiple heads
+ * and separate query, key and value projections, as well as
+ * an output projection.
+ */
+class MultiHeadAttention : public MultiplicativeAttention {
 protected:
-  using AttentionType::namedLayers_;
+  using MultiplicativeAttention::namedLayers_;
+  using AttentionCollector::saveAttentionWeights;
 
 private:
+  bool enableCache_{false};
   IPtr<CachedExpr> cachedKh_; // cached result of key projection
   IPtr<CachedExpr> cachedVh_; // cached result of value projection
 
@@ -93,15 +159,17 @@ class MultiHeadAttention : public AttentionType {
   int modelDim;
 
   MultiHeadAttention(Ptr<ExpressionGraph> graph,
-                     int numHeads, 
-                     int attDim, 
+                     int numHeads,
+                     int attDim,
                      int modelDim,
-                     float dropoutProbability)
-    : AttentionType(graph, dropoutProbability),
-      cachedKh_(new CachedExpr()), 
+                     float dropoutProbability,
+                     bool enableCache = false)
+    : MultiplicativeAttention(graph, dropoutProbability),
+      enableCache_(enableCache),
+      cachedKh_(new CachedExpr()),
       cachedVh_(new CachedExpr()),
-      numHeads(numHeads), 
-      attDim(attDim), 
+      numHeads(numHeads),
+      attDim(attDim),
       modelDim(modelDim) {
     qProj = New<Linear>(graph, attDim);
     registerLayer(qProj);
@@ -117,7 +185,7 @@ class MultiHeadAttention : public AttentionType {
   virtual ~MultiHeadAttention() = default;
 
 protected:
-  // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to 
+  // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to
   // be able to do an efficient batched matmul.
   Expr splitHeads(Expr input) const {
     int dimSteps = input->shape()[-2];
@@ -149,22 +217,27 @@ class MultiHeadAttention : public AttentionType {
 public:
   // Apply the multi-head attention to the given query, keys and values
   virtual Expr apply(Expr query, Expr keys, Expr values, Expr mask) const override {
+    // @TODO: implement custom bdot to avoid splitHeads/joinHeads
+    // @TODO: explore FlashAttention-like cpu implementation
     auto qh = splitHeads(qProj->apply(query));
 
-    // @TODO: in original implementation we use shape()->elements(), dunno why
-    auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); };
-    
-    // these two get conditionally recomputed if their size changes according to criterion above
-    auto kh = cachedKh_->apply(keys,   [this](Expr keys)   { 
-      return splitHeads(kProj->apply(keys)); 
-    }, equal);
-    
-    auto vh = cachedVh_->apply(values, [this](Expr values) { 
-      return splitHeads(vProj->apply(values)); 
-    }, equal);
-
-    auto output  = AttentionType::apply(qh, kh, vh, mask);
-
+    // if enabledCache_ is true, we cache the results of the key and value projections
+    // otherwise equal is always false and the key and value projections are recomputed
+    Expr kh, vh;
+    if(enableCache_) {
+      // @TODO: in original implementation we use shape()->elements(), dunno why
+      auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); };
+      // these two get conditionally recomputed if their size changes according to criterion above
+      kh = cachedKh_->apply(keys,   [this](Expr keys)   { return splitHeads(kProj->apply(keys)); }, equal);
+      vh = cachedVh_->apply(values, [this](Expr values) { return splitHeads(vProj->apply(values)); }, equal);
+    } else {
+      kh = splitHeads(kProj->apply(keys));
+      vh = splitHeads(vProj->apply(values));
+    }
+
+    auto output  = MultiplicativeAttention::apply(qh, kh, vh, mask);
+
+    // @TODO: combine joinHeads and apply in one matrix multiplication via striding
     output = joinHeads(output);
     output = oProj->apply(output);
 
@@ -178,51 +251,133 @@ class MultiHeadAttention : public AttentionType {
   }
 };
 
-// Base class for attention mask processors
-// Attention mask processors are used to process a given attention mask before it is used in an attention computation.
-struct AttentionMaskProcessor : public LayerWithOptions, public IBinaryLayer, public IBinaryDecoderLayer {
+/**
+ * Base class for mask processors.
+ */
+struct MaskProcessor : public LayerWithOptions, public IBinaryLayer {
+  IPtr<CachedExpr> cachedMask_;
+
+  MaskProcessor(Ptr<ExpressionGraph> graph,
+                Ptr<Options> options)
+    : LayerWithOptions(graph, options),
+    cachedMask_(new CachedExpr()) {}
+
+  virtual ~MaskProcessor() = default;
+
+  void clear() override {
+    LayerWithOptions::clear();
+    cachedMask_->clear();
+  }
+};
+
+/**
+ * Base class for decoder mask processors.
+ */
+struct DecoderMaskProcessor : public LayerWithOptions, public IBinaryDecoderLayer {
+  bool addCausalMask{false};
+  IPtr<CachedExpr> cachedMask_;
+
+  DecoderMaskProcessor(Ptr<ExpressionGraph> graph,
+                       Ptr<Options> options,
+                       bool addCausalMask = false)
+    : LayerWithOptions(graph, options),
+      addCausalMask(addCausalMask),
+      cachedMask_(new CachedExpr()) {}
+
+  virtual ~DecoderMaskProcessor() = default;
+
+  void clear() override {
+    LayerWithOptions::clear();
+    cachedMask_->clear();
+  }
+};
+
+/**
+ * Attention mask processors are used to process a given attention mask
+ * before it is used in an attention computation.
+ */
+struct AttentionMaskProcessor : public MaskProcessor {
   int numHeads{1};
 
   AttentionMaskProcessor(Ptr<ExpressionGraph> graph,
                          Ptr<Options> options)
-    : LayerWithOptions(graph, options), 
+    : MaskProcessor(graph, options),
       numHeads(opt<int>("transformer-heads", 1)) {}
 
   virtual ~AttentionMaskProcessor() = default;
-  
+
   virtual Expr apply(Expr /*query*/, Expr mask) const override {
     if(!mask)
       return nullptr;
 
-    // @TODO eventually remove this branch. For now we keep it for documentation purposes
-#if 0
-    // LayerAttention expects mask in a different layout
-    int dimBatch = mask->shape()[-3];
-    int dimKeys  = mask->shape()[-2];
-
-    mask = reshape(mask, {dimBatch, 1, 1, dimKeys}); // [batch size, num heads broadcast=1, max length broadcast=1, max length]
-
-    float maskFactor = std::max(NumericLimits<float>(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16
-    auto logMask = (1 - mask) * maskFactor;
-    logMask      = reshape(repeat(logMask, numHeads, -3), {1, dimBatch * numHeads, 1, dimKeys});
-    return logMask;
-#else
     // shape of mask should be [1, dimBatch, dimKeys, 1]
-    // this does all the above work in one step
-    return marian::logMask(mask, numHeads); // [1, dimBatch * numHeads, 1, dimKeys]
-#endif
+    return marian::logMask(mask, numHeads, /*addCausalMask=*/false); // [1, dimBatch * numHeads, 1, dimKeys]
   }
+};
+
+/**
+ * Base class for decoder attention mask processors. Attention mask processors are used to
+ * process a given attention mask before it is used in an attention computation.
+ * Decoder attention mask processors can take advantage of information from the decoder state.
+ */
+struct DecoderAttentionMaskProcessor : public DecoderMaskProcessor {
+  int numHeads{1};
+
+  DecoderAttentionMaskProcessor(Ptr<ExpressionGraph> graph,
+                                Ptr<Options> options,
+                                bool addCausalMask = false)
+    : DecoderMaskProcessor(graph, options, addCausalMask),
+      numHeads(opt<int>("transformer-heads", 1)) {}
+
+  virtual ~DecoderAttentionMaskProcessor() = default;
+
+  virtual void initState(Ptr<DecoderState> /*state*/) const override {}
 
   virtual Expr apply(Expr query, Expr mask, Ptr<DecoderState> /*state*/) const override {
-    return apply(query, mask);
+     if(!mask)
+      return nullptr;
+
+    // shape of input `mask` should be [1, dimBatch, dimKeys, 1]
+    // output shape will be // [1, dimBatch * numHeads, 1, dimKeys] if addCausalMask is false
+    // or [1, dimBatch * numHeads, dimKeys, dimKeys] if addCausalMask is true
+    auto processMask = [this](Expr mask) { return marian::logMask(mask, numHeads, addCausalMask); };
+
+    // recompute the mask if input mask changes (different memory address), otherwise return cached version
+    auto equal       = [](Expr a, Expr b) { return a == b; };
+
+    // recompute the mask if the shape changes, otherwise return cached version
+    return cachedMask_->apply(mask, processMask, equal);
   }
 };
 
-// Factory function to create attention layers from options
-Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
+/**
+ * Dummy decoder mask processor that returns the unprocessed mask, used for RNN autoregressive decoding
+ */
+struct DummyDecoderMaskProcessor : public DecoderMaskProcessor {
+  DummyDecoderMaskProcessor(Ptr<ExpressionGraph> graph,
+                            Ptr<Options> options)
+    : DecoderMaskProcessor(graph, options, /*addCausalMask=*/false) {}
+
+  virtual ~DummyDecoderMaskProcessor() = default;
+
+  virtual void initState(Ptr<DecoderState> /*state*/) const override {}
+
+  virtual Expr apply(Expr /*query*/, Expr mask, Ptr<DecoderState> /*state*/) const override {
+    return mask;
+  }
+};
 
-// Factory function to create attention mask processors from options
-Ptr<AttentionMaskProcessor> attentionMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
+/**
+ * Factory function to create attention layers from options
+ */
+Ptr<AttentionLayer> attentionFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options, bool enableCache = false);
+
+/**
+ * Factory function to create mask processors from options
+ */
+Ptr<MaskProcessor>        maskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
+Ptr<DecoderMaskProcessor> selfMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
+Ptr<DecoderMaskProcessor> contextDecoderMaskProcessorFromOptions(Ptr<ExpressionGraph> graph, Ptr<Options> options);
 
 } // namespace nn
 } // namespace marian
diff --git a/src/layers_new/decoder.h b/src/layers_new/decoder.h
index 406017d64..9ead145f9 100644
--- a/src/layers_new/decoder.h
+++ b/src/layers_new/decoder.h
@@ -11,7 +11,7 @@ namespace marian {
 namespace nn {
 
 // Interface: decoder state
-struct DecoderState : public IClassName, public std::enable_shared_from_this<DecoderState> {
+class DecoderState : public IClassName, public std::enable_shared_from_this<DecoderState> {
 protected:
   size_t position{0};
 
@@ -27,6 +27,10 @@ struct DecoderState : public IClassName, public std::enable_shared_from_this<Dec
     return position;
   }
 
+  virtual void setPosition(size_t pos) {
+    this->position = pos;
+  }
+
   // Dynamic cast to requested layer type. Will return nullptr if not possible
   template <class StateType>
   Ptr<StateType> as() {
@@ -43,7 +47,7 @@ struct DecoderState : public IClassName, public std::enable_shared_from_this<Dec
   template <class StateType>
   Ptr<StateType> cast() {
     auto stateCast = as<StateType>();
-    ABORT_IF(!stateCast, "State {} cannot be cast to requested type {}", 
+    ABORT_IF(!stateCast, "State {} cannot be cast to requested type {}",
              className(),
              utils::cxxTypeName<StateType>());
     return stateCast;
@@ -57,10 +61,11 @@ struct DecoderState : public IClassName, public std::enable_shared_from_this<Dec
 
 class DecoderStateItem : public DecoderState {
 private:
-  Expr state_;
+  Expr state_; // state of the decoder at a given position, can be nullptr
 
 public:
   DecoderStateItem(Expr state, size_t position) : DecoderState(position), state_(state) {}
+  DecoderStateItem(size_t position) : DecoderState(position) {}
   virtual ~DecoderStateItem() = default;
 
   Expr get() { return state_; }
@@ -83,12 +88,18 @@ class DecoderStateList : public DecoderState {
     }
   }
 
+  void setPosition(size_t pos) override {
+    DecoderState::setPosition(pos);
+    for(auto& item : items_)
+      item->setPosition(pos);
+  }
+
   void append(Ptr<DecoderStateItem> item) {
     ABORT_IF(position != item->getPosition(), "DecoderStateList.position ({}) != DecoderStateItem.position ({}) ?", position, item->getPosition());
     items_.push_back(item);
   }
 
-  /** 
+  /**
    * Retrieve DecoderStateItem at index i
    */
   Ptr<DecoderStateItem> at(size_t i) const {
@@ -106,29 +117,52 @@ class DecoderStateList : public DecoderState {
   size_t size() const { return items_.size(); }
 };
 
+class EncoderContext {
+private:
+  Expr context_;
+  Expr contextMask_;
+
+public:
+  EncoderContext(Expr context, Expr contextMask)
+  : context_(context), contextMask_(contextMask) {}
+
+  virtual Expr getContext() const { return context_; }
+  virtual Expr getContextMask() const { return contextMask_; }
+};
+
+class DecoderSeq2SeqState : public DecoderStateList, public EncoderContext {
+public:
+  DecoderSeq2SeqState(size_t position, Expr context, Expr contextMask)
+  : DecoderStateList(position), EncoderContext(context, contextMask) {}
+};
 
 // Interface: Unary function
 struct IUnaryDecoderLayer {
+  virtual void initState(Ptr<DecoderState> /*state*/) const  = 0;
   virtual Expr apply(Expr /*input*/, Ptr<DecoderState> /*state*/) const = 0;
 };
 
 // Interface: Binary function
 struct IBinaryDecoderLayer {
+  virtual void initState(Ptr<DecoderState> /*state*/) const  = 0;
   virtual Expr apply(Expr, Expr, Ptr<DecoderState> /*state*/) const = 0;
 };
 
 // Interface: Ternary function
 struct ITernaryDecoderLayer {
+  virtual void initState(Ptr<DecoderState> /*state*/) const  = 0;
   virtual Expr apply(Expr, Expr, Expr, Ptr<DecoderState> /*state*/) const = 0;
 };
 
 // Interface: 4ary function
 struct IQuaternaryDecoderLayer {
+  virtual void initState(Ptr<DecoderState> /*state*/) const  = 0;
   virtual Expr apply(Expr, Expr, Expr, Expr, Ptr<DecoderState> /*state*/) const = 0;
 };
 
 // Interface: N-Ary function
 struct INaryLayerDecoderLayer {
+  virtual void initState(Ptr<DecoderState> /*state*/) const  = 0;
   virtual Expr apply(const std::vector<Expr>& /*inputs*/, Ptr<DecoderState> /*state*/) const = 0;
 };
 
diff --git a/src/layers_new/interface.h b/src/layers_new/interface.h
index a938803ee..590348548 100644
--- a/src/layers_new/interface.h
+++ b/src/layers_new/interface.h
@@ -48,7 +48,6 @@ struct IClearable {
   virtual void clear() = 0;
 };
 
-
 // Helper macro to turn parameter C++ variable name into a string.
 #define registerParameter(paramArg, shape, init) \
 do { \
@@ -58,7 +57,7 @@ do { \
 } while(0);
 
 // Helper macro to turn parameter C++ variable name into a string.
-// This version is meant to be used in apply(...) functions for lazy parameter inits 
+// This version is meant to be used in apply(...) functions for lazy parameter inits
 // hence has to cast away constness.
 #define registerParameterLazy(paramArg, shape, init) \
 do { \
@@ -80,8 +79,8 @@ do { \
   } \
 } while(0);
 
-// Helper macro that adds the layer as a named sublayer to the parent layer and uses the given name. Different from above as 
-// the C++ variable name itself is not used a name string. 
+// Helper macro that adds the layer as a named sublayer to the parent layer and uses the given name. Different from above as
+// the C++ variable name itself is not used a name string.
 #define registerLayerWithName(layerArg, name) \
 do { \
   ABORT_IF(!layerArg, "Layer {} of type {} with name {} is not initialized", #layerArg, utils::cxxTypeName(layerArg), name); \
@@ -107,8 +106,8 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
 private:
   Weak<ExpressionGraph> graph_;
 
-  // Using naked pointer as a weak reference. Cannot use shared_ptr or weak_ptr 
-  // as registration happens in constructor of parent layer and shared_from_this() 
+  // Using naked pointer as a weak reference. Cannot use shared_ptr or weak_ptr
+  // as registration happens in constructor of parent layer and shared_from_this()
   // cannot be used before parent layer constructor exits.
   Layer* firstParent_{nullptr};
   std::string name_;
@@ -135,13 +134,13 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
 
   virtual ~Layer() = default;
 
-  Ptr<ExpressionGraph> graph() { 
+  Ptr<ExpressionGraph> graph() {
     auto graph = graph_.lock();
     ABORT_IF(!graph, "graph in layer {} expired?", path());
     return graph;
   }
 
-  const Ptr<ExpressionGraph> graph() const { 
+  const Ptr<ExpressionGraph> graph() const {
     auto graph = graph_.lock();
     ABORT_IF(!graph, "graph in layer {} expired?", path());
     return graph;
@@ -172,7 +171,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
   template <class LayerType>
   Ptr<LayerType> cast() {
     auto layerCast = as<LayerType>();
-    ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", 
+    ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}",
              className(),
              utils::cxxTypeName<LayerType>());
     return layerCast;
@@ -182,7 +181,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
   Ptr<LayerType> cast() const {
     return const_cast<Layer*>(this)->cast<LayerType>();
   }
-  
+
   // Return all named parameters for this specific layer (not descending into sub-layers)
   std::vector<NamedParameter>& namedParameters() { return namedParameters_; }
   const std::vector<NamedParameter>& namedParameters() const { return namedParameters_; }
@@ -192,7 +191,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
   const std::vector<NamedLayer<Layer>>& namedLayers() const { return namedLayers_; }
 
   // Return all named sub-layers for this layer and its sub-layers (descending recursively into sub-layers).
-  // Can be used with layer type e.g. allNamedLayers<Linear>() to return only sub-layers of this type. 
+  // Can be used with layer type e.g. allNamedLayers<Linear>() to return only sub-layers of this type.
   // Returned layers will then have the given type and do not need to be cast anymore.
   template <class LayerType = Layer>
   std::vector<NamedLayer<LayerType>> allNamedLayers() {
@@ -201,7 +200,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
       auto castLayer = namedLayer.second->as<LayerType>();
       if(castLayer)
         layers.emplace_back(namedLayer.first, castLayer);
-      
+
       auto subLayers = namedLayer.second->allNamedLayers<LayerType>();
       layers.insert(layers.end(), subLayers.begin(), subLayers.end());
     }
@@ -213,8 +212,8 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
     return const_cast<Layer*>(this)->allNamedLayers<LayerType>();
   }
 
-  // Returns all sub-layers (only the layers, not the names) for this layer and its sub-layers (descending 
-  // recursively into sub-layers). Can be used with layer type e.g. allLayers<Linear>() to return only 
+  // Returns all sub-layers (only the layers, not the names) for this layer and its sub-layers (descending
+  // recursively into sub-layers). Can be used with layer type e.g. allLayers<Linear>() to return only
   // sub-layers of this type. Returned layers will then have the given type and do not need to be cast anymore.
   template <class LayerType = Layer>
   std::vector<Ptr<LayerType>> allLayers() {
@@ -230,18 +229,18 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
   }
 
   // Used by parent layers to set the name of a sub-layer.
-  // @TODO: make this private and only allow friend access from layers before merging with master. 
-  // Currently misused for top layer that has no parent layer that can set its name. 
+  // @TODO: make this private and only allow friend access from layers before merging with master.
+  // Currently misused for top layer that has no parent layer that can set its name.
   void setName(const std::string& name) { name_ = name; }
 
   const std::string& name() const { return name_; }
 
   // This sets the first parent of a sublayer (the layer a sublayer was first registered with).
-  // This is required to generate the correct path/name for layer parameters at saving time. 
-  void setFirstParent(Layer* parent) { 
+  // This is required to generate the correct path/name for layer parameters at saving time.
+  void setFirstParent(Layer* parent) {
     ABORT_IF(firstParent_ != nullptr, "Parent layer has already been set");
     ABORT_IF(parent == this, "Parent layer has to be different from child");
-    firstParent_ = parent; 
+    firstParent_ = parent;
   }
 
   // The parent layer of a sublayer is the first layer the sublayer has been registered with.
@@ -275,9 +274,9 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr
     return ss.str();
   }
 
-  // Return Mode::eval or Mode::train. This is used to determine if training only layer-internal actions 
+  // Return Mode::eval or Mode::train. This is used to determine if training only layer-internal actions
   // like dropout should be run. This will not affect graph-internal gradient propagation unless somehow
-  // specified in a layer.  
+  // specified in a layer.
   Mode getMode() const {
   #if 1
     if(graph()->isInference()) {
@@ -355,10 +354,10 @@ class LayerWithOptions : public Layer {
 /**
  * Wrapper to be used exclusively inside LayerList or other similar containers. This is allows to use the apply(...) functions
  * of a layer without having to cast to specific type (this is done internally based on the number of arguments). Inspired by
- * boost::any_type which allows to construct containers that hold various types. 
+ * boost::any_type which allows to construct containers that hold various types.
  * This should allow to use any layer and iterfaces will be added here as required.
  */
-class AnyLayer final : public IUnaryLayer, 
+class AnyLayer final : public IUnaryLayer,
                        public IBinaryLayer,
                        public ITernaryLayer,
                        public IQuaternaryLayer,
@@ -371,7 +370,7 @@ class AnyLayer final : public IUnaryLayer,
   // private/protected constructor, should only be created within listed classes with friendship
   AnyLayer(const Ptr<Layer>& layer)
     : layer_(layer) {}
-  
+
   friend class LayerList;
 
 public:
@@ -385,7 +384,7 @@ class AnyLayer final : public IUnaryLayer,
   template <class LayerType>
   Ptr<LayerType> cast() const {
     auto layerCast = as<LayerType>();
-    ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", 
+    ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}",
              layer_->className(),
              utils::cxxTypeName<LayerType>());
     return layerCast;
@@ -416,12 +415,12 @@ class AnyLayer final : public IUnaryLayer,
   }
 };
 
-/** 
+/**
  * Holds sublayers in a list and performs correct registration of sublayers. Sublayers are indexed
  * and can be accessed like array elements, including iteration.
- * `LayerList` -- in contrast to `Sequential` -- does not provide `apply` functions. 
+ * `LayerList` -- in contrast to `Sequential` -- does not provide `apply` functions.
  * You have to define the execution order and information flow in code.
- * 
+ *
  * See TransformerEncoder for an example where we hold the transformer layer stack in a LayerList,
  * but define a custom apply function (due to masks being external information and shared between layers).
  */
@@ -433,7 +432,7 @@ class LayerList : public Layer {
   void recursiveAppend(Last last) {
     append(last);
   }
-  
+
   template <class First, class ...Rest>
   void recursiveAppend(First first, Rest ...rest) {
     append(first);
@@ -452,8 +451,8 @@ class LayerList : public Layer {
 
   virtual ~LayerList() = default;
 
-  /** 
-   * This inserts an already existing sublayer from this or a different container which will result in 
+  /**
+   * This inserts an already existing sublayer from this or a different container which will result in
    * parameter sharing if there are parameters.
   ```
   auto layers = New<LayerList>(graph);
@@ -471,7 +470,7 @@ class LayerList : public Layer {
     layers_.emplace_back(new AnyLayer(layer)); // not using New<...> because of missing friendship
   }
 
-  /** 
+  /**
    * Retrieve sublayer at index i
    */
   Ptr<AnyLayer> at(size_t i) const {
@@ -494,19 +493,19 @@ class LayerList : public Layer {
   }
 };
 
-/** 
+/**
  * `Sequential` is a list of layers similar to `LayerList`, but does provide a set of `apply` functions.
  * These function assume that the first element in the container can be a unary, binary, ternary
  * or n-ary layer, but all subsequent layers have to be unary layers as they will consume the single
- * output of their preceding layer. Non-unary layers will fail to execute during runtime if they are 
+ * output of their preceding layer. Non-unary layers will fail to execute during runtime if they are
  * not the very first layer.
- * 
+ *
  * `Sequential` can be used to implement typical feed forward networks:
- * 
+ *
  ```
   using namespace marian::nn;
 
-  auto seq = New<Sequential>(graph, 
+  auto seq = New<Sequential>(graph,
     New<Linear>(graph, 100),
     New<ReLU>(graph),
     New<Dropout>(graph, 0.1f),
@@ -519,7 +518,7 @@ class LayerList : public Layer {
  ```
  * For other application patterns use `LayerList` and implement them yourself by traversing the layers.
  */
-class Sequential : public LayerList, 
+class Sequential : public LayerList,
                    public IUnaryLayer,
                    public IBinaryLayer,
                    public ITernaryLayer,
@@ -567,7 +566,7 @@ class Sequential : public LayerList,
     for(int i = 1; i < layers_.size(); ++i)
       output = layers_[i]->apply(output);
     return output;
-  } 
+  }
 
 };
 
diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h
index c0912634f..04552501e 100644
--- a/src/layers_new/neuralnet.h
+++ b/src/layers_new/neuralnet.h
@@ -113,18 +113,10 @@ struct Linear : public Layer, public IUnaryLayer {
       registerParameterLazy(bias, Shape({ dimOut }), inits::zeros());
     }
 
-    Type outputType = x->value_type();
     if(useBias)
-      return marian::affine(x,
-                            marian::cast(weight, outputType),
-                            marian::cast(bias, outputType),
-                            /*transA=*/false,
-                            /*transB=*/transposed);
+      return marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed);
     else
-      return marian::dot(x,
-                         marian::cast(weight, outputType),
-                         /*transA=*/false,
-                         /*transB=*/transposed);
+      return marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed);
   }
 };
 
diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h
index 720fa50f7..9a9cd067f 100644
--- a/src/layers_new/rnn.h
+++ b/src/layers_new/rnn.h
@@ -12,6 +12,7 @@ struct CellState {
 };
 
 struct ICell {
+  virtual void initState(Ptr<CellState> state) const = 0;
   virtual std::vector<Expr> applyToInput(Expr input) const = 0;
   virtual Expr applyToState(const std::vector<Expr>& inputs, Expr mask, Ptr<CellState> state) const = 0;
 };
@@ -36,12 +37,17 @@ class SSRU final : public Layer, public ICell {
     registerLayer(dropout);
   }
 
+  virtual void initState(Ptr<CellState> state) const override {
+    state->recurrent = graph()->constant({1, 1, 1, dimState}, inits::zeros());
+    state->position = 0;
+  }
+
   std::vector<Expr> applyToInput(Expr input) const override {
     int dimModel = input->shape()[-1];
     ABORT_IF(dimModel != dimState, "Model dimension {} has to match state dimension {}", dimModel, dimState);
 
     input = dropout->apply(input);
-    
+
     Expr output = iProj->apply(input);
     Expr forget = fProj->apply(input);
 
@@ -73,7 +79,7 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer
   Ptr<Cell> cell;
   Ptr<Linear> oProj;
 
-  RNN(Ptr<ExpressionGraph> graph, int dimState, bool outputProjection = false) 
+  RNN(Ptr<ExpressionGraph> graph, int dimState, bool outputProjection = false)
   : Layer(graph) {
     cell = New<Cell>(graph, dimState);
     registerLayer(cell);
@@ -84,6 +90,14 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer
     }
   }
 
+  virtual void initState(Ptr<DecoderState> state) const override {
+    ABORT("Remove this abort once this is actually used in the decoder");
+    auto cellState = New<CellState>();
+    cell->initState(/*in/out=*/cellState);
+    state->as<nn::DecoderStateItem>()->set(cellState->recurrent);
+    state->setPosition(cellState->position);
+  }
+
   virtual Expr apply(Expr input, Expr inputMask = nullptr) const override {
     auto state = New<DecoderStateItem>(graph()->constant({1, 1, 1, cell->dimState}, inits::zeros()), /*position=*/0);
     return apply(input, inputMask, state);
@@ -93,13 +107,16 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer
     auto cellState = New<CellState>();
     cellState->recurrent = state->as<nn::DecoderStateItem>()->get();
 
+    // during decoding time is of dimension 1, so this is a no-op (reshape in fact)
     input = swapTimeBatch(input); // [beam, time, batch, dim]
     if(inputMask)
+      // same here
       inputMask = swapTimeBatch(inputMask);
     int dimTimeAxis = -3;
-    
+
     std::vector<Expr> inputs = cell->applyToInput(input);
 
+    // @TODO: this could be implemented as a special kernel/operator
     std::vector<Expr> outputs;
     for(int i = 0; i < input->shape()[dimTimeAxis]; ++i) {
       std::vector<Expr> stepInputs(inputs.size());
@@ -109,13 +126,15 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer
       auto stepMask = inputMask;
       if(stepMask)
          stepMask = slice(inputMask, dimTimeAxis, i);
-      
+
       Expr output = cell->applyToState(stepInputs, stepMask, /*in/out=*/cellState);
       outputs.push_back(output);
     }
 
     state->as<nn::DecoderStateItem>()->set(cellState->recurrent);
-    
+    state->setPosition(cellState->position);
+
+    // during decoding again, this is a no-op
     Expr output = swapTimeBatch(concatenate(outputs, dimTimeAxis));
     if(oProj)
       output = oProj->apply(output);
diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h
index d80fe102f..c358cd5c3 100644
--- a/src/layers_new/transformer.h
+++ b/src/layers_new/transformer.h
@@ -16,7 +16,8 @@ namespace nn {
  * Currently these are usually dropout, layer normalization and skip connections.
  * A transformer block will usually apply one of them.
  */
-struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
+class TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
+public:
   Ptr<Dropout> dropout;
   Ptr<Norm> norm;
   std::string actionDesc;
@@ -65,20 +66,24 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer {
 };
 
 /**
- * This is a typical transformer self-attention block. The default configuration will
+ * This is a transformer self-attention block without state. The default configuration will
  * use a multi-head multiplicative self-attention layer, followed by dropout, the skip
  * connection and layer normalization (dan) in the post-processor. The pre-processor does
- * nothing in the default configuration.
+ * nothing in the default configuration. See TransformerDecoderSelfAttentionBlock for a
+ * version that can be used in the decoder with state.
  */
 class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBinaryLayer {
 public:
   Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<MaskProcessor> selfMaskProcessor;
   Ptr<AttentionLayer> selfAttention;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
   TransformerSelfAttentionBlock(Ptr<ExpressionGraph> graph,
-                                Ptr<Options> options)
-    : LayerWithOptions(graph, options)
+                                Ptr<Options> options,
+                                Ptr<MaskProcessor> selfMaskProcessorInit = nullptr)
+    : LayerWithOptions(graph, options),
+      selfMaskProcessor(selfMaskProcessorInit)
   {
     preprocessor = New<TransformerPrePostProcessor>(
       graph,
@@ -86,6 +91,11 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
+    if(!selfMaskProcessor) {
+      selfMaskProcessor = maskProcessorFromOptions(graph, options);
+      registerLayer(selfMaskProcessor);
+    }
+
     selfAttention = attentionFromOptions(graph, options);
     registerLayer(selfAttention);
 
@@ -96,8 +106,9 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
     registerLayer(postprocessor);
   }
 
-  Expr apply(Expr input, Expr logMask = nullptr) const override {
+  Expr apply(Expr input, Expr inputMask = nullptr) const override {
     auto output  = preprocessor->apply(input);                            // optional preprocessing
+    auto logMask = selfMaskProcessor->apply(output, inputMask);           // mask out attention to padding symbols
     output       = selfAttention->apply(output, output, output, logMask); // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer
     output       = postprocessor->apply(output, input);                   // optional postprocessing, optional skip connection
     return output;
@@ -110,7 +121,8 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin
  * the skip connection and layer normalization (dan) in the post-processor. The pre-processor does
  * nothing in the default configuration.
  */
-struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLayer {
+class TransformerFilterBlock final : public LayerWithOptions, public IUnaryLayer {
+public:
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<Sequential> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
@@ -182,15 +194,17 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye
  * A full transformer encoder layer consists of a self-attention block followed by
  * a filter block. Skip connections etc. are handled inside the blocks, see above.
  */
-struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLayer {
+class TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLayer {
+public:
   Ptr<TransformerSelfAttentionBlock> selfAttentionBlock;
   Ptr<TransformerFilterBlock> filterBlock;
 
   TransformerEncoderLayer(Ptr<ExpressionGraph> graph,
-                          Ptr<Options> options)
+                          Ptr<Options> options,
+                          Ptr<MaskProcessor> selfMaskProcessorInit = nullptr)
     : LayerWithOptions(graph, options)
   {
-    selfAttentionBlock = New<TransformerSelfAttentionBlock>(graph, options);
+    selfAttentionBlock = New<TransformerSelfAttentionBlock>(graph, options, selfMaskProcessorInit);
     registerLayer(selfAttentionBlock);
 
     filterBlock = New<TransformerFilterBlock>(graph, options);
@@ -213,10 +227,9 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa
  * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output.
  * @TODO: get rid of these transposes.
  */
-struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
+class TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
 public:
   Ptr<PositionEmbeddingLayer> positionEmbedding;
-  Ptr<AttentionMaskProcessor> maskProcessor;
   Ptr<TransformerPrePostProcessor> preprocessor;
   Ptr<LayerList> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
@@ -237,9 +250,6 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
       registerLayer(positionEmbedding);
     }
 
-    maskProcessor = attentionMaskProcessorFromOptions(graph, options);
-    registerLayer(maskProcessor);
-
     preprocessor = New<TransformerPrePostProcessor>(
       graph,
       opt<std::string>("transformer-postprocess-emb", ""),
@@ -248,8 +258,15 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
 
     layers = New<LayerList>(graph);
     registerLayer(layers);
+
+    Ptr<MaskProcessor> selfMaskProcessor; // this will be initialized in the first encoder layer
     for(int i = 0; i < opt<int>("enc-depth"); ++i) {
-      auto transformerEncoderLayer = New<TransformerEncoderLayer>(graph, options);
+      auto transformerEncoderLayer = New<TransformerEncoderLayer>(graph, options, selfMaskProcessor);
+      layers->append(transformerEncoderLayer);
+
+      if(!selfMaskProcessor)
+        selfMaskProcessor = transformerEncoderLayer->selfAttentionBlock->selfMaskProcessor;
+
       // example of changing linear layer init functions burried deep in the model
       if(opt<bool>("transformer-depth-scaling", false))
         for(auto linear : transformerEncoderLayer->allLayers<Linear>())
@@ -265,7 +282,6 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
           norm->useBias  = false;
         }
       }
-      layers->append(transformerEncoderLayer);
     }
 
     postprocessor = New<TransformerPrePostProcessor>(
@@ -277,7 +293,7 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
 
   virtual ~TransformerEncoder() = default;
 
-  Expr apply(Expr input, Expr mask = nullptr) const override {
+  Expr apply(Expr input, Expr inputMask = nullptr) const override {
     // first and last operations (see at the bottom of this function) switch the time and batch
     // dimensions. This order is more natural for the transformer, but more difficult to handle
     // during beam search or when using RNNs. Hence the input/output transpositions here.
@@ -287,8 +303,8 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
     // do that everywhere we can detect inconsistencies automatically.
     // reorganize batch and timestep
     auto output = swapTimeBatch(input); // [1, dimBatch, dimSrcWords, dimModel]
-    if(mask)
-      mask = swapTimeBatch(mask); // [1, dimBatch, dimSrcWords, 1]
+    if(inputMask)
+      inputMask = swapTimeBatch(inputMask); // [1, dimBatch, dimSrcWords, 1]
 
     // apply positional embeddings to contextual input
     if(positionEmbedding)
@@ -301,13 +317,12 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
 
     // apply dropout or layer-norm to embeddings if required
     output = preprocessor->apply(output);
-    auto logMask = maskProcessor->apply(output, mask);
 
     // traverse the layers, use the same mask for each
     for(auto layer : *layers) {
       if(keepHiddenStates) // note, with pre-norm, the hidden states will not be normed here.
         hiddenStates.push_back(hiddenTransformFn(output));
-      output = layer->apply(output, logMask);
+      output = layer->apply(output, inputMask);
     }
 
     // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection
@@ -339,15 +354,18 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer {
  * connection and layer normalization (dan) in the post-processor. The pre-processor does
  * nothing in the default configuration.
  */
-class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITernaryLayer {
+class TransformerDecoderCrossAttentionBlock final : public LayerWithOptions, public ITernaryDecoderLayer {
 public:
   Ptr<TransformerPrePostProcessor> preprocessor;
+  Ptr<DecoderMaskProcessor> contextMaskProcessor;
   Ptr<AttentionLayer> crossAttention;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
-  TransformerCrossAttentionBlock(Ptr<ExpressionGraph> graph,
-                                 Ptr<Options> options)
-    : LayerWithOptions(graph, options)
+  TransformerDecoderCrossAttentionBlock(Ptr<ExpressionGraph> graph,
+                                        Ptr<Options> options,
+                                        Ptr<DecoderMaskProcessor> contextMaskProcessorInit = nullptr)
+    : LayerWithOptions(graph, options),
+      contextMaskProcessor(contextMaskProcessorInit)
   {
     preprocessor = New<TransformerPrePostProcessor>(
       graph,
@@ -355,8 +373,15 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
+    if(!contextMaskProcessor) {
+      contextMaskProcessor = contextDecoderMaskProcessorFromOptions(graph, options);
+      registerLayer(contextMaskProcessor);
+    }
+
     // @TODO: factory to support different attention flavors?
-    crossAttention = attentionFromOptions(graph, options);
+    // for cross-attention, we cache the projected keys and values since they come from
+    // the encoder and are static during decoding unless the batch size changes.
+    crossAttention = attentionFromOptions(graph, options, /*enableCache=*/true);
     registerLayer(crossAttention);
 
     postprocessor = New<TransformerPrePostProcessor>(
@@ -366,37 +391,33 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe
     registerLayer(postprocessor);
   }
 
-  Expr apply(Expr input, Expr context, Expr logMask) const override {
+  void initState(Ptr<DecoderState> state) const override {}
+
+  Expr apply(Expr input, Expr context, Expr contextMask, Ptr<DecoderState> state) const override {
     auto output  = preprocessor->apply(input); // optional preprocessing
+    auto logMask = contextMaskProcessor->apply(output, contextMask, state);
     output       = crossAttention->apply(output, context, context, logMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer
     output       = postprocessor->apply(output, input);                      // optional postprocessing, optional skip connection
     return output;
   }
 };
 
-class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer {
-public:
-  TransformerAutoRegressiveBlock(Ptr<ExpressionGraph> graph,
-                                 Ptr<Options> options)
-    : LayerWithOptions(graph, options) {}
-
-  virtual ~TransformerAutoRegressiveBlock() = default;
-
-  using IBinaryDecoderLayer::apply;
-};
-
 /**
- * This is a transformer RNN block.
+ * Base class for transformer auto-regressive blocks. These are blocks that can be used in the decoder
+ * and that take the previous step's output as input. Currently this is either a self-attention block
+ * or an RNN block.
  */
-class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
+class TransformerDecoderAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer {
 public:
   Ptr<TransformerPrePostProcessor> preprocessor;
-  Ptr<RNN<SSRU>> rnn;
+  Ptr<DecoderMaskProcessor> selfMaskProcessor;
   Ptr<TransformerPrePostProcessor> postprocessor;
 
-  TransformerRNNBlock(Ptr<ExpressionGraph> graph,
-                      Ptr<Options> options)
-    : TransformerAutoRegressiveBlock(graph, options)
+  TransformerDecoderAutoRegressiveBlock(Ptr<ExpressionGraph> graph,
+                                        Ptr<Options> options,
+                                        Ptr<DecoderMaskProcessor> selfMaskProcessorInit = nullptr)
+    : LayerWithOptions(graph, options),
+      selfMaskProcessor(selfMaskProcessorInit)
   {
     preprocessor = New<TransformerPrePostProcessor>(
       graph,
@@ -404,10 +425,10 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
-    // @TODO: factory to support different attention flavors?
-    int modelDim = opt<int>("transformer-dim-model", opt<int>("dim-emb"));
-    rnn = New<RNN<SSRU>>(graph, modelDim, opt<bool>("transformer-rnn-projection", false));
-    registerLayer(rnn);
+    if(!selfMaskProcessor) {
+      selfMaskProcessor = selfMaskProcessorFromOptions(graph, options);
+      registerLayer(selfMaskProcessor);
+    }
 
     postprocessor = New<TransformerPrePostProcessor>(
       graph,
@@ -416,6 +437,85 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
     registerLayer(postprocessor);
   }
 
+  virtual ~TransformerDecoderAutoRegressiveBlock() = default;
+
+  using IBinaryDecoderLayer::initState;
+  using IBinaryDecoderLayer::apply;
+};
+
+/**
+ * This is a typical transformer self-attention block. The default configuration will
+ * use a multi-head multiplicative self-attention layer, followed by dropout, the skip
+ * connection and layer normalization (dan) in the post-processor. The pre-processor does
+ * nothing in the default configuration.
+ */
+class TransformerDecoderSelfAttentionBlock final : public TransformerDecoderAutoRegressiveBlock {
+public:
+  Ptr<AttentionLayer> selfAttention;
+
+  using TransformerDecoderAutoRegressiveBlock::preprocessor;
+  using TransformerDecoderAutoRegressiveBlock::selfMaskProcessor;
+  using TransformerDecoderAutoRegressiveBlock::postprocessor;
+
+  TransformerDecoderSelfAttentionBlock(Ptr<ExpressionGraph> graph,
+                                       Ptr<Options> options,
+                                       Ptr<DecoderMaskProcessor> selfMaskProcessorInit = nullptr)
+    : TransformerDecoderAutoRegressiveBlock(graph, options, selfMaskProcessorInit)
+  {
+    // no caching of keys and values for self-attention since they change at each step
+    selfAttention = attentionFromOptions(graph, options, /*enableCache=*/false);
+    registerLayer(selfAttention);
+  }
+
+  void initState(Ptr<DecoderState> state) const override {
+    state->setPosition(0);
+  }
+
+  Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
+    auto output = preprocessor->apply(input);           // optional preprocessing
+
+    // Here we extend the state with the keys and values from the previous step.
+    auto query      = output;
+    auto keysValues = output;
+    if(state->getPosition() > 0) {
+      auto kvHistory = state->as<DecoderStateItem>()->get(); // [dimBeam, dimBatch, dimHistory, dimModel]
+      keysValues     = concatenate({kvHistory, keysValues}, /*axis=*/-2); // [dimBeam, dimBatch, dimHistory + 1, dimModel]
+    }
+    state->as<DecoderStateItem>()->set(keysValues);
+
+    auto logMask = selfMaskProcessor->apply(query, inputMask, state);
+    output       = selfAttention->apply(query, keysValues, keysValues, logMask);
+    output       = postprocessor->apply(output, input);  // optional postprocessing, optional skip connection
+    return output;
+  }
+};
+
+/**
+ * This is a transformer RNN block that can be used as a replacement for the self-attention
+ * block in the decoder.
+ */
+class TransformerDecoderRNNBlock final : public TransformerDecoderAutoRegressiveBlock {
+public:
+  Ptr<RNN<SSRU>> rnn; // @TODO: support other RNN types like LSTM or GRU
+
+  using TransformerDecoderAutoRegressiveBlock::preprocessor;
+  using TransformerDecoderAutoRegressiveBlock::postprocessor;
+
+  TransformerDecoderRNNBlock(Ptr<ExpressionGraph> graph,
+                             Ptr<Options> options,
+                             Ptr<DecoderMaskProcessor> selfMaskProcessorInit = nullptr)
+    : TransformerDecoderAutoRegressiveBlock(graph, options, selfMaskProcessorInit)
+  {
+    // @TODO: factory to support different attention flavors?
+    int modelDim = opt<int>("transformer-dim-model", opt<int>("dim-emb"));
+    rnn = New<RNN<SSRU>>(graph, modelDim, opt<bool>("transformer-rnn-projection", false));
+    registerLayer(rnn);
+  }
+
+  void initState(Ptr<DecoderState> state) const override {
+    rnn->as<IBinaryDecoderLayer>()->initState(state);
+  }
+
   Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
     auto output = preprocessor->apply(input);           // optional preprocessing
     output      = rnn->apply(output, inputMask, state); // rnn application with state extension
@@ -425,44 +525,39 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock {
 };
 
 /**
- * A full transformer decoder layer consists of a self-attention block followed by
- * cross-attention block and a filter block. Skip connections etc. are handled inside
- * the blocks, see above.
- *
- * For the self-attention block we need a special mask, usually a triangle mask that
- * prohibits to look into the future.
- * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive
- * for many layers.
+ * A full transformer (LM) decoder layer consists of a self-attention block followed by
+ * a filter block. Skip connections etc. are handled inside the blocks, see above.
  */
-struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaternaryDecoderLayer {
-  Ptr<TransformerAutoRegressiveBlock> autoRegressiveBlock;
-  Ptr<TransformerCrossAttentionBlock> crossAttentionBlock;
+class TransformerDecoderLayer : public LayerWithOptions, public IBinaryDecoderLayer {
+public:
+  Ptr<TransformerDecoderAutoRegressiveBlock> autoRegressiveBlock;
   Ptr<TransformerFilterBlock> filterBlock;
 
   TransformerDecoderLayer(Ptr<ExpressionGraph> graph,
-                          Ptr<Options> options)
+                          Ptr<Options> options,
+                          Ptr<DecoderMaskProcessor> selfMaskProcessorInit = nullptr)
     : LayerWithOptions(graph, options)
   {
     auto autoRegressionType = opt<std::string>("transformer-decoder-autoreg", "self-attention");
     if(autoRegressionType == "self-attention") {
-      ABORT("Auto-regression block type {} not yet implemented", autoRegressionType);
+      autoRegressiveBlock = New<TransformerDecoderSelfAttentionBlock>(graph, options, selfMaskProcessorInit);
     } else if(autoRegressionType == "rnn") {
-      autoRegressiveBlock = New<TransformerRNNBlock>(graph, options);
+      autoRegressiveBlock = New<TransformerDecoderRNNBlock>(graph, options, selfMaskProcessorInit);
     } else {
       ABORT("Unknown auto-regression block type {}", autoRegressionType);
     }
     registerLayer(autoRegressiveBlock);
 
-    crossAttentionBlock = New<TransformerCrossAttentionBlock>(graph, options);
-    registerLayer(crossAttentionBlock);
-
     filterBlock = New<TransformerFilterBlock>(graph, options, /*isDecoder=*/true);
     registerLayer(filterBlock);
   }
 
-  Expr apply(Expr input, Expr inputMask, Expr context, Expr logMask, Ptr<DecoderState> state) const override {
+  void initState(Ptr<DecoderState> state) const override {
+    autoRegressiveBlock->as<IBinaryDecoderLayer>()->initState(state);
+  }
+
+  Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
     Expr output = autoRegressiveBlock->apply(input, inputMask, state);
-    output      = crossAttentionBlock->apply(output, context, logMask);
     output      = filterBlock->apply(output);
 
     checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)
@@ -470,18 +565,64 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna
   }
 };
 
+/**
+ * A transformer (S2S) decoder layer consists of a self-attention block followed by
+ * cross-attention block and a filter block. Skip connections etc. are handled inside
+ * the blocks. We inherit from TransformerDecoderLayer and add the cross-attention block.
+ * * @TODO: get rid of IQuaternaryDecoderLayer and use IBinaryDecoderLayer instead
+ */
+class TransformerDecoderLayerWithCrossAttention : public TransformerDecoderLayer, public IQuaternaryDecoderLayer {
+public:
+  Ptr<TransformerDecoderCrossAttentionBlock> crossAttentionBlock;
+  using TransformerDecoderLayer::autoRegressiveBlock;
+  using TransformerDecoderLayer::filterBlock;
+
+  TransformerDecoderLayerWithCrossAttention(Ptr<ExpressionGraph> graph,
+                                            Ptr<Options> options,
+                                            Ptr<DecoderMaskProcessor> selfMaskProcessorInit = nullptr,
+                                            Ptr<DecoderMaskProcessor> contextMaskProcessorInit = nullptr)
+    : TransformerDecoderLayer(graph, options, selfMaskProcessorInit)
+  {
+    crossAttentionBlock = New<TransformerDecoderCrossAttentionBlock>(graph, options, contextMaskProcessorInit);
+    registerLayer(crossAttentionBlock);
+  }
+
+  void initState(Ptr<DecoderState> state) const override {
+    TransformerDecoderLayer::initState(state);
+  }
+
+  Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr<DecoderState> state) const override {
+    Expr output  = autoRegressiveBlock->apply(input, inputMask, state);
+    output       = crossAttentionBlock->apply(output, context, contextMask, state);
+    output       = filterBlock->apply(output);
+
+    checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual)
+    return output;
+  }
+
+private:
+  // @TODO: once we have correct decoder states we can change the interface to IBinaryDecoderLayer and remove this
+  // this is a dummy implementation to satisfy the interface, it should never be called
+  Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
+    ABORT("This should never be called");
+  }
+};
+
 /**
  * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we
  * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity
  * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output.
  * @TODO: get rid of these transposes.
  */
-struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer {
+class TransformerDecoder final : public LayerWithOptions, public IBinaryDecoderLayer {
+private:
+  Ptr<AttentionCollector> attentionCollector_;
+
+public:
   Ptr<PositionEmbeddingLayer> positionEmbedding;
-  Ptr<AttentionMaskProcessor> maskProcessor;
   Ptr<TransformerPrePostProcessor> preprocessor;
-  Ptr<LayerList> layers;
   Ptr<TransformerPrePostProcessor> postprocessor;
+  Ptr<LayerList> layers;
 
   TransformerDecoder(Ptr<ExpressionGraph> graph,
                      Ptr<Options> options)
@@ -492,15 +633,18 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
       registerLayer(positionEmbedding);
     }
 
-    maskProcessor = attentionMaskProcessorFromOptions(graph, options);
-    registerLayer(maskProcessor);
-
     preprocessor = New<TransformerPrePostProcessor>(
       graph,
       opt<std::string>("transformer-postprocess-emb", ""),
       opt<float>("transformer-dropout", 0.f));
     registerLayer(preprocessor);
 
+    postprocessor = New<TransformerPrePostProcessor>(
+      graph,
+      opt<std::string>("transformer-postprocess-top", ""),
+      opt<float>("transformer-dropout", 0.f));
+    registerLayer(postprocessor);
+
     size_t decDepth = opt<size_t>("dec-depth");
     std::vector<size_t> tiedLayers = opt<std::vector<size_t>>("transformer-tied-layers", std::vector<size_t>());
     ABORT_IF(!tiedLayers.empty() && tiedLayers.size() != decDepth,
@@ -513,23 +657,40 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
 
     layers = New<LayerList>(graph);
     registerLayer(layers);
+
+    Ptr<DecoderMaskProcessor> selfMaskProcessor;    // this will be initialized in the first decoder layer
+    Ptr<DecoderMaskProcessor> contextMaskProcessor; // this will be initialized in the first decoder layer
     for(size_t i = 0; i < decDepth; ++i) {
       if(tiedLayers.empty() || tiedLayers[i] == i) { // not tied or tied to itself, so needs to be created first
-        auto transformerDecoderLayer = New<TransformerDecoderLayer>(graph, options);
+        auto transformerDecoderLayer = New<TransformerDecoderLayerWithCrossAttention>(graph, options, selfMaskProcessor, contextMaskProcessor);
         layers->append(transformerDecoderLayer);
+
+        if(!selfMaskProcessor)
+          selfMaskProcessor    = transformerDecoderLayer->autoRegressiveBlock->selfMaskProcessor;
+        if(!contextMaskProcessor)
+          contextMaskProcessor = transformerDecoderLayer->crossAttentionBlock->contextMaskProcessor;
+
       } else {
         ABORT_IF(tiedLayers[i] > i, "Cannot tie to layer above this layer??");
         layers->append(layers->at(tiedLayers[i])); // repeat layer to tie weights
       }
 
-      auto currentLayer = layers->at(i)->as<TransformerDecoderLayer>();
+      auto currentLayer = layers->at(i)->as<TransformerDecoderLayerWithCrossAttention>();
+
       // example of changing linear layer init functions burried deep in the model
       if(opt<bool>("transformer-depth-scaling", false)) {
-        auto autoRegLayer = currentLayer->autoRegressiveBlock->as<TransformerRNNBlock>();
-        autoRegLayer->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+        auto autoRegLayerRNN = currentLayer->autoRegressiveBlock->as<TransformerDecoderRNNBlock>();
+        if(autoRegLayerRNN)
+          autoRegLayerRNN->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+
+        auto autoRegLayerSA = currentLayer->autoRegressiveBlock->as<TransformerDecoderSelfAttentionBlock>();
+        if(autoRegLayerSA)
+          for(auto linear : autoRegLayerSA->allLayers<Linear>())
+            linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
 
         for(auto linear : currentLayer->crossAttentionBlock->allLayers<Linear>())
           linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
+
         for(auto linear : currentLayer->filterBlock->allLayers<Linear>())
           linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1));
       }
@@ -544,34 +705,55 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
           norm->useBias = false;
         }
       }
+
+      if(opt<std::string>("guided-alignment", "none") != "none" || options_->hasAndNotEmpty("alignment")) {
+        std::string gaStr = opt<std::string>("transformer-guided-alignment-layer", "last");
+
+        size_t attLayer = decDepth - 1;
+        if(gaStr != "last")
+          attLayer = std::stoull(gaStr) - 1;
+
+        ABORT_IF(attLayer >= decDepth, "Chosen layer for guided attention ({}) larger than number of layers ({})", attLayer + 1, decDepth);
+
+        if(i == attLayer) {
+          attentionCollector_ = currentLayer->crossAttentionBlock->crossAttention->as<nn::AttentionCollector>();
+          attentionCollector_->saveAttentionWeights = true;              // @TODO: ugly
+          attentionCollector_->numHeads = opt<int>("transformer-heads"); // @TODO: ugly
+        }
+      }
     }
+  }
 
-    postprocessor = New<TransformerPrePostProcessor>(
-      graph,
-      opt<std::string>("transformer-postprocess-top", ""),
-      opt<float>("transformer-dropout", 0.f));
-    registerLayer(postprocessor);
+  void initState(Ptr<DecoderState> state) const override {
+    ABORT("Remove this abort once this is actually used in the decoder");
+    size_t positiion = 0;
+    state->setPosition(positiion);
+    for(auto layer : *layers) {
+      Ptr<DecoderStateItem> layerState = New<DecoderStateItem>(positiion);
+      layer->as<TransformerDecoderLayer>()->initState(layerState);
+      state->as<DecoderStateList>()->append(layerState);
+    }
   }
 
-  Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr<DecoderState> state) const override {
+  Expr apply(Expr input, Expr inputMask, Ptr<DecoderState> state) const override {
     // first and last operations (see at the bottom of this function) switch the time and batch
     // dimensions. This order is more natural for the transformer, but more difficult to handle
     // during beam search or when using RNNs. Hence the input/output transpositions here.
     Expr output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
-    context = swapTimeBatch(context); // [dimBeam=1, dimBatch, dimSrcWords, dimModel]
 
     // set current target token position during decoding or training. At training
     // this should be 0. During translation the current length of the translation.
     // Used for position embeddings and creating new decoder states.
     int startPos = (int)state->getPosition();
 
-    // @TODO: write function prepareMasks();
-    // @TODO: create triangle mask here and combine with inputMask
-    LOG_ONCE(info, "Don't forget the triangle mask if required!");
-
     if(inputMask)
       inputMask = swapTimeBatch(inputMask); // [dimBeam=1, dimBatch, dimTrgWords, dimModel=1]
 
+    Expr context     = state->as<EncoderContext>()->getContext();
+    Expr contextMask = state->as<EncoderContext>()->getContextMask();
+
+    // @TODO: get rid of this
+    context = swapTimeBatch(context); // [dimBeam=1, dimBatch, dimSrcWords, dimModel]
     if(contextMask)
       contextMask = swapTimeBatch(contextMask);  // [dimBeam=1, dimBatch, dimSrcWords, dimModel=1]
 
@@ -589,11 +771,11 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
 
     // get an iterator to per-layer states
     auto layerStateIt = state->as<nn::DecoderStateList>()->begin();
-    auto logMask = maskProcessor->apply(output, contextMask, *layerStateIt);
-
     // traverse the layers, use the same mask for each
     for(auto layer : *layers) {
-      output = layer->as<TransformerDecoderLayer>()->apply(output, inputMask, context, logMask, /*in/out=*/*layerStateIt++);
+      // @TODO: can we put logmask computation inside this layer? Then we can reduce the number of arguments here
+      // and use only the decoder state to provide context and mask.
+      output = layer->as<TransformerDecoderLayerWithCrossAttention>()->apply(output, inputMask, context, contextMask, /*in/out=*/*layerStateIt++);
     }
 
     // apply final postprocessor if requred, e.g. final layer-norm for pre-norm or final skip connection
@@ -609,6 +791,18 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec
     output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim]
     return output;
   }
+
+  std::vector<Expr> getAlignments() {
+    if(attentionCollector_)
+      return attentionCollector_->getAlignments();
+    else
+      return {};
+  }
+
+  virtual void clear() override {
+    LayerWithOptions::clear();
+  }
+
 };
 
 } // namespace nn
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 6a09469fd..513639dd6 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -58,7 +58,7 @@ IBeamSearchDecoder::IBeamSearchDecoder(Ptr<Options> options,
                                        const std::vector<const void*>& ptrs)
   : options_(options) {
   for(auto ptr : ptrs)
-    modelWeights_.push_back(New<io::ModelWeights>(ptr));
+    modelWeights_.push_back(New<io::ModelWeights>(ptr, io::MmapMode::RequiredMmap, /*locking=*/false));
 }
 
 class BeamSearchDecoder : public IBeamSearchDecoder {
diff --git a/src/models/amun.h b/src/models/amun.h
index d6b1209c6..65f5c6516 100644
--- a/src/models/amun.h
+++ b/src/models/amun.h
@@ -95,10 +95,7 @@ class Amun : public EncoderDecoder {
 
     // @TODO: get rid of all this eventually
     { // scope for lock_guard
-      // this is needed during loading since we modify the content of modelFile->items() directly
-      // This is quite ugly but this is legacy code anyway.
-      std::mutex mutex;
-      std::lock_guard<std::mutex> lock(mutex);
+      auto lockGuard = modelFile->scopedLockGuard();
 
       // only modify the first time.
       bool modify = false;
diff --git a/src/models/bleurt.h b/src/models/bleurt.h
index 74848b788..844f94609 100644
--- a/src/models/bleurt.h
+++ b/src/models/bleurt.h
@@ -70,8 +70,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder {
     auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
 
     auto binaryMask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-    auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1]
-
+    
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
 
@@ -83,7 +82,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder {
 
     // traverse the layers, use the same mask for each
     for(auto layer : *layers)
-      output = layer->apply(output, logMask);
+      output = layer->apply(output, binaryMask);
 
     return output;
   }
@@ -97,7 +96,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions,
   Ptr<BleurtEncoder> encoder;
 
   BleurtBatchEncoder(Ptr<ExpressionGraph> graph,
-                    Ptr<Options> options)
+                     Ptr<Options> options)
     : LayerWithOptions(graph, options),
       EncoderBase(graph, options)
   {
@@ -155,7 +154,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions,
   }
 
   virtual void clear() override {
-    Layer::clear();
+    LayerWithOptions::clear();
   }
 };
 
diff --git a/src/models/nematus.h b/src/models/nematus.h
index 7d421ec5c..d0132bc9e 100644
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@@ -35,10 +35,7 @@ class Nematus : public EncoderDecoder {
 
     // @TODO: get rid of all this eventually
     { // scope for lock_guard
-      // this is needed during loading since we modify the content of modelFile->items() directly
-      // This is quite ugly but this is legacy code anyway.
-      std::mutex mutex;
-      std::lock_guard<std::mutex> lock(mutex);
+      auto lockGuard = modelFile->scopedLockGuard();
 
       // only modify the first time.
       bool modify = false;
diff --git a/src/models/transformer.h b/src/models/transformer.h
index ad018b240..6feda24fe 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -40,16 +40,16 @@ class Transformer : public EncoderOrDecoderBase {
   std::vector<Expr> alignments_; // [max tgt len or 1][beam depth, max src length, batch size, 1]
 
   // @TODO: make this go away
-  template <typename T> 
-  T opt(const char* const key) const { Ptr<Options> options = options_; return options->get<T>(key); }  
+  template <typename T>
+  T opt(const char* const key) const { Ptr<Options> options = options_; return options->get<T>(key); }
 
-  template <typename T> 
-  T opt(const std::string& key) const { return opt<T>(key.c_str()); }  
+  template <typename T>
+  T opt(const std::string& key) const { return opt<T>(key.c_str()); }
 
-  template <typename T> 
+  template <typename T>
   T opt(const char* const key, const T& def) const { Ptr<Options> options = options_; return options->get<T>(key, def);  }
 
-  template <typename T> 
+  template <typename T>
   T opt(const std::string& key, const T& def) const { opt<T>(key.c_str(), def); }
 
 public:
@@ -120,7 +120,7 @@ class Transformer : public EncoderOrDecoderBase {
   virtual Expr addSpecialEmbeddings(Expr input, int start = 0, Ptr<data::CorpusBatch> /*batch*/ = nullptr) const {
     if(opt<bool>("transformer-disable-position-embeddings", false))
       return input;
-      
+
     bool trainPosEmbeddings = opt<bool>("transformer-train-positions", false);
     return addPositionalEmbeddings(input, start, trainPosEmbeddings);
   }
@@ -248,7 +248,7 @@ class Transformer : public EncoderOrDecoderBase {
 
     // to avoid mistakenly using the old transformer framework for new features
     auto maskType = opt<std::string>("transformer-attention-mask", "default");
-    ABORT_IF(maskType != "default", 
+    ABORT_IF(maskType != "default",
              "You specified --transformer-attention-mask={} which is not implemented for legacy Transformer", maskType  );
 
     // softmax over batched dot product of query and keys (applied over all
@@ -263,7 +263,7 @@ class Transformer : public EncoderOrDecoderBase {
 
     // take softmax along src sequence axis (-1)
     auto weights = softmax(z); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length]
-    
+
     if(saveAttentionWeights)
       collectOneHead(weights, dimBeam);
 
@@ -290,7 +290,7 @@ class Transformer : public EncoderOrDecoderBase {
     auto Wq = graph_->param(prefix + "_Wq", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f));
     auto bq = graph_->param(prefix + "_bq", {       1, dimModel}, inits::zeros());
     auto qh = affine(q, Wq, bq);
-    
+
     qh = SplitHeads(qh, dimHeads); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim]
 
     Expr kh;
@@ -313,8 +313,8 @@ class Transformer : public EncoderOrDecoderBase {
     }
 
     Expr vh;
-    if (cache 
-        && cache_.count(prefix + "_values") > 0 
+    if (cache
+        && cache_.count(prefix + "_values") > 0
         && cache_[prefix + "_values"]->shape().elements() == values->shape().elements()) {
       vh = cache_[prefix + "_values"];
     } else {
@@ -391,7 +391,7 @@ class Transformer : public EncoderOrDecoderBase {
 
     // multi-head self-attention over previous input
     output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights);
-    
+
     auto opsPost = opt<std::string>("transformer-postprocess");
     output = postProcess(prefix + "_Wo", opsPost, output, input, dropProb);
 
@@ -431,14 +431,14 @@ class Transformer : public EncoderOrDecoderBase {
       int decDimFfn = opt<int>("transformer-decoder-dim-ffn", 0);
       if(decDimFfn != 0)
         dimFfn = decDimFfn;
-      
+
       int decDepthFfn = opt<int>("transformer-decoder-ffn-depth", 0);
       if(decDepthFfn != 0)
-        depthFfn = decDepthFfn;      
+        depthFfn = decDepthFfn;
     }
-    
+
     ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn);
-    
+
     float ffnDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
     auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f);
 
@@ -588,7 +588,7 @@ class EncoderTransformer : public Transformer<EncoderBase> {
     auto embeddingLayer = getEmbeddingLayer(opt<bool>("ulr", false));
     std::tie(batchEmbeddings, batchMask) = embeddingLayer->apply((*batch)[batchIndex_]);
     batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch);
-    
+
     // reorganize batch and timestep
     batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim]
     batchMask       = atleast_nd(batchMask, 4);       // [beam depth=1, max length, batch size, vector dim=1]
@@ -623,7 +623,7 @@ class EncoderTransformer : public Transformer<EncoderBase> {
     }
 
     // this allows to run a final layernorm operation after going through the transformer layer stack.
-    // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) 
+    // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da)
     // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested.
     auto opsTop = opt<std::string>("transformer-postprocess-top", "");
     layer = postProcess(prefix_ + "_top", opsTop, layer, prevLayer, dropProb);
@@ -763,8 +763,8 @@ class DecoderTransformer : public Transformer<DecoderBase> {
 
       // This would happen if something goes wrong during batch pruning.
       ABORT_IF(encoderContext->shape()[-3] != dimBatch,
-               "Context and query batch dimension do not match {} != {}", 
-               encoderContext->shape()[-3], 
+               "Context and query batch dimension do not match {} != {}",
+               encoderContext->shape()[-3],
                dimBatch);
 
       // LayerAttention expects mask in a different layout
@@ -801,7 +801,7 @@ class DecoderTransformer : public Transformer<DecoderBase> {
       rnn::State prevDecoderState;
       if(prevDecoderStates.size() > 0)
         prevDecoderState = prevDecoderStates[i];
-      
+
       // self-attention
       std::string layerType = opt<std::string>("transformer-decoder-autoreg", "self-attention");
       rnn::State decoderState;
@@ -871,7 +871,7 @@ class DecoderTransformer : public Transformer<DecoderBase> {
     }
 
     // This allows to run a final layernorm operation after going through the transformer layer stack.
-    // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) 
+    // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da)
     // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested.
     auto opsTop = opt<std::string>("transformer-postprocess-top", "");
     query = postProcess(prefix_ + "_top", opsTop, query, prevQuery, dropProb);
@@ -883,7 +883,7 @@ class DecoderTransformer : public Transformer<DecoderBase> {
     if(shortlist_)
       output_->setShortlist(shortlist_);
     auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim]
-    
+
     // return unormalized(!) probabilities
     Ptr<DecoderState> nextState;
     if (opt<std::string>("transformer-decoder-autoreg", "self-attention") == "rnn") {
@@ -906,9 +906,9 @@ class DecoderTransformer : public Transformer<DecoderBase> {
       output_->clear();
     cache_.clear();
     alignments_.clear();
-    perLayerRnn_.clear(); // this needs to be cleared between batches. 
-    // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, 
-    // but where underlying memory has been deallocated by dropping all tensors 
+    perLayerRnn_.clear(); // this needs to be cleared between batches.
+    // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced,
+    // but where underlying memory has been deallocated by dropping all tensors
     // from a TensorAllocator object. This can happen during ExpressionGraph::clear()
   }
 };
diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h
index ac86e4dc7..fcd90ad63 100644
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@@ -22,6 +22,8 @@ class TransformerLegacy : public EncoderDecoder {
             bool markedReloaded = true) override {
 
     for(auto& item : modelFile->items()) {
+      auto lockGuard = modelFile->scopedLockGuard();
+
       auto pair = nameMap_.find(item.name);
       if(pair != nameMap_.end()) {
         LOG(debug, "Mapping parameter {} to {}", item.name, pair->second);
@@ -42,22 +44,28 @@ class TransformerLegacy : public EncoderDecoder {
     ABORT_IF(!encoder, "Could not cast to new type of encoder??");
     for(auto& linear : encoder->allLayers<nn::Linear>())
       linear->transposed = false;
+    for(auto& norm : encoder->allLayers<nn::LayerNorm>())
+      norm->eps = 1e-6f; // used in old code by default, so we need to set it here explicitly
 
     auto decoder = std::dynamic_pointer_cast<nn::Layer>(decoders_[0]);
     ABORT_IF(!decoder, "Could not cast to new type of decoder??");
     for(auto& linear : decoder->allLayers<nn::Linear>())
       linear->transposed = false;
+    for(auto& norm : decoder->allLayers<nn::LayerNorm>())
+      norm->eps = 1e-6f; // used in old code by default, so we need to set it here explicitly
 
     // load items into the graph
     graph->load(modelFile);
   }
 
 private:
-  std::map<std::string, std::string> nameMap_;
+  const std::unordered_map<std::string, std::string> nameMap_;
 
-  std::map<std::string, std::string> createNameMap() {
-    std::map<std::string, std::string> nameMap = {
+  std::unordered_map<std::string, std::string> createNameMap() {
+    std::unordered_map<std::string, std::string> nameMap = {
       {"Wemb", "Wemb"},
+      // {"decoder_ff_logit_out_b", "decoder_ff_logit_out_b"}, for now no shape conversion
+      // {"special:model.yml", "special:model.yml"}
     };
 
     // @TODO: This is going to change
@@ -100,20 +108,20 @@ class TransformerLegacy : public EncoderDecoder {
     prefix = "TransformerBatchDecoder";
     for(int layerNo = 0; layerNo < opt<int>("dec-depth"); ++layerNo) {
       // name maps for decoder self-attention blocks
-      nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->qProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->qProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->qProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->qProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->kProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->kProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->kProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->kProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->vProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->vProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->vProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->vProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->oProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->selfAttention->oProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->oProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->postprocessor->norm->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->selfAttentionBlock->postprocessor->norm->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
 
       // name maps for decoder SSRU
       nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo);
diff --git a/src/models/transformer_new.h b/src/models/transformer_new.h
index 61de01db2..1c7807727 100644
--- a/src/models/transformer_new.h
+++ b/src/models/transformer_new.h
@@ -12,12 +12,12 @@
 namespace marian {
 
 // Wrapper for backwards compatibility that uses current encoder/decoder framework
-struct TransformerBatchEncoder : public nn::LayerWithOptions, 
+struct TransformerBatchEncoder : public nn::LayerWithOptions,
                                  public nn::IEmbeddingLayer,  // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings
                                  public EncoderBase {         // @TODO: should all encoders be IEmbeddingLayer?
   Ptr<nn::TransformerEncoder> encoder;
 
-  TransformerBatchEncoder(Ptr<ExpressionGraph> graph, 
+  TransformerBatchEncoder(Ptr<ExpressionGraph> graph,
                           Ptr<Options> options)
     : LayerWithOptions(graph, options),
       EncoderBase(graph, options)
@@ -55,10 +55,10 @@ struct TransformerBatchEncoder : public nn::LayerWithOptions,
     EncoderBase::graph_ = graph;
     setGraph(graph);
     // This makes sure that the graph passed into the model during construction and now evaluation are identical.
-    // A good check to have for catching weird situations early. 
+    // A good check to have for catching weird situations early.
     ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
 #endif
-    
+
     const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]);
     return New<EncoderState>(batchEmbedding, batchMask, batch);
   }
@@ -69,11 +69,11 @@ struct TransformerBatchEncoder : public nn::LayerWithOptions,
 };
 
 // Wrapper for backwards compatibility that uses current encoder/decoder framework
-class TransformerBatchDecoder : public nn::LayerWithOptions, 
+class TransformerBatchDecoder : public nn::LayerWithOptions,
                                 public DecoderBase {
 
   Ptr<nn::TransformerDecoder> decoder;
-  Ptr<mlp::Output> output_; 
+  Ptr<mlp::Output> output_;
 
   void lazyCreateOutputLayer()
   {
@@ -101,9 +101,9 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
   }
 
 public:
-  TransformerBatchDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options) 
+  TransformerBatchDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
   : LayerWithOptions(graph, options), DecoderBase(graph, options) {
-    
+
     decoder = New<nn::TransformerDecoder>(graph, options);
     registerLayer(decoder);
 
@@ -118,7 +118,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
     DecoderBase::graph_ = graph;
     setGraph(graph);
     // This makes sure that the graph passed into the model during construction and now evaluation are identical.
-    // A good check to have for catching weird situations early. 
+    // A good check to have for catching weird situations early.
     ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match");
 #endif
 
@@ -127,6 +127,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
       int dimBatch = (int)batch->size();
       int dim = DecoderBase::opt<int>("dim-emb");
 
+      // @TODO: use the actual initState function of the new state
       auto start = graph->constant({1, 1, dimBatch, dim}, inits::zeros());
       rnn::States startStates(DecoderBase::opt<size_t>("dec-depth"), {start, start});
 
@@ -134,7 +135,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
       return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/false);
     }
     else {
-      rnn::States startStates;
+      rnn::States startStates(DecoderBase::opt<size_t>("dec-depth"), {nullptr, nullptr});
       return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/true);
     }
   }
@@ -157,20 +158,17 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
 
     //************************************************************************//
 
-    auto encoderContext = state->getEncoderStates()[0]->getContext(); // encoder output
-    auto encoderMask    = state->getEncoderStates()[0]->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention
-
     // Convert old style decoder state to new decoder state
     using namespace models;
     usage modelUsage = (usage)db::opt<int>("usage", (int)usage::translation);
     auto nnState = convertDecoderState(state, graph(), /*decoding=*/modelUsage == usage::translation);
-    auto decoderContext = decoder->apply(embeddings, decoderMask, encoderContext, encoderMask, nnState);
+    auto decoderContext = decoder->apply(embeddings, decoderMask, nnState);
 
     // final feed-forward layer (output)
     if(shortlist_)
       output_->setShortlist(shortlist_);
     auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim]
-    
+
     // Convert new style decoder state to old decoder state
     // @TODO: This is such a mess!
     rnn::States decoderStates;
@@ -185,8 +183,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
   // helper function for guided alignment
   // @TODO: const vector<> seems wrong. Either make it non-const or a const& (more efficient but dangerous)
   virtual const std::vector<Expr> getAlignments(int /*i*/ = 0) override {
-    ABORT("Not implemented");
-    return {};
+    return decoder->getAlignments();
   }
 
   virtual void clear() override {
@@ -203,13 +200,13 @@ class TransformerBatchDecoder : public nn::LayerWithOptions,
 static void testme() {
   using namespace marian;
   using namespace nn;
-  
+
   auto options = New<Options>(
-    "enc-depth", 12, 
-    "transformer-heads", 8, 
-    "dim-emb", 512, 
+    "enc-depth", 12,
+    "transformer-heads", 8,
+    "dim-emb", 512,
     "transformer-ffn-depth", 2,
-    "transformer-dim-ffn",   2048, 
+    "transformer-dim-ffn",   2048,
     "transformer-dropout",   0.1,
     "transformer-dropout-attention", 0.0,
     "transformer-postprocess", "dan",
@@ -230,13 +227,13 @@ static void testme() {
   auto encoder = New<TransformerEncoder>(graph, options);
   encoder->setName("TransformerEncoder");
   encoder->setEvalMode();
-  
+
   auto context = encoder->apply(input, mask);
 
   std::cerr << encoder->layerInfo(/*includeChildren=*/true) << std::endl;
 
   debug(context);
-  
+
   graph->forward();
   graph->save("test.npz");
 }

From b683f4b16561fb91fb11cb3182885f46fb54d344 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 8 Feb 2024 07:00:41 +0000
Subject: [PATCH 15/26] Merged PR 32882: Reorder inputs for kiwi-style metrics

This PR adds `--input-reorder`  which allows to swap the indices of batch subfields. Currently, this is used for comet-kiwi-style models to accomodate that the mt output comes first and not the source.
---
 CHANGELOG.md                  |  1 +
 VERSION                       |  2 +-
 scripts/comet/comet2marian.py |  2 ++
 src/common/config_parser.cpp  |  5 +++++
 src/data/corpus.cpp           | 13 ++++++++++---
 src/data/corpus_base.cpp      |  6 ++++--
 src/data/corpus_nbest.cpp     |  2 ++
 src/data/corpus_sqlite.cpp    |  2 ++
 src/data/dataset.h            |  6 ++++--
 src/data/text_input.h         | 23 ++++++++++++++++-------
 src/examples/mnist/dataset.h  |  4 ++--
 src/examples/mnist/training.h |  2 +-
 12 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 13dd5e301..5f297383d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
 - Added `pymarian`: python bindings based on pybind11
 - Added implementation of COMET-KIWI
 - Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
diff --git a/VERSION b/VERSION
index 9db15f195..bb7e6dd0a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.23
+v1.12.24
diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py
index 68912befd..94098711f 100755
--- a/scripts/comet/comet2marian.py
+++ b/scripts/comet/comet2marian.py
@@ -3,6 +3,7 @@
 This script converts Unbabel COMET-QE models to Marian weight file.
 """
 
+import sys
 import argparse
 import logging as log
 import numpy as np
@@ -105,6 +106,7 @@ def load_comet_model(model_path):
     config["input-join-fields"] = True
     config["separator-symbol"] = "</s>"
     config["comet-use-separator"] = True
+    config["input-reorder"] = [1, 0, 2] # reorder input fields from [src, mt, ref] to [mt, src, ref] for comet-kiwi etc.
 else:
     raise Exception(f'Unknown type of model {model_type}')
 
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 741a3915c..9c8b0776f 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -249,6 +249,11 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       "Possible values: sequence, class, alignment, weight. "
       "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).",
       {});
+  cli.add<std::vector<size_t>>("--input-reorder",
+      "Reorder input data to this order according to this permutation. If empty no reordering is done. "
+      "If non-empty, you need to provide one type per input file (if --train-sets) or per TSV field (if --tsv). "
+      "Usually, there should be no need to provide these on the command line, the model should have them saved.",
+      {});
   cli.add<bool>("--input-join-fields", 
       "Join input fields (from files or TSV) into a single sequence "
       "(mostly used single-encoder models like BLEURT and COMET-KIWI)", 
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
index 902f0c9f6..8f5b0035e 100644
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@@ -116,6 +116,9 @@ SentenceTuple Corpus::next() {
         fields.swap(tmpFields);
       }
 
+      ABORT_IF(inputPermutation_.size() != 0 && inputPermutation_.size() < fields.size(),
+               "Input permutation given, but not for every input field??");
+
       // fill up the sentence tuple with sentences from all input files
       SentenceTupleImpl tup(curId);
       size_t shift = 0;
@@ -125,12 +128,16 @@ SentenceTuple Corpus::next() {
         if(i == alignFileIdx_ || i == weightFileIdx_) {
           ++shift;
         } else {
-          size_t vocabId = i - shift;
+          size_t permutedIndex = i;
+          if(!inputPermutation_.empty())
+            permutedIndex = inputPermutation_[i];
+
+          size_t vocabId = permutedIndex - shift;
           bool altered;
-          preprocessLine(fields[i], vocabId, curId, /*out=*/altered);
+          preprocessLine(fields[permutedIndex], vocabId, curId, /*out=*/altered);
           if(altered)
             tup.markAltered();
-          addWordsToSentenceTuple(fields[i], vocabId, tup);
+          addWordsToSentenceTuple(fields[permutedIndex], vocabId, tup);
         }
       }
       // weights are added last to the sentence tuple, because this runs a validation that needs
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index e1b0aad62..d11b5f763 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -430,12 +430,14 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
 
   auto inputTypes = options_->get<std::vector<std::string>>("input-types", {}); // empty list by default
 
+  bool isFirst = tup.empty();
+
   // This handles adding starts symbols for COMET (<s>) and BERT/BLEURT ([CLS])
-  bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0));
+  bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && isFirst));
   if(prepend && inputTypes[batchIndex] == "sequence")
     words.insert(words.begin(), Word::fromWordIndex(0));
 
-  bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0;
+  bool prependSep = insertSeparator_ && joinFields_ && !isFirst;
   if(prependSep && inputTypes[batchIndex] == "sequence")
     words.insert(words.begin(), vocabs_[batchIndex]->getSepId());
 
diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp
index 8029d3516..3c795e19b 100644
--- a/src/data/corpus_nbest.cpp
+++ b/src/data/corpus_nbest.cpp
@@ -33,6 +33,8 @@ std::string lineFromNbest(const std::string& line) {
 }
 
 SentenceTuple CorpusNBest::next() {
+  ABORT_IF(!inputPermutation_.empty(), "Input permutation not supported for n-best lists");
+
   bool cont = true;
   while(cont) {
     // get index of the current sentence
diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp
index f7c577f29..07cff6947 100644
--- a/src/data/corpus_sqlite.cpp
+++ b/src/data/corpus_sqlite.cpp
@@ -106,6 +106,8 @@ void CorpusSQLite::fillSQLite() {
 }
 
 SentenceTuple CorpusSQLite::next() {
+  ABORT_IF(!inputPermutation_.empty(), "Input permutation not supported for sqlite corpus");
+
   while(select_->executeStep()) {
     // fill up the sentence tuple with sentences from all input files
     size_t curId = select_->getColumn(0).getInt();
diff --git a/src/data/dataset.h b/src/data/dataset.h
index 3cdccec99..d5098f3fa 100644
--- a/src/data/dataset.h
+++ b/src/data/dataset.h
@@ -15,7 +15,8 @@ class DatasetBase {
 protected:
   std::vector<std::string> paths_;
   Ptr<Options> options_;
-
+  std::vector<size_t> inputPermutation_; // if not empty, this is used to reorder input fields/batches i.e. [1,0] swaps the first two fields
+                                         // currently this is used for comet-kiwi-style metrics where the mt output is the first field
   // Data processing may differ in training/inference settings
   bool inference_{false};
 
@@ -28,7 +29,8 @@ class DatasetBase {
   DatasetBase(std::vector<std::string> paths, Ptr<Options> options)
       : paths_(paths),
         options_(options),
-        inference_(options != nullptr ? options->get<bool>("inference", false) : false) {}
+        inputPermutation_(options->get<std::vector<size_t>>("input-reorder", {})),
+        inference_(options->get<bool>("inference", false)) {}
 
   DatasetBase(Ptr<Options> options) : DatasetBase({}, options) {}
 
diff --git a/src/data/text_input.h b/src/data/text_input.h
index f2e9831de..4c4ea9f1b 100644
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@@ -100,15 +100,24 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
 
   void prepare() override {}
 
-  SentenceTuple encode(std::vector<std::string>& row, size_t id) {
-    ABORT_IF(row.size() != vocabs_.size(), "Number of fields does not match number of vocabs");
+  SentenceTuple encode(std::vector<std::string>& fields, size_t id) {
+    ABORT_IF(fields.size() != vocabs_.size(), "Number of fields does not match number of vocabs");
+    
     // fill up the sentence tuple with source and/or target sentences
     SentenceTupleImpl tup(id);
 
+    ABORT_IF(inputPermutation_.size() != 0 && inputPermutation_.size() < fields.size(),
+             "Input permutation given, but not for every input field??");
+
     // copied and adapted from corpus.cpp - @TODO: refactor or unify code between Corpus and TextInput
-    for(size_t batchIndex = 0; batchIndex < row.size(); ++batchIndex) {
-      std::string& field = row[batchIndex];
-      Words words = vocabs_[batchIndex]->encode(field, /*addEOS =*/true, inference_);
+    for(size_t batchIndex = 0; batchIndex < fields.size(); ++batchIndex) {
+      size_t permutedBatchIndex = batchIndex;
+      if(inputPermutation_.size() > 0)
+        permutedBatchIndex = inputPermutation_[batchIndex];
+
+      std::string& field = fields[permutedBatchIndex];
+
+      Words words = vocabs_[permutedBatchIndex]->encode(field, /*addEOS =*/true, inference_);
       ABORT_IF(words.empty(), "Empty input sequences are presently untested");
 
       // This handles adding starts symbols for COMET (<s>) and BERT/BLEURT ([CLS])
@@ -118,7 +127,7 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
 
       bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0;
       if(prependSep)
-        words.insert(words.begin(), vocabs_[batchIndex]->getSepId());
+        words.insert(words.begin(), vocabs_[permutedBatchIndex]->getSepId());
 
       // if fields are joined and the current sentence is not the first one, we need to make sure that
       // the current sentence is not longer than the maximum length minus the length of the previous sentence
@@ -130,7 +139,7 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
       // if the current sentence is longer than the maximum length, we need to crop it
       if(maxLengthCrop_ && words.size() > localMaxLength) {
         words.resize(localMaxLength);
-        words.back() = vocabs_[batchIndex]->getEosId();
+        words.back() = vocabs_[permutedBatchIndex]->getEosId();
       }
 
       // if true, the words are reversed
diff --git a/src/examples/mnist/dataset.h b/src/examples/mnist/dataset.h
index c665fa655..8c02c3b4b 100644
--- a/src/examples/mnist/dataset.h
+++ b/src/examples/mnist/dataset.h
@@ -139,8 +139,8 @@ class MNISTData : public Dataset {
 
 public:
   MNISTData(std::vector<std::string> paths,
-            std::vector<Ptr<Vocab>> /*vocabs*/ = {},
-            Ptr<Options> options = nullptr)
+            std::vector<Ptr<Vocab>> /*vocabs*/,
+            Ptr<Options> options)
       : Dataset(paths, options), IMAGE_MAGIC_NUMBER(2051), LABEL_MAGIC_NUMBER(2049) {
     loadData();
   }
diff --git a/src/examples/mnist/training.h b/src/examples/mnist/training.h
index eebcbf822..791e769be 100644
--- a/src/examples/mnist/training.h
+++ b/src/examples/mnist/training.h
@@ -22,7 +22,7 @@ class TrainMNIST : public ModelTask {
 
     // Prepare data set
     auto paths = options_->get<std::vector<std::string>>("train-sets");
-    auto dataset = New<data::MNISTData>(paths);
+    auto dataset = New<data::MNISTData>(paths, std::vector<Ptr<Vocab>>{}, options_);
     auto batchGenerator = New<BatchGenerator<data::MNISTData>>(dataset, options_, nullptr);
 
     // Prepare scheduler with validators

From 22ed792f867a63e3b6a6b534e7a12c38412cec2e Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 8 Feb 2024 23:14:55 +0000
Subject: [PATCH 16/26] Merged PR 32937: Fixes force-decoding for beam-size
 larger 1

It seems there was a shape mismatch for force-decoding with beams larger than 1. This PR fixes the problem.
---
 CHANGELOG.md              |  1 +
 VERSION                   |  2 +-
 src/translator/sampling.h | 19 +++++++++++--------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f297383d..918deb710 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed force-decoding for beam-size > 1
 - Fixed lost node in mt-detect metrics
 - Fixed BLEURT logmask computation
 - Fixed wrong paramter name for norm in new layer framework
diff --git a/VERSION b/VERSION
index bb7e6dd0a..53dbb431e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.24
+v1.12.25
diff --git a/src/translator/sampling.h b/src/translator/sampling.h
index 2b13791d9..184202229 100644
--- a/src/translator/sampling.h
+++ b/src/translator/sampling.h
@@ -199,9 +199,12 @@ class DistModifier {
 
     // get vocab index and probability for force-decoded tokens for the current time step
     Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos);   // [1, 1, dimBatch, 1]
-    Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1]
 
-    // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam
+    // select scores from first beam entry for force-decoding
+    Expr b1stScores = slice(scores, /*axis=*/-4, 0); // [1, 1, dimBatch, dimVocab]
+    Expr forceVals  = gather(b1stScores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1]
+
+    // create dummy indices and values for beam entries other than the force-decoded value. This is required to ensure that the beam
     // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch
     // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion.
     int dimVocab = scores->shape()[-1];      
@@ -212,13 +215,13 @@ class DistModifier {
     Expr dummyVals    = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f);
 
     // here we add the force-decoded entries back into the zeroed positions
-    dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32);
-    dummyVals    = dummyVals + forceVals;
+    dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); // [1, 1, dimBatch, dimBeam]
+    dummyVals    = dummyVals + forceVals; // [1, 1, dimBatch, dimBeam] 
 
-    // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and 
-    // dummy values into the correct positions.
-    Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_));
-    forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals);
+    // create a tensor of the same size as the original logits from the first beam entry, initialize with invalidPathScore and then scatter 
+    // the force-decoded and dummy values into the correct positions.
+    Expr forcedScores = constant_like(b1stScores, inits::fromValue(invalidPathScore_)); // [1, 1, dimBatch, dimVocab]
+    forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); // [1, 1, dimBatch, dimVocab]
 
     // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry
     // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means

From 9e40ac3df46514740bcbdc559a06f1e0077828b8 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Thu, 15 Feb 2024 21:21:44 +0000
Subject: [PATCH 17/26] Merged PR 32883: Pymarian improvements

List of changes/updates/fixes to pymarian
* Rename model IDs to match with hugging face (e.g., comet22-da -> wmt22-comet-da)
* Rename CLI to make it short pymarian-evaluate -> pymarian-eval.
* Rename pymarian.evaluate.py -> pymarian.eval.py to reflect CLI
* The functional code from pymarian.eval.py is moved to Evaluator class (goal: allow reuse of Evaluator object for scoring many small files like WMT metric task)
* Use mmap *.bins instead of *.npz
* Downloads *.bin and *.spm individually instead of .tgz. Future plan to support quantized / gemm models. Downloading .tgz is okay but it will get too expensive since we dont need all variants of model (.npz, .bin, fp32, fp16, avx512 ...)
* Uses file locking mechanism (based on `portalocker`) to avoid race condition between parallel download processes
* Added optional `-v/--vocab` argument to pymarian-eval.
* Added `--fields|-f` argument: supports `src mt ref` or a subsequence of this. Raises an error when missing fields are detected, ignores that extra fields
* pymarian build improvements: strict on python version match between package and native extension. Also removes custom logic for extension detection, instead uses EXT_SUFFIX from sysconfig
* add `--like` argument for local models
* Ran black and isort to fix code formatting issues
* pypdl -- parallel download
* Regression tests to pymarian

--

Other scripts
* Added `convert-all-models.sh` : convert pytorch to marian .npz, convert .npz to .bin and creates directory structure compatible with pymarian-eval
* Added `compare.sh` to compare metrics between original implementation and pymarian
---
 .dockerignore                                 |   7 +
 .gitignore                                    |   4 +
 CHANGELOG.md                                  |   1 +
 azure-regression-tests.yml                    |  13 +-
 scripts/bleurt/bleurt2marian.py               |   1 +
 scripts/metrics/.gitignore                    |   6 +-
 scripts/metrics/Dockerfile                    |  26 +-
 scripts/metrics/README.md                     |  53 +--
 scripts/metrics/compare.sh                    | 121 +++---
 scripts/metrics/convert-all-models.sh         |  94 +++++
 scripts/metrics/docker-run.sh                 |  20 -
 scripts/metrics/known-models.txt              |  13 +
 scripts/metrics/marian-score.sh               | 126 -------
 scripts/metrics/requirements.txt              |   8 +
 scripts/metrics/run.sh                        |  33 ++
 scripts/metrics/setup.sh                      |  15 -
 src/CMakeLists.txt                            |   4 +-
 src/models/bleurt.h                           |   3 +-
 src/python/README.md                          |  88 ++---
 src/python/pymarian/__init__.py               | 129 ++++++-
 src/python/pymarian/__main__.py               |  15 +-
 src/python/pymarian/constants.py              |  28 --
 src/python/pymarian/defaults.py               |  40 ++
 src/python/pymarian/eval.py                   | 264 +++++++++++++
 src/python/pymarian/evaluate.py               | 350 ------------------
 src/python/pymarian/mtapi_server.py           |   3 +-
 src/python/pymarian/pypdl/__init__.py         |   1 +
 src/python/pymarian/pypdl/downloader.py       |  97 +++++
 src/python/pymarian/pypdl/main.py             | 234 ++++++++++++
 src/python/pymarian/pypdl/utils.py            | 127 +++++++
 src/python/pymarian/qtdemo.py                 |   3 +-
 src/python/pymarian/utils.py                  | 151 ++++++--
 src/python/pyproject.toml                     |   7 +-
 src/python/setup.py                           |  46 +--
 src/python/tests/{ => regression}/__init__.py |   0
 .../tests/regression/test_pymarian_eval.py    |  91 +++++
 .../tests/{ => regression}/test_train.py      |  16 +-
 src/python/tests/regression/test_translate.py |  35 ++
 src/python/tests/test_evaluate.py             | 148 --------
 src/python/tests/test_translate.py            |  16 -
 40 files changed, 1514 insertions(+), 923 deletions(-)
 create mode 100644 .dockerignore
 mode change 100644 => 100755 scripts/bleurt/bleurt2marian.py
 create mode 100755 scripts/metrics/convert-all-models.sh
 delete mode 100755 scripts/metrics/docker-run.sh
 create mode 100644 scripts/metrics/known-models.txt
 delete mode 100755 scripts/metrics/marian-score.sh
 create mode 100644 scripts/metrics/requirements.txt
 create mode 100644 scripts/metrics/run.sh
 delete mode 100755 scripts/metrics/setup.sh
 delete mode 100644 src/python/pymarian/constants.py
 create mode 100644 src/python/pymarian/defaults.py
 create mode 100755 src/python/pymarian/eval.py
 delete mode 100755 src/python/pymarian/evaluate.py
 create mode 100644 src/python/pymarian/pypdl/__init__.py
 create mode 100644 src/python/pymarian/pypdl/downloader.py
 create mode 100644 src/python/pymarian/pypdl/main.py
 create mode 100644 src/python/pymarian/pypdl/utils.py
 rename src/python/tests/{ => regression}/__init__.py (100%)
 create mode 100644 src/python/tests/regression/test_pymarian_eval.py
 rename src/python/tests/{ => regression}/test_train.py (89%)
 create mode 100644 src/python/tests/regression/test_translate.py
 delete mode 100644 src/python/tests/test_evaluate.py
 delete mode 100644 src/python/tests/test_translate.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..57d59853c
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,7 @@
+/regression-tests
+/build*
+/.pytest_cache
+/.vscode
+/dist
+/doc
+.history*
diff --git a/.gitignore b/.gitignore
index a55d45a39..736424f85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,7 @@ examples/mnist/*ubyte
 *.whl
 *.egg-info
 src/python/pymarian/_version.py
+src/python/tests/data
+__pycache__
+.pytest_cache
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 918deb710..9412de3a2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fixed compilation with clang 16.0.6
 
 ### Added
+- Added `pymarian-eval`, CLI for scoring metrics
 - Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
 - Added `pymarian`: python bindings based on pybind11
 - Added implementation of COMET-KIWI
diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml
index 206c018a1..fb8a06f4e 100644
--- a/azure-regression-tests.yml
+++ b/azure-regression-tests.yml
@@ -42,7 +42,7 @@ stages:
         sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
         sudo update-alternatives --set python3 /usr/bin/python3.8
         sudo apt-get install -y python3-pip
-        python3 -m pip install --upgrade Cython
+        python3 -m pip install --upgrade Cython pip
       displayName: Clean and install packages
 
     # Collect details about CPU and GPU.
@@ -105,7 +105,8 @@ stages:
           -DCOMPILE_SERVER=on \
           -DCOMPILE_TESTS=on \
           -DCOMPILE_MAXWELL=on -DCOMPILE_PASCAL=off -DCOMPILE_VOLTA=off -DCOMPILE_TURING=off -DCOMPILE_AMPERE=off -DCOMPILE_AMPERE_RTX=off \
-          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1
+          -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1 \
+          -DPYMARIAN=on -DUSE_TCMALLOC=off -DPYTHON_EXECUTABLE=python3
       displayName: Configure CMake
 
     - bash: make -j5
@@ -141,6 +142,14 @@ stages:
       displayName: Collect outputs
       workingDirectory: regression-tests
 
+    - bash: |
+        python3 -m pip install build/pymarian-*.whl
+        python3 -m pymarian -v
+        python3 -m pip install pytest
+        python3 -m pytest src/python/tests/regression
+      displayName: Pymarian Install and Test
+
+
     - publish: regression-tests-ci-public_linux-x64-static_cuda_m60.zip
       artifact: regression-tests-ci-public_linux-x64-static_cuda_m60
       displayName: Publish outputs
diff --git a/scripts/bleurt/bleurt2marian.py b/scripts/bleurt/bleurt2marian.py
old mode 100644
new mode 100755
index 25aa8206f..f02d3a833
--- a/scripts/bleurt/bleurt2marian.py
+++ b/scripts/bleurt/bleurt2marian.py
@@ -57,6 +57,7 @@ def load_bleurt_model():
 config["bert-type-vocab-size"] = 2
 config["comet-prepend-zero"] = True
 config["input-join-fields"] = True
+config["input-reorder"] = [1, 0] # bleurt expects ref < hyp order while embedding, we are providing hyp < ref, hence the reordering
 config["version"] = "bleurt2marian.py conversion"
 config["enc-depth"] = 0
 
diff --git a/scripts/metrics/.gitignore b/scripts/metrics/.gitignore
index 5d66dfcd9..0ab29db58 100644
--- a/scripts/metrics/.gitignore
+++ b/scripts/metrics/.gitignore
@@ -1,2 +1,4 @@
-bins/
-tmp.*
\ No newline at end of file
+/bins
+tmp.*
+/workspace
+/marian-metric
\ No newline at end of file
diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile
index 25a3236a9..995586219 100644
--- a/scripts/metrics/Dockerfile
+++ b/scripts/metrics/Dockerfile
@@ -1,10 +1,13 @@
-FROM mcr.microsoft.com/azureml/minimal-ubuntu20.04-py38-cuda11.6.2-gpu-inference:20231102.v2
+# syntax = docker/dockerfile:experimental
+FROM mcr.microsoft.com/azureml/minimal-ubuntu22.04-py39-cuda11.8-gpu-inference:20240205.v2
 # use this if microsoft image is not accessible;
 #FROM nvidia/cuda:11.1.1-devel-ubuntu20.04
-LABEL description="Marian image - Ubuntu 20.04"
+LABEL description="Marian image - Ubuntu 22.04"
+
+# required for microsoft cr image
+USER root
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG NCPU=24
 ARG MARIAN_REPO="https://github.com/marian-nmt/marian-dev"
 ARG MARIAN_BRANCH=master
 
@@ -18,9 +21,10 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python && \
 
 # install unbabel-comet (requires pytorch) and bleurt (requires tensorflow and cudnn)
 # note: unabel-comet 2.x is broken use 1.x. requires numpy < 1.24
+    #&& pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \
 RUN  pip install --upgrade pip \
-    && pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \
-    && pip install sacrebleu unbabel-comet==1.1.3 numpy==1.23.5 nvidia-cudnn-cu11==8.6.0.163 git+https://github.com/google-research/bleurt.git \
+    && pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu118 \
+    && pip install sacrebleu unbabel-comet==2.2.1 git+https://github.com/google-research/bleurt.git \
     && rm -rf  ~/.cache/pip/
 
 # Install sentencepiece
@@ -38,7 +42,11 @@ RUN pip3 uninstall -y sentencepiece && \
     cd ../../.. && \
     rm -rf src
 
-RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \
-    && mkdir /marian/build && cd /marian/build \
-    && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=off -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off \
-    && make -j $NCPU && cp -v marian spm_encode spm_decode /usr/bin/ \
+# add source repo (sans the .dockerignore files)
+ADD . /marian-dev
+# RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \
+
+RUN --mount=type=cache,target=/marian-dev/build mkdir -p /marian-dev/build && cd /marian-dev/build \
+    && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=on -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off -DPYMARIAN=on \
+    && make -j && cp -v marian spm_encode spm_decode /usr/bin/ \
+    && pip install -v pymarian-*.whl
diff --git a/scripts/metrics/README.md b/scripts/metrics/README.md
index 4d04c20b7..3148d3180 100644
--- a/scripts/metrics/README.md
+++ b/scripts/metrics/README.md
@@ -1,36 +1,41 @@
-# Marian Evaluate
+# Marian Metrics
+
 The main script is `compare.sh`, however it needs to be run in an environment where all three -- marian, unbabel-comet(pytorch), and bleurt(tensorflow) are available.
-Hence, 1) we create a docker container with all the necessary libs.
-    and 2) run compare.sh inside the docker environment
+Hence we create a new python environment using conda to run comparisons.
 
-## Setup: build docker image
+## Setup
 
 ```bash
-./setup.sh
+./run.sh
+```
+This setups a conda environment named `metrics` which will have all the necessary requirements, except pymarian-eval, which you will have to install based on your CMAKE settings
+```bash
+# from the root dir of this repository
+conda activate metrics
+mkdir build; cd build
+cmake .. -DPYMARIAN=on #.. other flags
+pip install pymarian-*.whl
 ```
 
-## Run compare.sh in docker container
+## Run Compare.sh
 
 ```bash
-./docker-run.sh
+
+# option 1:
+./run.sh
+
+# option 2
+conda activate metrics
+bash compare.sh
 ```
-The `docker-run.sh` script mounts cache directory from the host to container.
-The necessary files (weights and vocabularies) will be automatically downloaded and cached for unbabel-comet and Bleurt metrics.
-However, for `marian-score.sh` expects the cache to be prepared under `$HOME/.cache/marian/metrics`.
-The structure/format of the cache directory for marian-score.sh looks as follows:
+
+This script produces reports at  `workspace/*.report.txt`, which shows average difference segment level scores between original implementation and `pymarian-eval`
+
+## Convert Metrics Weights to Marian format
+
 ```bash
-/home/$USER/.cache/marian/metrics/
-├── bleurt20-ref
-│   ├── bleurt-20.model.npz
-│   ├── bleurt.vocab.spm
-├── comet20-da-src
-│   ├── comet20-qe-da.model.npz
-│   └── roberta.vocab.spm
-└── comet20-da-src+ref
-    ├── comet20-da.model.npz
-    └── roberta.vocab.spm
+conda activate metrics
+MARIAN=../build/marian ./convert-all-models.sh
 ```
-Each metric subdir should have a `*model.npz` and a `*vocab.spm` files, and the name of metric directory should end with `-src|-qe|-ref|-src+ref` suffix to indicate the category of metric.
-
-> TODO: Upload Marian compatible comet and bleurt models to public blob storage and modify script to automatically download
 
+To add a new model ID, edit `known-models.txt` file in the same directory as this README
diff --git a/scripts/metrics/compare.sh b/scripts/metrics/compare.sh
index 902258863..3d3799f5c 100755
--- a/scripts/metrics/compare.sh
+++ b/scripts/metrics/compare.sh
@@ -1,12 +1,39 @@
 #!/usr/bin/env bash
+
+# This script compares the scores produced by
+# original implementation (unbabel-score or BLEURT) and Marian NMT (pymarian-eval).
+
+
 MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-export PATH=$MYDIR:$PATH
+OUT_DIR=$MYDIR/workspace
+REGEN_ORIG=0   # 1 : clear and regenerate original scores. 0: reuse previous scores
+REGEN_MARIAN=0  # 1 : to clear and regenerate marian scores (recommended).  0:  reuse / resume from previous scores
+
+DEVICES=0
+cd $MYDIR
+export CUDA_VISIBLE_DEVICES=0
+
+# add source to python path to test changes before installing
+# export PYTHONPATH=$(cd $MYDIR/../../src/python && pwd)
 
 log() {
     echo -e "\e[1;32m[$(date '+%Y-%m-%d %H:%M:%S')]\e[0m $@" >&2
 }
 
+for tool in comet-score pymarian-eval; do
+    which $tool > /dev/null || {
+        log "ERROR: $tool not found in PATH"
+        exit 1
+    }
+done
+
+
+METRIC_NAMES=$(cat $MYDIR/known-models.txt | grep -v '^#' | awk '{print $1}')
+# exclude xxl, they require more memory
+METRIC_NAMES=$(grep -v -i '\-xxl\|xcomet' <<< $METRIC_NAMES)
+
 get_sacrebleu_names(){
+    set -eu
     # using sacrebleu to get the list of systems
     testset=wmt21/systems
     while read line; do
@@ -14,7 +41,7 @@ get_sacrebleu_names(){
         refs=()
         mts=()
         while read name; do
-            # skip if name starts with $pair or src or docid
+            # skip if name starts with $pair or src or docidq
             if [[ $name == $pair* || $name == src || $name == docid || $name == origlang ]]; then
                 continue
             fi
@@ -29,12 +56,15 @@ get_sacrebleu_names(){
         for ref in ${refs[@]}; do
             for mt in ${mts[@]}; do
                 echo -e "$testset\t$pair\t$ref\t$mt"
+                break  # limit to one per lang pair
             done
+            break  # limit to one per lang pair
         done
     done < <(sacrebleu -t $testset --list)
 }
 
 unbabel_score(){
+    set -eu
     local metric=$1
     local prefix=$2
     log "Running $metric"
@@ -45,6 +75,7 @@ unbabel_score(){
 
 
 bleurt_score() {
+    set -eu
     local metric_name=$1
     local prefix=$2
     [[ $metric_name == "BLEURT-20" ]] || {
@@ -63,54 +94,60 @@ bleurt_score() {
 
     # to check if cuda libs are configured and GPU is available
     # python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
-    export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH
+    #export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH
     python -m bleurt.score_files --bleurt_checkpoint=$metric_path \
         --candidate_file=$prefix.mt --reference_file=$prefix.ref \
         --bleurt_batch_size 64 2> /dev/null
 }
 
-marian_score() {
-    local metric=$1
-    local prefix=$2
-    case $metric in
-        wmt20-comet-qe-da)  metric="comet20-da-src" ;;
-        wmt20-comet-da)     metric="comet20-da-src+ref" ;;
-        BLEURT-20)          metric="bleurt20-ref" ;;
-        *) echo "Unknown metric $metric";  exit 1;;
-    esac
-    marian-score.sh -d '0' -n $metric --src $prefix.src --ref $prefix.ref --mt $prefix.mt --seg
-}
 
+MAX_TESTS=10
+MAX_LINES=100  # in each testset
+mkdir -p $OUT_DIR
+
+while IFS=$'\t' read tset pair ref mt; do
+    data=$(sacrebleu -t $tset -l $pair --echo src ref $mt)
+    prefix=$OUT_DIR/${tset//\//-}.$pair.$MAX_LINES
+
+    [[ -s $prefix.src ]] || cut -f1 <<< "$data" | head -n $MAX_LINES > $prefix.src
+    [[ -s $prefix.ref ]] || cut -f2 <<< "$data" | head -n $MAX_LINES > $prefix.ref
+    [[ -s $prefix.mt ]] || cut -f3 <<< "$data" | head -n $MAX_LINES > $prefix.mt
+
+    report_file=$prefix.report.txt
+    echo "####$(date '+%Y-%m-%d %H:%M:%S') :: $(pymarian-eval -V) :: Avg diffs" | tee -a $report_file
 
-main() {
-    cd $MYDIR
-    local metric_names=(BLEURT-20 wmt20-comet-da wmt20-comet-qe-da)
-    export CUDA_VISIBLE_DEVICES=0
-    local max_tests=10
-    local max_lines=100  # in each testset
-    while IFS=$'\t' read tset pair ref mt; do
-        for mn in ${metric_names[@]}; do
-            log "Comparing >> $mn << on $tset $pair $ref $mt"
-            local data=$(sacrebleu -t $tset -l $pair --echo src ref $mt)
-            local tmp_pref=tmp.testset
-            rm -rf $tmp_pref.{src,ref,mt}
-            cut -f1 <<< "$data" | head -n $max_lines > $tmp_pref.src
-            cut -f2 <<< "$data" | head -n $max_lines > $tmp_pref.ref
-            cut -f3 <<< "$data" | head -n $max_lines > $tmp_pref.mt
+    for mn in ${METRIC_NAMES[@]}; do
+        log "Comparing >> $mn << on $tset $pair $ref $mt"
+        metric_id=$(basename $mn | tr '[:upper:]' '[:lower:]')
+        score_pref=$prefix.$metric_id
+        orig_file=$score_pref.orig
+        if [[ ! -s $orig_file || $REGEN_ORIG -eq 1 ]]; then
+            rm -f $score_pref  # cleanup
+            log "Generating original scores for $mn :: $prefix"
             if [[ $mn =~ BLEURT* ]]; then
-                local orig_out=$(bleurt_score $mn $tmp_pref)
+                bleurt_score $mn $prefix > $orig_file
             else
-                local orig_out=$(unbabel_score $mn $tmp_pref 2> /dev/null)
+                unbabel_score $mn $prefix 2> /dev/null > $orig_file
             fi
-            local marian_out=$(marian_score $mn $tmp_pref)
-            paste <(echo "$marian_out") <(echo "$orig_out") \
-                | awk -F '\t' -v OFS='\t' -v mn=$mn \
-                        'BEGIN {tot=0.0} {diff=sqrt(($1-$2)^2); tot+=diff; print diff,$0}
-                         END {printf "\n===Avg diff in %s: %f===\n\n", mn, tot/NR}'
-            #TODO1: extract averages and write to a report file
-            #TODO2: benchmark speeds
-        done
-    done <  <(get_sacrebleu_names | head -n $max_tests)
-}
+        fi
+
+        out_file=$score_pref.pymarian
+        if [[ ! -s $out_file || $REGEN_MARIAN -eq 1 ]]; then
+            rm -f $out_file $out_file.log  # cleanup
+            log "Generating Marian scores for $mn :: $prefix"
+            pymarian-eval -d $DEVICES -m $(basename $mn) -s $prefix.src -r $prefix.ref -t $prefix.mt -a skip --fp16 --debug > $out_file 2> $out_file.log || {
+                log "ERROR: Failed to generate scores for $mn"
+                cat $out_file.log
+                continue
+            }
+        fi
+
+        # compute diffs
+        paste $out_file $orig_file \
+            | awk -F '\t' -v OFS='\t' -v mn=$mn -v of=$out_file.diff 'BEGIN {tot=0.0}
+                {$2 = +sprintf("%.4f", $2); diff=sqrt(($1-$2)^2); tot+=diff; print diff, $0 > of}
+                END {printf "%s\t%f\n", mn, tot/NR}' | tee -a $report_file
+    done
+done <  <(get_sacrebleu_names | head -n $MAX_TESTS)
 
-main "$@"
\ No newline at end of file
+cat $OUT_DIR/*.report.txt #| column -t
diff --git a/scripts/metrics/convert-all-models.sh b/scripts/metrics/convert-all-models.sh
new file mode 100755
index 000000000..29fe72ff5
--- /dev/null
+++ b/scripts/metrics/convert-all-models.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+set -eu
+MYDIR=$(cd $(dirname ${BASH_SOURCE[0]}) && pwd)
+SCRIPTS=$(cd $MYDIR/.. && pwd)
+
+: "
+This script converts all metrics models to Marian format (.npz) and converts them to memory maps (.bin).
+This script expects comet2marian.py and bleurt2marian.py
+The script also expects marian binary to be in PATH or set as MARIAN environment variable.
+
+Pre-requisites:
+    pip install unbabel-comet
+Optionally, you may need to configure huggingface transformers,
+ specifically, hf-login for models that reqire login (e.g., wmt22-cometkiwi-da).
+
+To run bleurt2marian, install bleurt-pytorch package:
+    pip install git+https://github.com/lucadiliello/bleurt-pytorch.git
+"
+
+OUT_DIR=${1:-$MYDIR/marian-metric}   # NOTE: manually copy this to  /mnt/textmt/www/marian/metric
+COMET2MARIAN=$SCRIPTS/comet/comet2marian.py
+BLEURT2MARIAN=$SCRIPTS/bleurt/bleurt2marian.py
+MARIAN=${MARIAN:-}
+
+# locate marian binary
+if [[ -z "$MARIAN" ]]; then
+    if [[ -f $SCRIPTS/../build/marian ]]; then
+        MARIAN=$SCRIPTS/../build/marian
+    elif which marian > /dev/null; then
+        MARIAN=$(which marian)
+    fi
+fi
+if [[ -z "$MARIAN" || ! -e $MARIAN ]]; then
+    echo -e "Error: marian binary not found." \
+        "\n  Option 1) export MARIAN=path/to/marian" \
+        "\n  Option 2) make sudo <repository>/build/marian exists" \
+        "\n  Option 2) add marian binary to PATH" >&2
+    exit 1
+fi
+
+if [[ ! -f $COMET2MARIAN ]]; then
+    echo "comet2marian.py not found at $COMET2MARIAN"; exit 2
+fi
+if [[ ! -f $BLEURT2MARIAN ]]; then
+    echo "bleurt2marian.py not found at $BLEURT2MARIAN"; exit 2
+fi
+
+MODEL_IDS=$(cat $MYDIR/known-models.txt | grep -v '^#' | awk '{print $1}')
+
+
+######## convert to marian #########
+for model_id in ${MODEL_IDS[@]}; do
+    # lowercase model name
+    model_name=$(basename $model_id | tr '[:upper:]' '[:lower:]')
+    model_dir=$OUT_DIR/$model_name
+    ok_flag=$model_dir/._OK
+    if [[ -f $ok_flag ]]; then
+        echo "$model_id already exists at $model_dir, skipping." >&2
+        continue
+    fi
+    echo "Creating $model_dir"
+    mkdir -p $model_dir
+    npz_file=$model_dir/model.$model_name.npz
+    bin_file=${npz_file%.npz}.bin
+
+    # step 1 create .npz file
+    if [[ ! -f $npz_file || ! -f $npz_file.md5 ]]; then
+        CONVERT=""
+        if [[ $model_id =~ BLEURT ]]; then
+            # only one BLEURT model supported, so it does not take model ID
+            CONVERT="$BLEURT2MARIAN"
+        else
+            CONVERT="$COMET2MARIAN -c $model_id"
+        fi
+        rm -f $npz_file $npz_file.md5 # remove incomplete files
+        ${CONVERT} -m $npz_file --spm $model_dir/vocab.spm \
+            || { echo "Error: failed to convert $model_id to Marian format" >&2; exit 3; }
+        md5sum $npz_file | awk '{print $1}' > $npz_file.md5
+    fi
+
+    # Step 2: convert to memory map
+    if [[ ! -f $bin_file || ! -f $bin_file.md5  ]]; then
+        echo "Convert $npz_file --> $bin_file"
+        rm -f $bin_file $bin_file.md5  # remove incomplete files
+        $MARIAN convert -f $npz_file -t $bin_file || {
+            echo "Error: failed to convert $npz_file to memory map" >&2; exit 4;
+        }
+        md5sum $bin_file | awk '{print $1}' > $bin_file.md5
+    fi
+    touch $ok_flag
+done
+
+# NOTE: only update the new/changed models
+#cp -r $OUT_DIR/* /mnt/textmt/www/marian/metric
\ No newline at end of file
diff --git a/scripts/metrics/docker-run.sh b/scripts/metrics/docker-run.sh
deleted file mode 100755
index c379c4415..000000000
--- a/scripts/metrics/docker-run.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-cd $MYDIR
-
-IMAGE="marian-dev"
-
-VISIBLE_GPUS="1"   # exlcude 0 for now; run on single GPU
-
-MOUNTS="-v $PWD:$PWD"
-for cache in .sacrebleu .cache/{marian,torch,huggingface,bleurt}; do
-    MOUNTS+=" -v $HOME/$cache:/root/$cache"
-done
-
-
-cmd="docker run --rm -i $MOUNTS --gpus "\"device=$VISIBLE_GPUS\"" -t $IMAGE"
-
-# uncomment for an interactive shell
-# $cmd bash
-
-$cmd $PWD/compare.sh $@
diff --git a/scripts/metrics/known-models.txt b/scripts/metrics/known-models.txt
new file mode 100644
index 000000000..7b7307cef
--- /dev/null
+++ b/scripts/metrics/known-models.txt
@@ -0,0 +1,13 @@
+BLEURT-20
+wmt20-comet-qe-da
+wmt20-comet-qe-da-v2
+wmt20-comet-da
+wmt21-comet-qe-mqm
+wmt21-comet-qe-da
+wmt21-comet-da
+Unbabel/wmt22-comet-da
+Unbabel/wmt22-cometkiwi-da
+Unbabel/wmt23-cometkiwi-da-xl
+Unbabel/wmt23-cometkiwi-da-xxl
+Unbabel/XCOMET-XL
+Unbabel/XCOMET-XXL
\ No newline at end of file
diff --git a/scripts/metrics/marian-score.sh b/scripts/metrics/marian-score.sh
deleted file mode 100755
index 873ef5921..000000000
--- a/scripts/metrics/marian-score.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env bash
-set -eu
-
-MYDIR=$(realpath $(dirname ${BASH_SOURCE[0]}))
-
-
-METRICS_CACHE=$HOME/.cache/marian/metrics
-
-log() {
-    echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $@" >&2
-}
-
-which marian > /dev/null || {
-    log "marian not found in PATH. Please add marian binary to \$PATH and rerun"
-    exit 2
-}
-
-metric_name=
-src_file=
-ref_file=
-hyp_file=
-is_seg=
-debug_mode=
-batch_size=32
-pool_size=10
-max_length=256
-devices=0
-workspace=-4000
-
-usage() {
-    log " ${BASH_SOURCE##*/} -n METRIC -m HYP [-s SRC] [-r REF] [-d DEVICES] [--seg] [--debug] [-h|--help]
-
-Args:
-    -n|--name|--metric NAME  Metric name; required. See below for details.
-    -m|--mt|--hyp FILE       MT hypothesis, required for all metrics.
-    -s|--src FILE     Source file, required for source based metrics.
-    -r|--ref FILE     Reference file, required for reference based metrics.
-    -d|--devices DEV  IDs of GPU devices to use. Use quoted string to pass multiple values. Default: '$devices'
-    --seg             Output segment-level scores. Default: print only the corpus-level score (mean of segment scores)
-    --debug           Enable verbose mode (default is quiet)
-    -h|--help         Print this help message
-
-Metric name (-n|--name) shuld be a subdir name under $METRICS_CACHE.
-The metric name should have a suffix (-src|-qe|-ref|-src+ref) indicating the type of metric:
-    *-src|*-qe   Source-based metric and requires --src arg, e.g., comet20-src or comet20-da-qe
-    *-ref        Reference-based metric and requires --ref arg, e.g., bleurt20-ref
-    *-src+ref    Both source and reference based and requires --src and --ref args e.g., comet20-src+ref
-"
-}
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -s|--src)       src_file=$2; shift 2;;
-        -r|--ref)       ref_file=$2; shift 2;;
-        -m|--mt|--hyp)  hyp_file=$2; shift 2;;
-        -n|--name|--metric) metric_name=$2; shift 2;;
-        -d|--devices)   devices=$2; shift 2;;
-        --seg)          is_seg=1; shift 1;;
-        --debug)        debug_mode=1; shift 1;;
-        -h|--help)      usage; exit 0;;
-        *) log "ERROR: unknown option $1"; usage; exit 1;;
-    esac
-done
-
-[[ -n $metric_name ]] || { log "ERROR: metric_name=$metric_name name not provided"; usage; exit 1; }
-[[ -e $hyp_file ]] || { log "ERROR: hyp file not provided"; usage; exit 1; }
-
-metric_dir=$METRICS_CACHE/$metric_name
-checkpoint=$(echo $metric_dir/*model.npz)  # file model.npz or <blah>.model.npz
-vocab=$(echo $metric_dir/*vocab.spm)
-[[ -f $checkpoint && -f $vocab ]] || {
-    log "ERROR: metric $metric_name is not valid. See ls $METRICS_CACHE/$metric_name/{*model.npz,*vocab.spm}"
-    exit 1
-}
-
-# args common to all models
-cmd="marian evaluate -w -4000"
-[[ -n $devices ]] && cmd+=" -d $devices"
-[[ -n $debug_mode ]] || cmd+=" --quiet"
-cmd+=" -m $checkpoint --max-length $max_length --max-length-crop --mini-batch $batch_size --maxi-batch $pool_size -t stdin --tsv"
-input=  # to be filled later
-
-
-check_file(){
-    local name=$1
-    local file=$2
-    [[ -e $file ]] || { log "ERROR: $name file $file does not exist"; exit 1; }
-    [[ -s $file ]] || { log "ERROR: $name file $file is empty"; exit 1; }
-}
-
-metric_type=${metric_name##*-}   # suffix expected: src, ref, src+ref
-case $metric_type in
-    src|qe)
-        # two sequences: src, hyp
-        check_file src $src_file
-        cmd+=" --like comet-qe -v $vocab $vocab"
-        input="paste $src_file $hyp_file"
-        ;;
-    ref)
-        check_file ref $ref_file
-        # two sequences: ref, hyp
-        cmd+=" --like bleurt -v $vocab $vocab"
-        input="paste $ref_file $hyp_file"
-        ;;
-    src+ref)
-        # three sequences: src, hyp, ref;  three vocabularies
-        check_file src $src_file
-        check_file ref $ref_file
-        cmd+=" --like comet -v $vocab $vocab $vocab"
-        input="paste $src_file $hyp_file $ref_file"
-        ;;
-    *)
-        log "ERROR: $metric_name is not valid. Valid metrics have suffix '-{src|qe|ref|src+ref}'"
-        exit 3
-        ;;
-esac
-
-if [[ -z $is_seg ]]; then
-    cmd+=" --average only";
-fi
-pipeline="$input | $cmd | cut -f1 -d' '"
-
-# mean (default) or segment-level scores
-
-log "Running: $pipeline"
-eval $pipeline
diff --git a/scripts/metrics/requirements.txt b/scripts/metrics/requirements.txt
new file mode 100644
index 000000000..e9128e631
--- /dev/null
+++ b/scripts/metrics/requirements.txt
@@ -0,0 +1,8 @@
+# assume it downloads the correct pytorch
+unbabel-comet==2.2.1
+sacrebleu
+# this is the original bleurt; used for comparing scores
+git+https://github.com/google-research/bleurt.git
+# this is the pytorch version of bleurt; used in blert2marian
+git+https://github.com/lucadiliello/bleurt-pytorch.git
+huggingface_hub[cli]   # required for login to hf to authenticate private models
\ No newline at end of file
diff --git a/scripts/metrics/run.sh b/scripts/metrics/run.sh
new file mode 100644
index 000000000..7f067e013
--- /dev/null
+++ b/scripts/metrics/run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -eu
+MYDIR=$(dirname ${BASH_SOURCE[0]})
+cd $MYDIR
+
+ENV_NAME=metrics
+which conda > /dev/null || (echo "conda not found" && exit 1)
+# conda functions are not exported in non-interactive shell, so we source conda.sh
+CONDA_BASE=$(conda info --base)
+source $CONDA_BASE/etc/profile.d/conda.sh
+FOUND="$(conda env list  | awk -v name=$ENV_NAME '$1==name { print $1 }')"
+
+log() {
+    echo -e "\e[32m$@\e[0m" >&2
+}
+#### SETUP #########
+if [[ -z "$FOUND" ]]; then
+    log "Creating conda environment $ENV_NAME"
+    # create conda environment and install requirements
+    conda create -n $ENV_NAME python=3.10
+    conda activate $ENV_NAME
+    log "Installing requirements"
+    pip install -r $MYDIR/requirements.txt
+else
+    log "Activating conda environment $ENV_NAME"
+    conda activate $ENV_NAME
+fi
+
+which pymarian-eval > /dev/null || (
+    echo "pymarian-eval not found. Please install and return" && exit 1 )
+
+#####################
+bash ./compare.sh
\ No newline at end of file
diff --git a/scripts/metrics/setup.sh b/scripts/metrics/setup.sh
deleted file mode 100755
index df16563a6..000000000
--- a/scripts/metrics/setup.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-cd $MYDIR
-
-#SSH_KEY=$HOME/.ssh/id_rsa    # for git clone inside docker build
-IMAGE=marian-dev
-echo "Building docker image $IMAGE"
-#DOCKER_BUILDKIT=1 docker build --ssh default=$SSH_KEY . -f Dockerfile -t $IMAGE
-DOCKER_BUILDKIT=1 docker build . -f Dockerfile -t $IMAGE
-
-
-# Optional build args:
-# --build-arg MARIAN_COMMIT=master \
-# --build-arg MARIAN_REPO=https://github.com/marian-nmt/marian-dev.git \
-# --build-arg NCPUS=16
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c40eabc76..c0c4f74b9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -307,9 +307,9 @@ if(PYMARIAN)
   install(TARGETS _pymarian DESTINATION .)
 
   # build pymarian wheel
-  add_custom_target(pymarian ALL 
+  add_custom_target(pymarian ALL
     ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
     "${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}"
-    DEPENDS _pymarian 
+    DEPENDS _pymarian
     VERBATIM COMMENT "Building pymarian wheel")
 endif(PYMARIAN)
diff --git a/src/models/bleurt.h b/src/models/bleurt.h
index 844f94609..99cf7f7a1 100644
--- a/src/models/bleurt.h
+++ b/src/models/bleurt.h
@@ -70,7 +70,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder {
     auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim]
 
     auto binaryMask = marian::nn::swapTimeBatch(mask);   // [beam depth=1, batch size, max length, vector dim=1]
-    
+
     // apply positional embeddings to contextual input
     output = positionEmbedding->apply(output);
 
@@ -194,7 +194,6 @@ class BleurtPooler final : public nn::LayerWithOptions,
     auto modelType = LayerWithOptions::opt<std::string>("type");
 
     auto emb = slice(encoderStates[0]->getContext(), -2, 0);
-    emb = marian::cast(emb, Type::float32);
 
     Expr output;
     if(LayerWithOptions::opt<int>("usage") == (int)models::usage::evaluating) {
diff --git a/src/python/README.md b/src/python/README.md
index f8f00bdc5..d3fc34e25 100644
--- a/src/python/README.md
+++ b/src/python/README.md
@@ -7,10 +7,6 @@
 ## Install
 
 ```bash
-# get source code
-git clone https://github.com/marian-nmt/marian-dev
-cd marian-dev 
-
 # build marian with -DPYMARIAN=on option to create a pymarian wheel
 cmake . -Bbuild -DCOMPILE_CUDA=off -DPYMARIAN=on -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j       # -j option parallelizes build on all cpu cores
@@ -59,42 +55,50 @@ for score in scores:
 . `pymarian-qtdemo` : GUI App demo powered by QT 
 
 
-### `pymarian-evaluate` 
+### `pymarian-eval` 
 
 ```bash
-$ pymarian-evaluate -h
-usage: pymarian-evaluate [-h] [-m MODEL] [--stdin] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE] [-o OUT] [-a {skip,append,only}] [-w WIDTH] [--debug] [--mini-batch MINI_BATCH] [-d [DEVICES ...] | -c
-                         CPU_THREADS] [-ws WORKSPACE] [--backend {subprocess,pymarian}]
+$ pymarian-eval -h 
+usage: pymarian-eval [-h] [-m MODEL] [-v VOCAB] [-l {comet-qe,bleurt,comet}] [-V] [-] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE] [-f FIELD [FIELD ...]] [-o OUT] [-a {skip,append,only}] [-w WIDTH] [--debug] [--fp16] [--mini-batch MINI_BATCH] [-d [DEVICES ...] | -c
+                     CPU_THREADS] [-ws WORKSPACE] [-pc]
 
 options:
   -h, --help            show this help message and exit
   -m MODEL, --model MODEL
-                        Model name, or path. Known models=['cometoid22-wmt21', 'cometoid22-wmt22', 'cometoid22-wmt23', 'chrfoid-wmt23', 'comet20-da-qe', 'bleurt20', 'comet20-da'] (default:
-                        cometoid22-wmt22)
-  --stdin               Read input from stdin. TSV file with following format: QE metrics: "src<tab>mt", Comet with ref: "src<tab>ref<tab>; or BLEURT: "ref<tab>mt" (default: False)
+                        Model name, or path. Known models: bleurt-20, wmt20-comet-da, wmt20-comet-qe-da, wmt20-comet-qe-da-v2, wmt21-comet-da, wmt21-comet-qe-da, wmt21-comet-qe-mqm, wmt22-comet-da, wmt22-cometkiwi-da, xcomet-xl, xcomet-xxL (default: wmt22-cometkiwi-da)
+  -v VOCAB, --vocab VOCAB
+                        Vocabulary file (default: None)
+  -l {comet-qe,bleurt,comet}, --like {comet-qe,bleurt,comet}
+                        Model type. Required if --model is a local file (auto inferred for known models) (default: None)
+  -V, --version         show program's version number and exit
+  -, --stdin            Read input from stdin. TSV file with following format: QE metrics: "src<tab>mt", Ref based metrics ref: "src<tab>mt<tab>ref" or "mt<tab>ref" (default: False)
   -t MT_FILE, --mt MT_FILE
-                        MT output file. Ignored when --stdin. (default: None)
+                        MT output file. Ignored when --stdin (default: None)
   -s SRC_FILE, --src SRC_FILE
                         Source file. Ignored when --stdin (default: None)
   -r REF_FILE, --ref REF_FILE
                         Ref file. Ignored when --stdin (default: None)
-  -o OUT, --out OUT     output file. Default stdout (default: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>)
+  -f FIELD [FIELD ...], --fields FIELD [FIELD ...]
+                        Input fields, an ordered sequence of {src, mt, ref} (default: ['src', 'mt', 'ref'])
+  -o OUT, --out OUT     output file (default: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>)
   -a {skip,append,only}, --average {skip,append,only}
-                        Average segment scores to produce system score. skip=do not output average (default; segment scores only); append=append average at the end; only=output the average only
-                        (i.e system score only) (default: skip)
+                        Average segment scores to produce system score. skip=do not output average (default; segment scores only); append=append average at the end; only=output the average only (i.e. system score only) (default: skip)
   -w WIDTH, --width WIDTH
                         Output score width (default: 4)
-  --debug               Verbose output (default: False)
+  --debug               Debug or verbose mode (default: False)
+  --fp16                Enable FP16 mode (default: False)
   --mini-batch MINI_BATCH
                         Mini-batch size (default: 16)
   -d [DEVICES ...], --devices [DEVICES ...]
                         GPU device IDs (default: None)
   -c CPU_THREADS, --cpu-threads CPU_THREADS
-                        Use CPU threads. 0=use gpu device 0 (default: None)
+                        Use CPU threads. 0=use GPU device 0 (default: None)
   -ws WORKSPACE, --workspace WORKSPACE
                         Workspace memory (default: 8000)
-  --backend {subprocess,pymarian}
-                        Marian backend interface. subprocess looks for marian binary in PATH. pymarian is a pybind wrapper (default: pymarian)
+  -pc, --print-cmd      Print marian evaluate command and exit (default: False)
+
+More info at https://github.com/marian-nmt/marian-dev. This CLI is loaded from .../python3.10/site-packages/pymarian/eval.py (version: 1.12.25)
+
 ```
 
 **Performance Tuning Tips**:
@@ -104,33 +108,6 @@ options:
 * To see full logs from marian, set `--debug`
 
 
-*Example Usage*
-```bash
-# download sample dataset
-langs=en-ru
-prefix=tmp.$langs
-teset=wmt21/systems
-sysname=Online-B
-sacrebleu -t $teset -l $langs --echo src > $prefix.src
-sacrebleu -t $teset -l $langs --echo ref > $prefix.ref
-sacrebleu -t $teset -l $langs --echo $sysname > $prefix.mt
-
-# chrfoid
-paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m chrfoid-wmt23 
-
-# cometoid22-wmt{21,22,23}
-paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m cometoid22-wmt22
-
-# bleurt20
-paste $prefix.{ref,mt} | head | pymarian-evaluate --stdin  -m bleurt20 --debug
-
-# FIXME: comet20-da-qe and comet20-da appear to be broken 
-# comet20-da-qe
-paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m comet20-da-qe
-# comet20-da
-paste $prefix.{src,mt,ref} | pymarian-evaluate  -m comet20-da 
-
-```
 
 ### `pymarian-mtapi`
 
@@ -156,23 +133,32 @@ curl $URL --header "Content-Type: application/json" --request POST --data '[{"te
 pymarian-qtdemo
 ```
 
+## Code Formatting
+
+```bash
+
+pip install black isort
+isort .
+black .
+cd src/python
+```
+
 ## Run Tests
 
 ```bash
 # install pytest if necessary
-python -m pip install pytest 
+python -m pip install pytest
 
 # run tests in quiet mode
-python -m pytest src/python/tests/
+python -m pytest src/python/tests/regression
 
 # or, add -s to see STDOUT/STDERR from tests
-python -m pytest -s src/python/tests/
+python -m pytest -s src/python/tests/regression
 
 ```
 
-
 ## Known issues
-   
+
 1. In conda or mamba environment, if you see  `.../miniconda3/envs/<envname>/bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error,
     install libstdcxx-ng
 
diff --git a/src/python/pymarian/__init__.py b/src/python/pymarian/__init__.py
index f08d00944..36011c203 100644
--- a/src/python/pymarian/__init__.py
+++ b/src/python/pymarian/__init__.py
@@ -1,6 +1,17 @@
+import logging
+from itertools import islice
+from pathlib import Path
+from typing import Iterator, List, Optional, Tuple, Union
+
 import _pymarian
+import yaml
+
+# this log may be used by submodules, so we declare it here before submodule imports
+log = logging.getLogger(__name__)
+log.setLevel(logging.INFO)
 
 from ._version import __version__
+from .defaults import Defaults
 from .utils import kwargs_to_cli
 
 
@@ -22,8 +33,122 @@ def __init__(self, cli_string='', **kwargs):
         """Initializes the evaluator
         :param kwargs: kwargs
         """
-        cli_string += ' ' + kwargs_to_cli(**kwargs)
-        super().__init__(cli_string.strip())
+        self._kwargs = kwargs
+        self._cli_string = (cli_string + ' ' + kwargs_to_cli(**kwargs)).strip()
+        super().__init__(self._cli_string)
+        self._config = yaml.safe_load(self.get_model_config())
+        log.debug(f'Model config: {self._config}')
+
+    @property
+    def model_type(self) -> str:
+        return self._config.get('type', None)
+
+    @classmethod
+    def new(
+        cls,
+        model_file: Path,
+        vocab_file: Path = None,
+        devices: Optional[List[int]] = None,
+        width=Defaults.FLOAT_PRECISION,
+        mini_batch=Defaults.MINI_BATCH,
+        maxi_batch=Defaults.MAXI_BATCH,
+        like=Defaults.DEF_MODEL_TYPE,
+        workspace=Defaults.WORKSPACE,
+        max_length=Defaults.MAX_LENGTH,
+        cpu_threads=0,
+        average: str = Defaults.AVERAGE,
+        **kwargs,
+    ) -> Iterator[Union[float, Tuple[float, float]]]:
+        """A factory function to create an Evaluator with default values.
+
+        :param model_file: path to model file
+        :param vocab_file: path to vocabulary file
+        :param devices: list of GPU devices to use (optional)
+        :param width: number of decimal places to have in output scores
+        :param mini_batch: mini-batch size
+        :param maxi_batch: maxi-batch size
+        :param like: marian metric model like
+        :param cpu_threads: number of CPU threads to use
+        :param: average: average segment scores to produce system score.
+            skip=do not output average (default; segment scores only);
+            append=append average at the end;
+            only=output the average only (i.e. system score only)
+        :return: iterator of scores
+        """
+
+        assert model_file.exists(), f'Model file {model_file} does not exist'
+        assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist'
+        assert like in Defaults.MODEL_TYPES, f'Unknown model type: {like}'
+        n_inputs = len(Defaults.MODEL_TYPES[like])
+        vocabs = [vocab_file] * n_inputs
+        if not kwargs:
+            kwargs = {}
+        kwargs.update(
+            model=model_file,
+            vocabs=vocabs,
+            devices=devices,
+            width=width,
+            like=like,
+            mini_batch=mini_batch,
+            maxi_batch=maxi_batch,
+            max_length=max_length,
+            max_length_crop=True,
+            workspace=workspace,  # negative memory => relative to total memory
+            cpu_threads=cpu_threads,
+            average=average,
+        )
+        if kwargs.pop('fp16'):
+            kwargs['fp16'] = ''  # empty string for flag; i.e, "--fp16" and not "--fp16=true"
+
+        # TODO: remove this when c++ bindings supports iterator
+        kwargs['average'] = 'skip'
+        return cls(**kwargs)
+
+    def evaluate(self, input_lines: Iterator[str], average: str = 'skip', batch_size: Optional[int] = None):
+        """Evaluates the input lines and returns the scores
+
+        This function creates mini batches in python and calls the C++ bindings to evaluate the input lines.
+        This is a workaround until the C++ bindings support iterator API.
+
+        :param input_lines: iterator of input lines
+        :param average: average segment scores to produce system score. Options:
+            skip=do not output average (default; segment scores only);
+            append=append average at the end;
+            only=output the average only (i.e. system score only)
+        :param batch_size: batch size (optional; default=2*mini_batch*maxi_batch)
+        :return: iterator of scores
+        """
+        assert average in ('skip', 'append', 'only')
+        lines = (line.rstrip('\r\n').split('\t') for line in input_lines)
+        if not batch_size:
+            mini_batch = self._kwargs.get('mini_batch', Defaults.MINI_BATCH)
+            maxi_batch = self._kwargs.get('maxi_batch', Defaults.MAXI_BATCH)
+            batch_size = 2 * mini_batch * maxi_batch
+            # Sending twice the batch size to avoid starving GPU backend
+            # This is a workaround until the C++ bindings support iterator API
+        # pymarian bindings does not yet support iterator input, so this function is mini batching here
+        def make_maxi_batches(lines, batch_size=batch_size):
+            assert batch_size > 0
+            while True:
+                chunk = list(islice(lines, batch_size))
+                if not chunk:
+                    return
+                yield chunk
+
+        total, count = 0.0, 0
+        for batch in make_maxi_batches(lines):
+            scores = super().evaluate(batch)
+            assert len(scores) == len(batch)
+            for score in scores:
+                if isinstance(score, (tuple, list)):
+                    score = score[0]
+                total += score
+                count += 1
+                if average != 'only':  # skip or append
+                    yield score
+
+        if average != 'skip':  # append or only
+            yield total / count
 
 
 class Trainer(_pymarian.Trainer):
diff --git a/src/python/pymarian/__main__.py b/src/python/pymarian/__main__.py
index e0b68cd65..08d4e6fcb 100644
--- a/src/python/pymarian/__main__.py
+++ b/src/python/pymarian/__main__.py
@@ -1,18 +1,23 @@
-
 import argparse
 
 from pymarian import __version__
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(prog='pymarian', description="Python wrapper for Marian NMT",
-                                     epilog='URL: https://github.com/marian-nmt/marian-dev')
+    parser = argparse.ArgumentParser(
+        prog='pymarian',
+        description="Python wrapper for Marian NMT",
+        epilog='URL: https://github.com/marian-nmt/marian-dev',
+    )
     parser.add_argument('--version', '-v', action='version', version=__version__)
     return parser.parse_args()
 
+
 def main():
     args = parse_args()
     # prints version for -v/-version option.
-    # no other options are currently supported. Space left/intended for future use. 
+    # no other options are currently supported. Space left/intended for future use.
+
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/src/python/pymarian/constants.py b/src/python/pymarian/constants.py
deleted file mode 100644
index 3d04abbba..000000000
--- a/src/python/pymarian/constants.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from pathlib import Path
-
-
-class Defaults:
-    BASE_URL = "https://textmt.blob.core.windows.net/www/models/mt-metric"
-    CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metrics'
-    MINI_BATCH = 16
-    MAXI_BATCH = 256
-    WORKSPACE = 8000
-    AVERAGE = 'skip'
-    MAX_LENGTH = 512
-    FLOAT_PRECISION = 4
-
-    # NOTE: model names must be lower case for caseless matching
-    KNOWN_METRICS = {
-        'cometoid22-wmt21': "comet-qe",
-        'cometoid22-wmt22': "comet-qe",
-        'cometoid22-wmt23': "comet-qe",
-        'chrfoid-wmt23': "comet-qe",
-        'comet20-da-qe': "comet-qe",
-        'bleurt20': "bleurt",
-        'comet20-da': "comet",
-    }
-
-    KNOWN_SCHEMA = {'comet-qe': 'src+mt', 'bleurt': 'ref+mt', 'comet': 'src+mt+ref'}
-
-    DEF_MODEL = 'cometoid22-wmt22'
-    DEF_SCHEMA = KNOWN_METRICS[DEF_MODEL]
diff --git a/src/python/pymarian/defaults.py b/src/python/pymarian/defaults.py
new file mode 100644
index 000000000..2fdeff278
--- /dev/null
+++ b/src/python/pymarian/defaults.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+
+class Defaults:
+    BASE_URL = "https://textmt.blob.core.windows.net/www/marian/metric"
+    CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric'
+    MINI_BATCH = 16
+    MAXI_BATCH = 256
+    WORKSPACE = 8000
+    AVERAGE = 'skip'
+    MAX_LENGTH = 512
+    FLOAT_PRECISION = 4
+    FILE_LOCK_TIMEOUT = 1 * 60 * 60  # seconds => 1 hour
+    PROGRESS_BAR = True
+
+    # metric name to model type; lowercase all IDs
+    KNOWN_METRICS = {
+        "bleurt-20": "bleurt",
+        "wmt20-comet-da": "comet",
+        "wmt20-comet-qe-da": "comet-qe",
+        "wmt20-comet-qe-da-v2": "comet-qe",
+        "wmt21-comet-da": "comet",
+        "wmt21-comet-qe-da": "comet-qe",
+        "wmt21-comet-qe-mqm": "comet-qe",
+        "wmt22-comet-da": "comet",
+        "wmt22-cometkiwi-da": "comet-qe",
+        "xcomet-xl": "comet",
+        "xcomet-xxL": "comet",
+    }
+
+    # model type to field order
+    MODEL_TYPES = {
+        'comet-qe': ('src', 'mt'),
+        'bleurt': ('mt', 'ref'),
+        'comet': ('src', 'mt', 'ref'),
+    }
+
+    DEF_MODEL = 'wmt22-cometkiwi-da'
+    DEF_MODEL_TYPE = 'comet-qe'
+    DEF_FIELD_ORDER = 'src mt ref'.split()
diff --git a/src/python/pymarian/eval.py b/src/python/pymarian/eval.py
new file mode 100755
index 000000000..4b5e5f02c
--- /dev/null
+++ b/src/python/pymarian/eval.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+#
+# This is a python wrapper for marian evaluate command
+#
+import argparse
+import logging as log
+import sys
+from itertools import zip_longest
+from pathlib import Path
+from typing import Iterator, List
+
+from . import Evaluator, __version__
+from .defaults import Defaults
+from .utils import get_model_path, get_vocab_path
+
+log.basicConfig(level=log.INFO)
+DEBUG_MODE = False
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "pymarian-eval",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        epilog='More info at https://github.com/marian-nmt/marian-dev. '
+        f'This CLI is loaded from {__file__} (version: {__version__})',
+    )
+
+    known_metrics = ', '.join(Defaults.KNOWN_METRICS)
+    parser.add_argument(
+        '-m',
+        '--model',
+        help=f'Model name, or path. Known models: {known_metrics}',
+        default=Defaults.DEF_MODEL,
+        type=str,
+    )
+    parser.add_argument('-v', '--vocab', help=f'Vocabulary file', type=Path)
+    parser.add_argument(
+        '-l',
+        '--like',
+        help='Model type. Required if --model is a local file (auto inferred for known models)',
+        type=str,
+        choices=list(Defaults.MODEL_TYPES.keys()),
+    )
+    parser.add_argument('-V', '--version', action="version", version=f"%(prog)s {__version__}")
+
+    parser.add_argument(
+        '-',
+        '--stdin',
+        action='store_true',
+        help='Read input from stdin. TSV file with following format: \
+                        QE metrics: "src<tab>mt", Ref based metrics ref: "src<tab>mt<tab>ref" or "mt<tab>ref"',
+    )
+    parser.add_argument('-t', '--mt', dest='mt_file', help='MT output file. Ignored when --stdin', type=Path)
+    parser.add_argument('-s', '--src', dest='src_file', help='Source file. Ignored when --stdin', type=Path)
+    parser.add_argument('-r', '--ref', dest='ref_file', help='Ref file. Ignored when --stdin', type=Path)
+    parser.add_argument(
+        '-f',
+        '--fields',
+        dest='user_fields',
+        metavar='FIELD',
+        nargs='+',
+        choices=['src', 'mt', 'ref'],
+        help='Input fields, an ordered sequence of {src, mt, ref}',
+        default=Defaults.DEF_FIELD_ORDER,
+        type=str,
+    )
+    parser.add_argument('-o', '--out', default=sys.stdout, help='output file', type=argparse.FileType('w'))
+    parser.add_argument(
+        '-a',
+        '--average',
+        choices=('skip', 'append', 'only'),
+        default='skip',
+        help='Average segment scores to produce system score.'
+        ' skip=do not output average (default; segment scores only);'
+        ' append=append average at the end; '
+        ' only=output the average only (i.e. system score only)',
+    )
+
+    parser.add_argument('-w', '--width', default=4, help='Output score width', type=int)
+    parser.add_argument('--debug', help='Debug or verbose mode', action='store_true')
+    parser.add_argument('--fp16', help='Enable FP16 mode', action='store_true')
+    parser.add_argument('--mini-batch', default=16, help='Mini-batch size', type=int)
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-d', '--devices', nargs='*', type=int, help='GPU device IDs')
+    group.add_argument(
+        '-c', '--cpu-threads', default=None, type=int, help='Use CPU threads. 0=use GPU device 0'
+    )
+    parser.add_argument('-ws', '--workspace', default=8000, help='Workspace memory', type=int)
+    parser.add_argument(
+        '-pc', '--print-cmd', action="store_true", help="Print marian evaluate command and exit"
+    )
+
+    args = parser.parse_args()
+    return vars(args)
+
+
+def find_field_ordering(expected_fields: List[str], given_fields: List[str]) -> List[int]:
+    """Find the order of fields in given_fields to match expected_fields
+
+    :param expected_fields: list of expected fields
+    :param given_fields: list of given fields
+    :return: list of indices to select from given_fields to match expected_fields
+    :throws ValueError: if any expected field is missing in given_fields
+    """
+
+    missing_fields = set(expected_fields) - set(given_fields)
+    if missing_fields:
+        raise ValueError(
+            f'Required fields are missing: {missing_fields} [expected: {expected_fields}, given: {given_fields}]'
+        )
+    field_order = []
+    for name in expected_fields:
+        idx = given_fields.index(name)
+        assert idx >= 0, f'Field {name} not found in {given_fields}. Please check --fields argument'  # this should never happen
+        field_order.append(idx)
+    return field_order
+
+
+def reorder_fields(lines: Iterator[str], field_order: List[int]) -> Iterator[str]:
+    """Reorder fields in each line according to field_order
+
+    :param lines: input lines
+    :param field_order: list of indices to reorder fields
+    :return: lines with fields reordered
+    :throws ValueError: if any line has missing fields
+    """
+    max_column = max(field_order)
+    for line_num, line in enumerate(lines, start=1):
+        fields = line.rstrip('\r\n').split('\t')
+        if len(fields) <= max_column:
+            raise ValueError(
+                f'Expected at least {max_column} columns, but got {len(fields)} in line {line_num}'
+            )
+        yield '\t'.join(fields[i] for i in field_order)
+
+
+def read_input(
+    stdin=False,
+    src_file=None,
+    mt_file=None,
+    ref_file=None,
+    expected_fields=Defaults.DEF_FIELD_ORDER,
+    user_fields=Defaults.DEF_FIELD_ORDER,
+):
+    """Read input files and reorder fields if necessary.
+
+    This function modifies args dictionary in place.
+    :param args: command line arguments
+    :param model_id: model ID
+    :param schema: schema to use for the model
+    """
+
+    n_inputs = len(expected_fields)
+    assert 1 <= n_inputs <= 3, f'Invalid : {expected_fields}'
+
+    if stdin:
+        assert 1 <= len(user_fields) <= 3
+        reorder_idx = find_field_ordering(expected_fields, user_fields)
+        log.info(f'Input field mappings: {reorder_idx}; expected: {expected_fields}, given: {user_fields}')
+        return reorder_fields(sys.stdin, reorder_idx)
+
+    n_inputs = len(expected_fields)
+    assert mt_file.exists(), 'File with hypotheses {mt_file} does not exist'
+    if 'src' in expected_fields:
+        assert src_file, f'Source file is required'
+        assert src_file.exists(), f'{src_file} does not exist'
+    if 'ref' in expected_fields:
+        assert ref_file, f'Reference file is required'
+        assert ref_file.exists(), f'{ref_file} does not exist'
+
+    if expected_fields == ('src', 'mt'):
+        input_lines = zip_longest(open(src_file), open(mt_file))
+    elif expected_fields == ('mt', 'ref'):
+        input_lines = zip_longest(open(mt_file), open(ref_file))
+    elif expected_fields == ('src', 'mt', 'ref'):
+        input_lines = zip_longest(open(src_file), open(mt_file), open(ref_file))
+    else:
+        raise ValueError(f'Unknown schema {expected_fields}')
+
+    def _validate_and_join():
+        for row in input_lines:
+            assert len(row) == n_inputs, f'Expected {n_inputs} columns, but got {len(row)}'
+            for col in row:
+                assert col is not None, f'Expected {n_inputs} columns, but got {len(row)}'
+            line = '\t'.join(col.strip() for col in row)
+            yield line
+
+    return _validate_and_join()
+
+
+def main(**args):
+    args = args or parse_args()
+    if args.pop('debug'):
+        log.getLogger().setLevel(log.DEBUG)
+        global DEBUG_MODE
+        DEBUG_MODE = True
+        log.debug(args)
+    else:
+        args['quiet'] = ''
+
+    model_id = args.pop('model')
+    model_path = Path(model_id)
+    vocab_path = args.pop('vocab')
+    if vocab_path:  # if user gave this arg, it must be a valid arg
+        assert vocab_path.exists(), f'Vocabulary file {vocab_path} does not exist'
+
+    # if model arg is local path
+    if model_path.suffix.lower() in ('.npz', '.bin'):
+        assert model_path.exists() and model_path.is_file(), f'Model file {model_path} does not exist'
+        model_id = model_path.stem
+        assert args.get('like'), f'--like is required when --model is a local file'
+        if not vocab_path:  # if vocab is not given, resolve it from model directory
+            vocab_path = model_path.parent / 'vocab.spm'
+            if not vocab_path.exists():
+                raise Exception(
+                    f'Vocabulary file {vocab_path} does not exist. Plese sepcify it with --vocab option.'
+                )
+    else:  # assume it is ID and resolve path from cache
+        model_id = model_id.lower()
+        try:
+            model_path = get_model_path(model_id)
+            if not vocab_path:  # if vocab is not given, resolve it from cache
+                vocab_path = get_vocab_path(model_id)
+            args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_MODEL_TYPE)
+        except ValueError as e:
+            raise ValueError(f'Invalid model ID: {model_id}') from e
+
+    args['model_file'] = model_path
+    args['vocab_file'] = vocab_path
+
+    out = args.pop('out')
+    width = args.pop('width', Defaults.FLOAT_PRECISION)
+    average = args.pop('average', Defaults.AVERAGE)
+    print_cmd = args.pop('print_cmd', False)
+
+    input_args = ('stdin', 'src_file', 'mt_file', 'ref_file', 'user_fields')
+    input_args = {k: args.pop(k) for k in input_args}
+    input_args['expected_fields'] = Defaults.MODEL_TYPES[args['like']]
+    model_args = args
+
+    evaluator = Evaluator.new(**model_args)
+    if evaluator.model_type != args['like']:
+        log.warning(f'Config model type is {evaluator.model_type}, but given: {args["like"]}')
+
+    input_lines = read_input(**input_args)
+    cmd_line = "marian evaluate " + evaluator._cli_string
+    if print_cmd:  # print the command and exit
+        print(cmd_line)
+        return
+    else:
+        log.info("CLI:\t" + cmd_line)
+
+    scores = evaluator.evaluate(input_lines, average=average)
+
+    for i, score in enumerate(scores, start=1):
+        if isinstance(score, (tuple, list)):
+            score = score[0]  # the first score
+        out.write(f'{score:.{width}f}\n')
+    out.close()
+    log.info(f'Wrote {i} lines to {out.name}')
+
+
+if '__main__' == __name__:
+    main()
diff --git a/src/python/pymarian/evaluate.py b/src/python/pymarian/evaluate.py
deleted file mode 100755
index 371a37006..000000000
--- a/src/python/pymarian/evaluate.py
+++ /dev/null
@@ -1,350 +0,0 @@
-#!/usr/bin/env python
-#
-# This is a python wrapper for marian evaluate command
-#
-import argparse
-import itertools
-import logging as log
-import shutil
-import subprocess
-import sys
-import threading
-import yaml
-
-from pathlib import Path
-from typing import Iterator, List, Optional, Tuple, Union
-
-from .constants import Defaults
-from .utils import get_known_model
-
-log.basicConfig(level=log.INFO)
-DEBUG_MODE = False
-
-
-def copy_lines_to_stdin(proc, lines: Iterator[str]):
-    """Write data to subproc stdin. Note: run this on another thread to avoid deadlock
-    This function reads streams, and write them as TSV record to the stdin of the sub process.
-    :param proc: subprocess object to write to
-    """
-
-    for line in lines:
-        # line = line.rstrip('\n') + '\n'
-        proc.stdin.write(line)
-    proc.stdin.flush()
-    proc.stdin.close()  # close stdin to signal end of input
-
-
-def marian_evaluate(
-    model: Path,
-    input_lines: Iterator[str],
-    vocab_file: Path = None,
-    devices: Optional[List[int]] = None,
-    width=Defaults.FLOAT_PRECISION,
-    mini_batch=Defaults.MINI_BATCH,
-    like=Defaults.DEF_SCHEMA,
-    maxi_batch=Defaults.MAXI_BATCH,
-    workspace=Defaults.WORKSPACE,
-    max_length=Defaults.MAX_LENGTH,
-    cpu_threads=0,
-    average: str = Defaults.AVERAGE,
-    backend='subprocess',
-) -> Iterator[Union[float, Tuple[float, float]]]:
-    """Run 'marian evaluate' as a subprocess or using pymarian, read input and write scores
-    Depending on the `model` argument, either a single score or a tuple of scores is returned per input line.
-    :param model: path to model file, or directory containing model.npz.best-embed.npz
-    :param vocab: path to vocabulary file (optional; if not given, assumed to be in the same directory as the model)
-    :param devices: list of GPU devices to use (optional; if not given, decision is let to marian process)
-    :param width: float precision
-    :param mini_batch: mini-batch size (default: 16)
-    :param like: marian embedding model like (default: comet-qe)
-    :param cpu_threads: number of CPU threads to use (default: 0)
-    :param: average: average segment scores to produce system score.
-        skip=do not output average (default; segment scores only);
-        append=append average at the end;
-        only=output the average only (i.e. system score only)
-    :param backend: subprocess or pymarian
-    :return: iterator over scores.
-    """
-
-    assert model.exists()
-    if model.is_dir():
-        model_dir = model
-        _model_files = list(model.glob("*.npz"))
-        assert len(_model_files) == 1, f'Expected exactly one model file in {model_dir}'
-        model_file = _model_files[0]
-    else:
-        assert model.is_file()
-        model_dir = model.parent
-        model_file = model
-    if not vocab_file:
-        _vocab_files = list(model_dir.glob('*.spm'))
-        assert len(_vocab_files) == 1, f'Expected exactly one vocab file in {model_dir}'
-        vocab_file = _vocab_files[0]
-
-    assert model_file.exists(), f'Model file {model_file} does not exist'
-    assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist'
-
-    n_inputs = len(Defaults.KNOWN_SCHEMA[like].split('+'))
-    vocabs = [vocab_file] * n_inputs
-    kwargs = dict(
-        model=model_file,
-        vocabs=vocabs,
-        devices=devices,
-        width=width,
-        like=like,
-        mini_batch=mini_batch,
-        maxi_batch=maxi_batch,
-        max_length=max_length,
-        max_length_crop=True,
-        workspace=workspace,  # negative memory => relative to total memory
-        cpu_threads=cpu_threads,
-        average=average,
-    )
-    if backend == 'pymarian':
-        # handled separately for pymarian due to minibatching and iterator input
-        # TODO: remove this when iterator is supported in evaluator C++ API
-        kwargs['average'] = 'skip'
-
-    cmd_line = []
-    for key, val in kwargs.items():
-        if val is None:  # ignore this key / flag
-            continue
-        cmd_line.append(f"--{key.replace('_', '-')}")
-        if val is True:  # boolean flag
-            cmd_line.append('true')
-        elif val is False:
-            cmd_line.append('false')
-
-        elif isinstance(val, (list, tuple)):
-            cmd_line.extend(str(v) for v in val)
-        else:
-            cmd_line.append(str(val))
-    if not DEBUG_MODE:
-        cmd_line.append('--quiet')
-    if backend == 'subprocess':
-        return subprocess_evaluate(cmd_line, input_lines)
-    elif backend == 'pymarian':
-        cmd_line = ' '.join(cmd_line)
-        batch_size = mini_batch * maxi_batch
-        return pymarian_evaluate(cmd_line, input_lines, batch_size=batch_size, average=average)
-    else:
-        raise ValueError(f'Unknown backend {backend}')
-
-
-def pymarian_evaluate(
-    cmd_line: str, input_lines: Iterator[str], average=Defaults.AVERAGE, batch_size=int(Defaults.MINI_BATCH * Defaults.MAXI_BATCH)
-):
-    try:
-        from pymarian import Evaluator
-    except:
-        raise ImportError('pymarian is not installed. Please install it and rerun')
-
-    log.info(f'Marian CLI::\n\t{cmd_line}')
-
-    evaluator = Evaluator(cmd_line)
-    config = yaml.safe_load(evaluator.get_model_config())
-    log.info(f'Model config: {config}')
-
-    assert average in ('skip', 'append', 'only')
-    lines = (line.rstrip('\n').split('\t') for line in input_lines)
-
-    # NOTE: pymarian doesn't support iterator input yet; so mini batching here
-    def make_mini_batches(lines, batch_size=batch_size):
-        assert batch_size > 0
-        while True:
-            chunk = list(itertools.islice(lines, batch_size))
-            if not chunk:
-                return
-            yield chunk
-
-    total, count = 0.0, 0
-    for batch in make_mini_batches(lines):
-        scores = evaluator.evaluate(batch)
-        assert len(scores) == len(batch)
-        for score in scores:
-            if isinstance(score, (tuple, list)):
-                score = score[0]
-            total += score
-            count += 1
-            if average != 'only':  # skip or append
-                yield score
-
-    if average != 'skip':
-        yield total / count
-
-
-def subprocess_evaluate(cmd_line: List[str], input_lines: Iterator[str]):
-    assert isinstance(cmd_line, list)
-    marian_bin_path = shutil.which('marian')
-    if marian_bin_path is None:
-        raise FileNotFoundError('marian binary not found in PATH. Please add it and rerun')
-    cmd_line = [marian_bin_path, 'evaluate'] + cmd_line
-
-    proc = None
-    try:
-        proc = subprocess.Popen(
-            cmd_line,
-            shell=False,
-            stdout=subprocess.PIPE,
-            stdin=subprocess.PIPE,
-            stderr=sys.stderr,
-            text=True,
-            encoding='utf8',
-            errors='replace',
-        )
-        log.info(f'Running command: {" ".join(cmd_line)}')
-        copy_thread = threading.Thread(target=copy_lines_to_stdin, args=(proc, input_lines))
-
-        copy_thread.start()
-        # read output and yield scores
-        for line in proc.stdout:
-            line = line.rstrip()
-            if ' ' in line:
-                yield tuple(float(x) for x in line.split(' '))
-            else:
-                yield float(line)
-
-        # wait for copy thread to finish
-        copy_thread.join()
-        # proc.stdin.close()
-        returncode = proc.wait()
-        if returncode != 0:
-            raise RuntimeError(f'Process exited with code {returncode}')
-    finally:
-        if proc is not None and proc.returncode is None:
-            log.warning(f'Killing process {proc.pid}')
-            proc.kill()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    parser.add_argument(
-        '-m',
-        '--model',
-        help=f'Model name, or path. Known models={list(Defaults.KNOWN_METRICS.keys())}',
-        default=Defaults.DEF_MODEL,
-        type=str,
-    )
-
-    parser.add_argument(
-        '--stdin',
-        action='store_true',
-        help='Read input from stdin. TSV file with following format: \
-                          QE metrics: "src<tab>mt", Comet with ref: "src<tab>ref<tab>; or BLEURT: "ref<tab>mt"',
-    )
-    parser.add_argument('-t', '--mt', dest='mt_file', help='MT output file. Ignored when --stdin.', type=Path)
-    parser.add_argument('-s', '--src', dest='src_file', help='Source file. Ignored when --stdin', type=Path)
-    parser.add_argument('-r', '--ref', dest='ref_file', help='Ref file. Ignored when --stdin', type=Path)
-    parser.add_argument(
-        '-o', '--out', default=sys.stdout, help='output file. Default: stdout', type=argparse.FileType('w')
-    )
-    parser.add_argument(
-        '-a',
-        '--average',
-        choices=('skip', 'append', 'only'),
-        default='skip',
-        help='Average segment scores to produce system score.'
-        ' skip=do not output average (default; segment scores only);'
-        ' append=append average at the end; '
-        ' only=output the average only (i.e. system score only)',
-    )
-
-    parser.add_argument('-w', '--width', default=4, help='Output score width', type=int)
-    parser.add_argument('--debug', help='Verbose output', action='store_true')
-    parser.add_argument('--mini-batch', default=16, help='Mini-batch size', type=int)
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-d', '--devices', nargs='*', type=int, help='GPU device IDs')
-    group.add_argument(
-        '-c', '--cpu-threads', default=None, type=int, help='Use CPU threads. 0=use GPU device 0'
-    )
-    parser.add_argument('-ws', '--workspace', default=8000, help='Workspace memory', type=int)
-    parser.add_argument(
-        '--backend',
-        default='pymarian',
-        choices=['subprocess', 'pymarian'],
-        help='Marian backend interface. subprocess=look for marian binary in PATH. pymarian=pybind wrapper',
-    )
-
-    args = parser.parse_args()
-    return vars(args)
-
-
-def read_input(args, model_id, schema=None):
-    model_schema = Defaults.KNOWN_METRICS.get(model_id, schema or Defaults.DEF_SCHEMA)
-    input_schema = Defaults.KNOWN_SCHEMA[model_schema]
-    n_inputs = len(input_schema.split('+'))
-    if args.pop('stdin'):
-        del args['mt_file']
-        del args['src_file']
-        del args['ref_file']
-        return sys.stdin
-
-    n_inputs = len(input_schema.split('+'))
-    mt_file = args.pop('mt_file')
-    src_file = args.pop('src_file')
-    ref_file = args.pop('ref_file')
-    assert mt_file.exists(), f'{mt_file} does not exist'
-    if 'src' in input_schema:
-        assert src_file, f'Source file is required for metric {model_id}'
-        assert src_file.exists(), f'{src_file} does not exist'
-    if 'ref' in input_schema:
-        assert ref_file, f'Reference file is required for metric {model_id}'
-        assert ref_file.exists(), f'{ref_file} does not exist'
-    if input_schema == 'src+mt':
-        input_lines = itertools.zip_longest(open(src_file), open(mt_file))
-    elif input_schema == 'src+ref+mt':
-        input_lines = itertools.zip_longest(open(src_file), open(ref_file), open(mt_file))
-    elif input_schema == 'src+mt+ref':
-        input_lines = itertools.zip_longest(open(src_file), open(mt_file), open(ref_file))
-    elif input_schema == 'ref+mt':
-        input_lines = itertools.zip_longest(open(ref_file), open(mt_file))
-    else:
-        raise ValueError(f'Unknown schema {input_schema}')
-
-    def _validate_and_join():
-        for row in input_lines:
-            assert len(row) == n_inputs, f'Expected {n_inputs} columns, but got {len(row)}'
-            for col in row:
-                assert col is not None, f'Expected {n_inputs} columns, but got {len(row)}'
-            yield '\t'.join(row)
-
-    return _validate_and_join()
-
-
-def main(**args):
-    args = args or parse_args()
-    if args.pop('debug'):
-        log.getLogger().setLevel(log.DEBUG)
-        global DEBUG_MODE
-        DEBUG_MODE = True
-        log.debug(args)
-
-    model_id = args.pop('model')
-    if model_id.lower() in Defaults.KNOWN_METRICS:
-        model_path, vocab = get_known_model(model_id.lower())
-        log.info(f'{model_id} --> {model_path}')
-    else:
-        model_path, vocab = Path(model_id), None
-    assert (
-        model_path.exists()
-    ), f'{model_path} does not exist. Known models are {list(Defaults.KNOWN_METRICS.keys())}'
-    args['model'] = model_path
-    args['vocab_file'] = vocab
-
-    args['input_lines'] = read_input(args, model_id=model_id)
-    args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_SCHEMA)
-    out = args.pop('out')
-    width = args.pop('width', Defaults.FLOAT_PRECISION)
-    scores = marian_evaluate(**args)
-    for i, score in enumerate(scores, start=1):
-        if isinstance(score, (tuple, list)):
-            score = score[0]  # the first score
-        out.write(f'{score:.{width}f}\n')
-    out.close()
-
-    log.info(f'Wrote {i} lines to {out.name}')
-
-
-if '__main__' == __name__:
-    main()
diff --git a/src/python/pymarian/mtapi_server.py b/src/python/pymarian/mtapi_server.py
index 4391a3101..fd11ba2bb 100755
--- a/src/python/pymarian/mtapi_server.py
+++ b/src/python/pymarian/mtapi_server.py
@@ -9,12 +9,11 @@
 import logging as log
 from typing import List
 
+import pymarian
 from flask import Flask, request
 from sacremoses import MosesPunctNormalizer
 from sentence_splitter import SentenceSplitter
 
-import pymarian
-
 log.basicConfig(level=log.INFO)
 
 
diff --git a/src/python/pymarian/pypdl/__init__.py b/src/python/pymarian/pypdl/__init__.py
new file mode 100644
index 000000000..6d670df6e
--- /dev/null
+++ b/src/python/pymarian/pypdl/__init__.py
@@ -0,0 +1 @@
+from .main import Downloader
diff --git a/src/python/pymarian/pypdl/downloader.py b/src/python/pymarian/pypdl/downloader.py
new file mode 100644
index 000000000..ac4dd2f98
--- /dev/null
+++ b/src/python/pymarian/pypdl/downloader.py
@@ -0,0 +1,97 @@
+import copy
+import logging
+import time
+from pathlib import Path
+from threading import Event
+from typing import Dict
+
+import requests
+
+MEGABYTE = 1048576
+
+
+class BasicDownloader:
+    """Base downloader class."""
+
+    def __init__(self, interrupt: Event):
+        self.curr = 0  # Downloaded size in bytes (current size)
+        self.completed = False
+        self.id = 0
+        self.interrupt = interrupt
+        self.downloaded = 0
+
+    def download(self, url: str, path: str, mode: str, **kwargs) -> None:
+        """Download data in chunks."""
+        try:
+            with open(path, mode) as file, requests.get(url, stream=True, **kwargs) as response:
+                for chunk in response.iter_content(MEGABYTE):
+                    file.write(chunk)
+                    self.curr += len(chunk)
+                    self.downloaded += len(chunk)
+
+                    if self.interrupt.is_set():
+                        break
+
+        except Exception as e:
+            self.interrupt.set()
+            time.sleep(1)
+            logging.error("(Thread: %d) [%s: %s]", self.id, type(e).__name__, e)
+
+
+class Simpledown(BasicDownloader):
+    """Class for downloading the whole file in a single segment."""
+
+    def __init__(
+        self,
+        url: str,
+        file_path: str,
+        interrupt: Event,
+        **kwargs,
+    ):
+        super().__init__(interrupt)
+        self.url = url
+        self.file_path = file_path
+        self.kwargs = kwargs
+
+    def worker(self) -> None:
+        self.download(self.url, self.file_path, mode="wb", **self.kwargs)
+        self.completed = True
+
+
+class Multidown(BasicDownloader):
+    """Class for downloading a specific segment of the file."""
+
+    def __init__(
+        self,
+        segement_table: Dict,
+        segment_id: int,
+        interrupt: Event,
+        **kwargs,
+    ):
+        super().__init__(interrupt)
+        self.id = segment_id
+        self.segement_table = segement_table
+        self.kwargs = kwargs
+
+    def worker(self) -> None:
+        url = self.segement_table["url"]
+        segment_path = Path(self.segement_table[self.id]["segment_path"])
+        start = self.segement_table[self.id]["start"]
+        end = self.segement_table[self.id]["end"]
+        size = self.segement_table[self.id]["segment_size"]
+
+        if segment_path.exists():
+            downloaded_size = segment_path.stat().st_size
+            if downloaded_size > size:
+                segment_path.unlink()
+            else:
+                self.curr = downloaded_size
+
+        if self.curr < size:
+            start = start + self.curr
+            kwargs = copy.deepcopy(self.kwargs)  # since used by others
+            kwargs.setdefault("headers", {}).update({"range": f"bytes={start}-{end}"})
+            self.download(url, segment_path, "ab", **kwargs)
+
+        if self.curr == size:
+            self.completed = True
diff --git a/src/python/pymarian/pypdl/main.py b/src/python/pymarian/pypdl/main.py
new file mode 100644
index 000000000..1f85575d7
--- /dev/null
+++ b/src/python/pymarian/pypdl/main.py
@@ -0,0 +1,234 @@
+import logging
+import sys
+import time
+from collections import deque
+from concurrent.futures import ThreadPoolExecutor
+from threading import Event
+from typing import Callable, Optional, Union
+
+import requests
+from tqdm import tqdm
+
+from .downloader import Multidown, Simpledown
+from .utils import (
+    AutoShutdownFuture,
+    FileValidator,
+    combine_files,
+    create_segment_table,
+    get_filepath,
+    seconds_to_hms,
+    to_mb,
+)
+
+logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
+
+
+class Downloader:
+    """
+    A multi-threaded file downloader that supports progress tracking, retries, pause/resume functionality etc.
+
+    Keyword Arguments:
+        params (dict, optional): A dictionary, list of tuples or bytes to send as a query string. Default is None.
+        allow_redirects (bool, optional): A Boolean to enable/disable redirection. Default is True.
+        auth (tuple, optional): A tuple to enable a certain HTTP authentication. Default is None.
+        cert (str or tuple, optional): A String or Tuple specifying a cert file or key. Default is None.
+        cookies (dict, optional): A dictionary of cookies to send to the specified url. Default is None.
+        headers (dict, optional): A dictionary of HTTP headers to send to the specified url. Default is None.
+        proxies (dict, optional): A dictionary of the protocol to the proxy url. Default is None.
+        timeout (number or tuple, optional): A number, or a tuple, indicating how many seconds to wait for the client to make a connection and/or send a response. Default is 10 seconds.
+        verify (bool or str, optional): A Boolean or a String indication to verify the servers TLS certificate or not. Default is True.
+    """
+
+    def __init__(self, **kwargs):
+        self._pool = None  # ThreadPoolExecutor, initialized in _downloader
+        self._workers = []
+        self._interrupt = Event()
+        self._stop = False
+        self._kwargs = {"timeout": 10, "allow_redirects": True}  # request module kwargs
+        self._kwargs.update(kwargs)
+
+        # public attributes
+        self.size = None
+        self.progress = 0
+        self.speed = 0
+        self.time_spent = 0
+        self.current_size = 0
+        self.eta = "99:59:59"
+        self.remaining = None
+        self.failed = False
+        self.completed = False
+
+    def _display(self, dynamic_print):
+        dynamic_print.update(self.current_size - dynamic_print.n)
+
+    def _calc_values(self, recent_queue, interval):
+        self.current_size = sum(worker.curr for worker in self._workers)
+
+        # Speed calculation
+        recent_queue.append(sum(worker.downloaded for worker in self._workers))
+        non_zero_list = [to_mb(value) for value in recent_queue if value]
+        if len(non_zero_list) < 1:
+            self.speed = 0
+        elif len(non_zero_list) == 1:
+            self.speed = non_zero_list[0] / interval
+        else:
+            diff = [b - a for a, b in zip(non_zero_list, non_zero_list[1:])]
+            self.speed = (sum(diff) / len(diff)) / interval
+
+        if self.size:
+            self.progress = int(100 * self.current_size / self.size)
+            self.remaining = to_mb(self.size - self.current_size)
+
+            if self.speed:
+                self.eta = seconds_to_hms(self.remaining / self.speed)
+            else:
+                self.eta = "99:59:59"
+
+    def _single_thread(self, url, file_path):
+        sd = Simpledown(url, file_path, self._interrupt, **self._kwargs)
+        self._workers.append(sd)
+        self._pool.submit(sd.worker)
+
+    def _multi_thread(self, segments, segement_table):
+        for segment in range(segments):
+            md = Multidown(
+                segement_table,
+                segment,
+                self._interrupt,
+                **self._kwargs,
+            )
+            self._workers.append(md)
+            self._pool.submit(md.worker)
+
+    def _get_header(self, url):
+        kwargs = self._kwargs.copy()
+        kwargs.pop("params", None)
+        response = requests.head(url, **kwargs)
+
+        if response.status_code != 200:
+            self._interrupt.set()
+            raise ConnectionError(f"Server Returned: {response.reason}({response.status_code}), Invalid URL")
+
+        return response.headers
+
+    def _get_info(self, url, file_path, multithread, etag):
+        header = self._get_header(url)
+        file_path = get_filepath(url, header, file_path)
+
+        if size := int(header.get("Content-Length", 0)):
+            self.size = size
+
+        return file_path, multithread, etag
+
+    def _downloader(self, url, file_path, segments, display, multithread, etag):
+        start_time = time.time()
+
+        file_path, multithread, etag = self._get_info(url, file_path, multithread, etag)
+
+        if multithread:
+            segment_table = create_segment_table(url, file_path, segments, self.size, etag)
+            segments = segment_table["segments"]
+            self._pool = ThreadPoolExecutor(max_workers=segments)
+            self._multi_thread(segments, segment_table)
+        else:
+            self._pool = ThreadPoolExecutor(max_workers=1)
+            self._single_thread(url, file_path)
+
+        interval = 0.15
+        recent_queue = deque([0] * 12, maxlen=12)
+        download_mode = "Multi-Threaded" if multithread else "Single-Threaded"
+
+        with tqdm(total=self.size, desc=f"Downloading ({download_mode})", dynamic_ncols=True, unit='B', unit_scale=True, miniters=1) as dynamic_print:
+            while True:
+                status = sum(worker.completed for worker in self._workers)
+                self._calc_values(recent_queue, interval)
+
+                if display:
+                    self._display(dynamic_print)
+
+                if self._interrupt.is_set():
+                    self.time_spent = time.time() - start_time
+                    return None
+
+                if status == len(self._workers):
+                    if multithread:
+                        combine_files(file_path, segments)
+                    self.completed = True
+                    self.time_spent = time.time() - start_time
+                    return FileValidator(file_path)
+
+                time.sleep(interval)
+
+    def stop(self) -> None:
+        """Stop the download process."""
+        self._interrupt.set()
+        self._stop = True
+        time.sleep(1)  # wait for threads
+
+    def start(
+        self,
+        url: str,
+        file_path: Optional[str] = None,
+        segments: int = 10,
+        display: bool = True,
+        multithread: bool = True,
+        block: bool = True,
+        retries: int = 0,
+        mirror_func: Optional[Callable[[], str]] = None,
+        etag: bool = True,
+    ) -> Union[AutoShutdownFuture, FileValidator, None]:
+        """
+        Start the download process.
+
+        Parameters:
+            url (str): The URL to download from.
+            file_path (str, Optional): The path to save the downloaded file. If not provided, the file is saved in the current working directory.
+                If `file_path` is a directory, the file is saved in that directory. If `file_path` is a file name, the file is saved with that name.
+            segments (int, Optional): The number of segments to divide the file into for multi-threaded download. Default is 10.
+            display (bool, Optional): Whether to display download progress and other messages. Default is True.
+            multithread (bool, Optional): Whether to use multi-threaded download. Default is True.
+            block (bool, Optional): Whether to block the function until the download is complete. Default is True.
+            retries (int, Optional): The number of times to retry the download if it fails. Default is 0.
+            mirror_func (Callable[[], str], Optional): A function that returns a new download URL if the download fails. Default is None.
+            etag (bool, Optional): Whether to validate the ETag before resuming downloads. Default is True.
+
+        Returns:
+            AutoShutdownFuture: If `block` is False.
+            FileValidator: If `block` is True and the download successful.
+            None: If `block` is True and the download fails.
+        """
+
+        def download():
+            for i in range(retries + 1):
+                try:
+                    _url = mirror_func() if i > 0 and callable(mirror_func) else url
+                    if i > 0 and display:
+                        logging.info("Retrying... (%d/%d)", i, retries)
+
+                    self.__init__(**self._kwargs)
+                    result = self._downloader(_url, file_path, segments, display, multithread, etag)
+
+                    if self._stop or self.completed:
+                        if display:
+                            print(f"Time elapsed: {seconds_to_hms(self.time_spent)}", file=sys.stderr)
+                        return result
+
+                    time.sleep(3)
+
+                except Exception as e:
+                    logging.error("(%s) [%s]", e.__class__.__name__, e)
+
+                finally:
+                    self._pool.shutdown()
+
+            self.failed = True
+            return None
+
+        ex = ThreadPoolExecutor(max_workers=1)
+        future = AutoShutdownFuture(ex.submit(download), ex)
+
+        if block:
+            result = future.result()
+            return result
+
+        return future
diff --git a/src/python/pymarian/pypdl/utils.py b/src/python/pymarian/pypdl/utils.py
new file mode 100644
index 000000000..0c48642d2
--- /dev/null
+++ b/src/python/pymarian/pypdl/utils.py
@@ -0,0 +1,127 @@
+import hashlib
+import json
+import time
+from concurrent.futures import Executor, Future
+from pathlib import Path
+from typing import Dict, Union
+from urllib.parse import unquote, urlparse
+
+MEGABYTE = 1048576
+BLOCKSIZE = 4096
+BLOCKS = 1024
+CHUNKSIZE = BLOCKSIZE * BLOCKS
+
+
+def to_mb(size_in_bytes: int) -> float:
+    return size_in_bytes / MEGABYTE
+
+
+def seconds_to_hms(sec: float) -> str:
+    time_struct = time.gmtime(sec)
+    return time.strftime("%H:%M:%S", time_struct)
+
+
+def get_filepath(url: str, headers: Dict, file_path: str) -> str:
+    content_disposition = headers.get("Content-Disposition", None)
+
+    if content_disposition and "filename=" in content_disposition:
+        filename_start = content_disposition.index("filename=") + len("filename=")
+        filename = content_disposition[filename_start:]  # Get name from headers
+        filename = unquote(filename.strip('"'))  # Decode URL encodings
+    else:
+        filename = unquote(urlparse(url).path.split("/")[-1])  # Generate name from url
+
+    if file_path:
+        file_path = Path(file_path)
+        if file_path.is_dir():
+            return str(file_path / filename)
+        return str(file_path)
+    return filename
+
+
+def create_segment_table(url: str, file_path: str, segments: str, size: int, etag: Union[str, bool]) -> Dict:
+    """Create a segment table for multi-threaded download."""
+    segments = 5 if (segments > 5) and (to_mb(size) < 50) else segments
+    progress_file = Path(file_path + ".json")
+
+    try:
+        progress = json.loads(progress_file.read_text())
+        if etag and progress["url"] == url and progress["etag"] == etag:
+            segments = progress["segments"]
+    except Exception:
+        pass
+
+    progress_file.write_text(
+        json.dumps(
+            {"url": url, "etag": etag, "segments": segments},
+            indent=4,
+        )
+    )
+
+    dic = {"url": url, "segments": segments}
+    partition_size = size / segments
+    for segment in range(segments):
+        start = int(partition_size * segment)
+        end = int(partition_size * (segment + 1))
+        segment_size = end - start
+        if segment != (segments - 1):
+            end -= 1  # [0-100, 100-200] -> [0-99, 100-200]
+        # No segment_size+=1 for last setgment since final byte is end byte
+
+        dic[segment] = {
+            "start": start,
+            "end": end,
+            "segment_size": segment_size,
+            "segment_path": f"{file_path }.{segment}",
+        }
+
+    return dic
+
+
+def combine_files(file_path: str, segments: int) -> None:
+    """Combine the downloaded file segments into a single file."""
+    with open(file_path, "wb") as dest:
+        for segment in range(segments):
+            segment_file = f"{file_path}.{segment}"
+            with open(segment_file, "rb") as src:
+                while True:
+                    chunk = src.read(CHUNKSIZE)
+                    if chunk:
+                        dest.write(chunk)
+                    else:
+                        break
+            Path(segment_file).unlink()
+
+    progress_file = Path(f"{file_path}.json")
+    progress_file.unlink()
+
+
+class FileValidator:
+    """A class used to validate the integrity of the file."""
+
+    def __init__(self, path: str):
+        self.path = path
+
+    def calculate_hash(self, algorithm: str, **kwargs) -> str:
+        hash_obj = hashlib.new(algorithm, **kwargs)
+        with open(self.path, "rb") as file:
+            for chunk in iter(lambda: file.read(4096), b""):
+                hash_obj.update(chunk)
+        return hash_obj.hexdigest()
+
+    def validate_hash(self, correct_hash: str, algorithm: str, **kwargs) -> bool:
+        file_hash = self.calculate_hash(algorithm, **kwargs)
+        return file_hash == correct_hash
+
+
+class AutoShutdownFuture:
+    """A Future object wrapper that shuts down the executor when the result is retrieved."""
+
+    def __init__(self, future: Future, executor: Executor):
+        self.future = future
+        self.executor = executor
+
+    def result(self, timeout: float = None) -> Union[FileValidator, None]:
+        result = self.future.result(timeout)
+        self.executor.shutdown()
+        return result
diff --git a/src/python/pymarian/qtdemo.py b/src/python/pymarian/qtdemo.py
index e95d0bf12..fe99784c3 100644
--- a/src/python/pymarian/qtdemo.py
+++ b/src/python/pymarian/qtdemo.py
@@ -1,13 +1,12 @@
 import sys
 import time
 
+import pymarian
 from PyQt5.QtGui import *
 from PyQt5.QtWidgets import *
 from sacremoses import MosesPunctNormalizer, MosesTokenizer
 from sentence_splitter import SentenceSplitter
 
-import pymarian
-
 
 class Example(QWidget):
     def __init__(self):
diff --git a/src/python/pymarian/utils.py b/src/python/pymarian/utils.py
index 16e2e3c22..c3a4efab0 100644
--- a/src/python/pymarian/utils.py
+++ b/src/python/pymarian/utils.py
@@ -1,64 +1,135 @@
 #!/usr/bin/env python
 #
 # This is a python wrapper for marian evaluate command
-# created by Thamme Gowda on 2023-09-07
-#
+
 
 import logging as log
 import shutil
 from pathlib import Path
+from typing import List, Tuple
 
+import portalocker
 import requests
-from tqdm.auto import tqdm
 
-from .constants import Defaults
+from .defaults import Defaults
+from .pypdl import Downloader
 
 log.basicConfig(level=log.INFO)
+
 DEBUG_MODE = False
+PROGRESS_BAR = Defaults.PROGRESS_BAR
+
+
+class InvalidIDException(ValueError):
+    """Invalid model ID exception"""
+
+    pass
+
+
+def validate_id(id: str) -> bool:
+    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
+    for c in invalid_chars:
+        if c in id:
+            raise InvalidIDException(
+                f'Invalid model id {id}. It must not contain characters: {invalid_chars}'
+            )
 
 
-def get_known_model(model_name):
-    """Given a known model name, this functin gets the checkpoint and vocabulary paths. 
-    This function downloads and extracts model files to a local cache directory if necessary.
-    
-    Specifically,  checkpoint file must have model*.npz and vocab*.spm files in the resolved model directory.
+def get_model_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Path:
+    """Given the name of a (known) model, this function gets its checkpoint path.
+    If necessary, this function downloads checkpoint to a local cache directory.
+
     :param model_name: model name
-    :return: checkpoint path, vocabulary path
+    :return: checkpoint path
     """
-    assert model_name in Defaults.KNOWN_METRICS, f'Unknown model {model_name}'
+    validate_id(model_name)
+    chkpt_url = f'{Defaults.BASE_URL}/{model_name}/model.{model_name}.bin'
 
-    model_url = f'{Defaults.BASE_URL}/{model_name}.tgz'
-    local_file = Defaults.CACHE_PATH / f'{model_name}.tgz'
     local_dir = Defaults.CACHE_PATH / model_name
-    maybe_download_file(model_url, local_file)
-    maybe_extract(local_file, local_dir)
-    checkpt_file = list(local_dir.glob('model*.npz'))
-    vocab_file = list(local_dir.glob('vocab*.spm'))
-    assert len(checkpt_file) == 1, f'Expected exactly one model file in {local_dir}'
-    assert len(vocab_file) == 1, f'Expected exactly one vocab file in {local_dir}'
-    checkpt_file = checkpt_file[0]
-    vocab_file = vocab_file[0]
-    return checkpt_file, vocab_file
-
-
-def maybe_download_file(url, local_file: Path):
+    chkpt_local = local_dir / f'model.{model_name}.bin'
+
+    maybe_download_file(chkpt_url, chkpt_local)
+    assert chkpt_local.exists(), f'Checkpoint file {chkpt_local} does not exist'
+    return chkpt_local
+
+
+def get_vocab_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Tuple[Path, Path]:
+    """Given the name of a (known) model, this function gets its vocabulary path.
+    This function downloads vocabulary to a local cache directory, if necessary.
+
+    :param model_name: model name
+    :param progress_bar: show progress bar while downloading
+    :return: checkpoint path, vocabulary path
+    """
+    validate_id(model_name)
+    local_dir = Defaults.CACHE_PATH / model_name
+    vocab_local = local_dir / 'vocab.spm'
+
+    vocab_url = f'{Defaults.BASE_URL}/{model_name}/vocab.spm'
+    maybe_download_file(vocab_url, vocab_local, progress_bar=progress_bar)
+    assert vocab_local.exists(), f'Vocabulary file {vocab_local} does not exist'
+    return vocab_local
+
+
+def maybe_download_file(url: str, local_file: Path, progress_bar: bool = PROGRESS_BAR):
     """Downloads the file if not already downloaded
     :param url: url to download
     :param local_file: local file path
+    :param progress_bar: show progress bar while downloading
+    :return: None
+    :raises: ValueError if the url is invalid
     """
-    flag_file = local_file.with_name(local_file.name + '._OK')
-    if local_file.exists() and flag_file.exists():
-        log.info(f'Using cached file {local_file}')
+    lock_file = local_file.with_name('._LOCK_' + local_file.name)
+    if local_file.exists() and local_file.stat().st_size > 0:
+        log.debug(f'Using cached file {local_file}')
         return
-    log.info(f'Downloading {url} to {local_file}')
+
+    # check if the url has OK status; avoid creating cache directories when url is invalid due to bad model ID
+    if not is_ok_url(url):
+        raise ValueError(f'Invalid URL: {url}')
+
     local_file.parent.mkdir(parents=True, exist_ok=True)
-    with requests.get(url, stream=True) as r:
-        r.raise_for_status()
-        file_size = int(r.headers.get('Content-Length', 0))
-        with tqdm.wrapattr(r.raw, "read", total=file_size, desc='Downloading', dynamic_ncols=True) as r_raw:
-            with open(local_file, "wb") as f:
-                shutil.copyfileobj(r_raw, f)
-    flag_file.touch()
+    with portalocker.Lock(lock_file, 'w', timeout=Defaults.FILE_LOCK_TIMEOUT) as fh:
+        # check again if it is downloaded by another process while we were waiting for the lock
+        if local_file.exists() and local_file.stat().st_size > 0:
+            log.debug(f'Using cached file {local_file}')
+            return
+
+        # use file lock to avoid race of parallel downloads
+        local_file.parent.mkdir(parents=True, exist_ok=True)
+
+        tmp_file = local_file.with_name(local_file.name + '.downloading')
+        log.info(f'Downloading {url} to {tmp_file}')
+        dl = Downloader()
+        dl.start(
+            url=url,
+            file_path=tmp_file,
+            segments=20,
+            display=progress_bar,
+            multithread=True,
+            block=True,
+            retries=3,
+            mirror_func=None,
+            etag=False,
+        )
+
+        if dl.completed:
+            # move the file to the final location
+            if local_file.exists():
+                local_file.unlink()
+            shutil.move(tmp_file, local_file)
+
+
+def is_ok_url(url: str) -> bool:
+    """Checks if the given url has OK status code by making a HEAD request
+    :param url: url
+    :return: True if status is OK, False otherwise
+    """
+    try:
+        return requests.head(url).status_code == requests.codes.ok
+    except requests.exceptions.RequestException as e:
+        log.error(f'Invalid URL: {url}')
+        return False
 
 
 def maybe_extract(archive: Path, outdir: Path) -> Path:
@@ -79,9 +150,9 @@ def maybe_extract(archive: Path, outdir: Path) -> Path:
 
 
 def kwargs_to_cli(**kwargs) -> str:
-    """Converts kwargs to cli args
+    """Converts kwargs to command line arguments string
     :param kwargs: kwargs
-    :return: cli args
+    :return: CLI string
     """
     args = []
     for k, v in kwargs.items():
@@ -89,7 +160,7 @@ def kwargs_to_cli(**kwargs) -> str:
             continue  # ignore keys if values are None
         k = k.replace('_', '-')
         args.append(f'--{k}')
-        if v is '':
+        if v == '':
             continue  # only add keys for empty values
         elif isinstance(v, bool):
             args.append("true" if v else "false")
@@ -97,5 +168,5 @@ def kwargs_to_cli(**kwargs) -> str:
             args.extend(str(x) for x in v)
         else:
             args.append(f'{v}')
-
     return ' '.join(args)
+
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
index 84d1b0e8f..f2008a924 100644
--- a/src/python/pyproject.toml
+++ b/src/python/pyproject.toml
@@ -29,13 +29,14 @@ classifiers = [
 ]
 
 dependencies = [
+  "portalocker",
+  "pyyaml",
   "tqdm",
   "requests",
-  "pyyaml"
 ]
 
 [project.scripts]
-pymarian-evaluate = "pymarian.evaluate:main"
+pymarian-eval = "pymarian.eval:main"
 pymarian-qtdemo = "pymarian.qtdemo:main"
 pymarian-mtapi = "pymarian.mtapi_server:main"
 
@@ -54,7 +55,7 @@ include-package-data = true
 [tool.black]
 line-length = 110
 target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
-include = 'src/python/.*\.pyi?$'
+include = '.*\.pyi?$'
 skip-string-normalization = true
 
 # black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333
diff --git a/src/python/setup.py b/src/python/setup.py
index 01d3a0f5f..bcbca2c63 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -1,10 +1,10 @@
 import os
-import platform
 import shutil
 import sys
-
+import sysconfig
 from pathlib import Path
-from setuptools import setup, find_namespace_packages, Distribution
+
+from setuptools import Distribution, find_namespace_packages, setup
 
 """
 This script expects _pymarian.*.so to be present in $CMAKE_BINARY_DIR
@@ -17,12 +17,9 @@
 CMAKE_BINARY_DIR = os.getenv("CMAKE_BINARY_DIR", DEF_CMAKE_BINARY_DIR)
 print("\t>>>CMAKE_BINARY_DIR is ", CMAKE_BINARY_DIR)
 
-if platform.system() == 'Windows':
-    NATIVE_EXT_GLOB = '_pymarian.*.pyd'
-elif platform.system() == 'Darwin':
-    NATIVE_EXT_GLOB = '_pymarian.*.dylib'
-else:
-    NATIVE_EXT_GLOB = '_pymarian.*.so'
+EXT_SUFFIX = sysconfig.get_config_var('EXT_SUFFIX')  # See also: python -m sysconfig | grep -i EXT_SUFFIX
+assert EXT_SUFFIX, "EXT_SUFFIX not found in sysconfig"
+NATIVE_EXT_NAME = '_pymarian' + EXT_SUFFIX
 
 
 def get_version(cuda_version=None) -> str:
@@ -59,24 +56,27 @@ def get_version(cuda_version=None) -> str:
 
 def get_native_ext() -> Path:
 
-    native_exts = list(Path(CMAKE_BINARY_DIR).glob(f'src/{NATIVE_EXT_GLOB}'))
-    if not native_exts:
-        raise Exception(
-            f'No native extension found; Looked at {CMAKE_BINARY_DIR}/src/{NATIVE_EXT_GLOB}. \
-            Please run cmake build first with -DPYMARIAN=ON or set CMAKE_BINARY_DIR to the build dir'
-        )
-    elif len(native_exts) >= 2:
-        raise Exception(f'Only one native extension expected, but found: {native_exts}')
-
-    native_ext = native_exts[0]
+    native_ext = Path(CMAKE_BINARY_DIR) / 'src' / NATIVE_EXT_NAME
+    if not native_ext.exists():
+        msg = f"No native extension found at {native_ext}.\n \
+        Please run cmake build with -DPYMARIAN=ON or set CMAKE_BINARY_DIR to the existing build dir."
+        other_exts = list(Path(CMAKE_BINARY_DIR).glob("src/_pymarian.*." + NATIVE_EXT_NAME.split(".")[-1]))
+        if other_exts:
+            msg += f"\nOther extension(s) found: {other_exts} but they are not compatible with this platform ({EXT_SUFFIX})."
+        raise RuntimeError(msg)
     # Pip does not allow inclusion of files from parent dir our outside of package context (for security reasons).
     # So, we copy the native extension to the package directory
     native_ext_local = Path(__file__).parent / native_ext.name
-    print(f"\t>>>Found native extension at: {native_ext}")
-    print(f"\t   >>>Making it available under scope at: {native_ext_local}")
-    if native_ext_local.exists():
-        native_ext_local.unlink()
+    print(f"\t>>>Found the fresh native extension at: {native_ext}")
+    print(f"\t>>>Making it available under the package scope at: {native_ext_local}")
     shutil.copy(native_ext, native_ext_local)
+
+    # remove incomaptible .so files from prior builds (if any)
+    for old_file in Path(__file__).parent.glob("_pymarian.*"):
+        if old_file.resolve() == native_ext_local.resolve():
+            continue
+        print(f"\t>>>Removing old file: {old_file}")
+        old_file.unlink()
     return native_ext_local
 
 
diff --git a/src/python/tests/__init__.py b/src/python/tests/regression/__init__.py
similarity index 100%
rename from src/python/tests/__init__.py
rename to src/python/tests/regression/__init__.py
diff --git a/src/python/tests/regression/test_pymarian_eval.py b/src/python/tests/regression/test_pymarian_eval.py
new file mode 100644
index 000000000..8f207b149
--- /dev/null
+++ b/src/python/tests/regression/test_pymarian_eval.py
@@ -0,0 +1,91 @@
+import logging as log
+import os
+import shutil
+import subprocess
+import tarfile
+import urllib.request
+from pathlib import Path
+from typing import List
+
+import pytest
+
+log.basicConfig(level=log.INFO)
+
+DATA_URL = "https://textmt.blob.core.windows.net/www/data/marian-regression-tests/metrics-regression.tgz"
+DATA_DIR = Path(__file__).parent.parent / "data" / "metrics-regression"
+SELECT_PREFIX = "wmt21-systems.en-de.100"
+SYS_DIFF_OK = 0.01
+SEG_DIFF_OK = 0.05
+
+N_CPUS = max(os.cpu_count() - 2, 2)
+
+USE_GPU = False
+GPU_ARGS = "-d 0 --mini-batch 16"  # --fp16 error margin is too high for bleurt-20
+CPU_ARGS = f"--cpu-threads {N_CPUS} --mini-batch 1"
+# NOTE: --mini-batch > 1 on CPU deviates scores https://machinetranslation.visualstudio.com/DefaultCollection/Marian/_git/marian-dev/pullRequest/32883#1707853099
+BACKEND_ARGS = GPU_ARGS if USE_GPU else CPU_ARGS
+
+src_file = DATA_DIR / f"{SELECT_PREFIX}.src"
+ref_file = DATA_DIR / f"{SELECT_PREFIX}.ref"
+mt_file = DATA_DIR / f"{SELECT_PREFIX}.mt"
+
+
+def setup():
+    try:
+        flag_file = DATA_DIR / ".downloaded"
+        if flag_file.exists():
+            log.info("Data already downloaded. Setup skipped...")
+            return
+
+        DATA_DIR.mkdir(exist_ok=True, parents=True)
+        log.info(f"Downloading {DATA_URL} to {DATA_DIR}")
+        print("Downloading data package...")
+        with urllib.request.urlopen(DATA_URL) as response:
+            with tarfile.open(fileobj=response, mode="r|gz") as tar:
+                tar.extractall(path=DATA_DIR.parent)
+
+        flag_file.touch()
+        log.info("Setup Done.")
+    finally:
+        if not shutil.which("pymarian-eval"):
+            raise FileNotFoundError("pymarian-eval not found in PATH")
+        for f in [src_file, ref_file, mt_file]:
+            if not f.exists():
+                raise FileNotFoundError(f"File {f} not found.")
+
+
+def compare_scores(tag: str, lhs: List[float], rhs: List[float]):
+    assert len(lhs) == len(rhs), f"{tag} :: length mismatch: {len(lhs)} != {len(rhs)}"
+    total_diff = sum(abs(l - r) for l, r in zip(lhs, rhs))
+    avg_diff = total_diff / len(lhs)
+
+    seg_err_count = 0
+    for i, (l, r) in enumerate(zip(lhs, rhs)):
+        if abs(l - r) >= SEG_DIFF_OK:
+            log.warning(f"{tag} :: line {i}: {l:.4f} != {r:.4f} ({abs(l - r):.4f} > {SEG_DIFF_OK})")
+            seg_err_count += 1
+
+    assert avg_diff <= SYS_DIFF_OK, f"{tag} :: avg_diff: {avg_diff:.4f} > {SYS_DIFF_OK:.4f}"
+    assert seg_err_count == 0, f"{tag} :: seg_err_count: {seg_err_count:.4f} > 0"
+
+
+setup()
+# auto detect metric names
+# metric_names = list(set(f.name.split(".")[-2] for f in DATA_DIR.glob(f"{select_prefix}*.orig")))
+# update: No need to run all metric names, select a few
+metric_names = ["bleurt-20", "wmt20-comet-qe-da", "wmt22-comet-da", "wmt22-cometkiwi-da"]
+
+
+@pytest.mark.parametrize("metric_name", metric_names)
+def test_pymarian_cli(metric_name):
+    orig_file = DATA_DIR / f"{SELECT_PREFIX}.{metric_name}.orig"
+    assert orig_file.exists()
+    orig_scores = [float(x) for x in orig_file.read_text().splitlines() if x.strip()]
+
+    pymarian_args = f"-a skip -s {src_file} -r {ref_file} -t {mt_file} {BACKEND_ARGS}"
+    cmd = f"pymarian-eval -m {metric_name} {pymarian_args} "
+    log.info(f"Running: {cmd}")
+    output = subprocess.check_output(cmd, shell=True)
+    out_lines = output.decode("utf-8").splitlines()
+    out_scores = [float(x) for x in out_lines if x.strip()]
+    compare_scores(metric_name, orig_scores, out_scores)
diff --git a/src/python/tests/test_train.py b/src/python/tests/regression/test_train.py
similarity index 89%
rename from src/python/tests/test_train.py
rename to src/python/tests/regression/test_train.py
index 543e45db5..c538e716a 100644
--- a/src/python/tests/test_train.py
+++ b/src/python/tests/regression/test_train.py
@@ -4,23 +4,22 @@
 from pathlib import Path
 
 from pymarian import Trainer
-from pymarian.utils import get_known_model
 
 QUIET = False
 
-TMP_DATA_DIR = Path.home() / 'tmp' / 'marian-tests'
 DATA_URL = "https://textmt.blob.core.windows.net/www/data/marian-tests-data.tgz"
+DATA_DIR = Path(__file__).parent.parent / 'data' / 'marian-tests-data'
 
 
 def setup():
-    ok_file = TMP_DATA_DIR / '_OK'
-    if not TMP_DATA_DIR.exists() or not ok_file.exists():
-        TMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
+    ok_file = DATA_DIR / '.downloaded'
+    if not ok_file.exists():
+        DATA_DIR.mkdir(parents=True, exist_ok=True)
 
         print("Downloading data package...")
         with urllib.request.urlopen(DATA_URL) as response:
             with tarfile.open(fileobj=response, mode="r|gz") as tar:
-                tar.extractall(path=TMP_DATA_DIR)
+                tar.extractall(path=DATA_DIR.parent)
         ok_file.touch()
         print("Done.")
 
@@ -29,11 +28,10 @@ def setup():
 
 
 def test_train_comet_qe():
-    data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng'
+    data_dir = DATA_DIR / 'deu-eng'
     vocab_file = data_dir / 'vocab.8k.spm'
     classe_file = data_dir / 'classes4f.txt'
     train_file = data_dir / 'sample.5k.chrfoid-deu-eng.tsv'
-    # pretrained_model, vocab_file = get_known_model("chrfoid-wmt23")
     assert classe_file.exists()
     assert vocab_file.exists()
     assert train_file.exists()
@@ -88,7 +86,7 @@ def test_train_comet_qe():
 
 
 def test_train_transformer_nmt():
-    data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng'
+    data_dir = DATA_DIR / 'deu-eng'
     vocab_file = data_dir / 'vocab.8k.spm'
     train_prefix = str(data_dir / 'sample.5k')
     src_lang = "deu"
diff --git a/src/python/tests/regression/test_translate.py b/src/python/tests/regression/test_translate.py
new file mode 100644
index 000000000..b65ec49dc
--- /dev/null
+++ b/src/python/tests/regression/test_translate.py
@@ -0,0 +1,35 @@
+import tarfile
+import urllib.request
+from pathlib import Path
+
+from pymarian import Translator
+
+from . import BASE_ARGS
+
+DATA_URL = "http://data.statmt.org/romang/marian-regression-tests/models/wngt19.tar.gz"
+DATA_DIR = Path(__file__).parent.parent / "data" / "wngt19"
+
+
+def setup():
+    flag_file = DATA_DIR / ".downloaded"
+    if flag_file.exists():
+        print("Data already downloaded. Setup skipped...")
+        return
+    print(f"Downloading {DATA_URL} to {DATA_DIR}")
+    request = urllib.request.urlopen(DATA_URL)
+    with tarfile.open(fileobj=request, mode="r|gz") as tar:
+        tar.extractall(path=DATA_DIR.parent)
+    flag_file.touch()
+
+
+setup()
+
+
+def test_ende():
+
+    model_file = str(DATA_DIR / 'model.base.npz')
+    vocab_file = str(DATA_DIR / 'en-de.spm')
+    args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file])
+    translator = Translator(**args)
+    hyp = translator.translate("Hello. Good morning.")
+    assert hyp == "Hallo , Guten Morgen ."
diff --git a/src/python/tests/test_evaluate.py b/src/python/tests/test_evaluate.py
deleted file mode 100644
index d79462901..000000000
--- a/src/python/tests/test_evaluate.py
+++ /dev/null
@@ -1,148 +0,0 @@
-"""
-# silense marian log
-export MARIAN_QUIET=yes
-
-# run all tests in this file
-  pytest -v src/python/tests/test_evaluate.py 
-  pytest -vx src/python/tests/test_evaluate.py   #stop on first failure
-
-# run a single test:
-   pytest -v src/python/tests/test_evaluate.py -k test_evaluator_chrfoid
-   pytest -vs src/python/tests/test_evaluate.py -k test_evaluator_chrfoid # see stdout and stderr
-"""
-import os
-
-from pymarian import Evaluator
-from pymarian.utils import get_known_model
-
-from . import BASE_ARGS
-
-EPSILON = 0.0001  # the precision error we afford in float comparison
-
-
-# dummy sentences for testing
-SAMPLE_SRC_HYP = [
-    ["This is a test", "This is a test A"],
-    ["This is a test B", "This is a test C"],
-    ["This is a test D", "This is a test E"],
-]
-SAMPLE_REF_HYP = SAMPLE_SRC_HYP  # same for now
-SAMPLE_SRC_HYP_REF = [
-    ["This is a test", "This is a test A", "This is a test AA"],
-    ["This is a test B", "This is a test C", "This is a test CC"],
-    ["This is a test D", "This is a test E", "This is a test EE"],
-]
-
-
-def test_evaluator_chrfoid():
-    model_path, vocab_path = get_known_model("chrfoid-wmt23")
-    args = BASE_ARGS | dict(
-        like="comet-qe",
-        model=model_path,
-        vocabs=[vocab_path, vocab_path],
-    )
-    # args = dict(help='')   # to get help message with all args
-    eval = Evaluator(**args)
-    data = SAMPLE_SRC_HYP
-    expected_scores = [0.0548, 0.0797, 0.0988]
-
-    scores = eval.evaluate(data)
-    assert len(scores) == len(data)
-    for score, expected_score in zip(scores, expected_scores):
-        if isinstance(score, list):
-            score = score[0]
-        assert abs(score - expected_score) < EPSILON
-
-
-def test_evaluator_cometoid22_wmt22():
-    model_path, vocab_path = get_known_model("cometoid22-wmt22")
-    args = BASE_ARGS | dict(
-        like="comet-qe",
-        model=model_path,
-        vocabs=[vocab_path, vocab_path],
-    )
-    # args = dict(help='')   # to get help message with all args
-    eval = Evaluator(**args)
-    data = SAMPLE_SRC_HYP
-    expected_scores = [0.71845, 0.7906, 0.81549]
-
-    scores = eval.evaluate(data)
-    assert len(scores) == len(data)
-
-    for score, expected_score in zip(scores, expected_scores):
-        if isinstance(score, list):
-            score = score[0]
-        assert abs(score - expected_score) < EPSILON
-
-
-def test_evaluator_cometoid22_wmt23():
-    model_path, vocab_path = get_known_model("cometoid22-wmt23")
-    args = BASE_ARGS | dict(
-        like="comet-qe",
-        model=model_path,
-        vocabs=[vocab_path, vocab_path],
-    )
-    eval = Evaluator(**args)
-    data = SAMPLE_SRC_HYP
-    expected_scores = [0.75715, 0.81395, 0.8361]
-
-    scores = eval.evaluate(data)
-    assert len(scores) == len(data)
-    for score, expected_score in zip(scores, expected_scores):
-        if isinstance(score, list):
-            score = score[0]
-        assert abs(score - expected_score) < EPSILON
-
-
-def test_evaluator_bleurt():
-    model_path, vocab_path = get_known_model("bleurt20")
-    args = BASE_ARGS | dict(
-        like="bleurt",
-        model=model_path,
-        vocabs=[vocab_path, vocab_path],
-    )
-
-    eval = Evaluator(**args)
-    data = SAMPLE_REF_HYP
-    scores = eval.evaluate(data)
-    expected_scores = [0.30929, 0.3027, 0.3113]
-    assert len(scores) == len(data)
-    for score, expected_score in zip(scores, expected_scores):
-        if isinstance(score, list):
-            score = score[0]
-        assert abs(score - expected_score) < EPSILON
-
-
-# TODO: These below tests are failing
-
-
-def test_evaluator_comet20qe():
-    model_path, vocab_path = get_known_model("comet20-da-qe")
-    args = BASE_ARGS | dict(
-        like="comet-qe",
-        model=model_path,
-        vocabs=[vocab_path, vocab_path],
-    )
-
-    eval = Evaluator(**args)
-    data = SAMPLE_SRC_HYP
-    scores = eval.evaluate(data)
-    assert len(scores) == len(data)
-    # TODO: add expected scores and asserts
-
-
-def test_evaluator_comet20ref():
-    model_path, vocab_path = get_known_model("comet20-da")
-    args = BASE_ARGS | dict(
-        like="comet",
-        model=model_path,
-        vocabs=[vocab_path, vocab_path],
-    )
-
-    eval = Evaluator(**args)
-    data = SAMPLE_SRC_HYP_REF
-    scores = eval.evaluate(data)
-    len(scores) == len(data)
-
-
-# TODO: add expected scores and asserts
diff --git a/src/python/tests/test_translate.py b/src/python/tests/test_translate.py
deleted file mode 100644
index 0ad5adc60..000000000
--- a/src/python/tests/test_translate.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from pathlib import Path
-
-from pymarian import Translator
-
-from . import BASE_ARGS
-
-
-def test_ende():
-    # TODO: download model from blob storage
-    model_dir = Path.home() / 'tmp/marian-eng-deu'
-    model_file = str(model_dir / 'model.bin')
-    vocab_file = str(model_dir / 'vocab.spm')
-    args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file])
-    translator = Translator(**args)
-    hyp = translator.translate("Hello. Good morning.")
-    assert hyp == "Hallo. Guten Morgen."

From 39ade681112458957a1c4016d22622f0bdcdb489 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Thu, 22 Feb 2024 12:32:58 +0000
Subject: [PATCH 18/26] Merged PR 33078: Merge public master with internal
 master

This mostly adds @<Varun Mathur>'s changes from public master to internal. I did an automatic merge and need to go through those changes myself. I think there is an issue in translator.h which I will fix.

@<Varun Mathur> can you check if things work for you here?
---
 .github/workflows/ios.yml                   |   43 +
 .github/workflows/release.yml               |    2 +-
 .github/workflows/ubuntu.yml                |    2 +-
 .gitmodules                                 |    6 +
 CHANGELOG.md                                |    2 +
 CMakeLists.txt                              |   74 +-
 VERSION                                     |    2 +-
 azure-pipelines.yml                         |    2 +-
 cmake/ios.toolchain.cmake                   | 1099 +++++++++++++++++++
 scripts/ci/install_mkl.sh                   |    2 +-
 src/3rd_party/CMakeLists.txt                |   15 +-
 src/3rd_party/faiss/VectorTransform.cpp     |    6 +
 src/3rd_party/faiss/VectorTransform.h       |    2 +
 src/3rd_party/ruy                           |    1 +
 src/3rd_party/sentencepiece                 |    2 +-
 src/3rd_party/simd_utils                    |    1 +
 src/common/binary.cpp                       |    2 +-
 src/common/types.h                          |    6 +-
 src/functional/operators.h                  |    5 +-
 src/tensors/cpu/expression_graph_packable.h |    2 +-
 src/tensors/cpu/fbgemm/packed_gemm.cpp      |    8 +-
 src/tensors/cpu/integer_common.h            |   14 +-
 src/tensors/cpu/intgemm_interface.h         |    4 +-
 src/tensors/cpu/prod.cpp                    |    8 -
 src/tensors/cpu/prod_blas.h                 |  130 ++-
 src/translator/translator.h                 |    1 -
 26 files changed, 1393 insertions(+), 48 deletions(-)
 create mode 100644 .github/workflows/ios.yml
 create mode 100644 cmake/ios.toolchain.cmake
 create mode 160000 src/3rd_party/ruy
 create mode 160000 src/3rd_party/simd_utils

diff --git a/.github/workflows/ios.yml b/.github/workflows/ios.yml
new file mode 100644
index 000000000..4dfa8905d
--- /dev/null
+++ b/.github/workflows/ios.yml
@@ -0,0 +1,43 @@
+name: iOS
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build-macos:
+    name: iOS CPU-only
+    runs-on: macos-12
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Install dependencies
+      run: brew install boost openblas openssl protobuf
+
+    - name: Configure CMake
+      run: |
+        export LDFLAGS="-L/usr/local/opt/openblas/lib"
+        export CPPFLAGS="-I/usr/local/opt/openblas/include"
+        mkdir -p build
+        cd build
+        cmake .. \
+          -DCOMPILE_CPU=on \
+          -DCOMPILE_CUDA=off \
+          -DCOMPILE_EXAMPLES=on \
+          -DCOMPILE_SERVER=off \
+          -DCOMPILE_TESTS=on \
+          -DUSE_SENTENCEPIECE=on \
+          -DCMAKE_TOOLCHAIN_FILE=../cmake/ios.toolchain.cmake \
+          -DUSE_SENTENCEPIECE=on \
+          -DPLATFORM=OS64 \
+          -DDEPLOYMENT_TARGET=13.0
+
+    - name: Compile
+      working-directory: build
+      run: cmake --build . --config Release
\ No newline at end of file
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 5beab28f0..59bb2dc76 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -77,7 +77,7 @@ jobs:
     # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
     - name: Install MKL
       run: |
-        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
+        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" | sudo apt-key add -
         sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
         sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
         sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index f2baae82d..3a4c65b31 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -91,7 +91,7 @@ jobs:
     # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
     - name: Install MKL
       run: |
-        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
+        wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" | sudo apt-key add -
         sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
         sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
         sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088
diff --git a/.gitmodules b/.gitmodules
index 7a94dab1d..083aabe85 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -20,6 +20,12 @@
 [submodule "src/3rd_party/simple-websocket-server"]
 	path = src/3rd_party/simple-websocket-server
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
+[submodule "src/3rd_party/ruy"]
+	path = src/3rd_party/ruy
+	url = https://github.com/marian-nmt/ruy.git
+[submodule "src/3rd_party/simd_utils"]
+	path = src/3rd_party/simd_utils
+	url = https://github.com/marian-nmt/simd_utils.git
 [submodule "src/3rd_party/pybind11"]
 	path = src/3rd_party/pybind11
 	url = https://github.com/pybind/pybind11.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9412de3a2..3049e622e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 - Fixed compilation with clang 16.0.6
+- Added Threads::Threads to EXT_LIBS
+
 
 ### Added
 - Added `pymarian-eval`, CLI for scoring metrics
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e16876f78..b6aa74297 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ if (POLICY CMP0074)
 endif ()
 
 project(marian CXX C)
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
@@ -82,6 +83,48 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release")
 endif()
 
+# iOS support
+if(CMAKE_SYSTEM_NAME STREQUAL "iOS" )
+  set(ARM ON)
+  # need to ignore this warning for Xcode to be happy
+  list(APPEND ALL_WARNINGS -Wno-shorten-64-to-32;)
+endif()
+
+# ARM support: currently ONLY armv8. armv8 includes NEON by default
+# we do not currently have good support for automatic architecture detection, including for cross-compilation
+# this is planned for future PRs
+if(ARM)
+
+  # Apple by default has Apple Accelerate. Otherwise fallback to RUY for GEMM
+  if(APPLE)
+    message(STATUS "Using Apple Accelerate SGEMM")
+    option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
+  else(APPLE)
+    message(STATUS "Using Ruy SGEMM")
+    set(EXT_LIBS ${EXT_LIBS} ruy)
+    option(USE_RUY_SGEMM "Compile with Ruy SGEMM" ON)
+  endif(APPLE)
+
+  # Define that we are using ARM as required by simd_utils. See their README for info
+  add_compile_definitions(ARM FMA SSE)
+  # Some warnings as errors. I don't feel comfortable about the strict aliasing.
+  set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")
+
+  if(MSVC)
+    add_compile_options(/flax-vector-conversions)
+  else(MSVC)
+    add_compile_options(-flax-vector-conversions)
+  endif(MSVC)
+endif(ARM)
+
+########
+# pThreads: consider it as EXT_LIBS for a more portable binary
+set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+find_package(Threads REQUIRED)
+set(EXT_LIBS ${EXT_LIBS} Threads::Threads)
+########
+
 ###############################################################################
 # Set compilation flags
 if(MSVC)
@@ -141,13 +184,16 @@ else(MSVC)
   set(INTRINSICS "")
   list(APPEND INTRINSICS_NVCC)
 
-  option(COMPILE_SSE2   "Compile CPU code with SSE2 support"   ON)
-  option(COMPILE_SSE3   "Compile CPU code with SSE3 support"   ON)
-  option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON)
-  option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON)
-  option(COMPILE_AVX    "Compile CPU code with AVX support"    ON)
-  option(COMPILE_AVX2   "Compile CPU code with AVX2 support"   ON)
-  option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON)
+  if(NOT ARM)
+    # none of these options are available on ARM
+    option(COMPILE_SSE2   "Compile CPU code with SSE2 support"   ON)
+    option(COMPILE_SSE3   "Compile CPU code with SSE3 support"   ON)
+    option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON)
+    option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON)
+    option(COMPILE_AVX    "Compile CPU code with AVX support"    ON)
+    option(COMPILE_AVX2   "Compile CPU code with AVX2 support"   ON)
+    option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON)
+  endif(NOT ARM)
 
   if(BUILD_ARCH STREQUAL "native")
     message(STATUS "Building with -march=native and intrinsics will be chosen automatically by the compiler to match the current machine.")
@@ -223,7 +269,7 @@ else(MSVC)
     # Clang-10.0.0 complains when CUDA is newer than 10.1
     set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version")
   endif()
-  set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA}")
+  set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${ARM_WARNINGS}")
 
   # These are used in src/CMakeLists.txt on a per-target basis
   list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
@@ -242,7 +288,7 @@ else(MSVC)
     set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
   endif(CMAKE_COMPILER_IS_GNUCC)
 
-  set(CMAKE_CXX_FLAGS                 "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_CXX_FLAGS                 "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
   set(CMAKE_CXX_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_CXX_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
@@ -252,7 +298,7 @@ else(MSVC)
   set(CMAKE_CXX_FLAGS_PROFUSE         "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
 
   # these need to be set separately
-  set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
+  set(CMAKE_C_FLAGS                 "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
   set(CMAKE_C_FLAGS_RELEASE         "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_DEBUG           "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
   set(CMAKE_C_FLAGS_SLIM            "-O3 -m64 -funroll-loops -DNDEBUG")
@@ -260,6 +306,12 @@ else(MSVC)
   set(CMAKE_C_FLAGS_PROFILE         "${CMAKE_C_FLAGS_RELEASE} -pg")
   set(CMAKE_C_FLAGS_PROFGEN         "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
   set(CMAKE_C_FLAGS_PROFUSE         "${CMAKE_C_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
+
+  # set -march for all builds except iOS cross compilation
+  if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS" )
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${BUILD_ARCH}")
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -march=${BUILD_ARCH}")
+  endif()
 endif(MSVC)
 
 # with gcc 7.0 and above we need to mark fallthrough in switch case statements
@@ -521,7 +573,7 @@ endif()
 ###############################################################################
 # Find BLAS library
 if(COMPILE_CPU)
-  if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
+  if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
     set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
     add_definitions(-DCOMPILE_CPU=1)
   endif()
diff --git a/VERSION b/VERSION
index 53dbb431e..329143f69 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.25
+v1.12.26
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4c7cd0bfd..d9c816928 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -68,7 +68,7 @@ variables:
   - name: MKL_DIR
     value: "$(Build.SourcesDirectory)/mkl"
   - name: MKL_URL
-    value: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip"
+    value: "https://mariandev.blob.core.windows.net/public/ci/mkl-2020.1-windows-static.zip"
   - name: VCPKG_COMMIT
     value: 2023.11.20
   - name: VCPKG_DIR
diff --git a/cmake/ios.toolchain.cmake b/cmake/ios.toolchain.cmake
new file mode 100644
index 000000000..2131172fd
--- /dev/null
+++ b/cmake/ios.toolchain.cmake
@@ -0,0 +1,1099 @@
+# This file is part of the ios-cmake project. It was retrieved from
+# https://github.com/leetal/ios-cmake.git, which is a fork of
+# https://github.com/gerstrong/ios-cmake.git, which is a fork of
+# https://github.com/cristeab/ios-cmake.git, which is a fork of
+# https://code.google.com/p/ios-cmake/. Which in turn is based off of
+# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which
+# are included with CMake 2.8.4
+#
+# The ios-cmake project is licensed under the new BSD license.
+#
+# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software,
+# Kitware, Inc., Insight Software Consortium.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# This file is based on the Platform/Darwin.cmake and
+# Platform/UnixPaths.cmake files which are included with CMake 2.8.4
+# It has been altered for iOS development.
+#
+# Updated by Alex Stewart (alexs.mac@gmail.com)
+#
+# *****************************************************************************
+#      Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com)
+#                      under the BSD-3-Clause license
+#                   https://github.com/leetal/ios-cmake
+# *****************************************************************************
+#
+#                           INFORMATION / HELP
+#
+###############################################################################
+#                                  OPTIONS                                    #
+###############################################################################
+#
+# PLATFORM: (default "OS64")
+#    OS = Build for iPhoneOS.
+#    OS64 = Build for arm64 iphoneOS.
+#    OS64COMBINED = Build for arm64 x86_64 iphoneOS + iphoneOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step)
+#    SIMULATOR = Build for x86 i386 iphoneOS Simulator.
+#    SIMULATOR64 = Build for x86_64 iphoneOS Simulator.
+#    SIMULATORARM64 = Build for arm64 iphoneOS Simulator.
+#    SIMULATOR64COMBINED = Build for arm64 x86_64 iphoneOS Simulator. Combined into FAT STATIC lib (supported on 3.14+ of CMakewith "-G Xcode" argument ONLY)
+#    TVOS = Build for arm64 tvOS.
+#    TVOSCOMBINED = Build for arm64 x86_64 tvOS + tvOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step)
+#    SIMULATOR_TVOS = Build for x86_64 tvOS Simulator.
+#    SIMULATORARM64_TVOS = Build for arm64 tvOS Simulator.
+#    WATCHOS = Build for armv7k arm64_32 for watchOS.
+#    WATCHOSCOMBINED = Build for armv7k arm64_32 x86_64 watchOS + watchOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step)
+#    SIMULATOR_WATCHOS = Build for x86_64 for watchOS Simulator.
+#    MAC = Build for x86_64 macOS.
+#    MAC_ARM64 = Build for Apple Silicon macOS.
+#    MAC_UNIVERSAL = Combined build for x86_64 and Apple Silicon on macOS.
+#    MAC_CATALYST = Build for x86_64 macOS with Catalyst support (iOS toolchain on macOS).
+#                   Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS
+#    MAC_CATALYST_ARM64 = Build for Apple Silicon macOS with Catalyst support (iOS toolchain on macOS).
+#                         Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS
+#
+# CMAKE_OSX_SYSROOT: Path to the SDK to use.  By default this is
+#    automatically determined from PLATFORM and xcodebuild, but
+#    can also be manually specified (although this should not be required).
+#
+# CMAKE_DEVELOPER_ROOT: Path to the Developer directory for the platform
+#    being compiled for.  By default, this is automatically determined from
+#    CMAKE_OSX_SYSROOT, but can also be manually specified (although this should
+#    not be required).
+#
+# DEPLOYMENT_TARGET: Minimum SDK version to target. Default 2.0 on watchOS and 9.0 on tvOS+iOS
+#
+# NAMED_LANGUAGE_SUPPORT:
+#    ON (default) = Will require "enable_language(OBJC) and/or enable_language(OBJCXX)" for full OBJC|OBJCXX support
+#    OFF = Will embed the OBJC and OBJCXX flags into the CMAKE_C_FLAGS and CMAKE_CXX_FLAGS (legacy behavior, CMake version < 3.16)
+#
+# ENABLE_BITCODE: (ON|OFF) Enables or disables bitcode support. Default OFF
+#
+# ENABLE_ARC: (ON|OFF) Enables or disables ARC support. Default ON (ARC enabled by default)
+#
+# ENABLE_VISIBILITY: (ON|OFF) Enables or disables symbol visibility support. Default OFF (visibility hidden by default)
+#
+# ENABLE_STRICT_TRY_COMPILE: (ON|OFF) Enables or disables strict try_compile() on all Check* directives (will run linker
+#    to actually check if linking is possible). Default OFF (will set CMAKE_TRY_COMPILE_TARGET_TYPE to STATIC_LIBRARY)
+#
+# ARCHS: (armv7 armv7s armv7k arm64 arm64_32 i386 x86_64) If specified, will override the default architectures for the given PLATFORM
+#    OS = armv7 armv7s arm64 (if applicable)
+#    OS64 = arm64 (if applicable)
+#    SIMULATOR = i386
+#    SIMULATOR64 = x86_64
+#    SIMULATORARM64 = arm64
+#    TVOS = arm64
+#    SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated)
+#    SIMULATORARM64_TVOS = arm64
+#    WATCHOS = armv7k arm64_32 (if applicable)
+#    SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated)
+#    MAC = x86_64
+#    MAC_ARM64 = arm64
+#    MAC_UNIVERSAL = x86_64 arm64
+#    MAC_CATALYST = x86_64
+#    MAC_CATALYST_ARM64 = arm64
+#
+# NOTE: When manually specifying ARCHS, put a semi-colon between the entries. E.g., -DARCHS="armv7;arm64"
+#
+###############################################################################
+#                                END OPTIONS                                  #
+###############################################################################
+#
+# This toolchain defines the following properties (available via get_property()) for use externally:
+#
+# PLATFORM: The currently targeted platform.
+# XCODE_VERSION: Version number (not including Build version) of Xcode detected.
+# SDK_VERSION: Version of SDK being used.
+# OSX_ARCHITECTURES: Architectures being compiled for (generated from PLATFORM).
+# APPLE_TARGET_TRIPLE: Used by autoconf build systems. NOTE: If "ARCHS" is overridden, this will *NOT* be set!
+#
+# This toolchain defines the following macros for use externally:
+#
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT)
+#   A convenience macro for setting xcode specific properties on targets.
+#   Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel
+#   example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
+#
+# find_host_package (PROGRAM ARGS)
+#   A macro used to find executable programs on the host system, not within the
+#   environment. Thanks to the android-cmake project for providing the
+#   command.
+#
+
+cmake_minimum_required(VERSION 3.8.0)
+
+# CMake invokes the toolchain file twice during the first build, but only once during subsequent rebuilds.
+if(DEFINED ENV{_IOS_TOOLCHAIN_HAS_RUN})
+  return()
+endif()
+set(ENV{_IOS_TOOLCHAIN_HAS_RUN} true)
+
+# List of supported platform values
+list(APPEND _supported_platforms
+        "OS" "OS64" "OS64COMBINED" "SIMULATOR" "SIMULATOR64" "SIMULATORARM64" "SIMULATOR64COMBINED"
+        "TVOS" "TVOSCOMBINED" "SIMULATOR_TVOS" "SIMULATORARM64_TVOS"
+        "WATCHOS" "WATCHOSCOMBINED" "SIMULATOR_WATCHOS"
+        "MAC" "MAC_ARM64" "MAC_UNIVERSAL"
+        "VISIONOS" "SIMULATOR_VISIONOS" "SIMULATOR64_VISIONOS"
+        "MAC_CATALYST" "MAC_CATALYST_ARM64")
+
+# Cache what generator is used
+set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}")
+
+# Check if using a CMake version capable of building combined FAT builds (simulator and target slices combined in one static lib)
+if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14")
+  set(MODERN_CMAKE YES)
+endif()
+
+# Get the Xcode version being used.
+# Problem: CMake runs toolchain files multiple times, but can't read cache variables on some runs.
+# Workaround: On the first run (in which cache variables are always accessible), set an intermediary environment variable.
+#
+# NOTE: This pattern is used in many places in this toolchain to speed up checks of all sorts
+if(DEFINED XCODE_VERSION_INT)
+  # Environment variables are always preserved.
+  set(ENV{_XCODE_VERSION_INT} "${XCODE_VERSION_INT}")
+elseif(DEFINED ENV{_XCODE_VERSION_INT})
+  set(XCODE_VERSION_INT "$ENV{_XCODE_VERSION_INT}")
+elseif(NOT DEFINED XCODE_VERSION_INT)
+  find_program(XCODEBUILD_EXECUTABLE xcodebuild)
+  if(NOT XCODEBUILD_EXECUTABLE)
+    message(FATAL_ERROR "xcodebuild not found. Please install either the standalone commandline tools or Xcode.")
+  endif()
+  execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version
+          OUTPUT_VARIABLE XCODE_VERSION_INT
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION_INT "${XCODE_VERSION_INT}")
+  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION_INT "${XCODE_VERSION_INT}")
+  set(XCODE_VERSION_INT "${XCODE_VERSION_INT}" CACHE INTERNAL "")
+endif()
+
+# Assuming that xcode 12.0 is installed you most probably have ios sdk 14.0 or later installed (tested on Big Sur)
+# if you don't set a deployment target it will be set the way you only get 64-bit builds
+#if(NOT DEFINED DEPLOYMENT_TARGET AND XCODE_VERSION_INT VERSION_GREATER 12.0)
+# Temporarily fix the arm64 issues in CMake install-combined by excluding arm64 for simulator builds (needed for Apple Silicon...)
+#  set(CMAKE_XCODE_ATTRIBUTE_EXCLUDED_ARCHS[sdk=iphonesimulator*] "arm64")
+#endif()
+
+# Check if the platform variable is set
+if(DEFINED PLATFORM)
+  # Environment variables are always preserved.
+  set(ENV{_PLATFORM} "${PLATFORM}")
+elseif(DEFINED ENV{_PLATFORM})
+  set(PLATFORM "$ENV{_PLATFORM}")
+elseif(NOT DEFINED PLATFORM)
+  message(FATAL_ERROR "PLATFORM argument not set. Bailing configure since I don't know what target you want to build for!")
+endif ()
+
+if(PLATFORM MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode")
+  message(FATAL_ERROR "The combined builds support requires Xcode to be used as a generator via '-G Xcode' command-line argument in CMake")
+endif()
+
+# Safeguard that the platform value is set and is one of the supported values
+list(FIND _supported_platforms ${PLATFORM} contains_PLATFORM)
+if("${contains_PLATFORM}" EQUAL "-1")
+  string(REPLACE ";"  "\n * " _supported_platforms_formatted "${_supported_platforms}")
+  message(FATAL_ERROR " Invalid PLATFORM specified! Current value: ${PLATFORM}.\n"
+          " Supported PLATFORM values: \n * ${_supported_platforms_formatted}")
+endif()
+
+# Check if Apple Silicon is supported
+if(PLATFORM MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$|^(MAC_UNIVERSAL)$" AND ${CMAKE_VERSION} VERSION_LESS "3.19.5")
+  message(FATAL_ERROR "Apple Silicon builds requires a minimum of CMake 3.19.5")
+endif()
+
+# Touch the toolchain variable to suppress the "unused variable" warning.
+# This happens if CMake is invoked with the same command line the second time.
+if(CMAKE_TOOLCHAIN_FILE)
+endif()
+
+# Fix for PThread library not in path
+set(CMAKE_THREAD_LIBS_INIT "-lpthread")
+set(CMAKE_HAVE_THREADS_LIBRARY 1)
+set(CMAKE_USE_WIN32_THREADS_INIT 0)
+set(CMAKE_USE_PTHREADS_INIT 1)
+
+# Specify named language support defaults.
+if(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16")
+  set(NAMED_LANGUAGE_SUPPORT ON)
+  message(STATUS "[DEFAULTS] Using explicit named language support! E.g., enable_language(CXX) is needed in the project files.")
+elseif(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16")
+  set(NAMED_LANGUAGE_SUPPORT OFF)
+  message(STATUS "[DEFAULTS] Disabling explicit named language support. Falling back to legacy behavior.")
+elseif(DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16")
+  message(FATAL_ERROR "CMake named language support for OBJC and OBJCXX was added in CMake 3.16.")
+endif()
+set(NAMED_LANGUAGE_SUPPORT_INT ${NAMED_LANGUAGE_SUPPORT} CACHE BOOL
+        "Whether or not to enable explicit named language support" FORCE)
+
+# Specify the minimum version of the deployment target.
+if(NOT DEFINED DEPLOYMENT_TARGET)
+  if (PLATFORM MATCHES "WATCHOS")
+    # Unless specified, SDK version 4.0 is used by default as minimum target version (watchOS).
+    set(DEPLOYMENT_TARGET "4.0")
+  elseif(PLATFORM STREQUAL "MAC")
+    # Unless specified, SDK version 10.13 (High Sierra) is used by default as the minimum target version (macos).
+    set(DEPLOYMENT_TARGET "10.13")
+  elseif(PLATFORM STREQUAL "VISIONOS" OR PLATFORM STREQUAL "SIMULATOR_VISIONOS" OR PLATFORM STREQUAL "SIMULATOR64_VISIONOS")
+    # Unless specified, SDK version 1.0 is used by default as minimum target version (visionOS).
+    set(DEPLOYMENT_TARGET "1.0")
+  elseif(PLATFORM STREQUAL "MAC_ARM64")
+    # Unless specified, SDK version 11.0 (Big Sur) is used by default as the minimum target version (macOS on arm).
+    set(DEPLOYMENT_TARGET "11.0")
+  elseif(PLATFORM STREQUAL "MAC_UNIVERSAL")
+    # Unless specified, SDK version 11.0 (Big Sur) is used by default as minimum target version for universal builds.
+    set(DEPLOYMENT_TARGET "11.0")
+  elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64")
+    # Unless specified, SDK version 13.0 is used by default as the minimum target version (mac catalyst minimum requirement).
+    set(DEPLOYMENT_TARGET "13.1")
+  else()
+    # Unless specified, SDK version 11.0 is used by default as the minimum target version (iOS, tvOS).
+    set(DEPLOYMENT_TARGET "11.0")
+  endif()
+  message(STATUS "[DEFAULTS] Using the default min-version since DEPLOYMENT_TARGET not provided!")
+elseif(DEFINED DEPLOYMENT_TARGET AND PLATFORM MATCHES "^MAC_CATALYST" AND ${DEPLOYMENT_TARGET} VERSION_LESS "13.1")
+  message(FATAL_ERROR "Mac Catalyst builds requires a minimum deployment target of 13.1!")
+endif()
+
+# Store the DEPLOYMENT_TARGET in the cache
+set(DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}" CACHE INTERNAL "")
+
+# Handle the case where we are targeting iOS and a version above 10.3.4 (32-bit support dropped officially)
+if(PLATFORM STREQUAL "OS" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4)
+  set(PLATFORM "OS64")
+  message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.")
+elseif(PLATFORM STREQUAL "SIMULATOR" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4)
+  set(PLATFORM "SIMULATOR64")
+  message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.")
+endif()
+
+set(PLATFORM_INT "${PLATFORM}")
+
+if(DEFINED ARCHS)
+  string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}")
+endif()
+
+# Determine the platform name and architectures for use in xcodebuild commands
+# from the specified PLATFORM_INT name.
+if(PLATFORM_INT STREQUAL "OS")
+  set(SDK_NAME iphoneos)
+  if(NOT ARCHS)
+    set(ARCHS armv7 armv7s arm64)
+    set(APPLE_TARGET_TRIPLE_INT arm-apple-ios${DEPLOYMENT_TARGET})
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET})  
+  endif()
+elseif(PLATFORM_INT STREQUAL "OS64")
+  set(SDK_NAME iphoneos)
+  if(NOT ARCHS)
+    if (XCODE_VERSION_INT VERSION_GREATER 10.0)
+      set(ARCHS arm64) # FIXME: Add arm64e when Apple has fixed the integration issues with it, libarclite_iphoneos.a is currently missing bitcode markers for example
+    else()
+      set(ARCHS arm64)
+    endif()
+    set(APPLE_TARGET_TRIPLE_INT arm64-apple-ios${DEPLOYMENT_TARGET})
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET})
+  endif()
+elseif(PLATFORM_INT STREQUAL "OS64COMBINED")
+  set(SDK_NAME iphoneos)
+  if(MODERN_CMAKE)
+    if(NOT ARCHS)
+      if (XCODE_VERSION_INT VERSION_GREATER 12.0)
+        set(ARCHS arm64 x86_64)
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64")
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64 arm64")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64 arm64")
+      else()
+        set(ARCHS arm64 x86_64)
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64")
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64")
+      endif()
+      set(APPLE_TARGET_TRIPLE_INT arm64-x86_64-apple-ios${DEPLOYMENT_TARGET})
+    else()
+      set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET})
+    endif()
+  else()
+    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the OS64COMBINED setting work")
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATOR64COMBINED")
+  set(SDK_NAME iphonesimulator)
+  if(MODERN_CMAKE)
+    if(NOT ARCHS)
+      if (XCODE_VERSION_INT VERSION_GREATER 12.0)
+        set(ARCHS arm64 x86_64) # FIXME: Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missing bitcode markers for example
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "")
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64 arm64")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64 arm64")
+      else()
+        set(ARCHS arm64 x86_64)
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "")
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64")
+      endif()
+      set(APPLE_TARGET_TRIPLE_INT aarch64-x86_64-apple-ios${DEPLOYMENT_TARGET}-simulator)
+    else()
+      set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator)
+    endif()
+  else()
+    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the SIMULATOR64COMBINED setting work")
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATOR")
+  set(SDK_NAME iphonesimulator)
+  if(NOT ARCHS)
+    set(ARCHS i386)
+    set(APPLE_TARGET_TRIPLE_INT i386-apple-ios${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator)
+  endif()
+  message(DEPRECATION "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.")
+elseif(PLATFORM_INT STREQUAL "SIMULATOR64")
+  set(SDK_NAME iphonesimulator)
+  if(NOT ARCHS)
+    set(ARCHS x86_64)
+    set(APPLE_TARGET_TRIPLE_INT x86_64-apple-ios${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATORARM64")
+  set(SDK_NAME iphonesimulator)
+  if(NOT ARCHS)
+    set(ARCHS arm64)
+    set(APPLE_TARGET_TRIPLE_INT arm64-apple-ios${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "TVOS")
+  set(SDK_NAME appletvos)
+  if(NOT ARCHS)
+    set(ARCHS arm64)
+    set(APPLE_TARGET_TRIPLE_INT arm64-apple-tvos${DEPLOYMENT_TARGET})
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET})
+  endif()
+elseif (PLATFORM_INT STREQUAL "TVOSCOMBINED")
+  set(SDK_NAME appletvos)
+  if(MODERN_CMAKE)
+    if(NOT ARCHS)
+      set(ARCHS arm64 x86_64)
+      set(APPLE_TARGET_TRIPLE_INT arm64-x86_64-apple-tvos${DEPLOYMENT_TARGET})
+      set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvos*] "arm64")
+      set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvsimulator*] "x86_64")
+      set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvos*] "arm64")
+      set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvsimulator*] "x86_64")
+    else()
+      set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET})
+    endif()
+  else()
+    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the TVOSCOMBINED setting work")
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS")
+  set(SDK_NAME appletvsimulator)
+  if(NOT ARCHS)
+    set(ARCHS x86_64)
+    set(APPLE_TARGET_TRIPLE_INT x86_64-apple-tvos${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATORARM64_TVOS")
+  set(SDK_NAME appletvsimulator)
+  if(NOT ARCHS)
+    set(ARCHS arm64)
+    set(APPLE_TARGET_TRIPLE_INT arm64-apple-tvos${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "WATCHOS")
+  set(SDK_NAME watchos)
+  if(NOT ARCHS)
+    if (XCODE_VERSION_INT VERSION_GREATER 10.0)
+      set(ARCHS armv7k arm64_32)
+      set(APPLE_TARGET_TRIPLE_INT arm64_32-apple-watchos${DEPLOYMENT_TARGET})
+    else()
+      set(ARCHS armv7k)
+      set(APPLE_TARGET_TRIPLE_INT arm-apple-watchos${DEPLOYMENT_TARGET})
+    endif()
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET})
+  endif()
+elseif(PLATFORM_INT STREQUAL "WATCHOSCOMBINED")
+  set(SDK_NAME watchos)
+  if(MODERN_CMAKE)
+    if(NOT ARCHS)
+      if (XCODE_VERSION_INT VERSION_GREATER 10.0)
+        set(ARCHS armv7k arm64_32 i386)
+        set(APPLE_TARGET_TRIPLE_INT arm64_32-i386-apple-watchos${DEPLOYMENT_TARGET})
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k arm64_32")
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k arm64_32")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386")
+      else()
+        set(ARCHS armv7k i386)
+        set(APPLE_TARGET_TRIPLE_INT arm-i386-apple-watchos${DEPLOYMENT_TARGET})
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k")
+        set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k")
+        set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386")
+      endif()
+    else()
+      set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET})
+    endif()
+  else()
+    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the WATCHOSCOMBINED setting work")
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS")
+  set(SDK_NAME watchsimulator)
+  if(NOT ARCHS)
+    set(ARCHS i386)
+    set(APPLE_TARGET_TRIPLE_INT i386-apple-watchos${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATOR64_VISIONOS")
+  set(SDK_NAME xrsimulator)
+  if(NOT ARCHS)
+    set(ARCHS x86_64)
+    set(APPLE_TARGET_TRIPLE_INT x86_64-apple-xros${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-xros${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "SIMULATOR_VISIONOS")
+  set(SDK_NAME xrsimulator)
+  if(NOT ARCHS)
+    set(ARCHS arm64)
+    set(APPLE_TARGET_TRIPLE_INT arm64-apple-xros${DEPLOYMENT_TARGET}-simulator)
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-xros${DEPLOYMENT_TARGET}-simulator)
+  endif()
+elseif(PLATFORM_INT STREQUAL "VISIONOS")
+  set(SDK_NAME xros)
+  if(NOT ARCHS)
+    set(ARCHS arm64)
+    set(APPLE_TARGET_TRIPLE_INT arm64-apple-xros${DEPLOYMENT_TARGET})
+  else()
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-xros${DEPLOYMENT_TARGET})
+  endif()
+elseif(PLATFORM_INT STREQUAL "MAC" OR PLATFORM_INT STREQUAL "MAC_CATALYST")
+  set(SDK_NAME macosx)
+  if(NOT ARCHS)
+    set(ARCHS x86_64)
+  endif()
+  string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}")
+  if(PLATFORM_INT STREQUAL "MAC")
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET})
+  elseif(PLATFORM_INT STREQUAL "MAC_CATALYST")
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi)
+  endif()
+elseif(PLATFORM_INT MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$")
+  set(SDK_NAME macosx)
+  if(NOT ARCHS)
+    set(ARCHS arm64)
+  endif()
+  string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}")
+  if(PLATFORM_INT STREQUAL "MAC_ARM64")
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET})
+  elseif(PLATFORM_INT STREQUAL "MAC_CATALYST_ARM64")
+    set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi)
+  endif()
+elseif(PLATFORM_INT STREQUAL "MAC_UNIVERSAL")
+  set(SDK_NAME macosx)
+  if(NOT ARCHS)
+    set(ARCHS "x86_64;arm64")
+  endif()
+  string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}")
+  set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET})
+else()
+  message(FATAL_ERROR "Invalid PLATFORM: ${PLATFORM_INT}")
+endif()
+
+string(REPLACE ";" " " ARCHS_SPACED "${ARCHS}")
+
+if(MODERN_CMAKE AND PLATFORM_INT MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode")
+  message(FATAL_ERROR "The COMBINED options only work with Xcode generator, -G Xcode")
+endif()
+
+if(CMAKE_GENERATOR MATCHES "Xcode" AND PLATFORM_INT MATCHES "^MAC_CATALYST")
+  set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
+  set(CMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS "macosx")
+  set(CMAKE_XCODE_ATTRIBUTE_SUPPORTS_MACCATALYST "YES")
+  if(NOT DEFINED MACOSX_DEPLOYMENT_TARGET)
+    set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "10.15")
+  else()
+    set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "${MACOSX_DEPLOYMENT_TARGET}")
+  endif()
+elseif(CMAKE_GENERATOR MATCHES "Xcode")
+  set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
+  set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}")
+  if(NOT PLATFORM_INT MATCHES ".*COMBINED")
+    set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}")
+    set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}")
+  endif()
+endif()
+
+# If the user did not specify the SDK root to use, then query xcodebuild for it.
+if(DEFINED CMAKE_OSX_SYSROOT_INT)
+  # Environment variables are always preserved.
+  set(ENV{_CMAKE_OSX_SYSROOT_INT} "${CMAKE_OSX_SYSROOT_INT}")
+elseif(DEFINED ENV{_CMAKE_OSX_SYSROOT_INT})
+  set(CMAKE_OSX_SYSROOT_INT "$ENV{_CMAKE_OSX_SYSROOT_INT}")
+elseif(NOT DEFINED CMAKE_OSX_SYSROOT_INT)
+  execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version -sdk ${SDK_NAME} Path
+          OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+if (NOT DEFINED CMAKE_OSX_SYSROOT_INT AND NOT DEFINED CMAKE_OSX_SYSROOT)
+  message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain"
+          "is pointing to the correct path. Please run:"
+          "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer"
+          "and see if that fixes the problem for you.")
+  message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} "
+          "does not exist.")
+elseif(DEFINED CMAKE_OSX_SYSROOT_INT)
+  set(CMAKE_OSX_SYSROOT_INT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "")
+  # Specify the location or name of the platform SDK to be used in CMAKE_OSX_SYSROOT.
+  set(CMAKE_OSX_SYSROOT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "")
+endif()
+
+# Use bitcode or not
+if(NOT DEFINED ENABLE_BITCODE)
+  message(STATUS "[DEFAULTS] Disabling bitcode support by default. ENABLE_BITCODE not provided for override!")
+  set(ENABLE_BITCODE OFF)
+endif()
+set(ENABLE_BITCODE_INT ${ENABLE_BITCODE} CACHE BOOL
+        "Whether or not to enable bitcode" FORCE)
+# Use ARC or not
+if(NOT DEFINED ENABLE_ARC)
+  # Unless specified, enable ARC support by default
+  set(ENABLE_ARC ON)
+  message(STATUS "[DEFAULTS] Enabling ARC support by default. ENABLE_ARC not provided!")
+endif()
+set(ENABLE_ARC_INT ${ENABLE_ARC} CACHE BOOL "Whether or not to enable ARC" FORCE)
+# Use hidden visibility or not
+if(NOT DEFINED ENABLE_VISIBILITY)
+  # Unless specified, disable symbols visibility by default
+  set(ENABLE_VISIBILITY OFF)
+  message(STATUS "[DEFAULTS] Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!")
+endif()
+set(ENABLE_VISIBILITY_INT ${ENABLE_VISIBILITY} CACHE BOOL "Whether or not to hide symbols from the dynamic linker (-fvisibility=hidden)" FORCE)
+# Set strict compiler checks or not
+if(NOT DEFINED ENABLE_STRICT_TRY_COMPILE)
+  # Unless specified, disable strict try_compile()
+  set(ENABLE_STRICT_TRY_COMPILE OFF)
+  message(STATUS "[DEFAULTS] Using NON-strict compiler checks by default. ENABLE_STRICT_TRY_COMPILE not provided!")
+endif()
+set(ENABLE_STRICT_TRY_COMPILE_INT ${ENABLE_STRICT_TRY_COMPILE} CACHE BOOL
+        "Whether or not to use strict compiler checks" FORCE)
+
+# Get the SDK version information.
+if(DEFINED SDK_VERSION)
+  # Environment variables are always preserved.
+  set(ENV{_SDK_VERSION} "${SDK_VERSION}")
+elseif(DEFINED ENV{_SDK_VERSION})
+  set(SDK_VERSION "$ENV{_SDK_VERSION}")
+elseif(NOT DEFINED SDK_VERSION)
+  execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -sdk ${CMAKE_OSX_SYSROOT_INT} -version SDKVersion
+          OUTPUT_VARIABLE SDK_VERSION
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+# Find the Developer root for the specific iOS platform being compiled for
+# from CMAKE_OSX_SYSROOT.  Should be ../../ from SDK specified in
+# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain
+# this information from xcrun or xcodebuild.
+if (NOT DEFINED CMAKE_DEVELOPER_ROOT AND NOT CMAKE_GENERATOR MATCHES "Xcode")
+  get_filename_component(PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT_INT} PATH)
+  get_filename_component(CMAKE_DEVELOPER_ROOT ${PLATFORM_SDK_DIR} PATH)
+  if (NOT EXISTS "${CMAKE_DEVELOPER_ROOT}")
+    message(FATAL_ERROR "Invalid CMAKE_DEVELOPER_ROOT: ${CMAKE_DEVELOPER_ROOT} does not exist.")
+  endif()
+endif()
+
+# Find the C & C++ compilers for the specified SDK.
+if(DEFINED CMAKE_C_COMPILER)
+  # Environment variables are always preserved.
+  set(ENV{_CMAKE_C_COMPILER} "${CMAKE_C_COMPILER}")
+elseif(DEFINED ENV{_CMAKE_C_COMPILER})
+  set(CMAKE_C_COMPILER "$ENV{_CMAKE_C_COMPILER}")
+  set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+elseif(NOT DEFINED CMAKE_C_COMPILER)
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang
+          OUTPUT_VARIABLE CMAKE_C_COMPILER
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+endif()
+if(DEFINED CMAKE_CXX_COMPILER)
+  # Environment variables are always preserved.
+  set(ENV{_CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}")
+elseif(DEFINED ENV{_CMAKE_CXX_COMPILER})
+  set(CMAKE_CXX_COMPILER "$ENV{_CMAKE_CXX_COMPILER}")
+elseif(NOT DEFINED CMAKE_CXX_COMPILER)
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang++
+          OUTPUT_VARIABLE CMAKE_CXX_COMPILER
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+# Find (Apple's) libtool.
+if(DEFINED BUILD_LIBTOOL)
+  # Environment variables are always preserved.
+  set(ENV{_BUILD_LIBTOOL} "${BUILD_LIBTOOL}")
+elseif(DEFINED ENV{_BUILD_LIBTOOL})
+  set(BUILD_LIBTOOL "$ENV{_BUILD_LIBTOOL}")
+elseif(NOT DEFINED BUILD_LIBTOOL)
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find libtool
+          OUTPUT_VARIABLE BUILD_LIBTOOL
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+# Find the toolchain's provided install_name_tool if none is found on the host
+if(DEFINED CMAKE_INSTALL_NAME_TOOL)
+  # Environment variables are always preserved.
+  set(ENV{_CMAKE_INSTALL_NAME_TOOL} "${CMAKE_INSTALL_NAME_TOOL}")
+elseif(DEFINED ENV{_CMAKE_INSTALL_NAME_TOOL})
+  set(CMAKE_INSTALL_NAME_TOOL "$ENV{_CMAKE_INSTALL_NAME_TOOL}")
+elseif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find install_name_tool
+          OUTPUT_VARIABLE CMAKE_INSTALL_NAME_TOOL_INT
+          ERROR_QUIET
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+  set(CMAKE_INSTALL_NAME_TOOL ${CMAKE_INSTALL_NAME_TOOL_INT} CACHE INTERNAL "")
+endif()
+
+# Configure libtool to be used instead of ar + ranlib to build static libraries.
+# This is required on Xcode 7+, but should also work on previous versions of
+# Xcode.
+get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
+foreach(lang ${languages})
+  set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "${BUILD_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> " CACHE INTERNAL "")
+endforeach()
+
+# CMake 3.14+ support building for iOS, watchOS, and tvOS out of the box.
+if(MODERN_CMAKE)
+  if(SDK_NAME MATCHES "iphone")
+    set(CMAKE_SYSTEM_NAME iOS)
+  elseif(SDK_NAME MATCHES "xros")
+      set(CMAKE_SYSTEM_NAME visionOS)
+  elseif(SDK_NAME MATCHES "xrsimulator")
+      set(CMAKE_SYSTEM_NAME visionOS)
+  elseif(SDK_NAME MATCHES "macosx")
+    set(CMAKE_SYSTEM_NAME Darwin)
+  elseif(SDK_NAME MATCHES "appletv")
+    set(CMAKE_SYSTEM_NAME tvOS)
+  elseif(SDK_NAME MATCHES "watch")
+    set(CMAKE_SYSTEM_NAME watchOS)
+  endif()
+  # Provide flags for a combined FAT library build on newer CMake versions
+  if(PLATFORM_INT MATCHES ".*COMBINED")
+    set(CMAKE_IOS_INSTALL_COMBINED YES)
+    if(CMAKE_GENERATOR MATCHES "Xcode")
+      # Set the SDKROOT Xcode properties to a Xcode-friendly value (the SDK_NAME, E.g, iphoneos)
+      # This way, Xcode will automatically switch between the simulator and device SDK when building.
+      set(CMAKE_XCODE_ATTRIBUTE_SDKROOT "${SDK_NAME}")
+      # Force to not build just one ARCH, but all!
+      set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO")
+    endif()
+  endif()
+elseif(NOT DEFINED CMAKE_SYSTEM_NAME AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.10")
+  # Legacy code path prior to CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified
+  set(CMAKE_SYSTEM_NAME iOS)
+elseif(NOT DEFINED CMAKE_SYSTEM_NAME)
+  # Legacy code path before CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified
+  set(CMAKE_SYSTEM_NAME Darwin)
+endif()
+# Standard settings.
+set(CMAKE_SYSTEM_VERSION ${SDK_VERSION} CACHE INTERNAL "")
+set(UNIX ON CACHE BOOL "")
+set(APPLE ON CACHE BOOL "")
+if(PLATFORM STREQUAL "MAC" OR PLATFORM STREQUAL "MAC_ARM64" OR PLATFORM STREQUAL "MAC_UNIVERSAL")
+  set(IOS OFF CACHE BOOL "")
+  set(MACOS ON CACHE BOOL "")
+elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64")
+  set(IOS ON CACHE BOOL "")
+  set(MACOS ON CACHE BOOL "")
+else()
+  set(IOS ON CACHE BOOL "")
+endif()
+# Set the architectures for which to build.
+set(CMAKE_OSX_ARCHITECTURES ${ARCHS} CACHE INTERNAL "")
+# Change the type of target generated for try_compile() so it'll work when cross-compiling, weak compiler checks
+if(NOT ENABLE_STRICT_TRY_COMPILE_INT)
+  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+endif()
+# All iOS/Darwin specific settings - some may be redundant.
+if (NOT DEFINED CMAKE_MACOSX_BUNDLE)
+  set(CMAKE_MACOSX_BUNDLE YES)
+endif()
+set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO")
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
+set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
+set(CMAKE_SHARED_MODULE_PREFIX "lib")
+set(CMAKE_SHARED_MODULE_SUFFIX ".so")
+set(CMAKE_C_COMPILER_ABI ELF)
+set(CMAKE_CXX_COMPILER_ABI ELF)
+set(CMAKE_C_HAS_ISYSROOT 1)
+set(CMAKE_CXX_HAS_ISYSROOT 1)
+set(CMAKE_MODULE_EXISTS 1)
+set(CMAKE_DL_LIBS "")
+set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+
+if(ARCHS MATCHES "((^|;|, )(arm64|arm64e|x86_64))+")
+  set(CMAKE_C_SIZEOF_DATA_PTR 8)
+  set(CMAKE_CXX_SIZEOF_DATA_PTR 8)
+  if(ARCHS MATCHES "((^|;|, )(arm64|arm64e))+")
+    set(CMAKE_SYSTEM_PROCESSOR "aarch64")
+  else()
+    set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+  endif()
+else()
+  set(CMAKE_C_SIZEOF_DATA_PTR 4)
+  set(CMAKE_CXX_SIZEOF_DATA_PTR 4)
+  set(CMAKE_SYSTEM_PROCESSOR "arm")
+endif()
+
+# Note that only Xcode 7+ supports the newer more specific:
+# -m${SDK_NAME}-version-min flags, older versions of Xcode use:
+# -m(ios/ios-simulator)-version-min instead.
+if(${CMAKE_VERSION} VERSION_LESS "3.11")
+  if(PLATFORM_INT STREQUAL "OS" OR PLATFORM_INT STREQUAL "OS64")
+    if(XCODE_VERSION_INT VERSION_LESS 7.0)
+      set(SDK_NAME_VERSION_FLAGS
+              "-mios-version-min=${DEPLOYMENT_TARGET}")
+    else()
+      # Xcode 7.0+ uses flags we can build directly from SDK_NAME.
+      set(SDK_NAME_VERSION_FLAGS
+              "-m${SDK_NAME}-version-min=${DEPLOYMENT_TARGET}")
+    endif()
+  elseif(PLATFORM_INT STREQUAL "TVOS")
+    set(SDK_NAME_VERSION_FLAGS
+            "-mtvos-version-min=${DEPLOYMENT_TARGET}")
+  elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS")
+    set(SDK_NAME_VERSION_FLAGS
+            "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}")
+elseif(PLATFORM_INT STREQUAL "SIMULATORARM64_TVOS")
+    set(SDK_NAME_VERSION_FLAGS
+            "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}")
+  elseif(PLATFORM_INT STREQUAL "WATCHOS")
+    set(SDK_NAME_VERSION_FLAGS
+            "-mwatchos-version-min=${DEPLOYMENT_TARGET}")
+  elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS")
+    set(SDK_NAME_VERSION_FLAGS
+            "-mwatchos-simulator-version-min=${DEPLOYMENT_TARGET}")
+  elseif(PLATFORM_INT STREQUAL "MAC")
+    set(SDK_NAME_VERSION_FLAGS
+            "-mmacosx-version-min=${DEPLOYMENT_TARGET}")
+  else()
+    # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min.
+    set(SDK_NAME_VERSION_FLAGS
+            "-mios-simulator-version-min=${DEPLOYMENT_TARGET}")
+  endif()
+elseif(NOT PLATFORM_INT MATCHES "^MAC_CATALYST")
+  # Newer versions of CMake sets the version min flags correctly, skip this for Mac Catalyst targets
+  set(CMAKE_OSX_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET})
+endif()
+
+if(DEFINED APPLE_TARGET_TRIPLE_INT)
+  set(APPLE_TARGET_TRIPLE ${APPLE_TARGET_TRIPLE_INT} CACHE INTERNAL "")
+  set(CMAKE_C_COMPILER_TARGET ${APPLE_TARGET_TRIPLE})
+  set(CMAKE_CXX_COMPILER_TARGET ${APPLE_TARGET_TRIPLE})
+  set(CMAKE_ASM_COMPILER_TARGET ${APPLE_TARGET_TRIPLE})
+endif()
+
+if(PLATFORM_INT MATCHES "^MAC_CATALYST")
+  set(C_TARGET_FLAGS "-isystem ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/usr/include -iframework ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks")
+endif()
+
+if(ENABLE_BITCODE_INT)
+  set(BITCODE "-fembed-bitcode")
+  set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode")
+  set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "YES")
+else()
+  set(BITCODE "")
+  set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "NO")
+endif()
+
+if(ENABLE_ARC_INT)
+  set(FOBJC_ARC "-fobjc-arc")
+  set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "YES")
+else()
+  set(FOBJC_ARC "-fno-objc-arc")
+  set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "NO")
+endif()
+
+if(NAMED_LANGUAGE_SUPPORT_INT)
+  set(OBJC_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0")
+  set(OBJC_LEGACY_VARS "")
+else()
+  set(OBJC_VARS "")
+  set(OBJC_LEGACY_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0")
+endif()
+
+if(NOT ENABLE_VISIBILITY_INT)
+  foreach(lang ${languages})
+    set(CMAKE_${lang}_VISIBILITY_PRESET "hidden" CACHE INTERNAL "")
+  endforeach()
+  set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES")
+  set(VISIBILITY "-fvisibility=hidden -fvisibility-inlines-hidden")
+else()
+  foreach(lang ${languages})
+    set(CMAKE_${lang}_VISIBILITY_PRESET "default" CACHE INTERNAL "")
+  endforeach()
+  set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "NO")
+  set(VISIBILITY "-fvisibility=default")
+endif()
+
+if(DEFINED APPLE_TARGET_TRIPLE)
+  set(APPLE_TARGET_TRIPLE_FLAG "-target ${APPLE_TARGET_TRIPLE}")
+endif()
+
+#Check if Xcode generator is used since that will handle these flags automagically
+if(CMAKE_GENERATOR MATCHES "Xcode")
+  message(STATUS "Not setting any manual command-line buildflags, since Xcode is selected as the generator. Modifying the Xcode build-settings directly instead.")
+else()
+  set(CMAKE_C_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_C_FLAGS}")
+  set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_C_FLAGS_DEBUG}")
+  set(CMAKE_C_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_C_FLAGS_MINSIZEREL}")
+  set(CMAKE_C_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_C_FLAGS_RELWITHDEBINFO}")
+  set(CMAKE_C_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_C_FLAGS_RELEASE}")
+  set(CMAKE_CXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_CXX_FLAGS_DEBUG}")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_CXX_FLAGS_MINSIZEREL}")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_CXX_FLAGS_RELEASE}")
+  if(NAMED_LANGUAGE_SUPPORT_INT)
+    set(CMAKE_OBJC_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJC_FLAGS}")
+    set(CMAKE_OBJC_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJC_FLAGS_DEBUG}")
+    set(CMAKE_OBJC_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJC_FLAGS_MINSIZEREL}")
+    set(CMAKE_OBJC_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJC_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_OBJC_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJC_FLAGS_RELEASE}")
+    set(CMAKE_OBJCXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJCXX_FLAGS}")
+    set(CMAKE_OBJCXX_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJCXX_FLAGS_DEBUG}")
+    set(CMAKE_OBJCXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJCXX_FLAGS_MINSIZEREL}")
+    set(CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_OBJCXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJCXX_FLAGS_RELEASE}")
+  endif()
+  set(CMAKE_C_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
+  set(CMAKE_CXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS}  -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
+  if(NAMED_LANGUAGE_SUPPORT_INT)
+    set(CMAKE_OBJC_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJC_LINK_FLAGS}")
+    set(CMAKE_OBJCXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJCXX_LINK_FLAGS}")
+  endif()
+  set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -x assembler-with-cpp -arch ${CMAKE_OSX_ARCHITECTURES} ${APPLE_TARGET_TRIPLE_FLAG}")
+endif()
+
+## Print status messages to inform of the current state
+message(STATUS "Configuring ${SDK_NAME} build for platform: ${PLATFORM_INT}, architecture(s): ${ARCHS}")
+message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT_INT}")
+message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}")
+message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}")
+message(STATUS "Using libtool: ${BUILD_LIBTOOL}")
+message(STATUS "Using install name tool: ${CMAKE_INSTALL_NAME_TOOL}")
+if(DEFINED APPLE_TARGET_TRIPLE)
+  message(STATUS "Autoconf target triple: ${APPLE_TARGET_TRIPLE}")
+endif()
+message(STATUS "Using minimum deployment version: ${DEPLOYMENT_TARGET}"
+        " (SDK version: ${SDK_VERSION})")
+if(MODERN_CMAKE)
+  message(STATUS "Merging integrated CMake 3.14+ iOS,tvOS,watchOS,macOS toolchain(s) with this toolchain!")
+  if(PLATFORM_INT MATCHES ".*COMBINED")
+    message(STATUS "Will combine built (static) artifacts into FAT lib...")
+  endif()
+endif()
+if(CMAKE_GENERATOR MATCHES "Xcode")
+  message(STATUS "Using Xcode version: ${XCODE_VERSION_INT}")
+endif()
+message(STATUS "CMake version: ${CMAKE_VERSION}")
+if(DEFINED SDK_NAME_VERSION_FLAGS)
+  message(STATUS "Using version flags: ${SDK_NAME_VERSION_FLAGS}")
+endif()
+message(STATUS "Using a data_ptr size of: ${CMAKE_CXX_SIZEOF_DATA_PTR}")
+if(ENABLE_BITCODE_INT)
+  message(STATUS "Bitcode: Enabled")
+else()
+  message(STATUS "Bitcode: Disabled")
+endif()
+
+if(ENABLE_ARC_INT)
+  message(STATUS "ARC: Enabled")
+else()
+  message(STATUS "ARC: Disabled")
+endif()
+
+if(ENABLE_VISIBILITY_INT)
+  message(STATUS "Hiding symbols: Disabled")
+else()
+  message(STATUS "Hiding symbols: Enabled")
+endif()
+
+# Set global properties
+set_property(GLOBAL PROPERTY PLATFORM "${PLATFORM}")
+set_property(GLOBAL PROPERTY APPLE_TARGET_TRIPLE "${APPLE_TARGET_TRIPLE_INT}")
+set_property(GLOBAL PROPERTY SDK_VERSION "${SDK_VERSION}")
+set_property(GLOBAL PROPERTY XCODE_VERSION "${XCODE_VERSION_INT}")
+set_property(GLOBAL PROPERTY OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+
+# Export configurable variables for the try_compile() command.
+set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
+        PLATFORM
+        XCODE_VERSION_INT
+        SDK_VERSION
+        NAMED_LANGUAGE_SUPPORT
+        DEPLOYMENT_TARGET
+        CMAKE_DEVELOPER_ROOT
+        CMAKE_OSX_SYSROOT_INT
+        ENABLE_BITCODE
+        ENABLE_ARC
+        CMAKE_ASM_COMPILER
+        CMAKE_C_COMPILER
+        CMAKE_C_COMPILER_TARGET
+        CMAKE_CXX_COMPILER
+        CMAKE_CXX_COMPILER_TARGET
+        BUILD_LIBTOOL
+        CMAKE_INSTALL_NAME_TOOL
+        CMAKE_C_FLAGS
+        CMAKE_C_DEBUG
+        CMAKE_C_MINSIZEREL
+        CMAKE_C_RELWITHDEBINFO
+        CMAKE_C_RELEASE
+        CMAKE_CXX_FLAGS
+        CMAKE_CXX_FLAGS_DEBUG
+        CMAKE_CXX_FLAGS_MINSIZEREL
+        CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_C_LINK_FLAGS
+        CMAKE_CXX_LINK_FLAGS
+        CMAKE_ASM_FLAGS
+)
+
+if(NAMED_LANGUAGE_SUPPORT_INT)
+  list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES 
+        CMAKE_OBJC_FLAGS
+        CMAKE_OBJC_DEBUG
+        CMAKE_OBJC_MINSIZEREL
+        CMAKE_OBJC_RELWITHDEBINFO
+        CMAKE_OBJC_RELEASE
+        CMAKE_OBJCXX_FLAGS
+        CMAKE_OBJCXX_DEBUG
+        CMAKE_OBJCXX_MINSIZEREL
+        CMAKE_OBJCXX_RELWITHDEBINFO
+        CMAKE_OBJCXX_RELEASE
+        CMAKE_OBJC_LINK_FLAGS
+        CMAKE_OBJCXX_LINK_FLAGS
+  )
+endif()
+
+set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+set(CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks")
+set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -Wl,-headerpad_max_install_names")
+set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -Wl,-headerpad_max_install_names")
+set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".tbd" ".dylib" ".so" ".a")
+set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-install_name")
+
+# Set the find root to the SDK developer roots.
+# Note: CMAKE_FIND_ROOT_PATH is only useful when cross-compiling. Thus, do not set on macOS builds.
+if(NOT PLATFORM_INT MATCHES "^MAC.*$")
+  list(APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "")
+  set(CMAKE_IGNORE_PATH "/System/Library/Frameworks;/usr/local/lib;/opt/homebrew" CACHE INTERNAL "")
+endif()
+
+# Default to searching for frameworks first.
+IF(NOT DEFINED CMAKE_FIND_FRAMEWORK)
+  set(CMAKE_FIND_FRAMEWORK FIRST)
+ENDIF(NOT DEFINED CMAKE_FIND_FRAMEWORK)
+
+# Set up the default search directories for frameworks.
+if(PLATFORM_INT MATCHES "^MAC_CATALYST") 
+  set(CMAKE_FRAMEWORK_PATH
+          ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks
+          ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks
+          ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks
+          ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "")
+else()
+  set(CMAKE_FRAMEWORK_PATH
+          ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks
+          ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks
+          ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "")
+endif()
+
+# By default, search both the specified iOS SDK and the remainder of the host filesystem.
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH CACHE INTERNAL "")
+endif()
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH CACHE INTERNAL "")
+endif()
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH CACHE INTERNAL "")
+endif()
+if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH CACHE INTERNAL "")
+endif()
+
+#
+# Some helper-macros below to simplify and beautify the CMakeFile
+#
+
+# This little macro lets you set any Xcode specific property.
+macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION)
+  set(XCODE_RELVERSION_I "${XCODE_RELVERSION}")
+  if(XCODE_RELVERSION_I STREQUAL "All")
+    set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}")
+  else()
+    set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}")
+  endif()
+endmacro(set_xcode_property)
+
+# This macro lets you find executable programs on the host system.
+macro(find_host_package)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE NEVER)
+  set(_TOOLCHAIN_IOS ${IOS})
+  set(IOS OFF)
+  find_package(${ARGN})
+  set(IOS ${_TOOLCHAIN_IOS})
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
+endmacro(find_host_package)
\ No newline at end of file
diff --git a/scripts/ci/install_mkl.sh b/scripts/ci/install_mkl.sh
index 4037396e7..2333a75a7 100755
--- a/scripts/ci/install_mkl.sh
+++ b/scripts/ci/install_mkl.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
-wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
+wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" | sudo apt-key add -
 sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
 sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list"
 sudo apt-get install --no-install-recommends intel-mkl-64bit-2020.0-088
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 6cf46533f..f57e746e1 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -9,10 +9,23 @@ add_subdirectory(./faiss)
 include_directories(./faiss)
 
 if(COMPILE_CPU)
-  if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
+  # intgemm is not ARM-compatible.  do not build it if we are on ARM
+  if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
     set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
     add_subdirectory(./intgemm)
   endif()
+
+  # the default codepath does not use ruy so there is no need to add these directories 
+  # to the build unless it is explicitly enabled. RUY is intended mostly for ARM support
+  if(USE_RUY_SGEMM)
+    set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE)
+    set(CPUINFO_BUILD_TOOLS      OFF CACHE BOOL " " FORCE)
+    add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL)
+    add_subdirectory(ruy EXCLUDE_FROM_ALL)
+  endif(USE_RUY_SGEMM)
 endif(COMPILE_CPU)
 
 if(USE_FBGEMM)
diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp
index 103b0910e..22fecbf78 100644
--- a/src/3rd_party/faiss/VectorTransform.cpp
+++ b/src/3rd_party/faiss/VectorTransform.cpp
@@ -19,6 +19,12 @@
 
 using namespace faiss;
 
+#ifdef ARM
+// we use various AVX/SSE instructions in this file
+// simd_utils translates these into ARM/NEON compatible instructions
+#include "3rd_party/simd_utils/simd_utils.h"
+#endif
+
 
 extern "C" {
 
diff --git a/src/3rd_party/faiss/VectorTransform.h b/src/3rd_party/faiss/VectorTransform.h
index 5fc96bc46..e8689bc15 100644
--- a/src/3rd_party/faiss/VectorTransform.h
+++ b/src/3rd_party/faiss/VectorTransform.h
@@ -19,8 +19,10 @@
 
 #include <faiss/Index.h>
 #ifdef __APPLE__
+#ifndef ARM
 #include <x86intrin.h>
 #endif
+#endif
 
 
 namespace faiss {
diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy
new file mode 160000
index 000000000..c04e5e52a
--- /dev/null
+++ b/src/3rd_party/ruy
@@ -0,0 +1 @@
+Subproject commit c04e5e52ae6b144f74ac032652e3c538bda15c9b
diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
index fb6f8e408..b09054fdd 160000
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@@ -1 +1 @@
-Subproject commit fb6f8e408d2078ebfedc8ccc33985fef03c50b0e
+Subproject commit b09054fdd0ac18f1377b5f7c68807a86faada6c8
diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils
new file mode 160000
index 000000000..fe9fa82c9
--- /dev/null
+++ b/src/3rd_party/simd_utils
@@ -0,0 +1 @@
+Subproject commit fe9fa82c9d7e6297913bc6c98fe079acc6e157e9
diff --git a/src/common/binary.cpp b/src/common/binary.cpp
index 0041275c5..fa98ef8bc 100644
--- a/src/common/binary.cpp
+++ b/src/common/binary.cpp
@@ -109,7 +109,7 @@ void loadItems(const std::string& fileName, std::vector<io::Item>& items) {
 
 io::Item getItem(const void* current, const std::string& varName) {
   std::vector<io::Item> items;
-  loadItems(current, items);
+  loadItems(current, items, /*mapped=*/true);
 
   for(auto& item : items)
     if(item.name == varName)
diff --git a/src/common/types.h b/src/common/types.h
index 7b50bb691..bd67fae71 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -17,7 +17,11 @@
 #include <type_traits>
 
 #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
-#include <immintrin.h>
+  #ifndef ARM
+    #include <immintrin.h>
+  #else
+    #include "3rd_party/simd_utils/simd_utils.h"
+  #endif
 #endif
 
 #ifdef __CUDACC__ // nvcc is compiling this code
diff --git a/src/functional/operators.h b/src/functional/operators.h
index 3628fdcb9..e7dcea3c6 100644
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -217,8 +217,11 @@ struct Ops<double> {
 // __CUDACC__ is defined when compiling with NVCC regardless of device type
 // __CUDA_ARCH__ is defined when compiling device (GPU) code
 #ifndef __CUDACC__
-
+#ifndef ARM
 #include "3rd_party/sse_mathfun.h"
+#else
+#include "3rd_party/simd_utils/simd_utils.h"
+#endif
 
 namespace marian {
 namespace functional {
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
index f1a68210e..db526b626 100644
--- a/src/tensors/cpu/expression_graph_packable.h
+++ b/src/tensors/cpu/expression_graph_packable.h
@@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
 #endif
       } else if (isIntgemm(gemmElementType) &&
       (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
         using cpu::integer::cols;
         using cpu::integer::rows;
         auto allocator = New<TensorAllocator>(getBackend());
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp
index dd81d0f7f..23ed559f1 100644
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@@ -2,16 +2,16 @@
 #include "tensors/tensor_allocator.h"
 #include "tensors/tensor_operators.h"
 
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
 #include <cassert>
 #include <cstddef>
 #include <unordered_map>
 //#include <chrono>
 
 #if USE_FBGEMM
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
 #ifdef _MSC_VER
 #pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
 #pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index f4e632b5c..8a00a7870 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -5,7 +5,7 @@
 #include "tensors/cpu/aligned.h"
 #include "common/io_item.h"
 
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
 #include "3rd_party/intgemm/intgemm/intgemm.h"
 #else
 namespace intgemm {
@@ -31,10 +31,12 @@ namespace intgemm {
 }
 #endif
 
+#ifndef ARM
 #include <emmintrin.h>
 #include <immintrin.h>
 #include <tmmintrin.h>
 #include <xmmintrin.h>
+#endif
 #include <cassert>
 #include <cstddef>
 
@@ -98,7 +100,7 @@ template <> struct intgemm_<Type::intgemm16avx512> {
 
 template <Type vtype>
 static inline float& getQuantMult(marian::Tensor val) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type());
   typedef typename intgemm_<vtype>::type Integer;
   return *(reinterpret_cast<float*>(val->data<Integer>() + val->shape().elements()));
@@ -109,7 +111,7 @@ static inline float& getQuantMult(marian::Tensor val) {
 }
 
 static inline Type getIntgemmType(Type vtype) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   if (vtype == Type::intgemm8) {
     if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) {
       return Type::intgemm8avx512vnni;
@@ -142,7 +144,7 @@ static inline Type getIntgemmType(Type vtype) {
 }
 
 static inline bool passOrAbort(Type vtype) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   if (vtype == Type::intgemm8 || vtype == Type::intgemm16) {
     return true;
   } else if (vtype == Type::intgemm16sse2) {
@@ -166,7 +168,7 @@ static inline bool passOrAbort(Type vtype) {
 
 template <Type vtype>
 static inline float computeQuantMult(marian::Tensor val) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   if(sizeOf(vtype) == 1)
     return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
   else if(sizeOf(vtype) == 2)
@@ -186,7 +188,7 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias);
 // in our binary format. Then we copy the quantizationMultiplier information at the end
 template<Type vtype>
 void prepareAndTransposeB(io::Item& item, const char * input) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
     typedef typename intgemm_<vtype>::type Integer;
     Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
     // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 88408aa18..80784e0f6 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -9,7 +9,7 @@ namespace marian {
 namespace cpu {
 namespace integer {
 
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
 /*
  * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
  * Expr input: The input tensor
@@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) {
  */
 template<Type vtype>
 static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
-#if COMPILE_CPU
+#if COMPILE_CPU && !defined(ARM)
   ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
   ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());
 
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
index 8fcca924b..639027d05 100755
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@@ -7,14 +7,6 @@
 #include "tensors/tensor.h"
 #include "tensors/tensor_allocator.h"
 
-#if MKL_FOUND
-#include <mkl.h>
-#else
-#if BLAS_FOUND
-#include <cblas.h>
-#endif
-#endif
-
 #include "integer_common.h"
 #include "prod_blas.h"
 
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index a591fdd26..a281aa7bf 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -1,11 +1,117 @@
+#pragma once
 #if MKL_FOUND
-#include <mkl.h>
-#else
-#if BLAS_FOUND
-#include <cblas.h>
-#endif
+    #include <mkl.h>
+#elif BLAS_FOUND
+    #include <cblas.h>
+#elif USE_RUY_SGEMM
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
+    #include "ruy/ruy.h"
+    #include "ruy/system_aligned_alloc.h"
+#pragma GCC pop
 #endif
 
+#if USE_RUY_SGEMM
+// AlignedVector allocates aligned memory and cleans up after itself. RAII
+// wrapper similar to intgemm's AlignedVector.
+template <class T>
+class AlignedVector {
+public:
+  AlignedVector(size_t num_elem)
+      : size_(num_elem),
+        storage_(reinterpret_cast<T *>(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {}
+
+  T *begin() { return storage_; }
+  T *data() { return storage_; }
+  size_t size() const { return size_; }
+  size_t memSize() const { return sizeof(T) * size_; }
+
+  // Forbid copy
+  AlignedVector(const AlignedVector &) = delete;
+  AlignedVector &operator=(const AlignedVector &) = delete;
+
+  ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast<void *>(storage_)); }
+
+private:
+  size_t size_;
+  T *storage_;
+};
+
+
+inline void GemmRuy(const bool transA,
+                    const bool transB,
+                    const int M,
+                    const int N,
+                    const int K,
+                    const float alpha,
+                    const float *A,
+                    const int lda,
+                    const float *B,
+                    const int ldb,
+                    const float beta,
+                    float *C,
+                    const int ldc) {
+  ruy::Context context;
+
+  // If we need to transpose, we can swap dimensions in layout claim the matrix
+  // is just column-major. Set ordering so transpose.
+  const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+  const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+
+  ruy::Matrix<float> lhs;
+  ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout());
+  lhs.set_data(A);
+
+  ruy::Matrix<float> rhs;
+  ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout());
+  rhs.set_data(B);
+
+  ruy::Matrix<float> dst;
+  ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout());
+
+  if(beta == 0) {
+    // For beta = 0, we want to avoid the additional allocation. This is a
+    // large amount of our inference use-cases. sgemm is called with `beta` for
+    // accumulating gradients in backpropogation, which is 0.0 during
+    // inference.
+
+    dst.set_data(C);
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    if(alpha != 1.0) {
+        // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+        // Can we expect the compiler to autovectorize this?
+        // TODO: Come back and explicitly use SIMD.
+        const size_t size    = M * N;
+        const float *opA_opB = C;  // Alias.
+        for(size_t i = 0; i < size; i++) {
+          C[i] = alpha * opA_opB[i];
+        }
+    }
+
+  } else {
+    // No multiply-add in Ruy
+    // See also: https://github.com/google/ruy/issues/307
+
+    AlignedVector<float> intermediate(M * N);
+    dst.set_data(intermediate.data());
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+    // Can we expect the compiler to autovectorize this?
+    // TODO: Come back and explicitly use SIMD.
+    const size_t size    = M * N;
+    const float *opA_opB = intermediate.data();
+    for(size_t i = 0; i < size; i++) {
+      C[i] = alpha * opA_opB[i] + beta * C[i];
+    }
+  }
+}
+
+#endif // RUY_SGEMM
+
 inline void sgemm(bool transA,
                   bool transB,
                   int rows_a,
@@ -34,6 +140,20 @@ inline void sgemm(bool transA,
               beta,
               c,
               ldc);
+#elif USE_RUY_SGEMM
+        GemmRuy(transA,
+                transB,
+                rows_a,
+                rows_b,
+                width,
+                alpha,
+                a,
+                lda,
+                b,
+                ldb,
+                beta,
+                c,
+                ldc);
 #else
     transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy
     ABORT("Marian must be compiled with a BLAS library");
diff --git a/src/translator/translator.h b/src/translator/translator.h
index b15683867..28ff7b0c2 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -232,7 +232,6 @@ class TranslateService : public ModelServiceTask {
   std::vector<Ptr<io::ModelWeights>> modelWeights_;
 
   size_t numDevices_;
-  std::vector<std::vector<io::Item>> model_items_; // non-mmap
 
 public:
   virtual ~TranslateService() {}

From 01bc6b04435fbe3c523243c12f3000403c30496c Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Thu, 22 Feb 2024 19:31:30 +0000
Subject: [PATCH 19/26] Merged PR 33010: support force-decoding for pymarian
 Translator API

support force-decoding for pymarian Translator API
---
 src/python/tests/regression/test_translate.py | 14 ++++++++++++++
 src/translator/translator.h                   |  9 ++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/python/tests/regression/test_translate.py b/src/python/tests/regression/test_translate.py
index b65ec49dc..49ddc14a2 100644
--- a/src/python/tests/regression/test_translate.py
+++ b/src/python/tests/regression/test_translate.py
@@ -33,3 +33,17 @@ def test_ende():
     translator = Translator(**args)
     hyp = translator.translate("Hello. Good morning.")
     assert hyp == "Hallo , Guten Morgen ."
+
+
+def test_ende_force_decode():
+
+    model_file = str(DATA_DIR / 'model.base.npz')
+    vocab_file = str(DATA_DIR / 'en-de.spm')
+    args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file], quiet=True)
+    translator = Translator(**args)
+    hyp = translator.translate("Hello. Good morning.")
+    assert hyp == "Hallo , Guten Morgen ."
+
+    force_decode_config = dict(force_decode=True, tsv=True, tsv_fields=2)
+    hyp = translator.translate("Hello. Good morning.\tIsch", **force_decode_config)
+    assert hyp == "Isch am Guten Morgen ."
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 28ff7b0c2..f3c1ac549 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -227,6 +227,7 @@ class TranslateService : public ModelServiceTask {
 
   std::vector<Ptr<Vocab>> srcVocabs_;
   Ptr<Vocab> trgVocab_;
+  std::vector<Ptr<Vocab>> allVocabs_;
   Ptr<const data::ShortlistGenerator> shortlistGenerator_;
 
   std::vector<Ptr<io::ModelWeights>> modelWeights_;
@@ -257,6 +258,8 @@ class TranslateService : public ModelServiceTask {
     trgVocab_ = New<Vocab>(options_, vocabPaths.size() - 1);
     trgVocab_->load(vocabPaths.back());
     auto srcVocab = srcVocabs_.front();
+    allVocabs_.insert(allVocabs_.end(), srcVocabs_.begin(), srcVocabs_.end());
+    allVocabs_.emplace_back(trgVocab_);
 
     std::vector<int> lshOpts = options_->get<std::vector<int>>("output-approx-knn", {});
     ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters");
@@ -333,7 +336,11 @@ class TranslateService : public ModelServiceTask {
     auto inputs = currentOptions->get<bool>("tsv", false)
                       ? convertTsvToLists(input, currentOptions->get<size_t>("tsv-fields", 1))
                       : std::vector<std::string>({input});
-    auto corpus_ = New<data::TextInput>(inputs, srcVocabs_, currentOptions);
+    // when force-decode is set, include trgVocab_ , otherwise use srcVocabs_ only
+    // for CLI, force-decode is implemented in data/corpus_base.cpp
+    auto forceDecoding = currentOptions->get<bool>("force-decode", false);
+
+    auto corpus_ = New<data::TextInput>(inputs, forceDecoding ? allVocabs_ : srcVocabs_, currentOptions);
     data::BatchGenerator<data::TextInput> batchGenerator(corpus_, currentOptions, nullptr, /*runAsync=*/false);
 
     auto collector = New<StringCollector>(currentOptions->get<bool>("quiet-translation", false));

From 4d184bbd5fad5356fdeac00aca339d5723de9e43 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 5 Mar 2024 18:05:01 +0000
Subject: [PATCH 20/26] Merged PR 33382: handle cusparse deprecation warnings
 with cuda 12.3

Cuda seems to have deprecated a whole bunch of its interface and it seems to interact weirdly with some gcc versions. Disabling warnings for this header via dummy include.
---
 src/tensors/gpu/backend.h          |  3 ++-
 src/tensors/gpu/cusparse_include.h | 12 +++++++++
 src/tensors/gpu/prod_sparse.cpp    |  6 ++---
 src/tensors/gpu/prod_sparse_cu10.h |  3 ++-
 src/tensors/gpu/prod_sparse_cu11.h | 41 +++++++++++++++---------------
 5 files changed, 40 insertions(+), 25 deletions(-)
 create mode 100644 src/tensors/gpu/cusparse_include.h

diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h
index 022e4f3fe..1199055b6 100644
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@@ -3,12 +3,13 @@
 #include "common/config.h"
 #include "tensors/backend.h"  // note: this is one folder up
 #include "tensors/gpu/cuda_helpers.h"
+#include "tensors/gpu/cusparse_include.h"
 #include "common/logging.h"
 
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
-#include <cusparse.h>
+
 
 namespace marian {
 namespace gpu {
diff --git a/src/tensors/gpu/cusparse_include.h b/src/tensors/gpu/cusparse_include.h
new file mode 100644
index 000000000..b3e68387d
--- /dev/null
+++ b/src/tensors/gpu/cusparse_include.h
@@ -0,0 +1,12 @@
+// header file to include cusparse.h while ignoring deprecated warnings locally
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+#include <cusparse.h>
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
\ No newline at end of file
diff --git a/src/tensors/gpu/prod_sparse.cpp b/src/tensors/gpu/prod_sparse.cpp
index dd66866c7..448a383b9 100644
--- a/src/tensors/gpu/prod_sparse.cpp
+++ b/src/tensors/gpu/prod_sparse.cpp
@@ -2,15 +2,15 @@
 #pragma warning(disable: 4505) // warning C4505: '__float2half_rz': unreferenced local function has been removed (missing 'static inline')
 #endif
 
-#include <cublas_v2.h>
-#include <cusparse.h>
-
 // clang-format off
 #include "tensors/gpu/prod.h"
 #include "tensors/gpu/backend.h"
 #include "tensors/gpu/cuda_helpers.h"
+#include "tensors/gpu/cusparse_include.h"
 // clang-format on
 
+#include <cublas_v2.h>
+
 // what a nightmare
 #if CUDA_VERSION >= 11000
 #include "tensors/gpu/prod_sparse_cu11.h"
diff --git a/src/tensors/gpu/prod_sparse_cu10.h b/src/tensors/gpu/prod_sparse_cu10.h
index d03097e34..0cfdcfed4 100644
--- a/src/tensors/gpu/prod_sparse_cu10.h
+++ b/src/tensors/gpu/prod_sparse_cu10.h
@@ -1,10 +1,11 @@
 #include <cublas_v2.h>
-#include <cusparse.h>
 
 // clang-format off
 #include "tensors/gpu/prod.h"
 #include "tensors/gpu/backend.h"
 #include "tensors/gpu/cuda_helpers.h"
+#include "tensors/gpu/cusparse_include.h"
+
 // clang-format on
 
 namespace marian {
diff --git a/src/tensors/gpu/prod_sparse_cu11.h b/src/tensors/gpu/prod_sparse_cu11.h
index d8659e862..ed3e6e4f5 100644
--- a/src/tensors/gpu/prod_sparse_cu11.h
+++ b/src/tensors/gpu/prod_sparse_cu11.h
@@ -3,12 +3,13 @@
 #endif
 
 #include <cublas_v2.h>
-#include <cusparse.h>
 
 // clang-format off
 #include "tensors/gpu/prod.h"
 #include "tensors/gpu/backend.h"
 #include "tensors/gpu/cuda_helpers.h"
+#include "tensors/gpu/cusparse_include.h"
+
 // clang-format on
 
 namespace marian {
@@ -16,7 +17,7 @@ namespace gpu {
 
 // primary template for specialization with different element and compute types
 template <typename ElementType>
-struct TypedSparseGemm { 
+struct TypedSparseGemm {
 
 static cudaDataType getCudaDataType(const float*) { return CUDA_R_32F; };
 static cudaDataType getCudaDataType(const half*)  { return CUDA_R_16F; };
@@ -36,7 +37,7 @@ static void CSRProdSwapped(marian::Tensor C,
   // interpret tensor dimensions as matrix dimensions
   const auto& shapeC = C->shape();
   const auto& shapeD = D->shape();
-  
+
   auto colsC = shapeC[-1];
   auto rowsC = shapeC.elements() / colsC;
 
@@ -47,7 +48,7 @@ static void CSRProdSwapped(marian::Tensor C,
   auto colsS = rowsD;
 
   auto denseOrder = CUSPARSE_ORDER_COL;
-  auto algorithm  = CUSPARSE_SPMM_ALG_DEFAULT; 
+  auto algorithm  = CUSPARSE_SPMM_ALG_DEFAULT;
 
   std::cerr << shapeC << std::endl;
   std::cerr << shapeD << std::endl;
@@ -60,7 +61,7 @@ static void CSRProdSwapped(marian::Tensor C,
   auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length
   ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument");
   ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size");
-  
+
   ElementType alpha = 1.0;
 
   cusparseSpMatDescr_t descS;
@@ -71,19 +72,19 @@ static void CSRProdSwapped(marian::Tensor C,
                                    S_offsets->data<IndexType>(),
                                    S_indices->data<IndexType>(),
                                    S_values ->data<ElementType>(),
-                                   CUSPARSE_INDEX_32I, 
+                                   CUSPARSE_INDEX_32I,
                                    CUSPARSE_INDEX_32I,
                                    CUSPARSE_INDEX_BASE_ZERO,
                                    getCudaDataType(S_values->data<ElementType>())));
   CUSPARSE_CHECK(cusparseCreateDnMat(&descD,
-                                     rowsD, colsD, /*ld=*/colsD, 
-                                     D->data<ElementType>(), 
-                                     getCudaDataType(D->data<ElementType>()), 
+                                     rowsD, colsD, /*ld=*/colsD,
+                                     D->data<ElementType>(),
+                                     getCudaDataType(D->data<ElementType>()),
                                      denseOrder));
   CUSPARSE_CHECK(cusparseCreateDnMat(&descC,
                                      rowsC, colsC, /*ld=*/colsC,
-                                     C->data<ElementType>(), 
-                                     getCudaDataType(C->data<ElementType>()), 
+                                     C->data<ElementType>(),
+                                     getCudaDataType(C->data<ElementType>()),
                                      denseOrder));
 
   size_t bufferSize = 0;
@@ -136,7 +137,7 @@ static void CSRProd(marian::Tensor C,
   // interpret tensor dimensions as matrix dimensions
   const auto& shapeC = C->shape();
   const auto& shapeD = D->shape();
-  
+
   auto colsC = shapeC[-1];
   auto rowsC = shapeC.elements() / colsC;
 
@@ -147,7 +148,7 @@ static void CSRProd(marian::Tensor C,
   auto colsS = rowsD;
 
   auto denseOrder = CUSPARSE_ORDER_ROW;
-  auto algorithm  = CUSPARSE_SPMM_CSR_ALG2; 
+  auto algorithm  = CUSPARSE_SPMM_CSR_ALG2;
 
   if(transS)
     std::swap(rowsS, colsS);
@@ -157,7 +158,7 @@ static void CSRProd(marian::Tensor C,
   auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length
   ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument");
   ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size");
-  
+
   ElementType alpha = 1.0;
 
   cusparseSpMatDescr_t descS;
@@ -168,19 +169,19 @@ static void CSRProd(marian::Tensor C,
                                    S_offsets->data<IndexType>(),
                                    S_indices->data<IndexType>(),
                                    S_values ->data<ElementType>(),
-                                   CUSPARSE_INDEX_32I, 
+                                   CUSPARSE_INDEX_32I,
                                    CUSPARSE_INDEX_32I,
                                    CUSPARSE_INDEX_BASE_ZERO,
                                    getCudaDataType(S_values->data<ElementType>())));
   CUSPARSE_CHECK(cusparseCreateDnMat(&descD,
-                                     rowsD, colsD, /*ld=*/colsD, 
-                                     D->data<ElementType>(), 
-                                     getCudaDataType(D->data<ElementType>()), 
+                                     rowsD, colsD, /*ld=*/colsD,
+                                     D->data<ElementType>(),
+                                     getCudaDataType(D->data<ElementType>()),
                                      denseOrder));
   CUSPARSE_CHECK(cusparseCreateDnMat(&descC,
                                      rowsC, colsC, /*ld=*/colsC,
-                                     C->data<ElementType>(), 
-                                     getCudaDataType(C->data<ElementType>()), 
+                                     C->data<ElementType>(),
+                                     getCudaDataType(C->data<ElementType>()),
                                      denseOrder));
 
   size_t bufferSize = 0;

From 00ff08680ea7961b0ebbb2a2f15a80bb9c72d1dc Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 2 Apr 2024 13:20:25 +0000
Subject: [PATCH 21/26] Merged PR 33692: Add --no-optimizer-reload option

This PR adds a simple `--no-optimizer-reload` that allows to skip restoring optimizer state during continued training or divergence fallback.
---
 CHANGELOG.md                        |   2 +
 VERSION                             |   2 +-
 src/common/config_parser.cpp        |  10 +-
 src/graph/node_operators_binary.h   |  60 ++++++------
 src/tensors/gpu/tensor_operators.cu | 146 +++++++++++++++-------------
 src/training/graph_group.cpp        |  48 +++++++--
 src/training/graph_group.h          |  15 ++-
 7 files changed, 171 insertions(+), 112 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3049e622e..40ba6e0b6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 
 ### Added
+- Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size.
+- Added `--no-optimizer-reload` to skip optimizer state loading during continued training or fallback.
 - Added `pymarian-eval`, CLI for scoring metrics
 - Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
 - Added `pymarian`: python bindings based on pybind11
diff --git a/VERSION b/VERSION
index 329143f69..8d44afc76 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.26
+v1.12.27
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 9c8b0776f..e3172b47d 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -254,9 +254,9 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       "If non-empty, you need to provide one type per input file (if --train-sets) or per TSV field (if --tsv). "
       "Usually, there should be no need to provide these on the command line, the model should have them saved.",
       {});
-  cli.add<bool>("--input-join-fields", 
+  cli.add<bool>("--input-join-fields",
       "Join input fields (from files or TSV) into a single sequence "
-      "(mostly used single-encoder models like BLEURT and COMET-KIWI)", 
+      "(mostly used single-encoder models like BLEURT and COMET-KIWI)",
       false);
   cli.add<bool>("--best-deep",
       "Use Edinburgh deep RNN configuration (s2s)");
@@ -436,6 +436,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       true);
   cli.add<bool>("--no-reload",
       "Do not load existing model specified in --model arg");
+  cli.add<bool>("--no-optimizer-reload",
+      "Do not load existing optimizer state from checkpoint specified in --model arg");
   cli.add<std::vector<std::string>>("--train-sets,-t",
       "Paths to training corpora: source target");
   cli.add<std::vector<std::string>>("--vocabs,-v",
@@ -650,7 +652,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   cli.add<bool>("--check-gradient-nan",
       "Skip parameter update in case of NaNs in gradient");
   cli.add<bool>("--normalize-gradient",
-      "Normalize gradient by multiplying with no. devices / total labels (not recommended and to be removed in the future)");
+      "Normalize gradient by dividing with efficient batch size");
+  cli.add<bool>("--normalize-gradient-by-ratio",
+      "Normalize gradient by scaling with efficient batch size divided by running average batch size");
 
   cli.add<std::vector<std::string>>("--train-embedder-rank",
       "Override model configuration and train a embedding similarity ranker with the model encoder, "
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 8cf0af1a4..eb3381bb4 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -17,27 +17,27 @@ class LambdaNodeOp : public NaryNodeOp {
 private:
   typedef const std::vector<Expr>& Inputs;
   typedef std::function<void(Expr, Inputs)> LambdaNodeFunctor;
-  
+
   std::unique_ptr<LambdaNodeFunctor> forward_;
   std::unique_ptr<LambdaNodeFunctor> backward_;
-  
+
   size_t externalHash_;
 
 public:
-  LambdaNodeOp(Inputs inputs, Shape shape, Type type, 
-               LambdaNodeFunctor forward, 
+  LambdaNodeOp(Inputs inputs, Shape shape, Type type,
+               LambdaNodeFunctor forward,
                size_t externalHash = 0)
-  : NaryNodeOp(inputs, shape, type), 
+  : NaryNodeOp(inputs, shape, type),
     forward_(new LambdaNodeFunctor(forward)),
     externalHash_(externalHash) {
     Node::trainable_ = !!backward_;
   }
 
-  LambdaNodeOp(Inputs inputs, Shape shape, Type type, 
+  LambdaNodeOp(Inputs inputs, Shape shape, Type type,
                LambdaNodeFunctor forward,
-               LambdaNodeFunctor backward, 
-               size_t externalHash = 0) 
-  : NaryNodeOp(inputs, shape, type), 
+               LambdaNodeFunctor backward,
+               size_t externalHash = 0)
+  : NaryNodeOp(inputs, shape, type),
     forward_(new LambdaNodeFunctor(forward)),
     backward_(new LambdaNodeFunctor(backward)),
     externalHash_(externalHash) {
@@ -130,7 +130,7 @@ class DotNodeOp : public NaryNodeOp {
     // df/dB += alpha * dot(op(A).T, D)
     // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C
     // to sum gradients from different graph parts
-    
+
     auto isParameter = [](Expr p) {
       return std::dynamic_pointer_cast<ParamNode>(p) != nullptr;
     };
@@ -276,7 +276,7 @@ class AffineNodeOp : public NaryNodeOp {
 
   NodeOps forwardOps() override {
     using namespace functional;
-    
+
     return {
       NodeOp(Affine(val_,
                     graph()->allocator(),
@@ -431,8 +431,8 @@ class AffineWithReluNodeOp : public NaryNodeOp {
   float scalar_;
 
 public:
-  AffineWithReluNodeOp(Expr a, 
-                       Expr b, 
+  AffineWithReluNodeOp(Expr a,
+                       Expr b,
                        Expr bias)
       : NaryNodeOp({a, b, bias}, newShape(a, b, false, false)),
         transA_(false),
@@ -465,7 +465,7 @@ class AffineWithReluNodeOp : public NaryNodeOp {
   NodeOps forwardOps() override {
     ABORT_IF(!graph()->isInference(),
              "AffineWithReluNodeOp currently only supported for inference");
-    
+
     return {
       NodeOp(Affine(val_,
                     graph()->allocator(),
@@ -541,12 +541,12 @@ class DotBatchedNodeOp : public NaryNodeOp {
     ABORT_IF(shapeA[-1] != shapeB[-2],
              "Batched matrix product requires inner dimensions to match in {}{} * {}{}",
              std::string(shapeA), transA, std::string(shapeB), transB);
-    
+
     // create shapes for batch dimensions only
     auto shapeBatchA = shapeA;
     shapeBatchA.set(-1, 1);
     shapeBatchA.set(-2, 1);
-    
+
     auto shapeBatchB = shapeB;
     shapeBatchB.set(-1, 1);
     shapeBatchB.set(-2, 1);
@@ -557,7 +557,7 @@ class DotBatchedNodeOp : public NaryNodeOp {
     // set non-batch dimensions in output
     shapeOut.set(-2, shapeA[-2]);
     shapeOut.set(-1, shapeB[-1]);
-    
+
     return shapeOut;
   }
 
@@ -579,7 +579,7 @@ class DotBatchedNodeOp : public NaryNodeOp {
     // df/dB += alpha * dot(op(A).T, D)
     // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C
     // to sum gradients from different graph parts
-    
+
     if(!transA_ && transB_) {
       return {NodeOp(ProdBatched(child(0)->grad(),
                                  graph()->allocator(),
@@ -705,7 +705,7 @@ class DotBatchedLegacyNodeOp : public NaryNodeOp {
       shapeB.set(-2, b->shape()[-1]);
       shapeB.set(-1, b->shape()[-2]);
     }
-  
+
     Shape outShape = shapeA;
     outShape.set(-1, shapeB[-1]);
     ABORT_IF(shapeA[-1] != shapeB[-2],
@@ -1101,7 +1101,7 @@ struct ScatterNodeOp : public NaryNodeOp {
     auto backwardForVal = [this]() {
       auto allocator = graph()->allocator();
 
-      // create temporary tensor of child(0)->grad().shape() == adj_.shape() 
+      // create temporary tensor of child(0)->grad().shape() == adj_.shape()
       // copy adj_ to temporary
       auto grad = child(0)->grad();
       auto tempGradMem = allocator->alloc(grad->memory()->size());
@@ -1116,7 +1116,7 @@ struct ScatterNodeOp : public NaryNodeOp {
 
       // insert tensor of zeros into temporary
       Insert</*add=*/false>(tempGrad, /*source=*/tempZero, /*indices*/child(1)->val(), axis_);
-      
+
       // add temporary do child(0)->grad()
       Add(functional::_1, grad, tempGrad);
 
@@ -1127,8 +1127,8 @@ struct ScatterNodeOp : public NaryNodeOp {
 
     return {
       // val - add gradients every where else to gradient of "a"
-      NodeOp(backwardForVal()), 
-      
+      NodeOp(backwardForVal()),
+
       NodeOp(/*no gradient*/[](){}), // indices
 
       // add gradients on indices to gradient of "source"
@@ -1647,7 +1647,7 @@ struct RMSNormalizationOp : public NaryNodeOp {
     return {NodeOp(
         RMSNormalization(val_,
                          child(0)->val(),
-                         child(1)->val(),
+                         (children_.size() >= 2) ? child(1)->val() : nullptr,
                          (children_.size() == 3) ? child(2)->val() : nullptr,
                          eps_))};
   }
@@ -1658,12 +1658,12 @@ struct RMSNormalizationOp : public NaryNodeOp {
       RMSNormalizationGrad(
         graph()->allocator(),
         child(0)->grad(),
-        child(1)->grad(),
+        (children_.size() >= 2) ? child(1)->grad() : nullptr,
         (children_.size() == 3) ? child(2)->grad() : nullptr,
         adj_,
         val_,
         child(0)->val(),
-        child(1)->val(),
+        (children_.size() >= 2) ? child(1)->val() : nullptr,
         (children_.size() == 3) ? child(2)->val() : nullptr,
         eps_))};
   }
@@ -1692,9 +1692,9 @@ struct RMSNormalizationOp : public NaryNodeOp {
   float eps_;
 };
 
-// @TODO: rewriting this fixes a bug for this one node. There should be exactly one 
-// NodeOp per gradient tensor many other nodes have that bug and need to be fixed. 
-// This will only manifest if the first op is not trainable, then gradients for the 
+// @TODO: rewriting this fixes a bug for this one node. There should be exactly one
+// NodeOp per gradient tensor many other nodes have that bug and need to be fixed.
+// This will only manifest if the first op is not trainable, then gradients for the
 // other nodes might get skipped despite being trainable.
 struct HighwayNodeOp : public NaryNodeOp {
   HighwayNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes, Shape::broadcast(nodes)) {}
@@ -1703,7 +1703,7 @@ struct HighwayNodeOp : public NaryNodeOp {
     using namespace functional;
     auto alpha = sigmoid(_4);
     auto fwd = _1 = alpha * _2 + (1.f - alpha) * _3;
-    
+
     return {
       NodeOp(Element(fwd, val_, child(0)->val(), child(1)->val(), child(2)->val()))
     };
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index b7c80394b..c21f4d35f 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -127,14 +127,14 @@ __global__ void gSanitizeGradient(T* in, int length,
   }
 }
 
-// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. 
-// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. 
-// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. 
+// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required.
+// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient.
+// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor.
 // In that case infinities do not result in a bad gradient, since they get clipped.
-// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result 
+// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result
 // in a bad gradient.
-// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), 
-// we return `false` indicating a bad gradient. 
+// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`),
+// we return `false` indicating a bad gradient.
 bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
   cudaSetDevice(in->getDeviceId().no);
 
@@ -180,7 +180,7 @@ __global__ void gCopyCastTo(To* out, const From* in, int length) {
     if(index < length) {
       if(add)
         out[index] += (To)in[index];
-      else 
+      else
         out[index]  = (To)in[index];
     }
   }
@@ -702,7 +702,7 @@ __global__ void gSoftmax(T* out,
 
       // determine max (used below to improve numeric stability)
       T* _max = _share;
-      
+
       // @TODO: what's going on here with fp16?
       _max[threadIdx.x] = -CUDA_FLT_MAX;  // mask
       // find max over column indices that have the same relative column index (=threadIdx.x) across all blocks of columns
@@ -857,7 +857,7 @@ __global__ void gLogSoftmax(T* out,
 
       // CUDA complains if type or size of shared memory changes, keep size constant.
       extern __shared__ uint8_t _sharedBytes[];
-      T* _share = (T*)_sharedBytes; 
+      T* _share = (T*)_sharedBytes;
       AccType* _shareAccType = (AccType*)_sharedBytes;
 
       T* _max = _share; // 16-bit is ok for max if applicable
@@ -892,7 +892,7 @@ __global__ void gLogSoftmax(T* out,
       _sum[threadIdx.x] = 0.0;
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
-        if(id < cols) {  
+        if(id < cols) {
           // @TODO: would it be faster to recompute it below? Also better numeric stability with float?
           AccType sm = (AccType)sp[id] - (AccType)max; // subtract max for numeric stability
           so[id] = (T)sm; // assign numerator to output
@@ -1327,7 +1327,7 @@ __global__ void gSelect(T* out,
     if(index < length) {
       outShape.dims(index, dims);
       int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
-      dims[axis] = (int)d_indices[idxIndex];    
+      dims[axis] = (int)d_indices[idxIndex];
       int inIndex = inShape.index(dims);
       if(add)
         out[index] += in[inIndex];
@@ -1353,12 +1353,12 @@ __global__ void gInsert(T* out,
     if(index < length) {
       inShape.dims(index, dims);
       int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
-      dims[axis] = (int)d_indices[idxIndex];    
+      dims[axis] = (int)d_indices[idxIndex];
       int outIndex = outShape.index(dims);
       if(add)
         out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
       else
-        out[outIndex] = in[index];     
+        out[outIndex] = in[index];
     }
   }
 }
@@ -1385,7 +1385,7 @@ void Select(Tensor out,
                                       in->data<float>(),
                                       in->shape(),
                                       axisGPU,
-                                      indices->data<IndexType>(), 
+                                      indices->data<IndexType>(),
                                       indices->shape());
 #if COMPILE_FP16
   } else if (out->type() == Type::float16) {
@@ -1403,7 +1403,7 @@ void Select(Tensor out,
                                       in->data<IndexType>(),
                                       in->shape(),
                                       axisGPU,
-                                      indices->data<IndexType>(), 
+                                      indices->data<IndexType>(),
                                       indices->shape());
   } else {
     ABORT("Select not implemented for type {}", out->type());
@@ -1640,7 +1640,7 @@ void GRUFastBackward(Ptr<Allocator> allocator,
   int blocks = std::min(MAX_BLOCKS, rows);
   int threads = std::min(MAX_THREADS, cols);
 
-  Tensor tempGradBias, tempOnes; 
+  Tensor tempGradBias, tempOnes;
   MemoryPiece::PtrType tempGradBiasMemory, tempOnesMemory;
   if(outputs[3]) {
     Shape memShape = {rows, outputs[3]->shape()[-1]};
@@ -1692,7 +1692,7 @@ void GRUFastBackward(Ptr<Allocator> allocator,
 
   // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards.
   // This is much faster for fp16 which seems to have a broken atomicAdd implementation.
-  // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. 
+  // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type.
   // This preserves precision with larger batches where all batch entries reduce into a single vector.
   // See also AffineNodeOp where we do the same for biases
   if(outputs[3]) {
@@ -1917,7 +1917,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor indices,
   }
 }
 
-// computes the L2Norm of tensor and returns value as flaot on the CPU, 
+// computes the L2Norm of tensor and returns value as flaot on the CPU,
 // this is mostly used for diagnostic purposes and gradient clipping
 float L2Norm(Tensor in, Ptr<Allocator> allocator) { // @TODO: reverse order of arguments
   cudaSetDevice(in->getDeviceId().no);
@@ -1996,7 +1996,7 @@ void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
   size_t batchDim        = context->shape()[-2];
   size_t contextWordsDim = context->shape()[-3];
 
-  int blocks = std::min(MAX_BLOCKS, (int)totalRows);   
+  int blocks = std::min(MAX_BLOCKS, (int)totalRows);
   int threads = std::min(MAX_THREADS, (int)modelDim);
   int shared = sizeof(float) * threads;
 
@@ -2316,11 +2316,11 @@ __global__ void gLayerNormalizationGrad(T* gradX,
           AccType lv     = (xv - mean) / sigma;
 
           AccType gradLv = N * adjv - lv * sum_adj_l[0] - sum_adj[0];
-          gradLv        /= N * sigma; 
+          gradLv        /= N * sigma;
 
           AccType gradXv = gammav * gradLv;
 
-          // Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. 
+          // Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf.
           // @TODO: to be fixed and removed.
           AccType sign = functional::Ops<AccType>::sgn(gradXv);
           AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete.
@@ -2405,7 +2405,7 @@ void LayerNormalizationGrad(Ptr<Allocator> allocator,
 
   // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards.
   // This is much faster for fp16 which seems to have a broken atomicAdd implementation.
-  // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. 
+  // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type.
   // This preserves precision with larger batches where all batch entries reduce into a single vector.
   // See also AffineNodeOp where we do the same for biases
   if(gradGamma)
@@ -2462,7 +2462,7 @@ __global__ void gRMSNormalization(T* out,
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
         if(id < cols) {
-          AccType gammav  = (AccType)gamma[id];
+          AccType gammav  = gamma ? (AccType)gamma[id] : (AccType)1.f;
           AccType xv      = (AccType)xRow[id];
           AccType betav   = beta ? (AccType)beta[id] : (AccType)0.f;
           AccType rmsNorm = xv / rms;
@@ -2492,7 +2492,7 @@ void RMSNormalization(Tensor out,
   if(out->type() == Type::float32) {
     gRMSNormalization<float, float><<<blocks, threads, shared>>>(out->data<float>(),
                                                                  in->data<float>(),
-                                                                 gamma->data<float>(),
+                                                                 gamma ? gamma->data<float>() : nullptr,
                                                                  beta ? beta->data<float>() : nullptr,
                                                                  rows,
                                                                  cols,
@@ -2501,7 +2501,7 @@ void RMSNormalization(Tensor out,
   } else if (out->type() == Type::float16) {
     gRMSNormalization<half, float><<<blocks, threads, shared>>>(out->data<half>(),
                                                                 in->data<half>(),
-                                                                gamma->data<half>(),
+                                                                gamma ? gamma->data<half>() : nullptr,
                                                                 beta ? beta->data<half>() : nullptr,
                                                                 rows,
                                                                 cols,
@@ -2547,7 +2547,7 @@ __global__ void gRMSNormalizationGrad(T* gradX,
           AccType xv     = xRow[id];
           AccType yv     = yRow[id];
           AccType betav  = beta ? (AccType)beta[id] : (AccType)0.f;
-          AccType gammav = (AccType)gamma[id];
+          AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f;
           AccType adjv   = adjRow[id];
           AccType rv     = (yv - betav) / gammav; // go back to RMSNorm(x) from scaled and shifted version for accumulation
 
@@ -2580,16 +2580,16 @@ __global__ void gRMSNormalizationGrad(T* gradX,
         if(id < cols) {
 
           AccType xv      = xRow[id];
-          AccType gammav  = (AccType)gamma[id];
+          AccType gammav  = gamma ? (AccType)gamma[id] : (AccType)1.f;
           AccType adjv    = adjRow[id];
           AccType rmsNorm = xv / rms;
 
           AccType gradNorm = N * adjv - rmsNorm * sum_adj_r[0];
-          gradNorm        /= N * rms; 
+          gradNorm        /= N * rms;
 
           AccType gradXv = gammav * gradNorm;
 
-          // Keep RMSN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. 
+          // Keep RMSN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf.
           // @TODO: to be fixed and removed.
           AccType sign = functional::Ops<AccType>::sgn(gradXv);
           AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete.
@@ -2601,10 +2601,12 @@ __global__ void gRMSNormalizationGrad(T* gradX,
           T* gradXRow      = gradX     + j * cols;
           gradXRow[id]    += (T)(gradXv);
 
-          T* gradGammaRow  = gradGamma + j * cols;
-          // assignment is correct here as this gets summed up
-          // in the next kernel via matrix product
-          gradGammaRow[id] = (T)(adjv * rmsNorm);
+          if(gamma) {
+            T* gradGammaRow  = gradGamma + j * cols;
+            // assignment is correct here as this gets summed up
+            // in the next kernel via matrix product
+            gradGammaRow[id] = (T)(adjv * rmsNorm);
+          }
         }
       }
     }
@@ -2629,24 +2631,32 @@ void RMSNormalizationGrad(Ptr<Allocator> allocator,
   int threads = std::min(MAX_THREADS, cols);
   int blocks = std::min(MAX_BLOCKS, rows);
 
-  auto tempGradGammaMemory = allocator->alloc(adj->memory()->size());
-  Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend());
-  tempGradGamma->set(0.f);
+  MemoryPiece::PtrType tempGradGammaMemory;
+  Tensor tempGradGamma;
+  if(gamma) {
+    tempGradGammaMemory = allocator->alloc(adj->memory()->size());
+    tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend());
+    tempGradGamma->set(0.f);
+  }
 
-  auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type()));
-  Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend());
-  tempOnes->set(1.f);
+  MemoryPiece::PtrType tempOnesMemory;
+  Tensor tempOnes;
+  if(gamma || beta) {
+    tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type()));
+    tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend());
+    tempOnes->set(1.f);
+  }
 
   if(gradX->type() == Type::float32) {
     int shared = sizeof(float) * threads * 2;
     gRMSNormalizationGrad<float, float><<<blocks, threads, shared>>>(
       gradX->data<float>(),
-      tempGradGamma->data<float>(),
+      gamma ? tempGradGamma->data<float>() : nullptr,
       adj->data<float>(),
       y->data<float>(),
       x->data<float>(),
-      gamma->data<float>(),
-      (beta) ? beta->data<float>() : nullptr,
+      gamma ? gamma->data<float>() : nullptr,
+      beta ? beta->data<float>() : nullptr,
       rows,
       cols,
       eps);
@@ -2656,12 +2666,12 @@ void RMSNormalizationGrad(Ptr<Allocator> allocator,
     int shared = sizeof(float) * threads * 2;
     gRMSNormalizationGrad<half, float><<<blocks, threads, shared>>>(
       gradX->data<half>(),
-      tempGradGamma->data<half>(),
+      gamma ? tempGradGamma->data<half>() : nullptr,
       adj->data<half>(),
       y->data<half>(),
       x->data<half>(),
-      gamma->data<half>(),
-      (beta) ? beta->data<half>() : nullptr,
+      gamma ? gamma->data<half>() : nullptr,
+      beta ? beta->data<half>() : nullptr,
       rows,
       cols,
       eps);
@@ -2672,16 +2682,20 @@ void RMSNormalizationGrad(Ptr<Allocator> allocator,
 
   // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards.
   // This is much faster for fp16 which seems to have a broken atomicAdd implementation.
-  // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. 
+  // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type.
   // This preserves precision with larger batches where all batch entries reduce into a single vector.
   // See also AffineNodeOp where we do the same for biases
-  gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add
+  if(gamma) {
+    gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add
+    allocator->free(tempGradGammaMemory);
+  }
 
-  if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction)
+  if(beta) { // dC/dbeta = adj - inverse broadcasting (reduction)
     gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add
+  }
 
-  allocator->free(tempGradGammaMemory);
-  allocator->free(tempOnesMemory);
+  if(tempOnes)
+    allocator->free(tempOnesMemory);
 }
 
 
@@ -3421,16 +3435,16 @@ __global__ void Float2Bit(const float *in, uint32_t *out, int batch, int dim, in
   int batchIdx = blockIdx.x;
   const float *inBatchOffset = in + batchIdx * dim;
   uint32_t *outBatchOffset = out + batchIdx * outDim;
-  
+
   int outDimIdx = threadIdx.x;
   while (outDimIdx < outDim) {
     const float *inDimOffset = inBatchOffset + outDimIdx * 32;
     uint32_t &outDimOffset = outBatchOffset[outDimIdx];
     uint32_t outVal = 0;
     uint32_t mask = 1;
-    
+
     for (int bitIdx = 0; bitIdx < 32; ++bitIdx) {
-      if (inDimOffset[bitIdx] >= 0) 
+      if (inDimOffset[bitIdx] >= 0)
         outVal |= mask;
 
         mask <<= 1;
@@ -3458,12 +3472,12 @@ void Float2Bit(marian::Tensor output, const marian::Tensor input)
 //////////////////////////////////////////////////////////////////////////////////////////
 // Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo
 // https://www.geeksforgeeks.org/counting-sort/
-__global__ void HammmingAndSort(const uint32_t *weightHash, 
+__global__ void HammmingAndSort(const uint32_t *weightHash,
                       const uint32_t *inputHash,
                       uint16_t *hamming,
-                      uint32_t *outCounts, 
-                      uint32_t *outIdx, 
-                      uint32_t kBest, uint16_t minVal, uint16_t maxVal, uint16_t range, 
+                      uint32_t *outCounts,
+                      uint32_t *outIdx,
+                      uint32_t kBest, uint16_t minVal, uint16_t maxVal, uint16_t range,
                       int hashDim, int dim, int batch)
 {
   extern __shared__ uint32_t sharedCounts[];
@@ -3471,8 +3485,8 @@ __global__ void HammmingAndSort(const uint32_t *weightHash,
   int batchIdx = blockIdx.x;
 
   uint32_t *stopVal = sharedCounts + range;
-  uint16_t *hammingBatchOffset = hamming 
-                              ? hamming + batchIdx * dim 
+  uint16_t *hammingBatchOffset = hamming
+                              ? hamming + batchIdx * dim
                               : (uint16_t*) (sharedCounts + range);
 
   uint32_t *outCountsBatchOffset = outCounts ? outCounts + batchIdx * kBest : nullptr;
@@ -3550,7 +3564,7 @@ __global__ void HammmingAndSort(const uint32_t *weightHash,
     uint32_t countIdx = val - minVal;
     assert(countIdx < range);
     uint32_t &outIdx = sharedCounts[countIdx];
-    
+
     if (outIdx != NPP_MAX_32U) {
       uint32_t prevOutIdx;
 // Not supported in Maxwells or older
@@ -3576,10 +3590,10 @@ __global__ void HammmingAndSort(const uint32_t *weightHash,
 // Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo
 // https://www.geeksforgeeks.org/counting-sort/
 void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts,
-                  const marian::Tensor weightHash, 
+                  const marian::Tensor weightHash,
                   const marian::Tensor inputHash,
-                  uint32_t kBest, uint16_t minVal, uint16_t maxVal, 
-                  marian::Ptr<marian::Allocator> &alloc, 
+                  uint32_t kBest, uint16_t minVal, uint16_t maxVal,
+                  marian::Ptr<marian::Allocator> &alloc,
                   marian::Ptr<marian::Backend> &backend)
 {
   size_t SHARED_MEM_SIZE = 48000;
@@ -3599,7 +3613,7 @@ void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts,
   size_t mem = range * sizeof(uint32_t) // counts
               + sizeof(uint32_t)    // stopval
               + dim * sizeof(uint16_t); // hamming;
-  
+
   marian::Tensor hamming;
   if (mem > SHARED_MEM_SIZE) {
     // shared memory too small. Write haming distance to global mem instead
@@ -3613,12 +3627,12 @@ void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts,
   }
 
   HammmingAndSort<<<inputBatch, 256, mem>>>
-              (weightHash->data<uint32_t>(), 
+              (weightHash->data<uint32_t>(),
               inputHash->data<uint32_t>(),
               hamming ? hamming->data<uint16_t>() : nullptr,
               outCounts ? outCounts->data<uint32_t>() : nullptr,
               outIdx ? outIdx->data<uint32_t>() : nullptr,
-              kBest, minVal, maxVal, range, 
+              kBest, minVal, maxVal, range,
               hashDim, dim, inputBatch);
   CUDA_CHECK(cudaGetLastError());
 
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index efada03ae..dbe6a7782 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -8,6 +8,10 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
     devices_(Config::getDevices(options, mpi->myMPIRank(), mpi->numMPIProcesses())),
     shardingMode_(getShardingMode(options_, mpi)),
     mbRoundUp_(options_->get<bool>("mini-batch-round-up", true)) {
+
+  normalizeGradient_ = options_->get<bool>("normalize-gradient", false);
+  normalizeGradientByAverageRatio_ = options_->get<bool>("normalize-gradient-by-ratio", false);
+
   if(options_->hasAndNotEmpty("cost-scaling")) {
     auto vcs = options_->get<std::vector<std::string>>("cost-scaling");
 
@@ -234,14 +238,23 @@ float GraphGroup::executeAndCollectNorm(const std::function<float(size_t, size_t
  * - normalize the gradient by the number of words in a batch if requested (turning ce-sum in to ce-mean). @TODO: once fp16 stability issues are proven to not be caused by this, remove.
  * - re-scale the gradient based on a dynamic running average of gradient norms
  */
-float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) {
+float GraphGroup::computeNormalizationFactor(float gNorm, size_t effectiveBatchSize) {
   float normalizationFactor = 1.f;
 
   if(costScaling_)
     normalizationFactor *= costScalingFactor_;
 
-  if(options_->get<bool>("normalize-gradient"))
-    normalizationFactor *= updateTrgWords;
+  if(normalizeGradient_)
+    normalizationFactor *= effectiveBatchSize;
+
+  if(normalizeGradientByAverageRatio_) {
+    // keep track of average effective batch size
+    updateAverageEffectiveBatchSize(effectiveBatchSize);
+    // this slightly adapts the gradient magnitude if the batch size changes drastically,
+    // in practice this will only matter if we grow the batch in larger steps. In that case
+    // the gradient magnitude is reduced until after a couple of updates that goes back to ~1.
+    normalizationFactor *= effectiveBatchSize / getAverageEffectiveBatchSize();
+  }
 
   if(!isFinite(gNorm)) // we are checking the sanity of the gradient elsewhere
     return normalizationFactor;
@@ -253,7 +266,7 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
 
     // Normalize gradient norm w.r.t. number of labels in batch for statistics,
     // there should be no gradient normalization before this point, @TODO: check this
-    gNorm = gNorm / updateTrgWords;
+    gNorm = gNorm / effectiveBatchSize;
 
     size_t window; float gNormAvgTransform, gNormVarTransform, gNormTransform, gNormAvg;
     if(dynamicGradientScalingUseLogs_) {
@@ -368,8 +381,13 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
         models_[i++]->load(graph, modelWeights_, markReloaded);
       }
 
-      // try to restore everything from checkpoint now
-      loadOptimizerState(modelFileName, scatterFn);
+      bool noOptimizerReload = options_->get<bool>("no-optimizer-reload", false);
+      if(noOptimizerReload) {
+        LOG(info, "--no-optimizer-reload is specified, we are skipping optimizer state restoration");
+      } else {
+        // try to restore everything from checkpoint now
+        loadOptimizerState(modelFileName, scatterFn);
+      }
 
       // @TODO: run another graph->forward() to allocate the weights from the checkpoint?
       // then we might not need to keep modelWeights_ around.
@@ -673,15 +691,27 @@ Ptr<data::BatchStats> GraphGroup::collectStats(Ptr<ExpressionGraph> graph,
 }
 
 void GraphGroup::setTypicalTrgBatchWords(size_t typicalTrgBatchWords) { // needed for dynamic MB scaling
-  typicalTrgBatchWords_ = (double)typicalTrgBatchWords;
+  typicalTrgBatchWords_ = (float)typicalTrgBatchWords;
 }
 
-double GraphGroup::getTypicalTrgBatchWords() {
+float GraphGroup::getTypicalTrgBatchWords() {
   return typicalTrgBatchWords_;
 }
 
 void GraphGroup::updateAverageTrgBatchWords(size_t trgBatchWords) {
-  typicalTrgBatchWords_ = 0.99 * typicalTrgBatchWords_ + 0.01 * (double)trgBatchWords; // record a running average of the batch size, factors are chosen empirically.
+  typicalTrgBatchWords_ = 0.99f * typicalTrgBatchWords_ + 0.01f * (float)trgBatchWords; // record a running average of the batch size, factors are chosen empirically.
+}
+
+float GraphGroup::getAverageEffectiveBatchSize() {
+  return averageEffectiveBatchSize_;
+}
+
+void GraphGroup::updateAverageEffectiveBatchSize(size_t effectiveBatchSize) {
+  if(averageEffectiveBatchSize_ == 0)
+    averageEffectiveBatchSize_ = (float)effectiveBatchSize;
+
+  // record a running average of the effective batch size
+  averageEffectiveBatchSize_ = 0.9f * averageEffectiveBatchSize_ + 0.1f * (float)effectiveBatchSize;
 }
 
 size_t GraphGroup::numberOfInputFiles() {
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 9f70ed81b..1a2794571 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -50,7 +50,8 @@ class GraphGroup {
   Ptr<Scheduler> scheduler_; // scheduler that keeps track of how much has been processed
 
   bool finalized_{false};    // 'true' if training has completed (further updates are no longer allowed)
-  double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
+  float typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
+  float averageEffectiveBatchSize_{0}; // record average effective batch size
   bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false
 
   bool costScaling_{false};
@@ -64,6 +65,9 @@ class GraphGroup {
 
   bool checkGradientNan_{false};
 
+  bool normalizeGradient_{false};
+  bool normalizeGradientByAverageRatio_{true};
+
   bool dynamicGradientScaling_{false};
   float dynamicGradientScalingFactor_{2.f};
   bool dynamicGradientScalingUseLogs_{false};
@@ -133,7 +137,7 @@ class GraphGroup {
   float checkNanOrNorm(size_t i, size_t begin, size_t end);
   float executeAndCollectNorm(const std::function<float(size_t, size_t, size_t)>& task);
 
-  float computeNormalizationFactor(float gNorm, size_t updateTrgWords);
+  float computeNormalizationFactor(float gNorm, size_t effectiveBatchSize);
 
   /**
    * Determine maximal batch size that can fit into the given workspace
@@ -152,9 +156,14 @@ class GraphGroup {
 
   virtual Ptr<data::BatchStats> collectStats(const std::vector<Ptr<Vocab>>& vocabs) = 0;
 
+  // used to estimate the number of words in a batch and figure out statistics for batch growing etc.
   void setTypicalTrgBatchWords(size_t typicalTrgBatchWords);
-  double getTypicalTrgBatchWords();
+  float getTypicalTrgBatchWords();
   void updateAverageTrgBatchWords(size_t trgBatchWords);
+
+  // similar to above but counts the number of labels including delayed updates. This is used for gradient normalization.
+  float getAverageEffectiveBatchSize();
+  void updateAverageEffectiveBatchSize(size_t effectiveBatchSize);
 };
 
 }  // namespace marian

From 58a9150281a6cc2fef4324cc76124361f7d012ea Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 16 Apr 2024 14:59:45 +0000
Subject: [PATCH 22/26] Merged PR 33803: Fixes to force-decoding to enable LSH

This PR includes various fixes to the force decoding code to make the LSH and beam search work.
---
 CHANGELOG.md                   |   1 +
 src/data/shortlist.cpp         | 124 +++++++++++++++++++++++++--------
 src/data/shortlist.h           |  24 ++++---
 src/layers_new/alibi.cpp       |   4 +-
 src/microsoft/quicksand.cpp    |  40 ++++++-----
 src/microsoft/quicksand.h      |   4 +-
 src/tensors/cpu/topk.cpp       |  53 ++++++++------
 src/translator/beam_search.cpp |  37 +++++-----
 src/translator/sampling.h      | 104 ++++++++++++++++++---------
 9 files changed, 259 insertions(+), 132 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 40ba6e0b6..caa3b8aa8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed force-decoding with LSH
 - Fixed force-decoding for beam-size > 1
 - Fixed lost node in mt-detect metrics
 - Fixed BLEURT logmask computation
diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp
index 909734ea6..ac588a279 100644
--- a/src/data/shortlist.cpp
+++ b/src/data/shortlist.cpp
@@ -8,7 +8,7 @@
 namespace marian {
 namespace data {
 
-// cast current void pointer to T pointer and move forward by num elements 
+// cast current void pointer to T pointer and move forward by num elements
 template <typename T>
 const T* get(const void*& current, size_t num = 1) {
   const T* ptr = (const T*)current;
@@ -18,19 +18,22 @@ const T* get(const void*& current, size_t num = 1) {
 
 //////////////////////////////////////////////////////////////////////////////////////
 Shortlist::Shortlist(const std::vector<WordIndex>& indices)
-  : indices_(indices), 
+  : indices_(indices),
     initialized_(false) {}
 
 Shortlist::~Shortlist() {}
 
-WordIndex Shortlist::reverseMap(int /*beamIdx*/, int /*batchIdx*/, int idx) const { return indices_[idx]; }
+WordIndex Shortlist::reverseMap(int /*beamIdx*/, int /*batchIdx*/, int idx) const {
+  return indices_[idx];
+}
 
-WordIndex Shortlist::tryForwardMap(WordIndex wIdx) const {
+WordIndex Shortlist::tryForwardMap(WordIndex wIdx, int /*batchIdx*/) const {
   auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx);
-  if(first != indices_.end() && *first == wIdx)         // check if element not less than wIdx has been found and if equal to wIdx
+  if(first != indices_.end() && *first == wIdx) {        // check if element not less than wIdx has been found and if equal to wIdx
     return (int)std::distance(indices_.begin(), first); // return coordinate if found
-  else
+  } else {
     return npos;                                        // return npos if not found, @TODO: replace with std::optional once we switch to C++17?
+  }
 }
 
 void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) {
@@ -46,7 +49,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp
   Shape kShape({k});
   indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward);
 
-  createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k);
+  createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt);
   initialized_ = true;
 }
 
@@ -59,8 +62,7 @@ Expr Shortlist::getIndicesExpr() const {
 void Shortlist::createCachedTensors(Expr weights,
                           bool isLegacyUntransposedW,
                           Expr b,
-                          Expr lemmaEt,
-                          int k) {
+                          Expr lemmaEt) {
   ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested");
   cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExpr_);
   cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]});
@@ -70,6 +72,7 @@ void Shortlist::createCachedTensors(Expr weights,
   }
 
   if (lemmaEt) {
+    int k = indicesExpr_->shape()[-1];
     cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_);
     cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k});
   }
@@ -78,60 +81,121 @@ void Shortlist::createCachedTensors(Expr weights,
 ///////////////////////////////////////////////////////////////////////////////////
 
 LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic)
-: Shortlist(std::vector<WordIndex>()), 
+: Shortlist(std::vector<WordIndex>()),
   k_(k), nbits_(nbits), lemmaSize_(lemmaSize), abortIfDynamic_(abortIfDynamic) {
 }
 
 WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const {
-  //int currBeamSize = indicesExpr_->shape()[0];
   int currBatchSize = indicesExpr_->shape()[1];
   idx = (k_ * currBatchSize * beamIdx) + (k_ * batchIdx) + idx;
   assert(idx < indices_.size());
-  return indices_[idx]; 
+  return indices_[idx];
 }
 
 Expr LSHShortlist::getIndicesExpr() const {
   return indicesExpr_;
 }
 
+void LSHShortlist::setForcedIndices(Expr forcedIndices) {
+  if(forcedIndices) {
+    int dimBatch = forcedIndices->shape()[-2];
+    forcedIndicesExpr_ = reshape(forcedIndices, {1, dimBatch, 1});
+  } else {
+    forcedIndicesExpr_ = nullptr;
+  }
+}
+
 void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) {
-  indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_),
-                          [this](Expr node) { 
+  auto topk = lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_); // [beam, batch, k]
+
+  bool addForced = forcedIndicesExpr_ != nullptr;
+  if(addForced) {
+    topk = callback(topk,
+                    [this](Expr node) {
+                      int dimBeam = node->shape()[-3];
+                      int dimBatch = node->shape()[-2];
+                      for(int batchIdx = 0; batchIdx < dimBatch; batchIdx++) {
+                        for(int beamIdx = 0; beamIdx < dimBeam; beamIdx++) {
+                          IndexType* begin = node->val()->data<IndexType>() + beamIdx * dimBatch * k_ + batchIdx * k_;
+                          IndexType* end   = begin + k_;
+                          IndexType val    = forcedIndicesExpr_->val()->data<IndexType>()[batchIdx];
+                          auto pos         = std::lower_bound(begin, end, val);
+                          if(pos != end)
+                            *pos = val;
+                          else
+                            *(end-1) = val;
+                        }
+                      }
+                      // we will correctly overwrite the indices used for reverse mapping in the next call back
+
+                      setForcedIndices(nullptr); // mark as done for this step
+                    });
+  }
+
+  indicesExpr_ = callback(topk,
+                          [this](Expr node) {
                             node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node
                           });
 
-  createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k_);
+  createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt);
+}
+
+WordIndex LSHShortlist::tryForwardMap(WordIndex wIdx, int batchIdx) const {
+  if(!indicesExpr_ || indices_.empty())
+    return npos;
+
+  int dimBatch = indicesExpr_->shape()[-2];
+  int beamIdx = 0;
+
+  IndexType* begin = indicesExpr_->val()->data<IndexType>() + beamIdx * dimBatch * k_ + batchIdx * k_;
+  IndexType* end   = begin + k_;
+
+  auto pos = std::lower_bound(begin, end, wIdx);
+  if(pos != end)
+    return (int)std::distance(begin, pos);
+   else
+    return npos;
+}
+
+Expr LSHShortlist::tryForwardMap(Expr indices) const {
+  auto forward = [this](Expr out, const std::vector<Expr>& inputs) {
+    ABORT_IF(out->val()->getDeviceId().type != DeviceType::cpu, "LSHShortlist::tryForwardMap(Expr) is only implemented for CPU");
+    for(int batchIdx = 0; batchIdx < out->shape().elements(); batchIdx++)
+      out->val()->data<IndexType>()[batchIdx] = LSHShortlist::tryForwardMap(inputs[0]->val()->data<IndexType>()[batchIdx], batchIdx);
+  };
+
+  return lambda({indices}, indices->shape(), Type::uint32, forward);
 }
 
 void LSHShortlist::createCachedTensors(Expr weights,
                                        bool isLegacyUntransposedW,
                                        Expr b,
-                                       Expr lemmaEt,
-                                       int k) {
+                                       Expr lemmaEt) {
   int currBeamSize = indicesExpr_->shape()[0];
   int batchSize = indicesExpr_->shape()[1];
   ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested");
 
+  int kPrime = indicesExpr_->shape()[-1];
   Expr indicesExprFlatten = reshape(indicesExpr_, {indicesExpr_->shape().elements()});
 
   cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprFlatten);
-  cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]});
+  cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, kPrime, cachedShortWt_->shape()[1]});
 
   if (b) {
     ABORT("Bias not supported with LSH");
     cachedShortb_ = index_select(b, -1, indicesExprFlatten);
-    cachedShortb_ = reshape(cachedShortb_, {currBeamSize, batchSize, k, cachedShortb_->shape()[0]}); // not tested
+    cachedShortb_ = reshape(cachedShortb_, {currBeamSize, batchSize, kPrime, cachedShortb_->shape()[0]}); // not tested
   }
 
   if (lemmaEt) {
     int dim = lemmaEt->shape()[0];
     cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprFlatten);
-    cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, k});
+    cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, kPrime});
     cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3});
   }
 }
 
-LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize, bool abortIfDynamic) 
+LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize, bool abortIfDynamic)
   : k_(k), nbits_(nbits), lemmaSize_(lemmaSize), abortIfDynamic_(abortIfDynamic) {
 }
 
@@ -165,7 +229,7 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr<Options> options,
 
   mmap_ = mio::mmap_source(fname); // memory-map the binary file once
   const void* current = mmap_.data(); // pointer iterator over binary file
-  
+
   // compare magic number in binary file to make sure we are reading the right thing
   const int32_t MAGIC_NUMBER = 1234567890;
   int32_t header_magic_number = *get<int32_t>(current);
@@ -173,7 +237,7 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr<Options> options,
 
   auto config = marian::quicksand::ParameterTree::FromBinaryReader(current);
   use16bit_ = config->GetBoolReq("use_16_bit");
-  
+
   LOG(info, "[data] Mapping Quicksand shortlist from {}", fname);
 
   idSize_ = sizeof(int32_t);
@@ -189,12 +253,12 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr<Options> options,
   sourceOffsets_        =  get<int32_t>(current, numSourceIds_);
   numShortlistIds_      = *get<int32_t>(current);
   sourceToShortlistIds_ =  get<uint8_t>(current, idSize_ * numShortlistIds_);
-  
+
   // display parameters
-  LOG(info, 
+  LOG(info,
       "[data] Quicksand shortlist has {} source ids, {} default ids and {} shortlist ids",
-      numSourceIds_, 
-      numDefaultIds_, 
+      numSourceIds_,
+      numDefaultIds_,
       numShortlistIds_);
 }
 
@@ -225,12 +289,12 @@ Ptr<Shortlist> QuicksandShortlistGenerator::generate(Ptr<data::CorpusBatch> batc
       curShortlistIt->first  = curShortlistIds;
       curShortlistIt->second = length;
       curShortlistIt++;
-      
+
       if (length > maxLength)
         maxLength = length;
     }
   }
-        
+
   // collect the actual shortlist mappings
   for (int32_t i = 0; i < maxLength && indexSet.size() < maxShortlistSize; i++) {
     for (int32_t j = 0; j < curShortlists.size() && indexSet.size() < maxShortlistSize; j++) {
@@ -273,7 +337,7 @@ Ptr<ShortlistGenerator> createShortlistGenerator(Ptr<Options> options,
     size_t lemmaSize = trgVocab->lemmaSize();
     return New<LSHShortlistGenerator>(lshOpts[0], lshOpts[1], lemmaSize, /*abortIfDynamic=*/false);
   }
-  else {                                                   
+  else {
     std::vector<std::string> vals = options->get<std::vector<std::string>>("shortlist");
     ABORT_IF(vals.empty(), "No path to shortlist given");
     std::string fname = vals[0];
diff --git a/src/data/shortlist.h b/src/data/shortlist.h
index bf185d570..484a3403e 100644
--- a/src/data/shortlist.h
+++ b/src/data/shortlist.h
@@ -26,26 +26,26 @@ class Shortlist {
 protected:
   std::vector<WordIndex> indices_;    // // [packed shortlist index] -> word index, used to select columns from output embeddings
   Expr indicesExpr_;    // cache an expression that contains the short list indices
+  Expr forcedIndicesExpr_;
 
   Expr cachedShortWt_;  // short-listed version, cached (cleared by clear())
   Expr cachedShortb_;   // these match the current value of shortlist_
   Expr cachedShortLemmaEt_;
   bool initialized_; // used by batch-level shortlist. Only initialize with 1st call then skip all subsequent calls for same batch
-  
+
   void createCachedTensors(Expr weights,
                            bool isLegacyUntransposedW,
                            Expr b,
-                           Expr lemmaEt,
-                           int k);
+                           Expr lemmaEt);
 public:
   static constexpr WordIndex npos{std::numeric_limits<WordIndex>::max()}; // used to identify invalid shortlist entries similar to std::string::npos
 
   Shortlist(const std::vector<WordIndex>& indices);
   virtual ~Shortlist();
-  
+
   virtual bool isDynamic() const { return false; }
   virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const;
-  virtual WordIndex tryForwardMap(WordIndex wIdx) const;
+  virtual WordIndex tryForwardMap(WordIndex wIdx, int batchIdx=0) const;
 
   virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt);
   virtual Expr getIndicesExpr() const;
@@ -72,7 +72,7 @@ class ShortlistGenerator {
 // https://arxiv.org/pdf/1903.03129.pdf      https://arxiv.org/pdf/1806.00588.pdf
 class LSHShortlist: public Shortlist {
 private:
-  int k_; // number of candidates returned from each input 
+  int k_; // number of candidates returned from each input
   int nbits_; // length of hash
   size_t lemmaSize_; // vocab size
   bool abortIfDynamic_; // if true disallow dynamic allocation for encoded weights and rotation matrix (only allow use of pre-allocated parameters)
@@ -83,8 +83,8 @@ class LSHShortlist: public Shortlist {
   void createCachedTensors(Expr weights,
                            bool isLegacyUntransposedW,
                            Expr b,
-                           Expr lemmaEt,
-                           int k);
+                           Expr lemmaEt);
+  virtual WordIndex tryForwardMap(WordIndex wIdx, int batchIdx=0) const override;
 
 public:
   LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic = false);
@@ -94,7 +94,9 @@ class LSHShortlist: public Shortlist {
 
   virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override;
   virtual Expr getIndicesExpr() const override;
+  virtual void setForcedIndices(Expr forcedIndices);
 
+  virtual Expr tryForwardMap(Expr indices) const;
 };
 
 class LSHShortlistGenerator : public ShortlistGenerator {
@@ -349,7 +351,7 @@ class FakeShortlistGenerator : public ShortlistGenerator {
 };
 
 /*
-Legacy binary shortlist for Microsoft-internal use. 
+Legacy binary shortlist for Microsoft-internal use.
 */
 class QuicksandShortlistGenerator : public ShortlistGenerator {
 private:
@@ -371,7 +373,7 @@ class QuicksandShortlistGenerator : public ShortlistGenerator {
   const int32_t* sourceOffsets_{nullptr};
   int32_t numShortlistIds_{0};
   const uint8_t* sourceToShortlistIds_{nullptr};
-  
+
 public:
   QuicksandShortlistGenerator(Ptr<Options> options,
                               Ptr<const Vocab> srcVocab,
@@ -384,7 +386,7 @@ class QuicksandShortlistGenerator : public ShortlistGenerator {
 };
 
 /*
-Shortlist factory to create correct type of shortlist. Currently assumes everything is a text shortlist 
+Shortlist factory to create correct type of shortlist. Currently assumes everything is a text shortlist
 unless the extension is *.bin for which the Microsoft legacy binary shortlist is used.
 */
 Ptr<ShortlistGenerator> createShortlistGenerator(Ptr<Options> options,
diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp
index 44f0eb60b..1eb3c861c 100644
--- a/src/layers_new/alibi.cpp
+++ b/src/layers_new/alibi.cpp
@@ -54,8 +54,10 @@ Ptr<DecoderState> AlibiDecoderState::select(
 Expr AlibiDecoderState::getAlibiShift(Ptr<ExpressionGraph> graph, bool decoding) const {
   if(decoding) {
     std::vector<float> shift;
-    for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_)
+    for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_) {
+      (void)batchIdx; // unused
       shift.push_back((float)(srcPos - trgPos));
+    }
 
     if(!shift.empty()) {
       int dimBeam  = lastBeam_;
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 513639dd6..75f224d9a 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -48,6 +48,9 @@ class VocabWrapper : public IVocabWrapper {
   VocabWrapper(Ptr<Vocab> vocab) : pImpl_(vocab) {}
   virtual ~VocabWrapper() {}
   WordIndex encode(const std::string& word) const override { return (*pImpl_)[word].toWordIndex(); }
+  WordIndex getEosId() const override { return pImpl_->getEosId().toWordIndex(); };
+  WordIndex getUnkId() const override { return pImpl_->getUnkId().toWordIndex(); };
+
   std::string decode(WordIndex id) const override { return (*pImpl_)[Word::fromWordIndex(id)]; }
   size_t size() const override { return pImpl_->size(); }
   void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const override { pImpl_->transcodeToShortlistInPlace(ptr, num); }
@@ -145,10 +148,9 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
 
   void setWorkspace(uint8_t* data, size_t size) override { device_->set(data, size); }
 
-  QSNBestBatch decode(const QSBatch& qsBatch,
+  QSNBestBatch decode(const std::vector<QSBatch>& qsBatches,
                       size_t maxLength,
                       const std::unordered_set<WordIndex>& shortlist) override {
-
     std::vector<int> lshOpts = options_->get<std::vector<int>>("output-approx-knn", {});
     ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters");
     ABORT_IF(lshOpts.size() == 2 && shortlist.size() > 0, "LSH and shortlist cannot be used at the same time");
@@ -167,24 +169,30 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
         scorer->setShortlistGenerator(shortListGen);
     }
 
-    // form source batch, by interleaving the words over sentences in the batch, and setting the mask
-    size_t batchSize = qsBatch.size();
-    auto subBatch = New<data::SubBatch>(batchSize, maxLength, vocabs_[0]);
-    for(size_t i = 0; i < maxLength; ++i) {
-      for(size_t j = 0; j < batchSize; ++j) {
-        const auto& sent = qsBatch[j];
-        if(i < sent.size()) {
-          size_t idx = i * batchSize + j;
-          subBatch->data()[idx] = marian::Word::fromWordIndex(sent[i]);
-          subBatch->mask()[idx] = 1;
+    ABORT_IF(qsBatches.empty(), "No input batch provided");
+
+    auto createSubBatch = [maxLength](const QSBatch& qsBatch, Ptr<Vocab> vocab) {
+      size_t batchSize = qsBatch.size();
+      auto subBatch = New<data::SubBatch>(batchSize, qsBatch.front().size(), vocab);
+      for(size_t i = 0; i < maxLength; ++i) {
+        for(size_t j = 0; j < batchSize; ++j) {
+          const auto& sent = qsBatch[j];
+          if(i < sent.size()) {
+            size_t idx = i * batchSize + j;
+            subBatch->data()[idx] = marian::Word::fromWordIndex(sent[i]);
+            subBatch->mask()[idx] = 1;
+          }
         }
       }
-    }
-    auto tgtSubBatch = New<data::SubBatch>(batchSize, 0, vocabs_[1]); // only holds a vocab, but data is dummy
-    std::vector<Ptr<data::SubBatch>> subBatches{ subBatch, tgtSubBatch };
-    std::vector<size_t> sentIds(batchSize, 0);
+      return subBatch;
+    };
+
+    auto srcSubBatch = createSubBatch(qsBatches[0], vocabs_[0]);
+    auto tgtSubBatch = createSubBatch(qsBatches[1], vocabs_[1]);
 
+    std::vector<Ptr<data::SubBatch>> subBatches{ srcSubBatch, tgtSubBatch };
     auto batch = New<data::CorpusBatch>(subBatches);
+    std::vector<size_t> sentIds(batch->size(), 0);
     batch->setSentenceIds(sentIds);
 
     // decode
diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h
index 3ed866e83..cf452fb81 100644
--- a/src/microsoft/quicksand.h
+++ b/src/microsoft/quicksand.h
@@ -44,6 +44,8 @@ class IVocabWrapper {
 public:
   virtual WordIndex encode(const std::string& word) const = 0;
   virtual std::string decode(WordIndex id) const = 0;
+  virtual WordIndex getEosId() const = 0;
+  virtual WordIndex getUnkId() const = 0;
   virtual size_t size() const = 0;
   virtual void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const = 0;
 };
@@ -59,7 +61,7 @@ class IBeamSearchDecoder {
 
   virtual ~IBeamSearchDecoder() {}
 
-  virtual QSNBestBatch decode(const QSBatch& qsBatch,
+  virtual QSNBestBatch decode(const std::vector<QSBatch>& qsBatches,
                               size_t maxLength,
                               const std::unordered_set<WordIndex>& shortlist)
       = 0;
diff --git a/src/tensors/cpu/topk.cpp b/src/tensors/cpu/topk.cpp
index 73f0ce273..5cb1119bb 100644
--- a/src/tensors/cpu/topk.cpp
+++ b/src/tensors/cpu/topk.cpp
@@ -4,15 +4,15 @@
 
 // CPU implementation of proper Marian top-k operator for TopkNodeOp
 // This file contains a lot of code-duplicaton with src/translator/nth_element.cpp
-// the goal is to replace the beam-search specific topk search with this code. 
-// Currently this is only used in the unit tests, but we will move forward and 
+// the goal is to replace the beam-search specific topk search with this code.
+// Currently this is only used in the unit tests, but we will move forward and
 // make the beam-search more graph and operator-based.
 
 namespace marian {
-namespace cpu {  
+namespace cpu {
 
 void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tensor in, int k, int axis, bool descending) {
-  
+
   ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis");
   ABORT_IF(in->type() != Type::float32, "Input should have type {}", Type::float32);
   ABORT_IF(outInd->type() != Type::uint32, "Output should be have type {}", Type::uint32);
@@ -29,61 +29,70 @@ void TopK(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tens
   IndexType* outIndPtr   = outInd->data<IndexType>();
   float* outValPtr       = outVal->data<float>();
   for(int i = 0; i < rows; ++i) {
-    std::partial_sort( 
+    std::partial_sort(
       // sorts the top N (beam size) idxs by score to the front
       idxs.begin(),
       idxs.begin() + k,
       idxs.end(),
-      [&](int a, int b) { 
-        return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; 
+      [&](int a, int b) {
+        return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b];
       }
     );
-    
+
     for(int j = 0; j < k; j++) {
       outIndPtr[j] = idxs[j];
       outValPtr[j] = inDataPtr[idxs[j]];
     }
-    
+
     outIndPtr += k;
     outValPtr += k;
     inDataPtr += cols;
   }
 }
 
-// CPU implementation of Marian sort operator for SortNodeOp
-void Sort(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tensor in, int axis, bool descending) {
-  ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis");
-  ABORT_IF(in->type() != Type::float32, "Input should have type {}", Type::float32);
-  ABORT_IF(outInd->type() != Type::uint32, "Output should be have type {}", Type::uint32);
-
+template <typename T>
+void SortTyped(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tensor in, int axis, bool descending) {
   int cols = in->shape()[axis];
   int rows = in->shape().elements() / cols;
 
   std::vector<IndexType> idxs(cols);
   std::iota(idxs.begin(), idxs.end(), 0);
 
-  const float* inDataPtr = in->data<float>();
+  const T* inDataPtr = in->data<T>();
   IndexType* outIndPtr   = outInd->data<IndexType>();
-  float* outValPtr       = outVal->data<float>();
+  T* outValPtr       = outVal->data<T>();
   for(int i = 0; i < rows; ++i) {
-    std::sort( 
+    std::sort(
       idxs.begin(),
       idxs.end(),
-      [&](int a, int b) { 
-        return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; 
+      [&](int a, int b) {
+        return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b];
       }
     );
-    
+
     for(int j = 0; j < cols; j++) {
       outIndPtr[j] = idxs[j];
       outValPtr[j] = inDataPtr[idxs[j]];
     }
-    
+
     outIndPtr += cols;
     outValPtr += cols;
     inDataPtr += cols;
   }
 }
 
+// CPU implementation of Marian sort operator for SortNodeOp
+void Sort(Tensor outVal, Tensor outInd, Ptr<Allocator> /*allocator*/, const Tensor in, int axis, bool descending) {
+  ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis");
+  ABORT_IF(outInd->type() != Type::uint32, "Output indices should be have type {}", Type::uint32);
+
+  if(in->type() == Type::float32)
+    SortTyped<float>(outVal, outInd, nullptr, in, axis, descending);
+  else if(in->type() == Type::uint32)
+    SortTyped<uint32_t>(outVal, outInd, nullptr, in, axis, descending);
+  else
+    ABORT("Unsupported type {}", in->type());
+}
+
 }
 }
diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp
index 63aa0ec8f..8b1312c2f 100644
--- a/src/translator/beam_search.cpp
+++ b/src/translator/beam_search.cpp
@@ -45,20 +45,20 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
     // They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1.
     // (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging)
     const auto  key = nBestKeys[i];
-    
+
     // decompose key into individual indices (batchIdx, beamHypIdx, wordIdx)
     const auto beamHypIdx      = (key / vocabSize) % nBestBeamSize;
     const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize;
     const auto origBatchIdx    = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam
     bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx] && factorGroup == 0;
-    
+
     WordIndex wordIdx;
     if(dropHyp) { // if we force=drop the hypothesis, assign EOS, otherwise the expected word id.
       if(factoredVocab) { // when using factoredVocab, extract the EOS lemma index from the word id, we predicting factors one by one here, hence lemma only
         std::vector<size_t> eosFactors;
         factoredVocab->word2factors(factoredVocab->getEosId(), eosFactors);
         wordIdx = (WordIndex)eosFactors[0];
-      } else { // without factoredVocab lemma index and word index are the same. Safe cruising. 
+      } else { // without factoredVocab lemma index and word index are the same. Safe cruising.
         wordIdx = trgVocab_->getEosId().toWordIndex();
       }
     } else { // we are not dropping anything, just assign the normal index
@@ -66,9 +66,9 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
     }
 
     // @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use
-    // the per Hyp pathScore without the current expansion (a bit hard to obtain). 
-    // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better. 
-    // For the empty hyp this would naturally result in 0, too. 
+    // the per Hyp pathScore without the current expansion (a bit hard to obtain).
+    // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better.
+    // For the empty hyp this would naturally result in 0, too.
     const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word)
 
     const auto& beam = beams[origBatchIdx];
@@ -78,7 +78,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
       continue;
     if(pathScore == INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor)
       continue;
-    
+
     ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...)
 
     // map wordIdx to word
@@ -298,23 +298,23 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
   Expr suppressedWordIndices;
   bool suppressUnk     = !options_->get<bool>("allow-unk", false);
   bool suppressSpecial = !options_->get<bool>("allow-special", false);
+
+  auto shortlist = scorers_[0]->getShortlist(); // first shortlist is generally ok, @TODO: make sure they are the same across scorers?
   if (suppressUnk || suppressSpecial) { // do we need to suppress unk or special?
     std::vector<WordIndex> suppressed = trgVocab_->suppressedIndices(suppressUnk, suppressSpecial);
-
-    auto shortlist = scorers_[0]->getShortlist(); // first shortlist is generally ok, @TODO: make sure they are the same across scorers?
     if(shortlist) // check if suppressed words are allowed by the shortlist, if not, remove
-      suppressed.erase(std::remove_if(suppressed.begin(), 
-                                      suppressed.end(), 
-                                      [&](WordIndex i) { 
+      suppressed.erase(std::remove_if(suppressed.begin(),
+                                      suppressed.end(),
+                                      [&](WordIndex i) {
                                         return shortlist->tryForwardMap(i) == data::Shortlist::npos;
                                       }),
                        suppressed.end());
-    
+
     if(!suppressed.empty())
       suppressedWordIndices = graph->indices(suppressed);
   }
 
-  auto distMod = New<DistModifier>(options_, batch, INVALID_PATH_SCORE);
+  auto distMod = New<DistModifier>(graph, options_, batch, INVALID_PATH_SCORE, shortlist);
 
   // the decoding process updates the following state information in each output time step:
   //  - beams: array [origDimBatch] of array [maxBeamSize] of Hypothesis
@@ -432,8 +432,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
           if (numFactorGroups == 1) { // @TODO: this branch can go away
             logProbs = states[i]->getLogProbs().getLogits(); // [maxBeamSize, 1, currentDimBatch, dimVocab]
           } else {
-            auto shortlist = scorers_[i]->getShortlist();
-            logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, shortlist); // [maxBeamSize, 1, currentDimBatch, dimVocab]
+            logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, scorers_[i]->getShortlist()); // [maxBeamSize, 1, currentDimBatch, dimVocab]
           }
         }
         else {
@@ -456,10 +455,10 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
         }
       }
 
-      // we cast (ensembled) scores to float32, as accumulated them into path scores; 
+      // we cast (ensembled) scores to float32, as accumulated them into path scores;
       // also beneficial for sampling etc.
       // @TODO:: consider doing this before ensembling
-      stepScores = cast(stepScores, Type::float32); 
+      stepScores = cast(stepScores, Type::float32);
 
       if(factorGroup == 0) {
         stepScores = distMod->force(stepScores, (int)t, (int)maxBeamSize, batchIndices);
@@ -482,7 +481,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
       // suppress specific symbols if not at right positions
       // @TODO: move this to DistributionModifier
       if(suppressedWordIndices && factorGroup == 0)
-        suppressWords(expandedPathScores, suppressedWordIndices);
+        suppressWords(expandedPathScores, suppressedWordIndices); // @TODO: this is probably not working correctly for LSH short list
 
       //**********************************************************************
       // perform beam search
diff --git a/src/translator/sampling.h b/src/translator/sampling.h
index 184202229..043f79bd6 100644
--- a/src/translator/sampling.h
+++ b/src/translator/sampling.h
@@ -10,7 +10,7 @@ namespace sampling {
 
 // Prune logits via top-k pruning
 Expr topkPruning(Expr scores, int k, bool normalize = false) {
-  Expr val, idx; 
+  Expr val, idx;
 
   // note, for around k>200 topk is slower on the GPU than sorting and then selecting the top-k values
   std::tie(val, idx) = topk(scores, k, /*axis=*/-1, /*descending=*/true);
@@ -24,14 +24,14 @@ Expr topkPruning(Expr scores, int k, bool normalize = false) {
 
 // Prune logits via nucleus pruning
 Expr nucleusPruning(Expr scores, float threshold, bool normalize = false) {
-  // normalization would make sense here since we compare against a meaningful threshold and 
+  // normalization would make sense here since we compare against a meaningful threshold and
   // we don't know what other manipulations have been done to the logits before, but
   // leaving it to the user for now. We do set it to true in beam_search.cpp
   if(normalize)
     scores = logsoftmax(scores); // renormalize via logsoftmax
 
   // sort scores in descending order, this way we can use the cumulative sum to find the nucleus
-  Expr val, idx; 
+  Expr val, idx;
   std::tie(val, idx) = sort(scores, /*axis=*/-1, /*descending=*/true);
 
   // logcumsumexp because we have logprobs, exclusive because we keep at least the first element
@@ -51,11 +51,11 @@ Expr nucleusPruning(Expr scores, float threshold, bool normalize = false) {
 
 // Prune logits via epsilon pruning
 Expr epsilonPruning(Expr scores, float epsilon, bool normalize = false) {
-  // normalization would make sense here since we compare against a meaningful threshold and 
+  // normalization would make sense here since we compare against a meaningful threshold and
   // we don't know what other manipulations have been done to the logits before
   if(normalize)
     scores = logsoftmax(scores); // renormalize via logsoftmax
-  
+
   // make sure the epsilon is not larger than the largest value in the scores
   // otherwise we will mask out all values
   // equivalent to union of top-1 and log(epsilon)
@@ -81,7 +81,10 @@ Expr gumbelMaxTrick(Expr scores, float temperature) {
 
 class DistModifier {
 private:
+  Ptr<ExpressionGraph> graph_;
   Ptr<Options> options_;
+  Ptr<data::Shortlist> shortlist_;
+
   bool forceDecode_{false};
 
   bool sampling_{false};
@@ -91,12 +94,38 @@ class DistModifier {
   float invalidPathScore_;
 
   Expr forceBatch_;
-  
+
+  void lazyCreateForceBatch() {
+    if(!forceBatch_) {
+      // turn the batch into a cached tensor that lives in the computation graph
+      std::vector<WordIndex> forceWords;
+      for(auto& word : batch_->back()->data())
+        forceWords.push_back(word.toWordIndex());
+      int dimTime  = (int)batch_->back()->batchWidth();
+      int dimBatch = (int)batch_->back()->batchSize();
+      forceBatch_ = graph_->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1]
+    }
+  }
+
 public:
-  DistModifier(Ptr<Options> options, Ptr<data::CorpusBatch> batch, float invalidPathScore) :
-    options_(options), forceDecode_(options_->get<bool>("force-decode", false)),
-    batch_(batch), invalidPathScore_(invalidPathScore) {
-    
+  DistModifier(Ptr<ExpressionGraph> graph, Ptr<Options> options, Ptr<data::CorpusBatch> batch, float invalidPathScore, Ptr<data::Shortlist> shortlist = nullptr) :
+    graph_(graph),
+    options_(options),
+    shortlist_(shortlist),
+    forceDecode_(options_->get<bool>("force-decode", false)),
+    batch_(batch),
+    invalidPathScore_(invalidPathScore) {
+
+    // if we are force-decoding with a short list we need to set the forced token ids early
+    if(shortlist_ && forceDecode_) {
+      lazyCreateForceBatch();
+      auto lsh = std::dynamic_pointer_cast<data::LSHShortlist>(shortlist_);
+      ABORT_IF(!lsh, "Force-decoding not supported with shortlists other than LSH");
+      ABORT_IF(!forceBatch_, "forceBatch_ is undefined??");
+      Expr forceIndices = slice(forceBatch_, /*axis=*/-3, 0);   // [1, 1, dimBatch, 1]
+      lsh->setForcedIndices(forceIndices);
+    }
+
     if(options_->hasAndNotEmpty("output-sampling")) {
       sampling_ = true;
       auto samplingOpts = options_->get<std::vector<std::string>>("output-sampling", {});
@@ -108,8 +137,8 @@ class DistModifier {
       } else if(samplingMethod == "1") { // for backcompat with boolean values
         sampling_ = true;
         samplingMethod = "full";
-      } 
-      
+      }
+
       if(samplingMethod == "full") {
         float temperature = 1.f;
         if(samplingOpts.size() > 1)
@@ -171,28 +200,23 @@ class DistModifier {
 
   Expr force(Expr scores, int pos, int beamSize, std::vector<IndexType>& batchIndices) {
     // we check the last field of the batch for force-decoding content
+
     int dimTime = (int)batch_->back()->batchWidth();
-    if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores
+    if(!forceDecode_ || pos >= dimTime) { // nothing to force-decode, just return original scores
       return scores;
+    }
 
     LOG_ONCE(info, "Force-decoding with given prefixes");
-    // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the 
+    // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the
     // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections
     // which preserves the original tensor layout.
-    // This allows for beam-search and batched force-decoding with different length prefixes in a batch 
+    // This allows for beam-search and batched force-decoding with different length prefixes in a batch
     // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search
     // which then continues as normal on the modified distribution.
 
-    if(!forceBatch_) {
-      // turn the batch into a cached tensor that lives in the computation graph
-      std::vector<WordIndex> forceWords;
-      for(auto& word : batch_->back()->data())
-        forceWords.push_back(word.toWordIndex());
-  
-      int dimBatch = (int)batch_->back()->batchSize();
-      forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1]
-    }
+    lazyCreateForceBatch();
 
+    ABORT_IF(!forceBatch_, "forceBatch_ is undefined??");
     // if we remove batch entries during decoding (finished decoding) then adjust here
     if(forceBatch_->shape()[-2] != batchIndices.size())
       forceBatch_ = index_select(forceBatch_, -2, batchIndices);
@@ -200,6 +224,14 @@ class DistModifier {
     // get vocab index and probability for force-decoded tokens for the current time step
     Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos);   // [1, 1, dimBatch, 1]
 
+    if(shortlist_) {
+      auto lsh = std::dynamic_pointer_cast<data::LSHShortlist>(shortlist_);
+      ABORT_IF(!lsh, "Force-decoding not supported with shortlists other than LSH");
+      // only get location for first beam slot, the other slots don't matter since we overwrite them later.
+      lsh->setForcedIndices(forceIndices);
+      forceIndices = lsh->tryForwardMap(forceIndices); // [1, 1, dimBatch, 1]
+    }
+
     // select scores from first beam entry for force-decoding
     Expr b1stScores = slice(scores, /*axis=*/-4, 0); // [1, 1, dimBatch, dimVocab]
     Expr forceVals  = gather(b1stScores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1]
@@ -207,18 +239,26 @@ class DistModifier {
     // create dummy indices and values for beam entries other than the force-decoded value. This is required to ensure that the beam
     // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch
     // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion.
-    int dimVocab = scores->shape()[-1];      
+    int dimVocab = scores->shape()[-1];
     auto graph = scores->graph();
-    // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred.
-    Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f);
-    // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself.
-    Expr dummyVals    = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f);
+
+    std::vector<IndexType> dummyIndicesVec(beamSize, 0);
+    std::vector<float> dummyValsVec(beamSize, 0.f);
+    for(int i = 1; i < beamSize; ++i) {
+      // we use dimVocab - i - 1 to make sure that the dummy indices are different from the force-decoded index (last vocab indices)
+      dummyIndicesVec[i] = dimVocab - i - 1;
+      // we use invalidPathScore_ / (2.f + i) to make sure that the dummy values are very low and decrease with beam position
+      dummyValsVec[i] = invalidPathScore_ / (2.f + i);
+    }
+
+    Expr dummyIndices = graph->constant({1, 1, 1, beamSize}, inits::fromVector(dummyIndicesVec), Type::uint32);
+    Expr dummyVals    = graph->constant({1, 1, 1, beamSize}, inits::fromVector(dummyValsVec));
 
     // here we add the force-decoded entries back into the zeroed positions
-    dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); // [1, 1, dimBatch, dimBeam]
-    dummyVals    = dummyVals + forceVals; // [1, 1, dimBatch, dimBeam] 
+    dummyIndices = cast(maximum(cast(dummyIndices, Type::float32), cast(forceIndices, Type::float32)), Type::uint32); // [1, 1, dimBatch, dimBeam]
+    dummyVals    = dummyVals + forceVals; // [1, 1, dimBatch, dimBeam]
 
-    // create a tensor of the same size as the original logits from the first beam entry, initialize with invalidPathScore and then scatter 
+    // create a tensor of the same size as the original logits from the first beam entry, initialize with invalidPathScore and then scatter
     // the force-decoded and dummy values into the correct positions.
     Expr forcedScores = constant_like(b1stScores, inits::fromValue(invalidPathScore_)); // [1, 1, dimBatch, dimVocab]
     forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); // [1, 1, dimBatch, dimVocab]

From b4ed6304e86bf4f10f9d121ae2df008506ff73b2 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Wed, 17 Apr 2024 13:23:37 +0000
Subject: [PATCH 23/26] Merged PR 34062: Add exception if force-decoding is
 used for FSM vocab

Abort or throw an exception if we try force-decoding with a factored Vocab.
---
 CHANGELOG.md                |  1 +
 VERSION                     |  2 +-
 src/data/shortlist.cpp      | 44 ++++++++++++++++---------------------
 src/microsoft/quicksand.cpp | 10 ++++++---
 src/translator/sampling.h   | 11 ++++++++--
 5 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index caa3b8aa8..0b52200ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Throw exception when forcing with FS vocabs
 - Fixed force-decoding with LSH
 - Fixed force-decoding for beam-size > 1
 - Fixed lost node in mt-detect metrics
diff --git a/VERSION b/VERSION
index 8d44afc76..7542664ec 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.27
+v1.12.28
diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp
index ac588a279..83ae435f4 100644
--- a/src/data/shortlist.cpp
+++ b/src/data/shortlist.cpp
@@ -107,33 +107,27 @@ void LSHShortlist::setForcedIndices(Expr forcedIndices) {
 
 void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) {
   auto topk = lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_); // [beam, batch, k]
-
-  bool addForced = forcedIndicesExpr_ != nullptr;
-  if(addForced) {
-    topk = callback(topk,
-                    [this](Expr node) {
-                      int dimBeam = node->shape()[-3];
-                      int dimBatch = node->shape()[-2];
-                      for(int batchIdx = 0; batchIdx < dimBatch; batchIdx++) {
-                        for(int beamIdx = 0; beamIdx < dimBeam; beamIdx++) {
-                          IndexType* begin = node->val()->data<IndexType>() + beamIdx * dimBatch * k_ + batchIdx * k_;
-                          IndexType* end   = begin + k_;
-                          IndexType val    = forcedIndicesExpr_->val()->data<IndexType>()[batchIdx];
-                          auto pos         = std::lower_bound(begin, end, val);
-                          if(pos != end)
-                            *pos = val;
-                          else
-                            *(end-1) = val;
-                        }
-                      }
-                      // we will correctly overwrite the indices used for reverse mapping in the next call back
-
-                      setForcedIndices(nullptr); // mark as done for this step
-                    });
-  }
-
   indicesExpr_ = callback(topk,
                           [this](Expr node) {
+                            if(forcedIndicesExpr_) {
+                              // if a forced index is set, we need to overwrite the relevant topk index with the forced index
+                              int dimBeam = node->shape()[-3];
+                              int dimBatch = node->shape()[-2];
+                              for(int batchIdx = 0; batchIdx < dimBatch; batchIdx++) {
+                                for(int beamIdx = 0; beamIdx < dimBeam; beamIdx++) {
+                                  IndexType* begin = node->val()->data<IndexType>() + beamIdx * dimBatch * k_ + batchIdx * k_;
+                                  IndexType* end   = begin + k_;
+                                  IndexType val    = forcedIndicesExpr_->val()->data<IndexType>()[batchIdx];
+                                  auto pos         = std::lower_bound(begin, end, val);
+                                  if(pos != end)
+                                    *pos = val;
+                                  else
+                                    *(end-1) = val;
+                                }
+                              }
+                              // we will correctly overwrite the indices used for reverse mapping in the next call back
+                              setForcedIndices(nullptr);
+                            }
                             node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node
                           });
 
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 75f224d9a..3dce71d57 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -169,7 +169,8 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
         scorer->setShortlistGenerator(shortListGen);
     }
 
-    ABORT_IF(qsBatches.empty(), "No input batch provided");
+    ABORT_IF(qsBatches.empty(),    "No input batch provided");
+    ABORT_IF(qsBatches.size() > 2, "More than two sub-batches provided");
 
     auto createSubBatch = [maxLength](const QSBatch& qsBatch, Ptr<Vocab> vocab) {
       size_t batchSize = qsBatch.size();
@@ -188,9 +189,12 @@ class BeamSearchDecoder : public IBeamSearchDecoder {
     };
 
     auto srcSubBatch = createSubBatch(qsBatches[0], vocabs_[0]);
-    auto tgtSubBatch = createSubBatch(qsBatches[1], vocabs_[1]);
+    std::vector<Ptr<data::SubBatch>> subBatches{ srcSubBatch };
+    if(qsBatches.size() == 2) {
+      auto tgtSubBatch = createSubBatch(qsBatches[1], vocabs_[1]);
+      subBatches.push_back(tgtSubBatch);
+    }
 
-    std::vector<Ptr<data::SubBatch>> subBatches{ srcSubBatch, tgtSubBatch };
     auto batch = New<data::CorpusBatch>(subBatches);
     std::vector<size_t> sentIds(batch->size(), 0);
     batch->setSentenceIds(sentIds);
diff --git a/src/translator/sampling.h b/src/translator/sampling.h
index 043f79bd6..764ef1799 100644
--- a/src/translator/sampling.h
+++ b/src/translator/sampling.h
@@ -116,6 +116,8 @@ class DistModifier {
     batch_(batch),
     invalidPathScore_(invalidPathScore) {
 
+    forceDecode_ = forceDecode_ && batch->sets() > 1; // force-decoding if we have multiple sets in the batch
+
     // if we are force-decoding with a short list we need to set the forced token ids early
     if(shortlist_ && forceDecode_) {
       lazyCreateForceBatch();
@@ -217,13 +219,18 @@ class DistModifier {
     lazyCreateForceBatch();
 
     ABORT_IF(!forceBatch_, "forceBatch_ is undefined??");
+
+    auto factoredVocab = batch_->front()->vocab()->tryAs<FactoredVocab>();
+    ABORT_IF(factoredVocab, "Factored vocabularies are not supported for force-decoding");
+
     // if we remove batch entries during decoding (finished decoding) then adjust here
     if(forceBatch_->shape()[-2] != batchIndices.size())
       forceBatch_ = index_select(forceBatch_, -2, batchIndices);
 
     // get vocab index and probability for force-decoded tokens for the current time step
-    Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos);   // [1, 1, dimBatch, 1]
+    Expr posIndices = slice(forceBatch_, /*axis=*/-3, pos);   // [1, 1, dimBatch, 1]
 
+    Expr forceIndices = posIndices;
     if(shortlist_) {
       auto lsh = std::dynamic_pointer_cast<data::LSHShortlist>(shortlist_);
       ABORT_IF(!lsh, "Force-decoding not supported with shortlists other than LSH");
@@ -267,7 +274,7 @@ class DistModifier {
     // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means
     // free decoding or sampling.
     WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex();
-    auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId);
+    auto interpol = eq(cast(posIndices, scores->value_type()), (float)eosId);
     return interpol * scores + (1.f - interpol) * forcedScores;
   }
 

From 2745b773bc8b8402ad536737b33d4948bbac8542 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Sat, 20 Apr 2024 15:24:52 +0000
Subject: [PATCH 24/26] Merged PR 34029: Fix regressions in new layer framework
 for ALIBI-based decoding

* Fixes regressions in new layer framework for ALIBI-based decoding
---
 CHANGELOG.md                     |  1 +
 VERSION                          |  2 +-
 src/layers_new/alibi.h           | 28 +++++++++------
 src/layers_new/attention.h       | 12 +++++--
 src/layers_new/rnn.h             |  1 -
 src/models/transformer_factory.h | 62 ++++++++++++++++----------------
 6 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b52200ed..dbf50d5bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Fixed ALiBI states and caching in new layer framework
 - Throw exception when forcing with FS vocabs
 - Fixed force-decoding with LSH
 - Fixed force-decoding for beam-size > 1
diff --git a/VERSION b/VERSION
index 7542664ec..1e696b303 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.28
+v1.12.29
diff --git a/src/layers_new/alibi.h b/src/layers_new/alibi.h
index 66c102235..e4d58e8b5 100644
--- a/src/layers_new/alibi.h
+++ b/src/layers_new/alibi.h
@@ -155,19 +155,25 @@ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor {
 
   // Apply the alibi mask to the given query and mask
   virtual Expr apply(Expr query, Expr mask) const override {
-    if(!trainable) {
-      const_cast<Expr&>(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes());
-      const_cast<Expr&>(biases) = graph()->constant({numHeads, 1, 1}, initBiases());
-    } else {
-      registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes());
-      registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases());
-    }
 
-    Expr shift = nullptr;
-    int start = 0;
+    auto processMask = [this, query](Expr mask) {
+      if(!trainable) {
+        const_cast<Expr&>(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes());
+        const_cast<Expr&>(biases) = graph()->constant({numHeads, 1, 1}, initBiases());
+      } else {
+        registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes());
+        registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases());
+      }
+
+      Expr shift = nullptr;
+      int start = 0;
+      auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start);
+      return alibiMask;
+    };
 
-    auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start);
-    return alibiMask;
+    // recompute the mask if input mask changes (different memory address), otherwise return cached version
+    auto equal = [](Expr a, Expr b) { return a == b; };
+    return cachedMask_->apply(mask, processMask, equal);
   }
 };
 
diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h
index 9bd31baa0..cc65c9fa7 100644
--- a/src/layers_new/attention.h
+++ b/src/layers_new/attention.h
@@ -310,8 +310,16 @@ struct AttentionMaskProcessor : public MaskProcessor {
     if(!mask)
       return nullptr;
 
-    // shape of mask should be [1, dimBatch, dimKeys, 1]
-    return marian::logMask(mask, numHeads, /*addCausalMask=*/false); // [1, dimBatch * numHeads, 1, dimKeys]
+    // shape of input `mask` should be [1, dimBatch, dimKeys, 1]
+    // output shape will be // [1, dimBatch * numHeads, 1, dimKeys] if addCausalMask is false
+    // or [1, dimBatch * numHeads, dimKeys, dimKeys] if addCausalMask is true
+    auto processMask = [this](Expr mask) { return marian::logMask(mask, numHeads, /*addCausalMask=*/false); };
+
+    // recompute the mask if input mask changes (different memory address), otherwise return cached version
+    auto equal       = [](Expr a, Expr b) { return a == b; };
+
+    // recompute the mask if the shape changes, otherwise return cached version
+    return cachedMask_->apply(mask, processMask, equal);
   }
 };
 
diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h
index 9a9cd067f..3e3307476 100644
--- a/src/layers_new/rnn.h
+++ b/src/layers_new/rnn.h
@@ -132,7 +132,6 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer
     }
 
     state->as<nn::DecoderStateItem>()->set(cellState->recurrent);
-    state->setPosition(cellState->position);
 
     // during decoding again, this is a no-op
     Expr output = swapTimeBatch(concatenate(outputs, dimTimeAxis));
diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h
index fcd90ad63..06cf5a995 100644
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@@ -108,48 +108,48 @@ class TransformerLegacy : public EncoderDecoder {
     prefix = "TransformerBatchDecoder";
     for(int layerNo = 0; layerNo < opt<int>("dec-depth"); ++layerNo) {
       // name maps for decoder self-attention blocks
-      nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->qProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->qProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->qProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->qProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->kProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->kProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->kProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->kProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->vProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->vProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->vProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->vProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->oProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->selfAttention->oProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->selfAttention->oProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
 
       // name maps for decoder SSRU
-      nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo);
 
       // name maps for decoder cross-attention blocks
-      nameMap[fmt::format("decoder_l{}_context_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->qProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_context_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->qProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->qProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->qProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_context_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->kProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_context_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->kProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->kProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->kProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_context_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->vProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_context_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->vProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->vProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->vProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_context_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->oProj->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_context_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->crossAttention->oProj->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->oProj->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->crossAttention->oProj->bias", prefix, layerNo);
 
-      nameMap[fmt::format("decoder_l{}_context_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->postprocessor->norm->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_context_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->crossAttentionBlock->postprocessor->norm->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_context_Wo_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->crossAttentionBlock->postprocessor->norm->bias", prefix, layerNo);
 
       // name maps for decoder FFN blocks
       int mult = 3;
@@ -160,11 +160,11 @@ class TransformerLegacy : public EncoderDecoder {
           mult = 1;
           layerType = "LinearReluDropout";
         }
-        nameMap[fmt::format("decoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->layers->at({})->as<marian::nn::{}>()->weight", prefix, layerNo, mult * ffnLayerNo, layerType);
-        nameMap[fmt::format("decoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->layers->at({})->as<marian::nn::{}>()->bias", prefix, layerNo, mult * ffnLayerNo, layerType);
+        nameMap[fmt::format("decoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->filterBlock->layers->at({})->as<marian::nn::{}>()->weight", prefix, layerNo, mult * ffnLayerNo, layerType);
+        nameMap[fmt::format("decoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->filterBlock->layers->at({})->as<marian::nn::{}>()->bias", prefix, layerNo, mult * ffnLayerNo, layerType);
       }
-      nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->postprocessor->norm->weight", prefix, layerNo);
-      nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayer>()->filterBlock->postprocessor->norm->bias", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->filterBlock->postprocessor->norm->weight", prefix, layerNo);
+      nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_bias", layerNo + 1)]  = fmt::format("{}->decoder->layers->at({})->as<marian::nn::TransformerDecoderLayerWithCrossAttention>()->filterBlock->postprocessor->norm->bias", prefix, layerNo);
     }
 
     return nameMap;

From 07042cf2cea3d2b745a69bd14b76acaa1df0a913 Mon Sep 17 00:00:00 2001
From: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>
Date: Tue, 23 Apr 2024 12:31:12 +0000
Subject: [PATCH 25/26] Merged PR 34167: Do not mmap files for conversion in
 Quicksand API

* Do not mmap files for conversion
---
 CHANGELOG.md                | 1 +
 VERSION                     | 2 +-
 src/microsoft/quicksand.cpp | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dbf50d5bd..1b908d9cc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - New experimental layer framework for Transformer-like models.
 
 ### Fixed
+- Do not mmap files for conversion via Quicksand API
 - Fixed ALiBI states and caching in new layer framework
 - Throw exception when forcing with FS vocabs
 - Fixed force-decoding with LSH
diff --git a/VERSION b/VERSION
index 1e696b303..1501aad44 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.29
+v1.12.30
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 3dce71d57..67fef8592 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -303,7 +303,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) {
 bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, int32_t lshNBits) {
   std::cerr << "Converting from: " << inputFile << ", to: " << outputFile << ", precision: " << targetPrec << std::endl;
 
-  auto modelFile = New<marian::io::ModelWeights>(inputFile);
+  auto modelFile = New<marian::io::ModelWeights>(inputFile, marian::io::MmapMode::DontMmap);
 
   YAML::Node config = modelFile->getYamlFromModel();
   std::stringstream configStr;

From a6ab8af8fc8f02c130819bfe7e07318ec958e323 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@microsoft.com>
Date: Thu, 27 Jun 2024 15:21:08 +0000
Subject: [PATCH 26/26] Merged PR 34540: pymarian: build for multiple python
 versions; disable tcmalloc; huggingface backed for gated COMETs

pymarian upgrades
* Support for build for multiple python versions at once;  borrowed a cmake script from AMD
* use "build" instead of "pip wheel"; build is more stable and leaves less junk on file system
* Disable tcmalloc for pymarian
* Added support for [huggingface backend](https://huggingface.co/collections/Unbabel/marian-comet-metrics-and-qe-664e28c82743db6709d022fc). Currently enabled for gated comet models only.
* Added `--cache` argument to pymarian-eval CLI; Useful for accessing cache from blobstorage mount path for gated models
---
 CHANGELOG.md                    |   4 +-
 CMakeLists.txt                  |   6 ++
 VERSION                         |   2 +-
 cmake/PythonModules.cmake       | 119 ++++++++++++++++++++++++++++++++
 src/CMakeLists.txt              |  45 +++++++-----
 src/python/README.md            |  68 +++++++++++++++++-
 src/python/pymarian/defaults.py |  37 ++++++----
 src/python/pymarian/eval.py     |   6 +-
 src/python/pymarian/utils.py    |  49 +++++++++----
 src/python/pyproject.toml       |   3 +-
 src/python/setup.py             |  17 +++--
 11 files changed, 296 insertions(+), 60 deletions(-)
 create mode 100644 cmake/PythonModules.cmake

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b908d9cc..afa4465ce 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [Unreleased]
 - Fixed compilation with clang 16.0.6
-- Added Threads::Threads to EXT_LIBS
-
+- Added Threads::Threads to `EXT_LIBS`
+- Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace
 
 ### Added
 - Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6aa74297..ee05cf99b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,6 +123,12 @@ set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
 set(THREADS_PREFER_PTHREAD_FLAG TRUE)
 find_package(Threads REQUIRED)
 set(EXT_LIBS ${EXT_LIBS} Threads::Threads)
+
+# disable tcmalloc if pymarian=on
+if(USE_TCMALLOC AND PYMARIAN)
+  message(WARNING "TCMalloc can cause segfaults with some python libraries. Hence disabling TCMalloc for a robust pymarian build.")
+  set(USE_TCMALLOC off)
+endif()
 ########
 
 ###############################################################################
diff --git a/VERSION b/VERSION
index 1501aad44..7b4d55e09 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-v1.12.30
+v1.12.31
diff --git a/cmake/PythonModules.cmake b/cmake/PythonModules.cmake
new file mode 100644
index 000000000..062155647
--- /dev/null
+++ b/cmake/PythonModules.cmake
@@ -0,0 +1,119 @@
+# Retrieved from ROCm/AMDMIGraphX repo @ https://github.com/ROCm/AMDMIGraphX/blob/develop/cmake/PythonModules.cmake
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+if(COMMAND find_python)
+    return()
+endif()
+
+
+macro(py_exec)
+    execute_process(${ARGN} RESULT_VARIABLE RESULT)
+    if(NOT RESULT EQUAL 0)
+        message(FATAL_ERROR "Process failed: ${ARGN}")
+    endif()
+endmacro()
+
+set(PYBIND11_NOPYTHON On)
+# this wont work if pybind11 is git submodule
+#find_package(pybind11 REQUIRED)
+
+## =====================
+set(PYTHON_SEARCH_VERSIONS 3.7 3.8 3.9 3.10 3.11 3.12 3.13)
+set(PYTHON_DISABLE_VERSIONS "" CACHE STRING "")
+foreach(PYTHON_DISABLE_VERSION ${PYTHON_DISABLE_VERSIONS})
+    list(REMOVE_ITEM PYTHON_SEARCH_VERSIONS ${PYTHON_DISABLE_VERSION})
+endforeach()
+
+## =====================
+
+macro(find_python version)
+    find_program(PYTHON_CONFIG_${version} python${version}-config)
+    if(EXISTS ${PYTHON_CONFIG_${version}})
+        py_exec(COMMAND ${PYTHON_CONFIG_${version}} --includes OUTPUT_VARIABLE _python_include_args)
+        execute_process(COMMAND ${PYTHON_CONFIG_${version}} --ldflags --embed OUTPUT_VARIABLE _python_ldflags_args RESULT_VARIABLE _python_ldflags_result)
+        if(NOT _python_ldflags_result EQUAL 0)
+            py_exec(COMMAND ${PYTHON_CONFIG_${version}} --ldflags OUTPUT_VARIABLE _python_ldflags_args)
+        endif()
+        separate_arguments(_python_includes UNIX_COMMAND "${_python_include_args}")
+        separate_arguments(_python_ldflags UNIX_COMMAND "${_python_ldflags_args}")
+        string(REPLACE "-I" "" _python_includes "${_python_includes}")
+        add_library(python${version}::headers INTERFACE IMPORTED GLOBAL)
+        set_target_properties(python${version}::headers PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${_python_includes}"
+        )
+        add_library(python${version}::runtime INTERFACE IMPORTED GLOBAL)
+        set_target_properties(python${version}::runtime PROPERTIES
+            INTERFACE_LINK_OPTIONS "${_python_ldflags}"
+            INTERFACE_LINK_LIBRARIES python${version}::headers
+        )
+        py_exec(COMMAND ${PYTHON_CONFIG_${version}} --prefix OUTPUT_VARIABLE _python_prefix)
+        string(STRIP "${_python_prefix}" _python_prefix)
+        set(PYTHON_${version}_EXECUTABLE "${_python_prefix}/bin/python${version}" CACHE PATH "")
+    endif()
+endmacro()
+
+#######
+function(py_extension name version)
+    set(_python_module_extension ".so")
+    if(version VERSION_GREATER_EQUAL 3.0)
+        py_exec(COMMAND ${PYTHON_CONFIG_${version}} --extension-suffix OUTPUT_VARIABLE _python_module_extension)
+        string(STRIP "${_python_module_extension}" _python_module_extension)
+    endif()
+    set_target_properties(${name} PROPERTIES PREFIX "" SUFFIX "${_python_module_extension}")
+endfunction()
+
+function(py_add_module NAME)
+    set(options)
+    set(oneValueArgs PYTHON_VERSION PYTHON_MODULE)
+    set(multiValueArgs)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(PYTHON_VERSION ${PARSE_PYTHON_VERSION})
+
+    add_library(${NAME} MODULE ${PARSE_UNPARSED_ARGUMENTS})
+    pybind11_strip(${NAME})
+    py_extension(${NAME} ${PYTHON_VERSION})
+    target_link_libraries(${NAME} PRIVATE pybind11::module pybind11::lto python${PYTHON_VERSION}::headers)
+    set_target_properties(${NAME} PROPERTIES 
+        OUTPUT_NAME ${PARSE_PYTHON_MODULE}
+        C_VISIBILITY_PRESET hidden
+        CXX_VISIBILITY_PRESET hidden
+    )
+
+endfunction()
+
+###
+set(_PYTHON_VERSIONS)
+foreach(PYTHON_VERSION ${PYTHON_SEARCH_VERSIONS})
+    find_python(${PYTHON_VERSION})
+    if(TARGET python${PYTHON_VERSION}::headers)
+        message(STATUS "Python ${PYTHON_VERSION} found.")
+        list(APPEND _PYTHON_VERSIONS ${PYTHON_VERSION})
+    else()
+        message(STATUS "Python ${PYTHON_VERSION} not found.")
+    endif()
+endforeach()
+# Make the variable global
+set(PYTHON_VERSIONS "${_PYTHON_VERSIONS}" CACHE INTERNAL "" FORCE)
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c0c4f74b9..fb5bdca98 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -292,24 +292,35 @@ endif(GENERATE_MARIAN_INSTALL_TARGETS)
 
 
 if(PYMARIAN)
-  if(NOT PYTHON_EXECUTABLE)
-    set(PYTHON_EXECUTABLE python)   # default to python in the environment
-  endif()
-
+  # python libs which use different version of tcmalloc (e.g. pandas) can cause segfaults, so we disable it
   include_directories(3rd_party/pybind11/include)
   add_subdirectory(3rd_party/pybind11)
+  include(PythonModules)
+  # print all python versions
+  message(STATUS "Going to look for these Python versions: ${PYTHON_VERSIONS}")
+  add_custom_target(_pymarian)
+  foreach(PYTHON_VERSION ${PYTHON_VERSIONS})  # defined in PythonModules.cmake
+      py_add_module(_pymarian_${PYTHON_VERSION} python/binding/bind.cpp PYTHON_VERSION ${PYTHON_VERSION} PYTHON_MODULE _pymarian)
+      target_link_libraries(_pymarian_${PYTHON_VERSION} PUBLIC marian ${EXT_LIBS})
+      if(CUDA_FOUND)
+        target_link_libraries(_pymarian_${PYTHON_VERSION} PUBLIC marian_cuda)
+      endif(CUDA_FOUND)
+      add_dependencies(_pymarian _pymarian_${PYTHON_VERSION})
+  endforeach()
+
+  # ==== make .whl files  ====
+  # IMPORTANT: do not parallelize the wheel builds; they conflict on a few directories (e.g. *.egg-info)
+  set(LAST_PYMARIAN_TGT "")
+  foreach(PYTHON_VERSION ${PYTHON_VERSIONS})
+    add_custom_target(
+      pymarian_${PYTHON_VERSION} ALL
+      COMMAND ${PYTHON_${PYTHON_VERSION}_EXECUTABLE} -m pip install --upgrade pip build
+      COMMAND ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
+              ${PYTHON_${PYTHON_VERSION}_EXECUTABLE} -m build --wheel ${PROJECT_SOURCE_DIR}/src/python -o "${PROJECT_BINARY_DIR}"
+      DEPENDS _pymarian_${PYTHON_VERSION} ${LAST_PYMARIAN_TGT}
+      VERBATIM COMMENT "===Building pymarian wheel for python${PYTHON_VERSION}==="
+    )
+    set(LAST_PYMARIAN_TGT pymarian_${PYTHON_VERSION})
+  endforeach(PYTHON_VERSION)
 
-  pybind11_add_module(_pymarian MODULE python/binding/bind.cpp)
-  target_link_libraries(_pymarian PUBLIC marian)
-  if(CUDA_FOUND)
-    target_link_libraries(_pymarian PUBLIC marian_cuda)
-  endif(CUDA_FOUND)
-  install(TARGETS _pymarian DESTINATION .)
-
-  # build pymarian wheel
-  add_custom_target(pymarian ALL
-    ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
-    "${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}"
-    DEPENDS _pymarian
-    VERBATIM COMMENT "Building pymarian wheel")
 endif(PYMARIAN)
diff --git a/src/python/README.md b/src/python/README.md
index d3fc34e25..82d300675 100644
--- a/src/python/README.md
+++ b/src/python/README.md
@@ -13,7 +13,7 @@ cmake --build build -j       # -j option parallelizes build on all cpu cores
 python -m pip install build/pymarian-*.whl
 ```
 
-Since the above commands uses `python` executable in the PATH to determine Python version to compile marian native extension, make sure to have the desired `python` executable in your environment _before_ invoking these commands.
+The above commands use `python` executable in the PATH to determine Python version for compiling marian native extension. Make sure to have the desired `python` executable in your environment _before_ invoking these cmake commands.
 
 ## Python API
 
@@ -96,6 +96,7 @@ options:
   -ws WORKSPACE, --workspace WORKSPACE
                         Workspace memory (default: 8000)
   -pc, --print-cmd      Print marian evaluate command and exit (default: False)
+  --cache CACHE         Cache directory for storing models (default: $HOME/.cache/marian/metric)
 
 More info at https://github.com/marian-nmt/marian-dev. This CLI is loaded from .../python3.10/site-packages/pymarian/eval.py (version: 1.12.25)
 
@@ -157,6 +158,71 @@ python -m pytest -s src/python/tests/regression
 
 ```
 
+## Release Instructions
+
+### Building Pymarian for Multiple Python Versions
+
+Our CMake scripts detects `python3.*` available in PATH and builds pymarian for each.
+To support a specific version of python, make the `python3.x` executable available in PATH prior to running cmake.
+This can be achieved by (without conflicts) using `conda` or `mamba`.
+
+
+```bash
+# setup mamba if not already; Note: you may use conda as well
+which mamba || {
+   name=Miniforge3-$(uname)-$(uname -m).sh
+   wget "https://github.com/conda-forge/miniforge/releases/latest/download/$name" \
+      && bash $name -b -p ~/mambaforge && ~/mambaforge/bin/mamba init bash && rm $name
+}
+
+# create environment for each version
+versions="$(echo 3.{12,11,10,9,8,7})"
+for version in $versions; do
+   echo "python $version"
+   mamba env list | grep -q "^py${version}" || mamba create -q -y -n py${version} python=${version}
+done
+
+# stack all environments
+for version in $versions; do mamba activate py${version} --stack; done
+# check if all python versions are available
+for version in $versions; do which python$version; done
+
+
+# Build as usual
+cmake . -B build -DCOMPILE_CUDA=off -DPYMARIAN=on
+cmake --build build -j
+ls build/pymarian*.whl
+```
+
+### Upload to PyPI
+```bash
+twine upload -r testpypi build/*.whl
+
+twine upload -r pypi build/*.whl
+```
+
+__Initial Setup:__ create `~/.pypirc` with following:
+
+```ini
+[distutils]
+index-servers =
+    pypi
+    testpypi
+
+[pypi]
+repository: https://upload.pypi.org/legacy/
+username:__token__
+password:<token>
+
+[testpypi]
+repository: https://test.pypi.org/legacy/
+username:__token__
+password:<token>
+```
+Obtain token from https://pypi.org/manage/account/ 
+
+
+
 ## Known issues
 
 1. In conda or mamba environment, if you see  `.../miniconda3/envs/<envname>/bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error,
diff --git a/src/python/pymarian/defaults.py b/src/python/pymarian/defaults.py
index 2fdeff278..51c29c69d 100644
--- a/src/python/pymarian/defaults.py
+++ b/src/python/pymarian/defaults.py
@@ -1,9 +1,12 @@
 from pathlib import Path
-
+import os
 
 class Defaults:
     BASE_URL = "https://textmt.blob.core.windows.net/www/marian/metric"
-    CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric'
+
+    DEF_CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric'
+    # user might also change this from CLI at runtime
+    CACHE_PATH = Path(os.environ['MARIAN_CACHE']) if os.environ.get('MARIAN_CACHE', '').strip() else DEF_CACHE_PATH
     MINI_BATCH = 16
     MAXI_BATCH = 256
     WORKSPACE = 8000
@@ -12,20 +15,24 @@ class Defaults:
     FLOAT_PRECISION = 4
     FILE_LOCK_TIMEOUT = 1 * 60 * 60  # seconds => 1 hour
     PROGRESS_BAR = True
-
-    # metric name to model type; lowercase all IDs
+    HUGGINGFACE = "huggingface"
+    AZURE = "azure"
+    COMET_VOCAB_REPO = "microsoft/infoxlm-large"
+    # metric   id -> (model_type, huggingface_org/model_id)
+    # unbabel agreed to host models within their org and added the same gating/licensing mechanism
+    # we hosted bleurt ourself (Apache2.0) on https://huggingface.co/marian-nmt
     KNOWN_METRICS = {
-        "bleurt-20": "bleurt",
-        "wmt20-comet-da": "comet",
-        "wmt20-comet-qe-da": "comet-qe",
-        "wmt20-comet-qe-da-v2": "comet-qe",
-        "wmt21-comet-da": "comet",
-        "wmt21-comet-qe-da": "comet-qe",
-        "wmt21-comet-qe-mqm": "comet-qe",
-        "wmt22-comet-da": "comet",
-        "wmt22-cometkiwi-da": "comet-qe",
-        "xcomet-xl": "comet",
-        "xcomet-xxL": "comet",
+        "bleurt-20": ["bleurt", "marian-nmt/bleurt-20"],
+        "wmt20-comet-da": ["comet", "unbabel/wmt20-comet-da-marian"],
+        "wmt20-comet-qe-da": ["comet-qe", "unbabel/wmt20-comet-qe-da-marian"],
+        "wmt20-comet-qe-da-v2": ["comet-qe", "unbabel/wmt20-comet-qe-da-v2-marian"],
+        "wmt21-comet-da": ["comet", "unbabel/wmt21-comet-da-marian"],
+        "wmt21-comet-qe-da": ["comet-qe", "unbabel/wmt21-comet-qe-da-marian"],
+        "wmt21-comet-qe-mqm": ["comet-qe", "unbabel/wmt21-comet-qe-mqm-marian"],
+        "wmt22-comet-da": ["comet", "unbabel/wmt22-comet-da-marian"],
+        "wmt22-cometkiwi-da": ["comet-qe", "unbabel/wmt22-cometkiwi-da-marian"],
+        "wmt23-cometkiwi-da-xl": ["comet-qe", "unbabel/wmt23-cometkiwi-da-xl-marian"],
+        "wmt23-cometkiwi-da-xxl": ["comet-qe", "unbabel/wmt23-cometkiwi-da-xxl-marian"],
     }
 
     # model type to field order
diff --git a/src/python/pymarian/eval.py b/src/python/pymarian/eval.py
index 4b5e5f02c..d355e100f 100755
--- a/src/python/pymarian/eval.py
+++ b/src/python/pymarian/eval.py
@@ -25,7 +25,7 @@ def parse_args():
         f'This CLI is loaded from {__file__} (version: {__version__})',
     )
 
-    known_metrics = ', '.join(Defaults.KNOWN_METRICS)
+    known_metrics = ', '.join(Defaults.KNOWN_METRICS.keys())
     parser.add_argument(
         '-m',
         '--model',
@@ -89,6 +89,7 @@ def parse_args():
     parser.add_argument(
         '-pc', '--print-cmd', action="store_true", help="Print marian evaluate command and exit"
     )
+    parser.add_argument('--cache', help='Cache directory for storing models', type=Path, default=Defaults.CACHE_PATH)
 
     args = parser.parse_args()
     return vars(args)
@@ -197,6 +198,7 @@ def main(**args):
         log.debug(args)
     else:
         args['quiet'] = ''
+    Defaults.CACHE_PATH = args.pop('cache')
 
     model_id = args.pop('model')
     model_path = Path(model_id)
@@ -221,7 +223,7 @@ def main(**args):
             model_path = get_model_path(model_id)
             if not vocab_path:  # if vocab is not given, resolve it from cache
                 vocab_path = get_vocab_path(model_id)
-            args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_MODEL_TYPE)
+            args['like'] = Defaults.KNOWN_METRICS.get(model_id, [Defaults.DEF_MODEL_TYPE])[0]
         except ValueError as e:
             raise ValueError(f'Invalid model ID: {model_id}') from e
 
diff --git a/src/python/pymarian/utils.py b/src/python/pymarian/utils.py
index c3a4efab0..0d33ae455 100644
--- a/src/python/pymarian/utils.py
+++ b/src/python/pymarian/utils.py
@@ -6,13 +6,14 @@
 import logging as log
 import shutil
 from pathlib import Path
-from typing import List, Tuple
+from typing import Tuple
 
 import portalocker
 import requests
 
 from .defaults import Defaults
 from .pypdl import Downloader
+from huggingface_hub import hf_hub_download
 
 log.basicConfig(level=log.INFO)
 
@@ -40,15 +41,22 @@ def get_model_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Path:
     If necessary, this function downloads checkpoint to a local cache directory.
 
     :param model_name: model name
+    :param progress_bar: show progress bar while downloading
     :return: checkpoint path
     """
     validate_id(model_name)
-    chkpt_url = f'{Defaults.BASE_URL}/{model_name}/model.{model_name}.bin'
-
-    local_dir = Defaults.CACHE_PATH / model_name
-    chkpt_local = local_dir / f'model.{model_name}.bin'
-
-    maybe_download_file(chkpt_url, chkpt_local)
+    hf_repo_id = Defaults.KNOWN_METRICS.get(model_name, [None, None])[1]
+    if hf_repo_id:
+        # TODO: support progress bar switch
+        chkpt_local = hf_hub_download(repo_id=hf_repo_id, filename="checkpoints/marian.model.bin",
+                                      cache_dir=Defaults.CACHE_PATH)
+        chkpt_local = Path(chkpt_local)
+    else:
+        chkpt_url = f'{Defaults.BASE_URL}/{model_name}/model.{model_name}.bin'
+        local_dir = Defaults.CACHE_PATH / model_name
+        chkpt_local = local_dir / f'model.{model_name}.bin'
+
+        maybe_download_file(chkpt_url, chkpt_local, progress_bar=progress_bar)
     assert chkpt_local.exists(), f'Checkpoint file {chkpt_local} does not exist'
     return chkpt_local
 
@@ -61,12 +69,27 @@ def get_vocab_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Tuple[Path,
     :param progress_bar: show progress bar while downloading
     :return: checkpoint path, vocabulary path
     """
-    validate_id(model_name)
-    local_dir = Defaults.CACHE_PATH / model_name
-    vocab_local = local_dir / 'vocab.spm'
-
-    vocab_url = f'{Defaults.BASE_URL}/{model_name}/vocab.spm'
-    maybe_download_file(vocab_url, vocab_local, progress_bar=progress_bar)
+    hf_repo_id = Defaults.KNOWN_METRICS.get(model_name, [None, None])[1]
+    if hf_repo_id:
+        filename = "vocab.spm"
+        if 'comet' in hf_repo_id.lower():
+            hf_repo_id = Defaults.COMET_VOCAB_REPO
+            filename = "sentencepiece.bpe.model"
+        # TODO: support progress bar switch
+        vocab_local = hf_hub_download(repo_id=hf_repo_id, filename=filename, cache_dir=Defaults.CACHE_PATH)
+        vocab_local = Path(vocab_local)
+        if vocab_local.suffix != ".spm": # marian requires .spm extension
+            vocab_spm = vocab_local.with_suffix(".spm")
+            if not vocab_spm.exists():
+                vocab_spm.symlink_to(Path(vocab_local.name), target_is_directory=False)
+            vocab_local = vocab_spm
+    else:
+        validate_id(model_name)
+        local_dir = Defaults.CACHE_PATH / model_name
+        vocab_local = local_dir / 'vocab.spm'
+
+        vocab_url = f'{Defaults.BASE_URL}/{model_name}/vocab.spm'
+        maybe_download_file(vocab_url, vocab_local, progress_bar=progress_bar)
     assert vocab_local.exists(), f'Vocabulary file {vocab_local} does not exist'
     return vocab_local
 
diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml
index f2008a924..30eb16f36 100644
--- a/src/python/pyproject.toml
+++ b/src/python/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
   "pyyaml",
   "tqdm",
   "requests",
+  "huggingface-hub==0.23.1",
 ]
 
 [project.scripts]
@@ -49,8 +50,6 @@ demos = [
   "sentence-splitter@git+https://github.com/mediacloud/sentence-splitter",
 ]
 
-[tool.setuptools]
-include-package-data = true
 
 [tool.black]
 line-length = 110
diff --git a/src/python/setup.py b/src/python/setup.py
index bcbca2c63..0e34efd30 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -71,12 +71,15 @@ def get_native_ext() -> Path:
     print(f"\t>>>Making it available under the package scope at: {native_ext_local}")
     shutil.copy(native_ext, native_ext_local)
 
-    # remove incomaptible .so files from prior builds (if any)
-    for old_file in Path(__file__).parent.glob("_pymarian.*"):
-        if old_file.resolve() == native_ext_local.resolve():
-            continue
-        print(f"\t>>>Removing old file: {old_file}")
-        old_file.unlink()
+    # wheel builder adds all *.so files into *.whl making the wheel bloated; so we remove them
+    remove_old_files = True
+    if remove_old_files:
+        # remove incomaptible .so files from prior builds (if any)
+        for old_file in Path(__file__).parent.glob("_pymarian.*"):
+            if old_file.resolve() == native_ext_local.resolve():
+                continue
+            print(f"\t>>>INFO:: Removing incompatible extension: {old_file}")
+            old_file.unlink()
     return native_ext_local
 
 
@@ -97,6 +100,6 @@ def has_ext_modules(foo):
     package_dir={"pymarian": "pymarian"},
     packages=find_namespace_packages(where=".", exclude=["tests", "binding"]),
     include_package_data=True,
-    package_data={"": [str(native_ext)]},
+    package_data={"pymarian": [str(native_ext)]},
     distclass=BinaryDistribution,
 )