From a728daa78f478bfc67f80317cb6cc5f8e700bea1 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Mon, 13 Nov 2023 11:49:31 +0000 Subject: [PATCH 01/26] Merged PR 31742: Fix docker url security: use microsoft cr Fix docker url security: use microsoft container registry instead of public dockerhub --- scripts/metrics/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile index 4641e6571..25a3236a9 100644 --- a/scripts/metrics/Dockerfile +++ b/scripts/metrics/Dockerfile @@ -1,5 +1,6 @@ -FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 - +FROM mcr.microsoft.com/azureml/minimal-ubuntu20.04-py38-cuda11.6.2-gpu-inference:20231102.v2 +# use this if microsoft image is not accessible; +#FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 LABEL description="Marian image - Ubuntu 20.04" ARG DEBIAN_FRONTEND=noninteractive From 6fe9a8078446cb0fb711e5bbb9d638ee87d3e9cc Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Fri, 24 Nov 2023 15:58:48 +0000 Subject: [PATCH 02/26] Merged PR 31906: Updates to CI pipeline: new vcpkg and options to disable specific jobs --- azure-pipelines.yml | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1ec1f8739..4e1744375 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,10 +8,26 @@ parameters: # Allow skipping the entire 'Build' stage -- name: runBuilds - displayName: Run builds? Uncheck to run regression tests only. +- name: disableAllBuilds + displayName: Disable all builds and run regression tests only type: boolean - default: true + default: false +- name: disableAllTests + displayName: Do not run regression tests + type: boolean + default: false +- name: disableLinux + displayName: Disable Linux builds + type: boolean + default: false +- name: disableMacOS + displayName: Disable MacOS builds + type: boolean + default: false +- name: disableWindows + displayName: Disable Windows builds + type: boolean + default: false # Warning: the current branch policies disable the automatic triggering to # minimize VM usage! @@ -54,7 +70,7 @@ variables: - name: MKL_URL value: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip" - name: VCPKG_COMMIT - value: 2022.03.10 + value: 2023.11.20 - name: VCPKG_DIR value: "$(Build.SourcesDirectory)/vcpkg" - name: VCPKG_PACKAGES @@ -73,7 +89,7 @@ stages: ###################################################################### - job: BuildWindows cancelTimeoutInMinutes: 1 - condition: eq(${{ parameters.runBuilds }}, true) + condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableWindows }}, false) ) displayName: Windows strategy: @@ -210,7 +226,7 @@ stages: ###################################################################### - job: BuildUbuntu cancelTimeoutInMinutes: 1 - condition: eq(${{ parameters.runBuilds }}, true) + condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableLinux }}, false) ) displayName: Ubuntu timeoutInMinutes: 120 @@ -348,7 +364,7 @@ stages: ###################################################################### - job: BuildMacOS cancelTimeoutInMinutes: 1 - condition: eq(${{ parameters.runBuilds }}, true) + condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableMacOS }}, false) ) displayName: macOS CPU clang pool: @@ -398,7 +414,7 @@ stages: ###################################################################### - job: BuildInstall cancelTimeoutInMinutes: 1 - condition: eq(${{ parameters.runBuilds }}, true) + condition: and( eq(${{ parameters.disableAllBuilds }}, false), eq(${{ parameters.disableLinux }}, false) ) displayName: Linux CPU library install pool: @@ -462,6 +478,7 @@ stages: ###################################################################### - job: TestWindows cancelTimeoutInMinutes: 1 + condition: eq(${{ parameters.disableAllTests }}, false) displayName: Windows CPU+FBGEMM pool: @@ -588,6 +605,7 @@ stages: ###################################################################### - job: TestLinux cancelTimeoutInMinutes: 1 + condition: eq(${{ parameters.disableAllTests }}, false) displayName: Linux CPU+FBGEMM pool: From 72c8d60a77eacc73a572d0a0167e648e15e5def5 Mon Sep 17 00:00:00 2001 From: Roman Grundkiewicz Date: Mon, 27 Nov 2023 10:18:41 +0000 Subject: [PATCH 03/26] Merged PR 31918: Update MKL in GPU regression tests --- azure-regression-tests.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml index 0448b172a..206c018a1 100644 --- a/azure-regression-tests.yml +++ b/azure-regression-tests.yml @@ -84,8 +84,9 @@ stages: # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - bash: | - wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - - sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" + sudo mkdir -p /usr/share/keyrings + wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/intel.gpg > /dev/null + echo "deb [signed-by=/usr/share/keyrings/intel.gpg] https://apt.repos.intel.com/mkl all main" | sudo tee /etc/apt/sources.list.d/intel-mkl.list sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list" sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 displayName: Install MKL From a7cc324d50c02ae74257595e2284a543398f498b Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 2 Dec 2023 03:47:54 +0000 Subject: [PATCH 04/26] Merged PR 31730: ALIBI with shifts This PR adds a first working version of ALIBI with algorithmic shifts for encoder-decoder models. Also adds trainable ALIBI slopes and biases and ALIBI in general to the **new** layer framework. This is still experimental. --- CHANGELOG.md | 1 + CMakeLists.txt | 4 +- VERSION | 2 +- src/CMakeLists.txt | 3 + src/common/config_parser.cpp | 26 ++ src/data/corpus_base.cpp | 2 +- src/data/sentencepiece_vocab.cpp | 3 +- src/data/vocab.cpp | 8 + src/data/vocab.h | 3 + src/functional/shape.h | 15 +- src/functional/tensor.h | 3 +- src/graph/expression_operators.cpp | 55 +++-- src/graph/expression_operators.h | 6 +- src/graph/node_operators_binary.h | 39 ++- src/layers/constructors.h | 2 +- src/layers/loss.h | 22 +- src/layers_new/alibi.cpp | 339 +++++++++++++++++++++++++++ src/layers_new/alibi.cu | 251 ++++++++++++++++++++ src/layers_new/alibi.h | 268 +++++++++++++++++++++ src/layers_new/attention.cpp | 105 +++++++++ src/layers_new/attention.h | 77 ++++-- src/layers_new/interface.h | 25 ++ src/layers_new/neuralnet.h | 17 -- src/layers_new/transformer.h | 99 +++++--- src/models/bleurt.h | 3 - src/models/comet_qe.h | 83 +++---- src/models/decoder.h | 1 + src/models/encoder_decoder.cpp | 29 ++- src/models/encoder_decoder.h | 2 +- src/models/states.h | 31 ++- src/models/transformer.h | 3 + src/models/transformer_new.h | 23 +- src/tensors/cpu/tensor_operators.cpp | 2 + src/tensors/gpu/add.inc | 3 + src/tensors/gpu/add_all.cu | 15 ++ src/tensors/gpu/add_all.h | 14 +- src/tensors/gpu/add_all.inc | 6 + src/tensors/gpu/element.inc | 7 +- src/tensors/gpu/tensor_operators.cu | 3 +- src/translator/beam_search.cpp | 6 +- src/translator/translator.h | 2 +- 41 files changed, 1391 insertions(+), 217 deletions(-) create mode 100644 src/layers_new/alibi.cpp create mode 100644 src/layers_new/alibi.cu create mode 100644 src/layers_new/alibi.h create mode 100644 src/layers_new/attention.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 4242e5c19..e4eb14230 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added ALIBI related options to new layer framework. - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode. - Added MSE and MAE costs to COMET-QE training. - Added augmentation of shuffled examples to COMET-QE training via `--comet-augment-bad`. diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c674e68d..2ea841254 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,7 +240,7 @@ else(MSVC) set(CMAKE_RDYNAMIC_FLAG "-rdynamic") endif(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}") + set(CMAKE_CXX_FLAGS "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_CXX_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG") @@ -472,7 +472,7 @@ else(CMAKE_BUILD_TYPE STREQUAL "Debug") endif(CMAKE_BUILD_TYPE STREQUAL "Debug") if(NOT MSVC) # @TODO: add warnings here too - list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;) + list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;) list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC}) else() list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) diff --git a/VERSION b/VERSION index 10ae91bd5..274b68518 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.14 +v1.12.15 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 77c455946..d4cb8cc14 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -82,6 +82,8 @@ set(MARIAN_SOURCES layers/lsh.cpp layers_new/neuralnet.cpp + layers_new/alibi.cpp + layers_new/attention.cpp rnn/cells.cpp rnn/attention.cpp @@ -179,6 +181,7 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY if(CUDA_FOUND) cuda_add_library(marian_cuda + layers_new/alibi.cu tensors/gpu/device.cu tensors/gpu/hash.cu tensors/gpu/algorithm.cu diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 425a78143..d797b8e2d 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -320,6 +320,32 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--transformer-depth-scaling", "Scale down weight initialization in transformer layers by 1 / sqrt(depth)"); + cli.add("--transformer-attention-mask", + "Type of mask/bias in transformer attention: default, alibi", + "default"); + cli.add("--transformer-alibi-shift", + "Use alibi-shifting with sync-points with --transformer-attention-mask alibi"); + cli.add("--separator-symbol", + "Generic separator symbol for different applications, i.e. for transformer-alibi-shift syncpoints, default is [eos] (currently only supported with raw spm models)", + "[eos]"); + cli.add("--transformer-disable-position-embeddings", + "Do not add any position embeddings. Use e.g. with --transformer-attention-mask alibi"); + + cli.add("--transformer-alibi-trainable", + "Make alibi slopes trainable, default slopes are constant"); + + // handy shortcut for the current best setup + cli.add("--alibi", + "Use alibi settings for transformer, this is a shortcut for --transformer-attention-mask alibi --transformer-alibi-shift --transformer-disable-position-embeddings --separator-symbol [eos]"); + cli.alias("alibi", "true", [](YAML::Node& config) { + // define current-best alibi settings + config["transformer-attention-mask"] = "alibi"; + config["transformer-alibi-shift"] = true; + config["transformer-disable-position-embeddings"] = true; + config["separator-symbol"] = "[eos]"; + config["transformer-alibi-trainable"] = true; + }); + cli.add("--transformer-no-bias", "Don't use any bias vectors in linear layers"); cli.add("--transformer-no-affine", diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index b168ecba1..0ef804b1c 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -355,7 +355,7 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) // when force-decoding we want the last vocab to be part of the batch, // hence we do not drop it from the input batch. bool forceDecoding = options_->get("force-decode", false); - size_t shift = !forceDecoding ? 1 : 0; + size_t shift = forceDecoding ? 0 : 1; for(size_t i = 0; i + shift < numVocs; ++i) { Ptr vocab = New(options_, i); diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp index 548b95a46..84d406dd5 100644 --- a/src/data/sentencepiece_vocab.cpp +++ b/src/data/sentencepiece_vocab.cpp @@ -255,7 +255,8 @@ class SentencePieceVocab : public IVocab { for(const Word& id : sentence) if(!ignoreEOS || id != getEosId()) line += (*this)[id] + " "; - line.pop_back(); // trim the trailing whitespace + if(!line.empty()) + line.pop_back(); // trim the trailing whitespace } else { // convert vector of Word to vector of int std::vector spmSentence; diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp index 82a4b8da1..a39c0eeae 100644 --- a/src/data/vocab.cpp +++ b/src/data/vocab.cpp @@ -142,6 +142,14 @@ Word Vocab::getEosId() const { return vImpl_->getEosId(); } // return UNK symbol id Word Vocab::getUnkId() const { return vImpl_->getUnkId(); } +// return generic separator symbol id +Word Vocab::getSepId() const { + std::string sepSym = options_->get("separator-symbol", "[eos]"); + Word id = (*vImpl_)[sepSym]; + ABORT_IF(id == getUnkId(), "Separator symbol '{}' not found in vocabulary", sepSym); + return id; +} + std::vector Vocab::suppressedIds(bool suppressUnk, bool suppressSpecial) const { std::vector ids; if(suppressUnk) { diff --git a/src/data/vocab.h b/src/data/vocab.h index 7eeca2902..2c60912f7 100644 --- a/src/data/vocab.h +++ b/src/data/vocab.h @@ -73,6 +73,9 @@ class Vocab { // return UNK symbol id Word getUnkId() const; + // return generic separator symbol id + Word getSepId() const; + // return a set of Word ids that should be suppressed based on the underlying vocabulary implementation. // Arguments mosty likely provided based on outside options like --allow-unk etc. std::vector suppressedIds(bool suppressUnk = true, bool suppressSpecial = true) const; diff --git a/src/functional/shape.h b/src/functional/shape.h index fd354e1e0..330fa9971 100644 --- a/src/functional/shape.h +++ b/src/functional/shape.h @@ -76,17 +76,24 @@ struct ConstantShape { offset_(shape.offset_) {} template - HOST_DEVICE ConstantShape(const Array& shape) { + ConstantShape(const Array& shape) { ABORT_IF(M > N, "Recompile with CONST_SHAPE_DIMS >= {}", M); - std::copy(shape.begin(), shape.end(), shape_.begin() + N - M); - if(N - M) - std::fill_n(shape_.begin(), N - M, 1); + for(int i = 0; i < shape.size(); ++i) + shape_[N - M + i] = shape[i]; + for(int i = 0; i < N - M; ++i) + shape_[i] = 1; updateStrides(); updateElements(); } + HOST_DEVICE ConstantShape(const Array& shape) + : shape_(shape) { + updateStrides(); + updateElements(); + } + HOST_DEVICE ConstantShape(const Array& shape, const Array& stride, size_t offset) diff --git a/src/functional/tensor.h b/src/functional/tensor.h index f5549c608..e631cd63a 100644 --- a/src/functional/tensor.h +++ b/src/functional/tensor.h @@ -70,7 +70,8 @@ struct View { HOST_DEVICE View(T* ptr, const ConstantShape& shape) : data_(ptr), shape_(shape) {} - HOST View(marian::Tensor t) : data_(t->data()), shape_(adapt(t->shape())) {} + HOST View(marian::Tensor t) + : data_(t ? t->data() : nullptr), shape_(t ? adapt(t->shape()) : adapt(marian::Shape({0, 0, 0, 0}))) {} HOST_DEVICE_INLINE T& operator[](size_t i) { return data_[shape_.index((int)i)]; diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 1a81ce51f..60a86112f 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -183,7 +183,7 @@ Expr maximum(Expr a, Expr b) { // @TODO: implement version without constant Expr maximum(float a, Expr b) { - auto aExpr = b->graph()->constant({}, inits::fromValue(a)); + auto aExpr = b->graph()->constant({1}, inits::fromValue(a), b->value_type()); return Expression(aExpr, b); } @@ -197,7 +197,7 @@ Expr minimum(Expr a, Expr b) { // @TODO: implement version without constant Expr minimum(float a, Expr b) { - auto aExpr = b->graph()->constant({}, inits::fromValue(a)); + auto aExpr = b->graph()->constant({1}, inits::fromValue(a), b->value_type()); return Expression(aExpr, b); } @@ -216,19 +216,19 @@ Expr ge(Expr a, Expr b) { return Expression(a, b, -1, true); } Expr ne(Expr a, Expr b) { return Expression(a, b, 0, true); } Expr le(Expr a, Expr b) { return Expression(a, b, 1, true); } -Expr lt(float a, Expr b) { return Expression(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, -1, false); } -Expr eq(float a, Expr b) { return Expression(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, 0, false); } -Expr gt(float a, Expr b) { return Expression(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, 1, false); } -Expr ge(float a, Expr b) { return Expression(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, -1, true); } -Expr ne(float a, Expr b) { return Expression(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, 0, true); } -Expr le(float a, Expr b) { return Expression(b->graph()->constant({}, inits::fromValue(a), b->value_type()), b, 1, true); } +Expr lt(float a, Expr b) { return Expression(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, -1, false); } +Expr eq(float a, Expr b) { return Expression(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, 0, false); } +Expr gt(float a, Expr b) { return Expression(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, 1, false); } +Expr ge(float a, Expr b) { return Expression(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, -1, true); } +Expr ne(float a, Expr b) { return Expression(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, 0, true); } +Expr le(float a, Expr b) { return Expression(b->graph()->constant({1}, inits::fromValue(a), b->value_type()), b, 1, true); } -Expr lt(Expr a, float b) { return Expression(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), -1, false); } -Expr eq(Expr a, float b) { return Expression(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), 0, false); } -Expr gt(Expr a, float b) { return Expression(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), 1, false); } -Expr ge(Expr a, float b) { return Expression(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), -1, true); } -Expr ne(Expr a, float b) { return Expression(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), 0, true); } -Expr le(Expr a, float b) { return Expression(a, a->graph()->constant({}, inits::fromValue(b), a->value_type()), 1, true); } +Expr lt(Expr a, float b) { return Expression(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), -1, false); } +Expr eq(Expr a, float b) { return Expression(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), 0, false); } +Expr gt(Expr a, float b) { return Expression(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), 1, false); } +Expr ge(Expr a, float b) { return Expression(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), -1, true); } +Expr ne(Expr a, float b) { return Expression(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), 0, true); } +Expr le(Expr a, float b) { return Expression(a, a->graph()->constant({1}, inits::fromValue(b), a->value_type()), 1, true); } /*********************************************************/ @@ -280,23 +280,22 @@ Expr operator/(Expr a, float b) { // TODO: efficient version of this without constant() Expr operator/(float a, Expr b) { - auto aExpr = b->graph()->constant({}, inits::fromValue(a)); + auto aExpr = b->graph()->constant({1}, inits::fromValue(a), b->value_type()); return aExpr / b; } -// Expr pow(float a, Expr b) { -// return Expression(a, b); -// -//} -// -// Expr pow(Expr a, float b) { -// return Expression(a, b); -// -//} -// -// Expr pow(Expr a, Expr b) { -// return Expression(a, b); -//} +// @TODO: implement proper operators for all three: +Expr pow(float a, Expr b) { + return exp(std::log(a) * b); +} + +Expr pow(Expr a, float b) { + return exp(log(a) * b); +} + +Expr pow(Expr a, Expr b) { + return exp(log(a) * b); +} /*********************************************************/ diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index e96d8f7c9..c792096b1 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -303,9 +303,9 @@ Expr square(Expr a); */ Expr abs(Expr a); -// Expr pow(Expr a, Expr b); -// Expr pow(float a, Expr b); -// Expr pow(Expr a, float b); +Expr pow(Expr a, Expr b); +Expr pow(float a, Expr b); +Expr pow(Expr a, float b); /** * Computes @f$\log(e^a + e^b)@f$. diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 29259f983..8cf0af1a4 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -1692,23 +1692,40 @@ struct RMSNormalizationOp : public NaryNodeOp { float eps_; }; - +// @TODO: rewriting this fixes a bug for this one node. There should be exactly one +// NodeOp per gradient tensor many other nodes have that bug and need to be fixed. +// This will only manifest if the first op is not trainable, then gradients for the +// other nodes might get skipped despite being trainable. struct HighwayNodeOp : public NaryNodeOp { - HighwayNodeOp(const std::vector& nodes) : NaryNodeOp(nodes) {} + HighwayNodeOp(const std::vector& nodes) : NaryNodeOp(nodes, Shape::broadcast(nodes)) {} NodeOps forwardOps() override { - return {NodeOp(HighwayForward( - val_, child(0)->val(), child(1)->val(), child(2)->val()))}; + using namespace functional; + auto alpha = sigmoid(_4); + auto fwd = _1 = alpha * _2 + (1.f - alpha) * _3; + + return { + NodeOp(Element(fwd, val_, child(0)->val(), child(1)->val(), child(2)->val())) + }; } NodeOps backwardOps() override { - return {NodeOp(HighwayBackward(child(0)->grad(), - child(1)->grad(), - child(2)->grad(), - child(0)->val(), - child(1)->val(), - child(2)->val(), - adj_))}; + using namespace functional; + + auto alpha = sigmoid(_1); + auto bwd1 = alpha * _2; + auto bwd2 = (1.f - alpha) * _2; + auto bwd3 = alpha * (1.f - alpha) * _2 * (_3 - _4); + + auto& in1 = child(0)->val(); + auto& in2 = child(1)->val(); + auto& gate = child(2)->val(); + + return { + NodeOp(Add(bwd1, child(0)->grad(), gate, adj_)), + NodeOp(Add(bwd2, child(1)->grad(), gate, adj_)), + NodeOp(Add(bwd3, child(2)->grad(), gate, adj_, in1, in2)) + }; } const std::string type() override { return "highway"; } diff --git a/src/layers/constructors.h b/src/layers/constructors.h index 5597a6a4e..28be22e2d 100644 --- a/src/layers/constructors.h +++ b/src/layers/constructors.h @@ -246,7 +246,7 @@ class MLPFactory : public Factory { public: AsLayerFactory(const WrappedFactory& wrapped) : us(wrapped) {} Ptr construct(Ptr graph) override final { - auto p = std::static_pointer_cast(us.construct(graph)); + auto p = std::dynamic_pointer_cast(us.construct(graph)); ABORT_IF(!p, "Attempted to cast a Factory to LayerFactory that isn't one"); return p; } diff --git a/src/layers/loss.h b/src/layers/loss.h index 5dbb5e553..c27dd954d 100644 --- a/src/layers/loss.h +++ b/src/layers/loss.h @@ -198,11 +198,12 @@ class ScaledMultiRationalLoss : public MultiRationalLoss { virtual Expr accumulateLoss(const RationalLoss& current) override { if(loss_) { const auto& first = partialLosses_.front(); - return loss_ - + current.loss() * first.count() - / current.count(); // scale up/down to match scale of first loss + Type lossType = loss_->value_type(); + // scale up/down to match scale of first loss + return loss_ + cast(current.loss(), lossType) * first.count() / cast(current.count(), lossType); } else { - return current.loss(); // first reference loss, keeps to scale with this one + // first reference loss, keeps to scale with this one + return current.loss(); } } @@ -212,7 +213,7 @@ class ScaledMultiRationalLoss : public MultiRationalLoss { } else { return current.count(); // This is the first loss } - } + } public: ScaledMultiRationalLoss() : MultiRationalLoss() {} @@ -233,18 +234,19 @@ class ScaledMultiRationalLoss : public MultiRationalLoss { class MeanMultiRationalLoss : public MultiRationalLoss { private: virtual Expr accumulateLoss(const RationalLoss& current) override { - if(loss_) - return loss_ + current.loss() / current.count(); - else + if(loss_) { + Type lossType = loss_->value_type(); + return loss_ + cast(current.loss(), lossType) / cast(current.count(), lossType); + } else { return current.loss() / current.count(); + } } virtual Expr accumulateCount(const RationalLoss& current) override { if(count_) return count_; // keep the existing '1' else - return current.count()->graph()->ones( - {1}, current.loss()->value_type()); // just '1' as labels are factored into loss_ + return current.count()->graph()->ones({1}, current.loss()->value_type()); // just '1' as labels are factored into loss_ } public: diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp new file mode 100644 index 000000000..07989ce6a --- /dev/null +++ b/src/layers_new/alibi.cpp @@ -0,0 +1,339 @@ +#include "graph/node_operators_unary.h" +#include "layers_new/alibi.h" + +namespace marian { + +AlibiDecoderState::AlibiDecoderState(const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor) +: DecoderState(states, logProbs, encStates, batch, isBatchMajor) {} + +// override to create derived decoder states +Ptr AlibiDecoderState::Create(const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor) const { + return New(states, logProbs, encStates, batch, isBatchMajor); +} + +// expand the decoder state +Ptr AlibiDecoderState::next(const rnn::States& states, + Logits logProbs) const { + // expand the previous decoder state via the base class expansion + auto state = std::dynamic_pointer_cast(DecoderState::next(states, logProbs)); + // this should always succeed, unless we somehow messed up inheritance + ABORT_IF(!state, "state is nullptr, i.e. the conversion to AlibiDecoderState failed??"); + + // carry over the sync points and last beam size from the previous state + state->syncPoints_ = syncPoints_; + state->lastBeam_ = lastBeam_; + return state; +} + +// select the hypotheses based on beam search indices +Ptr AlibiDecoderState::select( + const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] + const Words& words, + const std::vector& batchIndices, // [batchIndex] + int beamSize) const { + // select the hypotheses via the base class selection + auto state = std::dynamic_pointer_cast(DecoderState::select(hypIndices, words, batchIndices, beamSize)); + // this should always succeed, unless we somehow messed up inheritance + ABORT_IF(!state, "state is nullptr, i.e. the conversion to AlibiDecoderState failed??"); + // compute the new sync points and carry over the current beam size + // this is the most important part of the algorithm while decoding + state->syncPoints_ = computeSyncPoints(hypIndices, words, batchIndices, beamSize); + state->lastBeam_ = beamSize; + return state; +} + +// get the alibi shift for the current state based on currently stored sync points computed while decoding +Expr AlibiDecoderState::getAlibiShift(Ptr graph, bool decoding) const { + if(decoding) { + std::vector shift; + for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_) + shift.push_back((float)(srcPos - trgPos)); + + if(!shift.empty()) { + int dimBeam = lastBeam_; + ABORT_IF(dimBeam == 0, "dimBeam is 0??"); + int dimBatch = (int)shift.size() / dimBeam; + return graph->constant({dimBeam, dimBatch, 1, 1}, inits::fromVector(shift)); // [dimBeam, dimBatch, dimTrg=1, 1] + } else { + return nullptr; + } + } else { + ABORT_IF(getBatch()->sets() != 2, + "--transformer-alibi-shift=true currently only works with batch sets=2"); + return getAlibiShiftFromBatch(graph); + } +} + +// get the alibi shift based on the batch data - this is used during training or scoring where ground truth is available +Expr AlibiDecoderState::getAlibiShiftFromBatch(Ptr graph) const { + std::vector shift; + + auto targetBatch = getBatch()->back(); + Word trgSyncSym = targetBatch->vocab()->getSepId(); + + auto locateInTrg = [&targetBatch](int batchIdx, int j) { + return targetBatch->data()[targetBatch->locate(batchIdx, j)]; + }; + + auto sourceBatch = getBatch()->front(); + Word srcSyncSym = sourceBatch->vocab()->getSepId(); + + auto locateInSrc = [&sourceBatch](int batchIdx, int j) { + return sourceBatch->data()[sourceBatch->locate(batchIdx, j)]; + }; + + int dimBatch = (int)targetBatch->batchSize(); + int dimSrc = (int)sourceBatch->batchWidth(); + int dimTrg = (int)targetBatch->batchWidth(); + + for(int batchIdx = 0; batchIdx < dimBatch; ++batchIdx) { + int trgPos = -1, srcPos = -1; + for(int i = 0; i < dimTrg; ++i) { + if(i > 0) { // don't check until we are one word ahead to mimic generation order where we look back by one word (i - 1) + if(locateInTrg(batchIdx, i - 1) == trgSyncSym) { + trgPos = i - 1; // record that position + // now we are looking for the corresponding source position, no need to look backwards + for(int j = srcPos + 1; j < dimSrc; ++j) { + if(locateInSrc(batchIdx, j) == srcSyncSym) { + srcPos = j; + break; + } + } + } + } + + shift.push_back((float)(srcPos - trgPos)); + } + } + + if(!shift.empty()) { + return graph->constant({1, dimBatch, dimTrg, 1}, inits::fromVector(shift)); // [dimBeam=1, dimBatch, dimTrg, 1] + } else { + return nullptr; + } +} + +// compute the sync points for the current state based on the previous sync points and the last generated words. +// This happens one step at a time while decoding. +std::vector AlibiDecoderState::computeSyncPoints( + const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] + const Words& words, // [beamIndex * activeBatchSize + batchIndex] + const std::vector& batchIndices, // [batchIndex] of activeBatchSize + int beamSize +) const { + size_t position = getPosition(); + + // get the sync symbols for source and target + auto sourceBatch = getBatch()->front(); + Word srcSyncSymbol = sourceBatch->vocab()->getSepId(); + Word trgSyncSymbol = srcSyncSymbol; // @TODO: this is actually wrong, we should make sure to use the correct target vocab + + auto locateInSrc = [&sourceBatch](int batchIdx, int j) { + return sourceBatch->data()[sourceBatch->locate(batchIdx, j)]; + }; + + int dimBatch = (int)batchIndices.size(); + std::vector nextSyncPoints; + + // For each hypothesis, create an updated sync point. + // If the current symbol is not a sync symbol, the sync point is the same as before and gets carried over. + // If the current symbol is a sync symbol, the sync point target coordinate is updated to the current position + // and the source coordinate is updated to the next sync symbol in the source sentence. + for(int i = 0; i < hypIndices.size(); ++i) { + SyncCoord pos = syncPoints_.empty() + ? SyncCoord({-1, -1, (int)batchIndices[i % dimBatch]}) // no sync points yet, initialize with -1 position and current batch index + : syncPoints_[hypIndices[i]]; // carry over the sync point from the previous state at first + auto& [trgPos, srcPos, batchIdx] = pos; + + // note, words were generated at the step before the current position, hence the pos - 1 + if(words[i] == trgSyncSymbol) { // the current word is a sync symbol, so update the sync point + trgPos = (int)position - 1; + // find the next sync symbol in the source sentence + for(int j = srcPos + 1; j < sourceBatch->batchWidth(); ++j) { + if(locateInSrc(batchIdx, j) == srcSyncSymbol) { // found the next sync symbol in the source + srcPos = j; // update the sync point source coordinate + break; // and stop looking + } + } + } + nextSyncPoints.push_back(pos); + } + + return nextSyncPoints; +} + + +Ptr NewDecoderState(Ptr options, + const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor) { + if(options->get("transformer-alibi-shift", false)) { + ABORT_IF(options->get("transformer-attention-mask") != "alibi", "transformer-alibi-shift=true only works with transformer-attention-mask=\"alibi\""); + return New(states, logProbs, encStates, batch, isBatchMajor); + } else { + return New(states, logProbs, encStates, batch, isBatchMajor); + } +} + +Ptr convertDecoderState(Ptr state, + Ptr graph, + bool decoding) { + Expr shift; + auto alibiState = std::dynamic_pointer_cast(state); + if(alibiState) + shift = alibiState->getAlibiShift(graph, decoding); + + size_t position = state->getPosition(); + auto nnState = New(position); + for(auto& layerState : state->getStates()) { + if(alibiState) { + nnState->append(New(layerState.cell, shift, position)); + } else { + nnState->append(New(layerState.cell, position)); + } + } + return nnState; +} + +#ifdef CUDA_FOUND +namespace gpu { + template + void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors); +} +#endif + +namespace cpu { + template + void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) { + ABORT("Not implemented"); + } +} + +template +void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) { +#ifdef CUDA_FOUND + if(out->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::Alibi(numHeads, start, out, tensors...); + else +#endif + cpu::Alibi(numHeads, start, out, tensors...); +} + + +#ifdef CUDA_FOUND +namespace gpu { + template + void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors); +} +#endif + +namespace cpu { + template + void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors) { + ABORT("Not implemented"); + } +} + +template +void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... inputs) { +#ifdef CUDA_FOUND + if(slopesGrad->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...); + else +#endif + cpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...); +} + +class AlibiLogMaskNode : public NaryNodeOp { +private: + int numHeads_{8}; + int start_{0}; + + Shape newShape(Expr mask, Expr query, int numHeads) { + int dimBeam = query->shape()[-4]; + int dimBatch = query->shape()[-3]; + int dimQuery = query->shape()[-2]; + int dimKeys = mask->shape()[-2]; + + return { dimBeam, dimBatch * numHeads, dimQuery, dimKeys }; + } + +public: + AlibiLogMaskNode(const std::vector& nodes, int numHeads, int start) + : NaryNodeOp(nodes, newShape(/*mask=*/nodes[0], /*query=*/nodes[1], numHeads), nodes[0]->value_type()), + numHeads_(numHeads), start_{start} + {} + + void forward() override { + Alibi( + numHeads_, + start_, + val_, + /*mask=*/ child(0)->val(), + /*slopes=*/child(2)->val(), + /*biases=*/child(3)->val(), + /*shift=*/ children().size() == 5 ? child(4)->val() : nullptr); + } + + void backward() override { + if(!trainable()) + return; + + AlibiGrad( + numHeads_, + start_, + // gradients + /*d_f/d_slopes=*/child(2)->grad(), + /*d_f/d_biases=*/child(3)->grad(), + // inputs + /*mask=*/ child(0)->val(), + /*slopes=*/ child(2)->val(), + /*biases=*/ child(3)->val(), + /*shift=*/ children().size() == 5 ? child(4)->val() : nullptr, + // adjoint + /*d_J/d_f=*/adj_); + } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, numHeads_); + util::hash_combine(seed, start_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(numHeads_ != cnode->numHeads_) + return false; + if(start_ != cnode->start_) + return false; + return true; + } + + const std::string type() override { return "alibi-log-mask"; } +}; + +Expr alibiLogMask(Expr mask, Expr query, Expr slopes, Expr biases, Expr shift, int numHeads, int start) { + std::vector nodes = {mask, query, slopes, biases}; + if(shift) + nodes.push_back(shift); + + return Expression(nodes, numHeads, start); +} + + +} // namespace marian diff --git a/src/layers_new/alibi.cu b/src/layers_new/alibi.cu new file mode 100644 index 000000000..07042699b --- /dev/null +++ b/src/layers_new/alibi.cu @@ -0,0 +1,251 @@ +#include "common/types.h" +#include "functional/functional.h" +#include "functional/tensor.h" +#include "tensors/gpu/cuda_helpers.h" + +#include + +namespace marian { + +namespace gpu { + +template +__global__ void gAlibi( + functional::Tensor out, + functional::Array, 4> inputs, + int numHeads, + int start, + float maskFactor) { + + constexpr size_t N = functional::Shape::size(); + functional::Array oDims; + int length = out.shape().elements(); + + const auto& mask = inputs[0]; + const auto& slopes = inputs[1]; + const auto& biases = inputs[2]; + const auto& shift = inputs[3]; + + for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < length) { + out.shape().dims(index, oDims); + + int beamIdx = oDims[0]; + int batchHeadIdx = oDims[1]; + int queryIdx = oDims[2]; + int keyIdx = oDims[3]; + + // [[maybe_unused]] because NVCC seems to have a bug telling me the variable is not referenced when it appears in an intializer; this surpresses the warning. + [[maybe_unused]] int batchIdx = batchHeadIdx / numHeads; + [[maybe_unused]] int headIdx = batchHeadIdx % numHeads; + + int keyPos = keyIdx; + int queryPos = queryIdx + start; + + float relPos = (float)keyPos - (float)queryPos; + + if(shift.data() != nullptr) + relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}]; + + float slope = (float)slopes[{0, headIdx, 0, 0}]; + float bias = (float)biases[{0, headIdx, 0, 0}]; + float alibi = slope * abs(relPos + bias); + + float binMask = (float)mask[{0, batchIdx, keyIdx, 0}]; + float logMask = (2.f * binMask - 1.f) * maskFactor; // range (-maskFactor, maskFactor) + + out[index] = (T)min(logMask, alibi); + } + } +} + +template +void Alibi(int numHeads, int start, Tensor out, Tensors... tensors) { + cudaSetDevice(out->getDeviceId().no); + int length = out->size(); + + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + float largest = NumericLimits(out->type()).max; + float maskFactor = std::min(largest / 2.f, 99999999.f); // to make sure we do not overflow for fp16 + + constexpr size_t K = sizeof...(tensors); + + if(out->type() == Type::float32) { + functional::Array, K> inputs = {tensors...}; + gAlibi<<>>(out, inputs, numHeads, start, maskFactor); +#if COMPILE_FP16 + } else if(out->type() == Type::float16) { + functional::Array, K> inputs = {tensors...}; + gAlibi<<>>(out, inputs, numHeads, start, maskFactor); +#endif + } else { + ABORT("Alibi for type {} not implemented", out->type()); + } +} + +// template specialization for h/cpp separation +template void Alibi(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); + +template +__global__ void gAlibiGrad( + functional::Tensor slopesGrad, + functional::Tensor biasesGrad, + functional::Array, 5> inputs, + int numHeads, + int start) { + + const auto& mask = inputs[0]; + const auto& slopes = inputs[1]; + const auto& biases = inputs[2]; + const auto& shift = inputs[3]; + const auto& adj = inputs[4]; + + int cols = adj.size() / numHeads; + + functional::Shape fullShape = adj.shape(); + int dimBeam = fullShape[0]; + int dimBatchHead = fullShape[1]; + [[maybe_unused]] // because NVCC seems to have a bug telling me the variable is not referenced + int dimBatch = dimBatchHead / numHeads; + int dimQuery = fullShape[2]; + int dimKeys = fullShape[3]; + + using A5 = functional::Array; + using S5 = functional::ConstantShape<5>; + S5 fullShape5(A5({dimBeam, dimBatch, numHeads, dimQuery, dimKeys})); + S5 headShape5(A5({dimBeam, dimBatch, 1, dimQuery, dimKeys})); + + A5 dims5; + const int HEAD_DIM = 2; + + // compute single element derivate for slopes and biases + auto dJ_dxy = [&](int headIdx, int colIdx) -> thrust::tuple { + // get the location for one head + headShape5.dims(colIdx, dims5); + + // set the location of the current head + dims5[HEAD_DIM] = headIdx; + // get the index into the full tensor + int index = fullShape5.index(dims5); + // get the value of the full adjoint + float vadj = (float)adj[index]; + + // handle the rest + int beamIdx = dims5[0]; + int batchIdx = dims5[1]; + int queryIdx = dims5[3]; + int keyIdx = dims5[4]; + + int keyPos = keyIdx; + int queryPos = queryIdx + start; + + float relPos = (float)keyPos - (float)queryPos; + + if(shift.data() != nullptr) + relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}]; + + float slope = (float)slopes[{0, headIdx, 0, 0}]; + float bias = (float)biases[{0, headIdx, 0, 0}]; + float binMask = (float)mask[{0, batchIdx, keyIdx, 0}]; + + float signedAlibi = relPos + bias; + + // compute derivative of slope + float dslope = binMask * abs(signedAlibi) * vadj; + + // compute derivative of bias + float db; + if(signedAlibi > 0) + db = 1.f; + else if(signedAlibi < 0) + db = -1.f; + else + db = 0.f; + float dbias = binMask * slope * db * vadj; + + return { dslope, dbias }; + }; + + for(int bid = 0; bid < numHeads; bid += gridDim.x) { + int headIdx = bid + blockIdx.x; + if(headIdx < numHeads) { + // get and assign shared memory + extern __shared__ uint8_t _sharedBytes[]; + float* _sum = (float*)(_sharedBytes); + auto sharedSlopes = [_sum](int idx) -> float& { return _sum[2 * idx + 0]; }; // use even indices for slopes + auto sharedBiases = [_sum](int idx) -> float& { return _sum[2 * idx + 1]; }; // use odd indices for biases + + sharedSlopes(threadIdx.x) = 0.0; + sharedBiases(threadIdx.x) = 0.0; + for(int tid = 0; tid < cols; tid += blockDim.x) { + int colIdx = tid + threadIdx.x; + if(colIdx < cols) { + float dslopes = 0, dbiases = 0; + // get the element-wise derivative + thrust::tie(dslopes, dbiases) = dJ_dxy(headIdx, colIdx); + // accumulate by thread id + sharedSlopes(threadIdx.x) += dslopes; + sharedBiases(threadIdx.x) += dbiases; + } + } + __syncthreads(); + + // accumulate here over matrix dimensions, tree reduction + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if(threadIdx.x < (len >> 1)) { + sharedSlopes(threadIdx.x) += sharedSlopes(threadIdx.x + skip); // float + sharedBiases(threadIdx.x) += sharedBiases(threadIdx.x + skip); // float + } + len = (len + 1) >> 1; + } + __syncthreads(); + + // assign accumulated gradients here (preserving existing gradients) + slopesGrad[headIdx] += (T)sharedSlopes(0); + biasesGrad[headIdx] += (T)sharedBiases(0); + } + __syncthreads(); + } +} + +template +void TypedAlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) { + cudaSetDevice(slopesGrad->getDeviceId().no); + + constexpr size_t K = sizeof...(tensors); + functional::Array, K> inputs = {tensors...}; + + const auto& adj = inputs[K - 1]; // last one is adjoint and full broadcast shape + int total = adj.size(); + + // we will reduce over each head + int blocks = std::min(MAX_BLOCKS, numHeads); + int threads = std::min(MAX_THREADS, total / numHeads); + int shared = sizeof(float) * threads * 2; // Use float32 as accumulation type, we accumulate slopes and biases + + gAlibiGrad<<>>(slopesGrad, biasesGrad, inputs, numHeads, start); +} + +template +void AlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) { + if(slopesGrad->type() == Type::float32) { + TypedAlibiGrad(numHeads, start, slopesGrad, biasesGrad, tensors...); +#if COMPILE_FP16 + } else if(slopesGrad->type() == Type::float16) { + TypedAlibiGrad(numHeads, start, slopesGrad, biasesGrad, tensors...); +#endif + } else { + ABORT("AlibiGrad for type {} not implemented", slopesGrad->type()); + } +} + +// template specialization for h/cpp separation +template void AlibiGrad(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +} +} diff --git a/src/layers_new/alibi.h b/src/layers_new/alibi.h new file mode 100644 index 000000000..bec2da55d --- /dev/null +++ b/src/layers_new/alibi.h @@ -0,0 +1,268 @@ +#pragma once + +#include "models/states.h" +#include "layers_new/attention.h" +#include "layers_new/decoder.h" +#include "layers_new/neuralnet.h" + +namespace marian { + +// @TODO: this whole set of functions is currently somewhat akward in general, since we need to implement +// old style and new style decoder state for this to work. We decoder with the old decoder framework, but +// use the new style transformer layers. This will eventually be cleaned up. + +// Specialized version of DecoderState for model that knows about algorithmic ALIBI position shifts +class AlibiDecoderState : public DecoderState { +private: + typedef std::tuple SyncCoord; + mutable std::vector syncPoints_; + int lastBeam_{1}; + +public: + AlibiDecoderState(const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor = false); + + // override to create derived decoder states + virtual Ptr Create(const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor = false) const override; + + // expand the decoder state + virtual Ptr next(const rnn::States& states, + Logits logProbs) const override; + + // select the hypotheses based on beam search indices + virtual Ptr select( + const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] + const Words& words, + const std::vector& batchIndices, // [batchIndex] + int beamSize) const override; + + // get the alibi shift for the current state based on currently stored sync points computed while decoding + Expr getAlibiShift(Ptr graph, bool decoding) const; + + // get the alibi shift based on the batch data - this is used during training or scoring where ground truth is available + Expr getAlibiShiftFromBatch(Ptr graph) const; + +private: + + // compute the sync points for the current state based on the previous sync points and the last generated words. + // This happens one step at a time while decoding. + std::vector computeSyncPoints( + const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] + const Words& words, // [beamIndex * activeBatchSize + batchIndex] + const std::vector& batchIndices, // [batchIndex] of activeBatchSize + int beamSize + ) const; +}; + +// create a new (alibi) decoder state +Ptr NewDecoderState(Ptr options, + const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor = false); + +// convert an old-style decoder state to an (alibi) decoder state +Ptr convertDecoderState(Ptr state, + Ptr graph, + bool decoding=false); + +// efficient operator for ALIBI log mask with shift and optionally learnable parameters +Expr alibiLogMask(Expr mask, Expr query, Expr shift, Expr slopes, Expr biases, int numHeads, int start); + +namespace nn { + +class AlibiDecoderStateItem : public DecoderStateItem { +private: + Expr shift_; + +public: + AlibiDecoderStateItem(Expr state, Expr shift, size_t position) : DecoderStateItem(state, position), shift_(shift) {} + virtual ~AlibiDecoderStateItem() = default; + + Expr getShift() const { + return shift_; + } +}; + +// Experimental implementation of the ALIBI attention mechanism (via masking) (https://arxiv.org/abs/2108.12409) +class AlibiAttentionMaskProcessor : public AttentionMaskProcessor { +public: + bool trainable{false}; // if true don't use learnable parameters + + Expr slopes; // learnable per head ALIBI slopes + Expr biases; // learnable per head additive biases + + using AttentionMaskProcessor::numHeads; + + AlibiAttentionMaskProcessor(Ptr graph, + Ptr options) + : AttentionMaskProcessor(graph, options), + trainable(options->get("transformer-alibi-trainable", false)) + {} + + virtual ~AlibiAttentionMaskProcessor() = default; + +private: +// @TODO: eventually to be removed. This computes ALIBI log masks with multiple operators, replaced with more efficient version below. +// For now we keep this for documentation and experimentation puprposes. +// The same functionality is implemented in `alibiLogMask` above via a special operator +#if 0 + const float ALIBI_REFERENCE_HEADS{8.f}; // number of reference heads that ALIBI slopes are computed for + + // Compute the alibi mask for a given query and keys + Expr alibiMask(Expr query, int dimQuery, int dimKeys, Ptr state) const { + int start = 0; + Expr shift = nullptr; + + int dimBatch = query->shape()[-3]; + int dimBeam = query->shape()[-4]; + + if(state) { + start = (int)state->getPosition(); + auto alibiState = std::dynamic_pointer_cast(state); + shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1] + } + + // Create constant tensors of reflecting the query and key positions. + // When decoding, we start with the decoding state position for the query. The key positions are just the indices for the whole sequence. + Expr queryPositions = graph()->constant({1, 1, dimQuery, 1}, inits::range((float)start, (float)(start + dimQuery))); // [1, 1, dimQuery, 1] + Expr keyPositions = graph()->constant({1, 1, 1, dimKeys}, inits::range(0.f, (float)dimKeys)); // [1, 1, 1, dimKeys] + + // Create matrix of distances between positions, rows are distances of current query position vs all key positions. + // Layout is the same as the attention distance matrix where we compute rowwise softmaxes of similarities between + // each target word and all the source words + Expr alibiBiases = keyPositions - queryPositions; // [1, 1, dimQuery, dimKeys] + + // apply the corrective shift if any sync-points are present + if(shift) { + alibiBiases = alibiBiases - shift; // [dimBeam, dimBatch, dimQuery, dimKeys] + alibiBiases = reshape(alibiBiases, {dimBeam * dimBatch, 1, dimQuery, dimKeys}); // [dimBeam * dimBatch, 1, dimQuery, dimKeys] + } + + Expr alibi = slopes * abs(alibiBiases + biases); // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys] + return alibi; + }; + + // Compute the log mask for a given query and combine with the alibi mask + Expr logMask(Expr query, Expr mask, Ptr state) const { + ABORT_IF(!mask, "mask is expected!!"); + + // query: [dimBeam, dimBatch, dimQuery, dimModel] -> dimQuery == dimTrgWords + int dimBatch = query->shape()[-3]; + int dimBeam = query->shape()[-4]; + + int dimQuery = query->shape()[-2]; + int dimKeys = mask->shape()[-2]; + + // all this is bascially a copy of the normal attention mask computation, however we need to do some extra reshaping + // to make the alibi mask and the log mask broadcastable and then combine them via minimum + + // Note, this is not a typical logMask with values 0 (don't mask) and -inf (mask). Rather we use +inf (or a large value) + // and -inf and then compbine with the ALIBI mask via minimum. This way, we keep the original ALIBI values where the mask has + // +inf and have -inf for masking. + // largest useful value and making sure we do not overflow for fp16 + float maskFactor = std::min(NumericLimits(mask->value_type()).max / 2.f, 99999999.f); + // convert binary 0/1 mask to -1/1 mask and then muliply with inf, results in -inf/+inf mask. + auto logMask = (2.f * mask - 1.f) * maskFactor; // [1, dimBatch, dimKeys, 1] + logMask = reshape(logMask, {dimBatch, 1, 1, dimKeys}); // [dimBatch, 1, 1, dimKeys] + + + // make logMask broadcastable when decoding with beam search + logMask = repeat(logMask, /*repeats=*/dimBeam, /*axis=*/-4); // [dimBeam|1 * dimBatch, 1, 1, dimKeys] + + // make logMask and alibiBias broadcastable, then combine + auto alibiBias = alibiMask(query, dimQuery, dimKeys, state); // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys] + logMask = minimum(logMask, alibiBias); // [dimBeam|1 * dimBatch, numHeads, dimQuery, dimKeys] + + // final reshape to match attention operation + logMask = reshape(logMask, {dimBeam, dimBatch * numHeads, dimQuery, dimKeys}); // [dimBeam|1, dimBatch * numHeads, dimQuery, dimKeys] + return logMask; + } +#endif + + // Initialized the head-wise scaling factors from ALIBI (they are constant in the original paper, + // we are making them optionally learnable here) + Ptr initSlopes(bool decoder = false) const { +// This is the original implementation of ALIBI slopes for LMs. We find our slopes and biases work better for Seq2seq models +// Keep for now until we find a use, e.g. in LMs +#if 0 + std::vector mVec(numHeads); + for(size_t i = 0; i < numHeads; ++i) { + // slopes in the paper go from 1/2^1 to 1/2^8 where 8 is the reference number of heads; + // if there are more or less heads we scale back to 8 heads and interpolate. + float exponent = (float)(i + 1) * (ALIBI_REFERENCE_HEADS / (float)numHeads); + + // We multiply slopes with 2 for the symmetric mask to keep total probability mass the + // same as in the causal mask (we have two symmetric halves instead of just one causal half) + mVec[i] = -2.f / std::pow(2.f, exponent); + if(decoder) + mVec[i] *= 0.5f; + } + + return inits::fromVector(mVec); +#else + // Magic numbers, for now don't ask. + std::vector init; + if(decoder) { + return inits::fromValue(-0.1f); + } else { + init = { -2.00f, -1.00f, -0.50f, -0.25f, -0.05f, -0.05f, -0.05f, -0.05f }; + init.resize(numHeads, -0.05f); + return inits::fromVector(init); + } +#endif + } + + // Head-wise biases for ALIBI, this does not occur in the paper, ignore the magic numbers + Ptr initBiases(bool decoder=false) const { + if(decoder) { + return inits::fromValue(0.3f); + } else { + std::vector init({ 1.00f, -2.00f, 3.00f, -4.00f, 5.00f, -6.00f, 7.00f, -8.00f }); + init.resize(numHeads, 0.f); + return inits::fromVector(init); + } + } + +public: + // Apply the alibi mask to the given query and mask + virtual Expr apply(Expr query, Expr mask) const override { + return apply(query, mask, /*state=*/nullptr); + } + + // Apply the alibi mask to the given query and mask for decoder cross-attention + virtual Expr apply(Expr query, Expr mask, Ptr state) const override { + bool decoder = state != nullptr; + + if(!trainable) { + const_cast(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes(decoder)); + const_cast(biases) = graph()->constant({numHeads, 1, 1}, initBiases(decoder)); + } else { + registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes(decoder)); + registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases(decoder)); + } + + Expr shift = nullptr; + int start = 0; + + if(state) { + start = (int)state->getPosition(); + auto alibiState = std::dynamic_pointer_cast(state); + shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1] + } + + auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start); + return alibiMask; + } +}; + +} // namespace nn +} // namespace marian \ No newline at end of file diff --git a/src/layers_new/attention.cpp b/src/layers_new/attention.cpp new file mode 100644 index 000000000..c3758296e --- /dev/null +++ b/src/layers_new/attention.cpp @@ -0,0 +1,105 @@ +#include "graph/node_operators_unary.h" +#include "layers_new/attention.h" +#include "layers_new/alibi.h" + +namespace marian { +namespace nn { + +// Factory function to create attention layers from options +Ptr attentionFromOptions(Ptr graph, Ptr options) { + // @TODO: currently this does nothing as it isn't set anywhere + std::string selfAttentionType = options->get("transformer-encoder-attention", "default"); // currently only default + + // in the future we might add SingleHead or Additive or LSH-based as in Reformer + if(selfAttentionType == "default") { + int numHeads = options->get("transformer-heads"); + int modelDim = options->get("transformer-dim-model", options->get("dim-emb")); + + float attentionDropoutProbability = options->get("transformer-dropout-attention", 0.f); + + return New>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability); + } + else { + ABORT("Unknown transformer encoder attention type: {}", selfAttentionType); + } +} + +// Factory function to create attention mask processors from options +Ptr attentionMaskProcessorFromOptions(Ptr graph, Ptr options) { + // currently only default or alibi + std::string processorType = options->get("transformer-attention-mask", "default"); + if(processorType == "default") { + return New(graph, options); + } else if(processorType == "alibi") { + return New(graph, options); + } else { + ABORT("Unknown transformer attention mask processor type: {}", processorType); + } +} + +} // namespace nn + +// specialized faster operator for log-mask computation +class LogMaskNode : public UnaryNodeOp { +private: + int numHeads_{8}; + + Shape newShape(Expr mask, int numHeads) { + // incoming mask is expected to have shape [dimBatch, 1, 1, dimKeys] + // see the reshape below in the logMask function + int dimBatch = mask->shape()[-4]; + int dimKeys = mask->shape()[-1]; + return { dimBatch, numHeads, 1, dimKeys }; + } + +public: + LogMaskNode(Expr mask, int numHeads) + : UnaryNodeOp(mask, newShape(mask, numHeads)), numHeads_(numHeads) + {} + + NodeOps forwardOps() override { + float lowest = NumericLimits(value_type()).lowest; + float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 + + using namespace functional; + // compared to the multi-operation code this does conversion and broadcasting in one step + return { NodeOp(Element(_1 = (1.f - _2) * maskFactor, val_, child(0)->val())) }; + } + + NodeOps backwardOps() override { + float lowest = NumericLimits(value_type()).lowest; + float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 + using namespace functional; + return { NodeOp(Add(-maskFactor * _1, child(0)->grad(), adj_)) }; + } + + virtual size_t hash() override { + size_t seed = NaryNodeOp::hash(); + util::hash_combine(seed, numHeads_); + return seed; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(numHeads_ != cnode->numHeads_) + return false; + return true; + } + + const std::string type() override { return "log-mask"; } +}; + +Expr logMask(Expr mask, int numHeads) { + // incoming mask has shape [1, dimBatch, dimKeys, 1] + int dimBatch = mask->shape()[-3]; + int dimKeys = mask->shape()[-2]; + mask = reshape(mask, {dimBatch, 1, 1, dimKeys}); + auto logMask = Expression(mask, numHeads); // [dimBatch, numHeads, 1, dimKeys] + return reshape(logMask, {1, dimBatch * numHeads, 1, dimKeys}); +} + +} // namespace marian diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h index 4f4838e48..6ddfaad2a 100644 --- a/src/layers_new/attention.h +++ b/src/layers_new/attention.h @@ -1,9 +1,14 @@ #pragma once #include "graph/cached_expression.h" +#include "layers_new/decoder.h" #include "layers_new/neuralnet.h" namespace marian { + +// specialized operator for faster logMask computation +Expr logMask(Expr mask, int numHeads); + namespace nn { // Abstract base class for attention mechanisms @@ -41,15 +46,15 @@ class MultiplicativeAttention : public AttentionLayer { // multiplicative attention with flattened softmax float scale = 1.0f / std::sqrt((float)dimKeys); // scaling to avoid extreme values due to matrix multiplication - // query, keys and values: [beam depth * batch size, num heads, length, head dim] - auto z = bdot(query, keys, false, true, scale); // [beam depth, batch size * num heads, max tgt length, max src length] + // query, keys and values: [dimBeam, dimBatch * numHeads, (dimQuery|dimKeys=dimValues), dimHead] + auto z = bdot(query, keys, false, true, scale); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys] // mask out garbage beyond end of sequences if(logMask) z = z + logMask; // take softmax along src sequence axis (-1) - auto weights = softmax(z); // [beam depth, batch size * num heads, max tgt length, max src length] + auto weights = softmax(z); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys] #if 0 // @TODO: make this work again if(saveAttentionWeights) @@ -60,13 +65,14 @@ class MultiplicativeAttention : public AttentionLayer { weights = attentionDropout->apply(weights); // apply attention weights to values - // weights: [beam depth, batch size * num heads, max tgt length, max src length] - // values: [beam depth, batch size * num heads, src length, head dim] - auto output = bdot(weights, values); // [beam depth, batch size * num heads, max tgt length, split vector dim] + // weights: [dimBeam, dimBatch * numHeads, dimQuery, dimKeys] + // values: [dimBeam, dimBatch * numHeads, dimKeys, dimHead] + auto output = bdot(weights, values); // [dimBeam, dimBatch * numHeads, dimQuery, dimHead] return output; } }; +// Base class for multi-head attention template // Currently only used for MultiplicativeAttention class MultiHeadAttention : public AttentionType { protected: @@ -110,7 +116,7 @@ class MultiHeadAttention : public AttentionType { virtual ~MultiHeadAttention() = default; -private: +protected: // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to // be able to do an efficient batched matmul. Expr splitHeads(Expr input) const { @@ -141,6 +147,7 @@ class MultiHeadAttention : public AttentionType { } public: + // Apply the multi-head attention to the given query, keys and values virtual Expr apply(Expr query, Expr keys, Expr values, Expr mask) const override { auto qh = splitHeads(qProj->apply(query)); @@ -156,7 +163,7 @@ class MultiHeadAttention : public AttentionType { return splitHeads(vProj->apply(values)); }, equal); - auto output = AttentionType::apply(qh, kh, vh, mask); + auto output = AttentionType::apply(qh, kh, vh, mask); output = joinHeads(output); output = oProj->apply(output); @@ -171,23 +178,51 @@ class MultiHeadAttention : public AttentionType { } }; -static Ptr attentionFromOptions(Ptr graph, Ptr options) { - // @TODO: currently this does nothing as it isn't set anywhere - std::string selfAttentionType = options->get("transformer-encoder-attention", "default"); // currently only default - - // in the future we might add SingleHead or Additive or LSH-based as in Reformer - if(selfAttentionType == "default") { - int numHeads = options->get("transformer-heads"); - int modelDim = options->get("transformer-dim-model", options->get("dim-emb")); +// Base class for attention mask processors +// Attention mask processors are used to process a given attention mask before it is used in an attention computation. +struct AttentionMaskProcessor : public LayerWithOptions, public IBinaryLayer, public IBinaryDecoderLayer { + int numHeads{1}; - float attentionDropoutProbability = options->get("transformer-dropout-attention", 0.f); + AttentionMaskProcessor(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options), + numHeads(opt("transformer-heads", 1)) {} - return New>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability); + virtual ~AttentionMaskProcessor() = default; + + virtual Expr apply(Expr /*query*/, Expr mask) const override { + if(!mask) + return nullptr; + + // @TODO eventually remove this branch. For now we keep it for documentation purposes +#if 0 + // LayerAttention expects mask in a different layout + int dimBatch = mask->shape()[-3]; + int dimKeys = mask->shape()[-2]; + + mask = reshape(mask, {dimBatch, 1, 1, dimKeys}); // [batch size, num heads broadcast=1, max length broadcast=1, max length] + + float maskFactor = std::max(NumericLimits(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 + auto logMask = (1 - mask) * maskFactor; + logMask = reshape(repeat(logMask, numHeads, -3), {1, dimBatch * numHeads, 1, dimKeys}); + return logMask; +#else + // shape of mask should be [1, dimBatch, dimKeys, 1] + // this does all the above work in one step + return marian::logMask(mask, numHeads); // [1, dimBatch * numHeads, 1, dimKeys] +#endif } - else { - ABORT("Unknown transformer encoder attention type: {}", selfAttentionType); + + virtual Expr apply(Expr query, Expr mask, Ptr /*state*/) const override { + return apply(query, mask); } -} +}; + +// Factory function to create attention layers from options +Ptr attentionFromOptions(Ptr graph, Ptr options); + +// Factory function to create attention mask processors from options +Ptr attentionMaskProcessorFromOptions(Ptr graph, Ptr options); } // namespace nn } // namespace marian diff --git a/src/layers_new/interface.h b/src/layers_new/interface.h index d8317d610..a938803ee 100644 --- a/src/layers_new/interface.h +++ b/src/layers_new/interface.h @@ -4,6 +4,7 @@ #include "graph/expression_graph.h" #include "graph/expression_operators.h" #include "graph/node_initializers.h" +#include "layers/loss.h" #include @@ -118,6 +119,8 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr std::vector namedParameters_; // vector of all named parameters belonging to this specific layer (not recurisve) std::vector> namedLayers_; // vector of all named sublayers for this specific layer (not recursive) + mutable std::vector auxiliaryLosses_; + // Create a layer parameter with a full name composed of the path to this layer and localName Expr param(const std::string& localName, const Shape& shape, const Ptr& init) { std::string fullName = fmt::format("{}->{}", path(), localName); @@ -255,6 +258,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr return marian::utils::join(path, "->"); } + // Return a string with information about this layer and its sub-layers if includeChildren is true. std::string layerInfo(bool includeChildren=false) const { std::stringstream ss; std::function recurse; @@ -301,11 +305,32 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr } virtual void clear() override { + auxiliaryLosses_.clear(); for(auto& lr : namedLayers()) lr.second->clear(); } + + void addAuxiliaryLoss(const RationalLoss& loss) const { + auxiliaryLosses_.push_back(loss); + } + + // Return all auxiliary losses for this layer and its sub-layers (descending recursively into sub-layers). + std::vector getAuxiliaryLosses(bool recurse = false) const { + if(recurse) { + std::vector losses; + for(auto layer : allLayers()) + for(auto loss : layer->getAuxiliaryLosses(/*recurse=*/false)) + losses.push_back(loss); + for(auto loss : auxiliaryLosses_) + losses.push_back(loss); + return losses; + } else { + return auxiliaryLosses_; + } + } }; +// Layer that holds a reference to a set of options. This is used to allow layers to access options class LayerWithOptions : public Layer { protected: Ptr options_; diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h index b81728c77..923838aa0 100644 --- a/src/layers_new/neuralnet.h +++ b/src/layers_new/neuralnet.h @@ -8,23 +8,6 @@ namespace nn { static inline Expr swapTimeBatch(Expr input) { return swapAxes(atleast_4d(input), -2, -3); } - // @TODO: this is an odd function to be here, this should rather be handled somewhere globally? - // convert multiplicative 1/0 mask to additive 0/-inf log mask, and transpose to match result of bdot() op in Attention() -static inline Expr transposedLogMask(Expr mask, int dimHeads) { - if(!mask) - return nullptr; - - // LayerAttention expects mask in a different layout - int dimBatch = mask->shape()[-3]; - int dimSrcWords = mask->shape()[-2]; - mask = reshape(mask, {dimBatch, 1, 1, dimSrcWords}); // [batch size, num heads broadcast=1, max length broadcast=1, max length] - - float maskFactor = std::max(NumericLimits(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 - auto logMask = (1 - mask) * maskFactor; - logMask = reshape(repeat(logMask, dimHeads, -3), {1, dimBatch * dimHeads, 1, dimSrcWords}); - return logMask; -} - /** * A generic Activation function layer. Any unary Marian operator or function accepted by * `std::function` can be turned into an activation function like this: diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h index ade61a78e..ccce35d13 100644 --- a/src/layers_new/transformer.h +++ b/src/layers_new/transformer.h @@ -43,6 +43,8 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { } } } + + virtual ~TransformerPrePostProcessor() = default; Expr apply(Expr input, Expr previous = nullptr) const override { Expr output = input; @@ -84,7 +86,6 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin opt("transformer-dropout", 0.f)); registerLayer(preprocessor); - // @TODO: factory to support different attention flavors? selfAttention = attentionFromOptions(graph, options); registerLayer(selfAttention); @@ -95,10 +96,10 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin registerLayer(postprocessor); } - Expr apply(Expr input, Expr mask = nullptr) const override { - auto output = preprocessor->apply(input); // optional preprocessing - output = selfAttention->apply(output, output, output, mask); // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer - output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + Expr apply(Expr input, Expr logMask = nullptr) const override { + auto output = preprocessor->apply(input); // optional preprocessing + output = selfAttention->apply(output, output, output, logMask); // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection return output; } }; @@ -214,6 +215,7 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa */ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { Ptr positionEmbedding; + Ptr maskProcessor; Ptr preprocessor; Ptr layers; Ptr postprocessor; @@ -222,8 +224,13 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { Ptr options) : LayerWithOptions(graph, options) { - positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); - registerLayer(positionEmbedding); + if(!opt("transformer-disable-position-embeddings", false)) { + positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); + registerLayer(positionEmbedding); + } + + maskProcessor = attentionMaskProcessorFromOptions(graph, options); + registerLayer(maskProcessor); preprocessor = New( graph, @@ -271,24 +278,26 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { // decoder state, Frank added information about batchMajor/timeMajor orientation. If we // do that everywhere we can detect inconsistencies automatically. // reorganize batch and timestep - auto output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - if(mask) { - mask = swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - mask = transposedLogMask(mask, opt("transformer-heads")); - } + auto output = swapTimeBatch(input); // [1, dimBatch, dimSrcWords, dimModel] + if(mask) + mask = swapTimeBatch(mask); // [1, dimBatch, dimSrcWords, 1] // apply positional embeddings to contextual input - output = positionEmbedding->apply(output); + if(positionEmbedding) + output = positionEmbedding->apply(output); + else + output = std::sqrt((float)output->shape()[-1]) * output; // handle for skip connection at top auto prevOutput = output; // apply dropout or layer-norm to embeddings if required output = preprocessor->apply(output); + auto logMask = maskProcessor->apply(output, mask); // traverse the layers, use the same mask for each for(auto layer : *layers) - output = layer->apply(output, mask); + output = layer->apply(output, logMask); // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection output = postprocessor->apply(output, prevOutput); @@ -327,7 +336,7 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe opt("transformer-preprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); - + // @TODO: factory to support different attention flavors? crossAttention = attentionFromOptions(graph, options); registerLayer(crossAttention); @@ -339,16 +348,14 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe registerLayer(postprocessor); } - Expr apply(Expr input, Expr context, Expr contextMask = nullptr) const override { - auto output = preprocessor->apply(input); // optional preprocessing - output = crossAttention->apply(output, context, context, contextMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer - output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + Expr apply(Expr input, Expr context, Expr logMask) const override { + auto output = preprocessor->apply(input); // optional preprocessing + output = crossAttention->apply(output, context, context, logMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection return output; } }; -#if 1 - class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer { public: TransformerAutoRegressiveBlock(Ptr graph, @@ -435,9 +442,9 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna registerLayer(filterBlock); } - Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr state) const override { + Expr apply(Expr input, Expr inputMask, Expr context, Expr logMask, Ptr state) const override { Expr output = autoRegressiveBlock->apply(input, inputMask, state); - output = crossAttentionBlock->apply(output, context, contextMask); + output = crossAttentionBlock->apply(output, context, logMask); output = filterBlock->apply(output); checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) @@ -453,6 +460,7 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna */ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer { Ptr positionEmbedding; + Ptr maskProcessor; Ptr preprocessor; Ptr layers; Ptr postprocessor; @@ -461,8 +469,13 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec Ptr options) : LayerWithOptions(graph, options) { - positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); - registerLayer(positionEmbedding); + if(!opt("transformer-disable-position-embeddings", false)) { + positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); + registerLayer(positionEmbedding); + } + + maskProcessor = attentionMaskProcessorFromOptions(graph, options); + registerLayer(maskProcessor); preprocessor = New( graph, @@ -527,22 +540,28 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec // dimensions. This order is more natural for the transformer, but more difficult to handle // during beam search or when using RNNs. Hence the input/output transpositions here. Expr output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - context = swapTimeBatch(context); + context = swapTimeBatch(context); // [dimBeam=1, dimBatch, dimSrcWords, dimModel] + + // set current target token position during decoding or training. At training + // this should be 0. During translation the current length of the translation. + // Used for position embeddings and creating new decoder states. + int startPos = (int)state->getPosition(); // @TODO: write function prepareMasks(); // @TODO: create triangle mask here and combine with inputMask LOG_ONCE(info, "Don't forget the triangle mask if required!"); - if(inputMask) { - inputMask = swapTimeBatch(inputMask); // [beam depth=1, batch size, max length, vector dim=1] - } - - if(contextMask) { - contextMask = swapTimeBatch(contextMask); // [beam depth=1, max length, batch size, vector dim=1] - contextMask = transposedLogMask(contextMask, opt("transformer-heads")); // [beam broadcast=1, batch size * num heads, max length broadcast=1, max length] - } - // apply positional embeddings to contextual input @TODO: remove need for conversion to int - output = positionEmbedding->apply(output, (int)state->getPosition()); + if(inputMask) + inputMask = swapTimeBatch(inputMask); // [dimBeam=1, dimBatch, dimTrgWords, dimModel=1] + + if(contextMask) + contextMask = swapTimeBatch(contextMask); // [dimBeam=1, dimBatch, dimSrcWords, dimModel=1] + + // apply positional embeddings to contextual input + if(positionEmbedding) + output = positionEmbedding->apply(output, startPos); + else + output = std::sqrt((float)output->shape()[-1]) * output; // handle for skip connection at top auto prevOutput = output; @@ -552,9 +571,12 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec // get an iterator to per-layer states auto layerStateIt = state->as()->begin(); + auto logMask = maskProcessor->apply(output, contextMask, *layerStateIt); + // traverse the layers, use the same mask for each - for(auto layer : *layers) - output = layer->as()->apply(output, inputMask, context, contextMask, /*in/out=*/*layerStateIt++); + for(auto layer : *layers) { + output = layer->as()->apply(output, inputMask, context, logMask, /*in/out=*/*layerStateIt++); + } // apply final postprocessor if requred, e.g. final layer-norm for pre-norm or final skip connection output = postprocessor->apply(output, prevOutput); @@ -570,7 +592,6 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec return output; } }; -#endif } // namespace nn } // namespace marian diff --git a/src/models/bleurt.h b/src/models/bleurt.h index 131b675a7..baeb704a5 100644 --- a/src/models/bleurt.h +++ b/src/models/bleurt.h @@ -68,10 +68,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder { Expr apply(Expr input, Expr mask) const override { auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - mask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - auto binMask = mask; - mask = marian::nn::transposedLogMask(mask, opt("transformer-heads")); // apply positional embeddings to contextual input output = positionEmbedding->apply(output); diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h index d0305002b..868f7d6e9 100644 --- a/src/models/comet_qe.h +++ b/src/models/comet_qe.h @@ -8,7 +8,41 @@ namespace marian { namespace models { -struct CometEncoder final : public nn::TransformerEncoder { +class CometEncoder final : public nn::TransformerEncoder { +private: + // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code. + // It norms over time, not batch, also should be optimized. Seems safe to disable for custom + // models trained by us, but required when doing inference with Unbabel models. + Expr cometNorm(Expr x, Expr binaryMask) const { + Expr output; + if(opt("comet-mix-norm", false)) { + registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); + int dimModel = x->shape()[-1]; + + // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32. + Type origType = x->value_type(); + x = marian::cast(x, Type::float32); + binaryMask = marian::cast(binaryMask, Type::float32); + + x = x * binaryMask; + auto denom = (float)dimModel * sum(binaryMask, -2); + auto mu = sum(sum(x, -1), -2) / denom; // sum over model and time + auto sigma = sum(sum(square(x - mu), -1), -2) / denom; + + auto normed = (x - mu) / sqrt(sigma + 1e-12f); + output = marian::cast(gamma, Type::float32) * sum(normed * binaryMask, -2) / sum(binaryMask, -2); + + // Undo conversion to fp32 if not originally fp32 (most likely fp16 then) + output = marian::cast(output, origType); + } else { + // average over time dimension + output = sum(x * binaryMask, -2) / sum(binaryMask, -2); + } + + return output; + }; + +public: Expr weights; Expr gamma; @@ -19,57 +53,24 @@ struct CometEncoder final : public nn::TransformerEncoder { Expr apply(Expr input, Expr mask) const override { auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - mask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - auto binMask = mask; - mask = marian::nn::transposedLogMask(mask, opt("transformer-heads")); - + auto binaryMask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] + // apply positional embeddings to contextual input output = positionEmbedding->apply(output); // apply dropout or layer-norm to embeddings if required output = preprocessor->apply(output); - - // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code. - // It norms over time, not batch, also should be optimized. Seems safe to disable for custom - // models trained by us, but required when doing inference with Unbabel models. - auto cometNorm = [&, this](Expr x, Expr binMask) { - Expr output; - if(opt("comet-mix-norm", false)) { - registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); - int dimModel = x->shape()[-1]; - - // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32. - Type origType = x->value_type(); - x = marian::cast(x, Type::float32); - binMask = marian::cast(binMask, Type::float32); - - x = x * binMask; - auto denom = (float)dimModel * sum(binMask, -2); - auto mu = sum(sum(x, -1), -2) / denom; // sum over model and time - auto sigma = sum(sum(square(x - mu), -1), -2) / denom; - - auto normed = (x - mu) / sqrt(sigma + 1e-12f); - output = marian::cast(gamma, Type::float32) * sum(normed * binMask, -2) / sum(binMask, -2); - - // Undo conversion to fp32 if not originally fp32 (most likely fp16 then) - output = marian::cast(output, origType); - } else { - // average over time dimension - output = sum(x * binMask, -2) / sum(binMask, -2); - } - - return output; - }; + auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1] std::vector pooler; if(opt("comet-mix", false)) - pooler.push_back(cometNorm(output, binMask)); + pooler.push_back(cometNorm(output, binaryMask)); // traverse the layers, use the same mask for each for(auto layer : *layers) { - output = layer->apply(output, mask); + output = layer->apply(output, logMask); if(opt("comet-mix", false)) - pooler.push_back(cometNorm(output, binMask)); // [ batch, time, modelDim ] + pooler.push_back(cometNorm(output, binaryMask)); // [ batch, time, modelDim ] } if(opt("comet-mix", false)) { @@ -78,7 +79,7 @@ struct CometEncoder final : public nn::TransformerEncoder { output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim] } else { // just use last layer, average over time dim - output = cometNorm(output, binMask); // [batch, 1, modelDim] + output = cometNorm(output, binaryMask); // [batch, 1, modelDim] } return output; diff --git a/src/models/decoder.h b/src/models/decoder.h index 5ddaa9643..1646c44b2 100644 --- a/src/models/decoder.h +++ b/src/models/decoder.h @@ -70,6 +70,7 @@ class DecoderBase : public EncoderDecoderLayerBase { else selectedEmbs = embeddingLayer->apply(words, {dimBeam, 1, dimBatch, dimEmb}); state->setTargetHistoryEmbeddings(selectedEmbs); + state->setTargetWords(words); } virtual const std::vector getAlignments(int /*i*/ = 0) { return {}; }; // [tgt index][beam depth, max src length, batch size, 1] diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index f70353a64..971726271 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -3,6 +3,8 @@ #include "common/filesystem.h" #include "common/version.h" +#include "models/transformer_new.h" + namespace marian { EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) @@ -71,6 +73,12 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("transformer-no-bias"); modelFeatures_.insert("transformer-no-affine"); + + modelFeatures_.insert("transformer-disable-position-embeddings"); + modelFeatures_.insert("transformer-attention-mask"); + modelFeatures_.insert("transformer-alibi-shift"); + modelFeatures_.insert("transformer-alibi-trainable"); + modelFeatures_.insert("separator-symbol"); } std::vector>& EncoderDecoder::getEncoders() { @@ -183,10 +191,22 @@ void EncoderDecoder::save(Ptr graph, void EncoderDecoder::clear(Ptr graph) { graph->clear(); - for(auto& enc : encoders_) + for(auto& enc : encoders_) { enc->clear(); - for(auto& dec : decoders_) + // this cast looks redundant, but TransformerBatchEncoder has two base clases with clear() + // so we need to cast here and call explicitly. Should be removed once we switch to the new + // layer framework everywhere. + auto encNew = std::dynamic_pointer_cast(enc); + if(encNew) + encNew->clear(); + } + for(auto& dec : decoders_) { dec->clear(); + // Same as above, but TransformerBatchDecoder + auto decNew = std::dynamic_pointer_cast(dec); + if(decNew) + decNew->clear(); + } } Ptr EncoderDecoder::startState(Ptr graph, @@ -210,11 +230,12 @@ Ptr EncoderDecoder::step(Ptr graph, const Words& words, // [beamIndex * activeBatchSize + batchIndex] const std::vector& batchIndices, // [batchIndex] int beamSize) { + // create updated state that reflects reordering and dropping of hypotheses - state = hypIndices.empty() ? state : state->select(hypIndices, batchIndices, beamSize); + state = hypIndices.empty() ? state : state->select(hypIndices, words, batchIndices, beamSize); // Fill state with embeddings based on last prediction - decoders_[0]->embeddingsFromPrediction(graph, state, words, (int) batchIndices.size(), beamSize); + decoders_[0]->embeddingsFromPrediction(graph, state, words, (int)batchIndices.size(), beamSize); auto nextState = decoders_[0]->step(graph, state); return nextState; diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h index 4ccc6a93f..ef810ed8b 100644 --- a/src/models/encoder_decoder.h +++ b/src/models/encoder_decoder.h @@ -62,7 +62,7 @@ class IEncoderDecoder : public models::IModel { virtual Ptr getShortlist() = 0; - virtual data::SoftAlignment getAlignment() = 0; + virtual data::SoftAlignment getAlignment() = 0; }; class EncoderDecoder : public IEncoderDecoder, public LayerBase { diff --git a/src/models/states.h b/src/models/states.h index a4be3795e..ec5c3aed8 100644 --- a/src/models/states.h +++ b/src/models/states.h @@ -56,6 +56,24 @@ class DecoderState { Ptr batch, bool isBatchMajor = false) : states_(states), logProbs_(logProbs), encStates_(encStates), batch_(batch), isBatchMajor_(isBatchMajor) {} + + // override to create derived decoder states + virtual Ptr Create(const rnn::States& states, + Logits logProbs, + const std::vector>& encStates, + Ptr batch, + bool isBatchMajor = false) const { + return New(states, logProbs, encStates, batch, isBatchMajor); + } + + // override to create derived decoder states + virtual Ptr next(const rnn::States& states, + Logits logProbs) const { + auto state = Create(states, logProbs, encStates_, batch_, isBatchMajor_); + state->setPosition(getPosition() + 1); + return state; + } + virtual ~DecoderState() {} // @TODO: Do we need all these to be virtual? @@ -68,6 +86,7 @@ class DecoderState { // @TODO: should this be a constructor? Then derived classes can call this without the New<> in the loop virtual Ptr select( const std::vector& hypIndices, // [beamIndex * activeBatchSize + batchIndex] + const Words& /*words*/, const std::vector& batchIndices, // [batchIndex] int beamSize) const { std::vector> newEncStates; @@ -77,11 +96,11 @@ class DecoderState { newEncStates.push_back(es->getContext()->shape()[-2] == batchIndices.size() ? es : es->select(batchIndices)); // hypindices matches batchIndices in terms of batch dimension, so we only need hypIndices - auto selectedState = New(states_.select(hypIndices, beamSize, /*isBatchMajor=*/isBatchMajor_), - logProbs_, - newEncStates, - batch_, - isBatchMajor_); + auto selectedState = Create(states_.select(hypIndices, beamSize, /*isBatchMajor=*/isBatchMajor_), + logProbs_, + newEncStates, + batch_, + isBatchMajor_); // Set positon of new state based on the target token position of current state selectedState->setPosition(getPosition()); @@ -97,7 +116,7 @@ class DecoderState { virtual const Words& getTargetWords() const { return targetWords_; }; virtual void setTargetWords(const Words& targetWords) { targetWords_ = targetWords; } - + virtual Expr getTargetMask() const { return targetMask_; }; virtual void setTargetMask(Expr targetMask) { targetMask_ = targetMask; } diff --git a/src/models/transformer.h b/src/models/transformer.h index 0fa52ff82..1befc726f 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -118,6 +118,9 @@ class Transformer : public EncoderOrDecoderBase { } virtual Expr addSpecialEmbeddings(Expr input, int start = 0, Ptr /*batch*/ = nullptr) const { + if(opt("transformer-disable-position-embeddings", false)) + return input; + bool trainPosEmbeddings = opt("transformer-train-positions", false); return addPositionalEmbeddings(input, start, trainPosEmbeddings); } diff --git a/src/models/transformer_new.h b/src/models/transformer_new.h index cfc3a6b14..61de01db2 100644 --- a/src/models/transformer_new.h +++ b/src/models/transformer_new.h @@ -1,10 +1,12 @@ #pragma once #include "layers_new/transformer.h" +#include "layers_new/alibi.h" #include "models/encoder.h" #include "models/decoder.h" #include "models/states.h" +#include "models/model_base.h" #include "layers/constructors.h" namespace marian { @@ -129,11 +131,11 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, rnn::States startStates(DecoderBase::opt("dec-depth"), {start, start}); // don't use TransformerState for RNN layers - return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/false); + return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/false); } else { rnn::States startStates; - return New(startStates, Logits(), encStates, batch, /*isBatchMajor=*/true); + return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/true); } } @@ -148,6 +150,8 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, } Ptr step(Ptr state) { + using db = DecoderBase; + auto embeddings = state->getTargetHistoryEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] auto decoderMask = state->getTargetMask(); // [max length, batch size, 1] --this is a hypothesis @@ -155,13 +159,11 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, auto encoderContext = state->getEncoderStates()[0]->getContext(); // encoder output auto encoderMask = state->getEncoderStates()[0]->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention - - // Convert old style decoder state to new decoder state - size_t position = state->getPosition(); - auto nnState = New(position); - for(auto& layerState : state->getStates()) - nnState->as()->append(New(layerState.cell, position)); + // Convert old style decoder state to new decoder state + using namespace models; + usage modelUsage = (usage)db::opt("usage", (int)usage::translation); + auto nnState = convertDecoderState(state, graph(), /*decoding=*/modelUsage == usage::translation); auto decoderContext = decoder->apply(embeddings, decoderMask, encoderContext, encoderMask, nnState); // final feed-forward layer (output) @@ -177,10 +179,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, decoderStates.push_back(rnn::State({ cellState, cellState })); } // return unnormalized(!) probabilities - auto nextState = New(decoderStates, logits, state->getEncoderStates(), state->getBatch(), state->isBatchMajor()); - nextState->setPosition(state->getPosition() + 1); - - return nextState; + return state->next(decoderStates, logits); } // helper function for guided alignment diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 6a075e9c5..9d5c8166d 100755 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -55,6 +55,8 @@ void CopyCastFrom(Tensor out, const T* in, int length) { CopyCastTo(out->data(), in, length); } else if(out->type() == Type::float16) { CopyCastTo(out->data(), in, length); + } else if(out->type() == Type::uint32) { + CopyCastTo(out->data(), in, length); } else { ABORT("CopyCastTo to type {} not implemented", out->type()); } diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index ed1e72553..c6953c144 100755 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -41,3 +41,6 @@ template void marian::gpu::Add, marian::functional::UnaryFunctor > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, float, marian::Tensor, marian::Tensor, marian::Tensor); template void marian::gpu::Add >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::gpu::Add, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Add >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Add > >, marian::functional::Assignee<2> >, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor > >, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Add >, marian::functional::BinaryFunctor > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<4> > >, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr >(marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<4> > >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr); \ No newline at end of file diff --git a/src/tensors/gpu/add_all.cu b/src/tensors/gpu/add_all.cu index bc78709a3..3f14153df 100644 --- a/src/tensors/gpu/add_all.cu +++ b/src/tensors/gpu/add_all.cu @@ -111,6 +111,21 @@ void AggregateAll(Ptr allocator, AggregateAllVar(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3); } +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + marian::Tensor out, + const marian::Tensor in1, + const marian::Tensor in2, + const marian::Tensor in3, + const marian::Tensor in4) { + AggregateAllVar(allocator, functor, aggInit, aggFunctor, scale, out, in1, in2, in3, in4); +} + + #include "tensors/gpu/add_all.inc" } \ No newline at end of file diff --git a/src/tensors/gpu/add_all.h b/src/tensors/gpu/add_all.h index 2e37fd497..ce8d9df20 100644 --- a/src/tensors/gpu/add_all.h +++ b/src/tensors/gpu/add_all.h @@ -13,7 +13,7 @@ namespace marian { // These function declarations are repeated as template specialization with variadic template arguments does not seem to work. -// Here I am just creating version for 1, 2, and 3 arguments. To be extended if required. +// Here I am just creating version for 1, 2, 3 and 4 arguments. To be extended if required. template void AggregateAll(Ptr allocator, Functor functor, @@ -44,6 +44,18 @@ void AggregateAll(Ptr allocator, const Tensor in2, const Tensor in3); +template +void AggregateAll(Ptr allocator, + Functor functor, + AccType aggInit, + AggFunctor aggFunctor, + AccType scale, + Tensor out, + const Tensor in1, + const Tensor in2, + const Tensor in3, + const Tensor in4); + // Aggregates all values into a single tensor and returns the value of that tensor as a float // This does a GPU to CPU memory copy via TensorBase::scalar(). // Used currently only for L2Norm computation diff --git a/src/tensors/gpu/add_all.inc b/src/tensors/gpu/add_all.inc index 41da1351b..8987268ba 100644 --- a/src/tensors/gpu/add_all.inc +++ b/src/tensors/gpu/add_all.inc @@ -43,6 +43,9 @@ template void marian::AggregateAll, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll >, marian::functional::BinaryFunctor > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<4> > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<4> > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr); #if COMPILE_FP16 template void AggregateAll<__half, float, BinaryFunctor>, Assignee<2>>, BinaryFunctor, Assignee<2>>>(std::shared_ptr, BinaryFunctor>, Assignee<2>>, float, BinaryFunctor, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -87,4 +90,7 @@ template void marian::AggregateAll<__half, float, marian::functional::BinaryFunc template void marian::AggregateAll<__half, float, marian::functional::Assignee<1>, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture>, marian::functional::BinaryFunctor, marian::functional::Capture> > > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > >, marian::functional::BinaryFunctor, marian::functional::BinaryFunctor >, marian::functional::UnaryFunctor, marian::functional::Assignee<1> > >, marian::functional::Capture> > > >, marian::functional::UnaryFunctor > >, marian::functional::Capture> >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor > >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<4> > >, marian::functional::BinaryFunctor, marian::functional::Assignee<2> > >(std::shared_ptr, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor > > >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor, marian::functional::Assignee<4> > >, float, marian::functional::BinaryFunctor, marian::functional::Assignee<2> >, float, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr); #endif diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc index 27cc641da..da957a9ce 100755 --- a/src/tensors/gpu/element.inc +++ b/src/tensors/gpu/element.inc @@ -76,6 +76,10 @@ template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Capture>, marian::functional::Capture> >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::BinaryFunctor >, marian::functional::Capture> >, marian::functional::Capture> > >, IntrusivePtr, IntrusivePtr); template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > > > >, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::UnaryFunctor > >>(marian::functional::Assign, marian::functional::UnaryFunctor > >, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor > >, marian::functional::Assignee<3> > > >, IntrusivePtr, IntrusivePtr, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Assignee<2> >, marian::functional::BinaryFunctor > >, marian::functional::Assignee<3> > > >, IntrusivePtr, IntrusivePtr, IntrusivePtr, IntrusivePtr); +template void marian::gpu::Element, marian::functional::BinaryFunctor >, marian::functional::Capture> >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor >, marian::functional::Capture> >, IntrusivePtr, IntrusivePtr); // How to add new specializations: // When you use a new specialization, it will cause a link error of this form (example): @@ -84,6 +88,3 @@ template void marian::gpu::Element' with 'marian::Tensor' - -template void marian::gpu::Element, marian::functional::UnaryFunctor > >>(marian::functional::Assign, marian::functional::UnaryFunctor > >, IntrusivePtr); -template void marian::gpu::Element, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, IntrusivePtr >(marian::functional::Assign, marian::functional::BinaryFunctor, marian::functional::UnaryFunctor > > >, IntrusivePtr, IntrusivePtr); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 6dbded2a4..4662ab041 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -3217,8 +3217,7 @@ __global__ void gHighwayBackward(T* out1, T sigma = functional::Ops::sigmoid(t[index]); out1[index] = sigma * adj[index]; out2[index] = ((T)1.f - sigma) * adj[index]; - outt[index] - = sigma * ((T)1.f - sigma) * (in1[index] - in2[index]) * adj[index]; + outt[index] = sigma * ((T)1.f - sigma) * (in1[index] - in2[index]) * adj[index]; } } } diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 901eddc5c..1eeef913a 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -358,7 +358,7 @@ Histories BeamSearch::search(Ptr graph, Ptr bool anyCanExpand = false; // stays false if all hyps are invalid factor expansions if(t == 0 && factorGroup == 0) { // no scores yet - prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0)); + prevPathScores = graph->constant({1, 1, 1, 1}, inits::fromValue(0), Type::float32); anyCanExpand = true; // at the beginning all batch entries are used @@ -407,7 +407,7 @@ Histories BeamSearch::search(Ptr graph, Ptr } if(factorGroup == 0) currentDimBatch = (IndexType) batchIndices.size(); // keep batch size constant for all factor groups in a time step - prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores)); + prevPathScores = graph->constant({(int)maxBeamSize, 1, (int)currentDimBatch, 1}, inits::fromVector(prevScores), Type::float32); } if (!anyCanExpand) // all words cannot expand this factor: skip continue; @@ -462,7 +462,7 @@ Histories BeamSearch::search(Ptr graph, Ptr } // make beams continuous - auto expandedPathScores = prevPathScores + stepScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab] + auto expandedPathScores = prevPathScores + cast(stepScores, Type::float32); // will become [maxBeamSize, 1, currDimBatch, dimVocab] expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab] // perform NN computation diff --git a/src/translator/translator.h b/src/translator/translator.h index f0fc0b908..f1fd04d3f 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -49,7 +49,7 @@ class Translate : public ModelTask { options_->set("inference", true, "shuffle", "none"); - corpus_ = New(options_, true); + corpus_ = New(options_, /*translate=*/true); auto vocabs = options_->get>("vocabs"); trgVocab_ = New(options_, vocabs.size() - 1); From b61755b656899edd21ddbf9a5b416086c3a5a82a Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 4 Dec 2023 18:15:53 +0000 Subject: [PATCH 05/26] Merged PR 31919: Nucleus and epsilon sampling This adds nucleus and epsilon sampling to the output-sampling options. * This required the implementation of a sorting algorithm, tested thrust and CUB. * Implementation of cumsum and logcumsumexp (no gradient for now) operators. * Various minor improvements. --- CHANGELOG.md | 1 + CMakeLists.txt | 6 +- VERSION | 2 +- src/CMakeLists.txt | 2 + src/common/types.h | 8 +- src/graph/expression_operators.cpp | 22 ++ src/graph/expression_operators.h | 30 +++ src/graph/node_initializers.cpp | 1 + src/graph/node_operators_tuple.h | 88 +++++++ src/graph/node_operators_unary.h | 123 +++++++++ src/layers_new/alibi.cpp | 2 +- src/layers_new/rnn.h | 4 +- src/models/transformer.h | 5 + src/tensors/cpu/cumsum.cpp | 90 +++++++ src/tensors/cpu/topk.cpp | 35 +++ src/tensors/gpu/cumsum.cu | 156 +++++++++++ src/tensors/gpu/tensor_operators.cu | 27 +- src/tensors/gpu/topk.cu | 164 +++++++++++- src/tensors/rand.cpp | 13 +- src/tensors/rand.h | 19 +- src/tensors/tensor_operators.h | 5 + src/tests/transformer_new.cpp | 11 - src/tests/units/CMakeLists.txt | 1 + src/tests/units/operator_tests.cpp | 100 +++++++ src/tests/units/transformer_tests.cpp | 147 +++++++++++ src/translator/beam_search.cpp | 19 +- src/translator/beam_search.h | 6 +- src/translator/sampling.h | 365 ++++++++++++++++---------- src/translator/translator.h | 8 +- 29 files changed, 1264 insertions(+), 196 deletions(-) create mode 100644 src/tensors/cpu/cumsum.cpp create mode 100644 src/tensors/gpu/cumsum.cu delete mode 100644 src/tests/transformer_new.cpp create mode 100644 src/tests/units/transformer_tests.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index e4eb14230..51df73b57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively. - Added ALIBI related options to new layer framework. - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode. - Added MSE and MAE costs to COMET-QE training. diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ea841254..595f87cc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -466,15 +466,17 @@ endif(COMPILE_CUDA) # TODO: make compatible with older CUDA versions if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE}) + list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE}) else(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE}) + list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE}) endif(CMAKE_BUILD_TYPE STREQUAL "Debug") if(NOT MSVC) # @TODO: add warnings here too list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;) list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC}) else() + # c++17 doesn't work with CUDA 10 + # list(APPEND CUDA_NVCC_FLAGS -std=c++17; -Xcompiler "/std:c++17"; -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) endif() diff --git a/VERSION b/VERSION index 274b68518..658123368 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.15 +v1.12.16 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d4cb8cc14..5bf321af5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,6 +55,7 @@ set(MARIAN_SOURCES tensors/backend.cpp tensors/rand.cpp tensors/tensor.cpp + tensors/cpu/cumsum.cpp tensors/cpu/device.cpp tensors/cpu/prod.cpp tensors/cpu/topk.cpp @@ -182,6 +183,7 @@ set_target_properties(marian PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY if(CUDA_FOUND) cuda_add_library(marian_cuda layers_new/alibi.cu + tensors/gpu/cumsum.cu tensors/gpu/device.cu tensors/gpu/hash.cu tensors/gpu/algorithm.cu diff --git a/src/common/types.h b/src/common/types.h index a0930a0f8..7b50bb691 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -607,9 +607,10 @@ class NumericLimits { private: template void setLimitsMax() { - max = (ReturnType)std::numeric_limits::max(); - min = (ReturnType)std::numeric_limits::min(); - lowest = (ReturnType)std::numeric_limits::lowest(); + max = (ReturnType)std::numeric_limits::max(); + min = (ReturnType)std::numeric_limits::min(); + lowest = (ReturnType)std::numeric_limits::lowest(); + infinity = (ReturnType)std::numeric_limits::infinity(); } template @@ -635,6 +636,7 @@ class NumericLimits { ReturnType max; ReturnType min; ReturnType lowest; + ReturnType infinity; NumericLimits(Type type) { setLimits(type); diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 60a86112f..47da511cf 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -177,6 +177,28 @@ Expr2 argmin(Expr a, int axis) { return topk(a, 1, axis, /*descending=*/false); } +Expr2 sort(Expr a, int axis, bool descending) { + // only supports sort along last dimension, hence transpose if required + a = swapAxes(a, axis, -1); // non-op if axes are the same + auto sortedVal = Expression(a, /*axis=*/-1, descending); // axis=-1 is OK now as we swapped + auto sortedIdx = std::dynamic_pointer_cast(sortedVal)->tupleView(); // get a view on the sorted values + return std::make_tuple(swapAxes(sortedVal, axis, -1), swapAxes(sortedIdx, axis, -1)); // non-op if axes are the same +} + +Expr cumsum(Expr a, int axis, bool reverse, bool exclusive) { + // only supports sort along last dimension, hence transpose if required + a = swapAxes(a, axis, -1); // non-op if axes are the same + auto cumsums = Expression(a, axis, reverse, exclusive); + return swapAxes(cumsums, axis, -1); // non-op if axes are the same +} + +Expr logcumsumexp(Expr a, int axis, bool reverse, bool exclusive, bool fast) { + // only supports sort along last dimension, hence transpose if required + a = swapAxes(a, axis, -1); // non-op if axes are the same + auto logcumsums = Expression(a, axis, reverse, exclusive, fast); + return swapAxes(logcumsums, axis, -1); // non-op if axes are the same +} + Expr maximum(Expr a, Expr b) { return Expression(a, b); } diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index c792096b1..82d8726c5 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -405,6 +405,36 @@ Expr2 argmax(Expr a, int axis); */ Expr2 argmin(Expr a, int axis); +/** + * Sorts an expression along an axis. + * Sorts the elements of an expression along a specified @p axis. + * @param a Expression to sort + * @param axis Axis to sort along + * @param descending If true, sort in descending order. Otherwise, sort in ascending order. + * Default is true. + * @returns A sorted expression + */ +Expr2 sort(Expr a, int axis, bool descending = true); + +/** + * Cumulative sum of an expression along an axis. + * Computes the cumulative sum of an expression along a specified @p axis. + * @param a Expression to cumsum + * @param axis Axis to cumsum along + * @param exclusive If true, the first element is not included in the sum. Default is false. + * @returns Cumulative sums of @p a along @p axis + */ +Expr cumsum(Expr a, int axis, bool reverse=false, bool exclusive = false); + +/** + * Logarithmic cumulative sum of an expression along an axis. + * Computes the logarithmic cumulative sum of an expression along a specified @p axis. + * @param a Expression to cumsum + * @param axis Axis to cumsum along + * @param exclusive If true, the first element is not included in the sum. Default is false. + * @returns Logarithmic cumulative sums of @p a along @p axis +*/ +Expr logcumsumexp(Expr a, int axis, bool reverse = false, bool exclusive = false, bool fast = false); /** * @addtogroup graph_ops_cmp Comparison diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp index e44b48287..3afb599a9 100644 --- a/src/graph/node_initializers.cpp +++ b/src/graph/node_initializers.cpp @@ -226,6 +226,7 @@ Ptr sinusoidalPositionEmbeddings(int start) { return fromLambda([start](Tensor t) { SinusoidalPositionEmbeddings(t, start); }); } +// @TODO: this is rather inefficient also needs axis argument or something // computes the equivalent of Python's range() template Ptr range(T begin, T end, T step) { diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h index 4444e2ef8..79866681e 100644 --- a/src/graph/node_operators_tuple.h +++ b/src/graph/node_operators_tuple.h @@ -165,6 +165,94 @@ struct TopKNodeOp : public UnaryNodeOp, } }; +// This is an implementation of sort, similar to the PyTorch node. +// At the moment we only handle axis=-1 in here, but do transposes +// in the actual operator to handle other axes (inefficiently). +// The normal forward values here are the sorted values per axis, +// the additional value from the TupleNode contains the integer +// indices of the sorted values. +struct SortNodeOp : public UnaryNodeOp, + public TupleNode { +private: + int axis_; // on which axis + bool descending_; // sort-order, by default descending. PyTorch has a version without sorting, we always sort. + +public: + SortNodeOp(Expr a, int axis, bool descending = true) + : UnaryNodeOp(a, a->shape()), + axis_{a->shape().axis(axis)}, + descending_{descending} { + ABORT_IF(axis_ != shape().size() - 1, "Only implemented along last axis, you tried {}", axis_); + } + + // imlementation of TupleNode-specific pure-virtual functions for allocation + void allocateTuple() override final { + graph()->getTensorAllocator()->allocate(tupleVal_, shape(), Type::uint32); + } + + // we override the normal allocation to include the TupleNode allocation + void allocate() override { + UnaryNodeOp::allocate(); + allocateTuple(); + } + + // implementation of TupleNode-specific pure-virtual functions for de-allocation + void freeTuple() override final { + if(graph()) { + if(tupleVal_) { + graph()->free(tupleVal_); + tupleVal_ = nullptr; + } + } + } + + // we override the normal allocation to include the TupleNode de-allocation + void free() override { + UnaryNodeOp::free(); + freeTuple(); + } + + // Create and return a TupleView to the additional forward value + virtual Expr tupleView() override final { + return Expression(this, shape(), Type::uint32); + } + + void forward() override { + Sort(/*out*/val_, /*out: indices=*/tupleVal_, + graph()->allocator(), + child(0)->val(), axis_, descending_); + } + + void backward() override { + Insert(/*out*/child(0)->grad(), adj_, tupleVal_, axis_); + } + + const std::string type() override { return "sort"; } + + virtual size_t hash() override { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + util::hash_combine(hash_, axis_); + util::hash_combine(hash_, descending_); + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(axis_ != cnode->axis_) + return false; + if(descending_ != cnode->descending_) + return false; + return true; + } +}; + + // This node attaches multiple children to a parent node and allows // to select one of them via a given index. This is mostly used to avoid // unattached nodes that might nevertheless get created based on some diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 97ffedf61..aa3f5004c 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -613,6 +613,129 @@ struct ReduceNodeOp : public UnaryNodeOp { } }; +class CumSumNodeOp : public UnaryNodeOp { +private: + friend class SerializationHelpers; + int axis_; + bool reverse_; + bool exclusive_; + +public: + CumSumNodeOp(Expr a, int axis, bool reverse, bool exclusive) + : UnaryNodeOp(a), + axis_(a->shape().axis(axis)), + reverse_(reverse), + exclusive_(exclusive) + {} + + NodeOps forwardOps() override { + using namespace functional; + return {NodeOp(CumSum(val_, child(0)->val(), reverse_, exclusive_))}; + } + + NodeOps backwardOps() override { + using namespace functional; + return {NodeOp( + // if we are here then we are done with adding gradients to adj_ + // so we can canibalize it to compute the gradient of the input + // compute the cumsum of the adjoint + CumSum(adj_, adj_, !reverse_, exclusive_); + // add that cumsum to the gradient of the input + Add(_1, child(0)->grad(), adj_); + )}; + } + + const std::string type() override { return "cumsum"; } + + const std::string color() override { return "orange"; } + + virtual size_t hash() override { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + util::hash_combine(hash_, axis_); + util::hash_combine(hash_, reverse_); + util::hash_combine(hash_, exclusive_); + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(axis_ != cnode->axis_) + return false; + if(reverse_ != cnode->reverse_) + return false; + if(exclusive_ != cnode->exclusive_) + return false; + return true; + } +}; + +class LogCumSumExpNodeOp : public UnaryNodeOp { +private: + friend class SerializationHelpers; + int axis_; + bool reverse_; + bool exclusive_; + bool fast_; + +public: + LogCumSumExpNodeOp(Expr a, int axis, bool reverse, bool exclusive, bool fast=false) + : UnaryNodeOp(a), + axis_(a->shape().axis(axis)), + reverse_(reverse), + exclusive_(exclusive), + fast_(fast) + {} + + NodeOps forwardOps() override { + using namespace functional; + return {NodeOp(LogCumSumExp(val_, child(0)->val(), reverse_, exclusive_, fast_))}; + } + + NodeOps backwardOps() override { + using namespace functional; + ABORT("LogCumSumNodeOp::backwardOps() not implemented yet"); + // return {NodeOp(LogCumSumExpGrad(child(0)->grad(), adj_, val_, child(0)->val()))}; + } + + const std::string type() override { return "logcumsumexp"; } + + const std::string color() override { return "orange"; } + + virtual size_t hash() override { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + util::hash_combine(hash_, axis_); + util::hash_combine(hash_, reverse_); + util::hash_combine(hash_, exclusive_); + util::hash_combine(hash_, fast_); + } + return hash_; + } + + virtual bool equal(Expr node) override { + if(!NaryNodeOp::equal(node)) + return false; + auto cnode = std::dynamic_pointer_cast(node); + if(!cnode) + return false; + if(axis_ != cnode->axis_) + return false; + if(reverse_ != cnode->reverse_) + return false; + if(exclusive_ != cnode->exclusive_) + return false; + if(fast_ != cnode->fast_) + return false; + return true; + } +}; + struct LogNodeOp : public UnaryNodeOp { LogNodeOp(Expr a) : UnaryNodeOp(a) {} diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp index 07989ce6a..abffb6bae 100644 --- a/src/layers_new/alibi.cpp +++ b/src/layers_new/alibi.cpp @@ -67,7 +67,7 @@ Expr AlibiDecoderState::getAlibiShift(Ptr graph, bool decoding) } } else { ABORT_IF(getBatch()->sets() != 2, - "--transformer-alibi-shift=true currently only works with batch sets=2"); + "--transformer-alibi-shift=true currently only works with batch sets=2"); return getAlibiShiftFromBatch(graph); } } diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h index 281d2dce9..720fa50f7 100644 --- a/src/layers_new/rnn.h +++ b/src/layers_new/rnn.h @@ -8,6 +8,7 @@ namespace nn { struct CellState { Expr recurrent; + size_t position = 0; }; struct ICell { @@ -43,7 +44,7 @@ class SSRU final : public Layer, public ICell { Expr output = iProj->apply(input); Expr forget = fProj->apply(input); - + return {output, forget}; } @@ -104,6 +105,7 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer std::vector stepInputs(inputs.size()); std::transform(inputs.begin(), inputs.end(), stepInputs.begin(), [i, dimTimeAxis](Expr e) { return slice(e, dimTimeAxis, i); }); + cellState->position = state->getPosition() + i; auto stepMask = inputMask; if(stepMask) stepMask = slice(inputMask, dimTimeAxis, i); diff --git a/src/models/transformer.h b/src/models/transformer.h index 1befc726f..ad018b240 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -246,6 +246,11 @@ class Transformer : public EncoderOrDecoderBase { int dimBeam = 1) { int dk = k->shape()[-1]; + // to avoid mistakenly using the old transformer framework for new features + auto maskType = opt("transformer-attention-mask", "default"); + ABORT_IF(maskType != "default", + "You specified --transformer-attention-mask={} which is not implemented for legacy Transformer", maskType ); + // softmax over batched dot product of query and keys (applied over all // time steps and batch entries), also add mask for illegal connections diff --git a/src/tensors/cpu/cumsum.cpp b/src/tensors/cpu/cumsum.cpp new file mode 100644 index 000000000..12c7cb155 --- /dev/null +++ b/src/tensors/cpu/cumsum.cpp @@ -0,0 +1,90 @@ +#include "tensors/tensor_operators.h" + +namespace marian { +namespace cpu { + +// wrap Marian functor to work with thrust +template +class AccFunctorWrapper { +private: + Functor functor_; + +public: + AccFunctorWrapper(Functor functor) : functor_(functor) {} + T operator()(T x, T y) { return (T)functor_((float)x, (float)y); } +}; + +template +void BatchedScan(Tensor out, const Tensor in, bool reverse, bool exclusive, Functor accOp, float zero) { + ABORT_IF(!isFloat(in->type()), "Input should be float type and not {}", in->type()); + ABORT_IF(out->type() != in->type(), "Output should have type {}", in->type()); + + int cols = in->shape()[0]; + int rows = in->shape().elements() / cols; + + auto batchedScan = [=](auto inIt, auto outIt) { + AccFunctorWrapper accFunctor(accOp); + + for(int i = 0; i < rows; ++i) { + float sum; + int shift = exclusive ? 1 : 0; + + // handle first element differently based on exclusive flag + if(exclusive) + sum = zero; + else + sum = inIt[0]; + outIt[0] = sum; + + for(int j = 1; j < cols; ++j) { + sum = accFunctor(sum, inIt[j - shift]); + outIt[j] = sum; + } + + inIt += cols; + outIt += cols; + } + }; + + if(reverse) { + auto revInIt = std::make_reverse_iterator(in->data() + in->size()); + auto revOutIt = std::make_reverse_iterator(out->data() + out->size()); + batchedScan(revInIt, revOutIt); + } else { + auto fwdInIt = in->data(); + auto fwdOutIt = out->data(); + batchedScan(fwdInIt, fwdOutIt); + } +} + +// CPU implementation of logcumsumexp operator for LogCumSumExpNodeOp +void LogCumSumExp(Tensor out, const Tensor in, bool reverse, bool exclusive, bool fast = false) { + float max = 0; + if(!fast) { + // compute max of entire tensor, this is just for stabilization + // note, if e.g. all values are logprobs, then the max is at most 0 and we can skip this step + // maybe it should be the default to turn this off? + max = *std::max_element(in->data(), in->data() + in->size()); + } + + using namespace functional; + auto functor = log(exp(_1 - max) + exp(_2 - max)) + max; + auto zero = -NumericLimits(in->type()).infinity; + BatchedScan(out, in, reverse, exclusive, functor, zero); +} + +// CPU implementation of cumsum operator for CumSumNodeOp +void CumSum(Tensor out, const Tensor in, bool reverse, bool exclusive) { + using namespace functional; + auto functor = _1 + _2; + BatchedScan(out, in, reverse, exclusive, functor, 0.f); +} + +void CumProd(Tensor out, const Tensor in, bool reverse, bool exclusive) { + using namespace functional; + auto functor = _1 * _2; + BatchedScan(out, in, reverse, exclusive, functor, 1.f); +} + +} // namespace gpu +} // namespace marian \ No newline at end of file diff --git a/src/tensors/cpu/topk.cpp b/src/tensors/cpu/topk.cpp index 92dcba591..73f0ce273 100644 --- a/src/tensors/cpu/topk.cpp +++ b/src/tensors/cpu/topk.cpp @@ -50,5 +50,40 @@ void TopK(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tens } } +// CPU implementation of Marian sort operator for SortNodeOp +void Sort(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tensor in, int axis, bool descending) { + ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis"); + ABORT_IF(in->type() != Type::float32, "Input should have type {}", Type::float32); + ABORT_IF(outInd->type() != Type::uint32, "Output should be have type {}", Type::uint32); + + int cols = in->shape()[axis]; + int rows = in->shape().elements() / cols; + + std::vector idxs(cols); + std::iota(idxs.begin(), idxs.end(), 0); + + const float* inDataPtr = in->data(); + IndexType* outIndPtr = outInd->data(); + float* outValPtr = outVal->data(); + for(int i = 0; i < rows; ++i) { + std::sort( + idxs.begin(), + idxs.end(), + [&](int a, int b) { + return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; + } + ); + + for(int j = 0; j < cols; j++) { + outIndPtr[j] = idxs[j]; + outValPtr[j] = inDataPtr[idxs[j]]; + } + + outIndPtr += cols; + outValPtr += cols; + inDataPtr += cols; + } +} + } } diff --git a/src/tensors/gpu/cumsum.cu b/src/tensors/gpu/cumsum.cu new file mode 100644 index 000000000..c0f11c6fa --- /dev/null +++ b/src/tensors/gpu/cumsum.cu @@ -0,0 +1,156 @@ +#include "tensors/tensor_operators.h" +#include "tensors/gpu/cuda_helpers.h" +#include "tensors/allocator.h" + +#include "functional/operators.h" + +#include +#include +#include +#include +#include +#include + +namespace marian { +namespace gpu { + +// small operator to compute the row id of an element in a 2d tensor +class ProjectToRow : public thrust::unary_function { +private: + int cols_; + +public: + ProjectToRow(int cols) : cols_(cols) {} + HOST_DEVICE int operator()(int i) { return i / cols_; } +}; + +// create the iterators to group the elements of a 2d tensor by row +auto rowIterators(int rows, int cols) { + thrust::counting_iterator firstElement(0); + auto begin = thrust::make_transform_iterator(firstElement, ProjectToRow(cols)); + auto end = thrust::make_transform_iterator(firstElement + rows * cols, ProjectToRow(cols)); + return std::make_pair(begin, end); +}; + +// create the iterators to group the elements of a 2d tensor by row +auto rowIterators(const Shape& shape) { + // use last dimension as column size + int cols = shape[-1]; + // compute number of rows from total number of elements and column size + int rows = shape.elements() / cols; + return rowIterators(rows, cols); +} + +// wrap Marian functor to work with thrust +template +class AccFunctorWrapper { +private: + Functor functor_; + +public: + AccFunctorWrapper(Functor functor) : functor_(functor) {} + HOST_DEVICE T operator()(T x, T y) { return (T)functor_((float)x, (float)y); } +}; + +template +void TypedBatchedScan(Tensor out, const Tensor in, bool reverse, bool exclusive, Functor accOpFunctor, T zero) { + // use thrust device_ptr to wrap raw pointers + thrust::device_ptr inData(in->data()); + thrust::device_ptr outData(out->data()); + + // currently use default stream + auto exec = thrust::cuda::par; + auto equalOp = thrust::equal_to(); + auto accOp = AccFunctorWrapper(accOpFunctor); + + auto batchedScan = [=](auto inIt, auto outIt) { + // treat each row as as set of keys, only works for last dimension + const auto range = rowIterators(in->shape()); + auto begin = range.first; + auto end = range.second; + if(exclusive) + thrust::exclusive_scan_by_key(exec, begin, end, inIt, outIt, zero, equalOp, accOp); + else + thrust::inclusive_scan_by_key(exec, begin, end, inIt, outIt, equalOp, accOp); + }; + + if(reverse) { + auto revInIt = thrust::make_reverse_iterator(inData + in->size()); + auto revOutIt = thrust::make_reverse_iterator(outData + out->size()); + batchedScan(revInIt, revOutIt); + } else { + auto fwdInIt = inData; + auto fwdOutIt = outData; + batchedScan(fwdInIt, fwdOutIt); + } +} + +template +void BatchedScan(Tensor out, const Tensor in, bool reverse, bool exclusive, Functor functor, float zero) { + ABORT_IF(!isFloat(in->type()), "Input should be float type and not {}", in->type()); + ABORT_IF(out->type() != in->type(), "Output should have type {}", in->type()); + + if(in->type() == Type::float32) { + TypedBatchedScan(out, in, reverse, exclusive, functor, zero); +#if COMPILE_FP16 + } else if(in->type() == Type::float16) { + TypedBatchedScan<__half>(out, in, reverse, exclusive, functor, __float2half(zero)); +#endif + } else { + ABORT("BatchedScan not implemented for type {}", in->type()); + } +} + +template +T typedMaxElement(const Tensor in) { + // use thrust device_ptr to wrap raw pointers + thrust::device_ptr inData(in->data()); + + // currently use default stream + auto exec = thrust::cuda::par; + + return *thrust::max_element(exec, inData, inData + in->size()); +} + +float MaxElement(const Tensor in) { + ABORT_IF(!isFloat(in->type()), "Input should be float type and not {}", in->type()); + if(in->type() == Type::float32) { + return typedMaxElement(in); +#if COMPILE_FP16 + } else if(in->type() == Type::float16) { + return __half2float(typedMaxElement<__half>(in)); +#endif + } else { + ABORT("MaxElement not implemented for type {}", in->type()); + } +} + +void LogCumSumExp(Tensor out, const Tensor in, bool reverse, bool exclusive, bool fast) { + float max = 0; + if(!fast) { + // compute max of entire tensor, this is just for stabilization + // note, if e.g. all values are logprobs, then the max is at most 0 and we can skip this step + // maybe it should be the default to turn this off? + max = MaxElement(in); + } + + using namespace functional; + auto functor = log(exp(_1 - max) + exp(_2 - max)) + max; + auto zero = -NumericLimits(in->type()).infinity; + BatchedScan(out, in, reverse, exclusive, functor, zero); +} + +void CumSum(Tensor out, const Tensor in, bool reverse, bool exclusive) { + using namespace functional; + auto functor = _1 + _2; + BatchedScan(out, in, reverse, exclusive, functor, 0.f); +} + +void CumProd(Tensor out, const Tensor in, bool reverse, bool exclusive) { + using namespace functional; + auto functor = _1 * _2; + BatchedScan(out, in, reverse, exclusive, functor, 1.f); +} + +} // namespace gpu +} // namespace marian diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 4662ab041..b7c80394b 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -848,27 +848,30 @@ __global__ void gLogSoftmax(T* out, int rows = outShape.elements() / outShape.back(); int cols = outShape.back(); + // loop over blocks of rows for(int bid = 0; bid < rows; bid += gridDim.x) { - int j = bid + blockIdx.x; + int j = bid + blockIdx.x; // blockIdx.x - row index (within block of rows) if(j < rows) { - T* so = out + j * cols; - const T* sp = in + j * cols; + T* so = out + j * cols; // pointer to row output data + const T* sp = in + j * cols; // pointer to row input data // CUDA complains if type or size of shared memory changes, keep size constant. extern __shared__ uint8_t _sharedBytes[]; - T* _share = (T*)_sharedBytes; + T* _share = (T*)_sharedBytes; AccType* _shareAccType = (AccType*)_sharedBytes; T* _max = _share; // 16-bit is ok for max if applicable _max[threadIdx.x] = sp[threadIdx.x]; for(int tid = 0; tid < cols; tid += blockDim.x) { - int id = tid + threadIdx.x; + int id = tid + threadIdx.x; // threadIdx.x = column index within block of columns if(id < cols) { if(sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id]; } } __syncthreads(); + + // max over columns within a column block via tree reduction int len = blockDim.x; while(len != 1) { __syncthreads(); @@ -889,14 +892,18 @@ __global__ void gLogSoftmax(T* out, _sum[threadIdx.x] = 0.0; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; - if(id < cols) { - T sm = sp[id] - max; - AccType ex = Ops::exp(sm); // sum with AccType - so[id] = sm; + if(id < cols) { + // @TODO: would it be faster to recompute it below? Also better numeric stability with float? + AccType sm = (AccType)sp[id] - (AccType)max; // subtract max for numeric stability + so[id] = (T)sm; // assign numerator to output + + AccType ex = Ops::exp(sm); _sum[threadIdx.x] += ex; // sum with AccType } } __syncthreads(); + + // now reduce over all columns within the block len = blockDim.x; while(len != 1) { __syncthreads(); @@ -906,6 +913,8 @@ __global__ void gLogSoftmax(T* out, len = (len + 1) >> 1; } __syncthreads(); + + // produce final output data AccType sum = _sum[0]; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; diff --git a/src/tensors/gpu/topk.cu b/src/tensors/gpu/topk.cu index 94256fb7a..3bb8582d4 100644 --- a/src/tensors/gpu/topk.cu +++ b/src/tensors/gpu/topk.cu @@ -3,6 +3,13 @@ #include "tensors/allocator.h" #include +#include +#include +#include + +#if CUDA_VERSION >= 11000 +#include +#endif // GPU implementation of proper Marian top-k operator for TopkNodeOp // This file contains a lot of code-duplicaton with src/translator/nth_element.cu @@ -14,7 +21,6 @@ namespace marian { namespace gpu { const int MAX_BINS = 500; -const int BLOCK_SIZE = 512; #define UNROLL_MAXARG_LOOP(n, max) \ if(tid < (n) && tid + (n) < (max)) { \ @@ -35,7 +41,7 @@ __global__ void gMaxElement(IndexType* binIndices, // out: top-k positions bool descending) // This will be the largest possible value if the order is reversed (i.e. we look for the minimum). { extern __shared__ float sharedValues[]; - __shared__ IndexType sharedIndices[BLOCK_SIZE]; + __shared__ IndexType sharedIndices[MAX_THREADS]; // id of current thread within block int tid = threadIdx.x; @@ -147,7 +153,7 @@ __global__ void gMaxElementUpdate(IndexType* binIndices, // memory for bin indic bool descending) { extern __shared__ float sharedValues[]; - __shared__ int sharedIndices[BLOCK_SIZE]; + __shared__ int sharedIndices[MAX_THREADS]; __shared__ float bestBinCost; __shared__ int bestBinCostIdx; @@ -332,7 +338,7 @@ void TopK(Tensor outVal, Tensor outInd, Ptr allocator, const Tensor i float minimal = NumericLimits(in->type()).lowest; // lowest if looking for max - const int numBlocks = std::min(MAX_BINS, int(cols / (2 * BLOCK_SIZE)) + int(cols % (2 * BLOCK_SIZE) != 0)); + const int numBlocks = std::min(MAX_BINS, int(cols / (2 * MAX_THREADS)) + int(cols % (2 * MAX_THREADS) != 0)); auto tempMemInd = allocator->alloc(rows * numBlocks); MemoryPiece::PtrType tempMemVal; @@ -340,14 +346,14 @@ void TopK(Tensor outVal, Tensor outInd, Ptr allocator, const Tensor i tempMemVal = allocator->alloc(rows * numBlocks); // first find the maximum value per row and block and save indices and values to temporary memory gMaxElement<<>>( tempMemInd->data(), tempMemVal->data(), in->data(), rows, cols, minimal, descending); gMaxElementUpdate<<>>( tempMemInd->data(), tempMemVal->data(), outInd->data(), outVal->data(), @@ -357,14 +363,14 @@ void TopK(Tensor outVal, Tensor outInd, Ptr allocator, const Tensor i tempMemVal = allocator->alloc<__half>(rows * numBlocks); // first find the maximum value per row and block and save indices and values to temporary memory gMaxElement<<>>( tempMemInd->data(), tempMemVal->data<__half>(), in->data<__half>(), rows, cols, minimal, descending); gMaxElementUpdate<<>>( tempMemInd->data(), tempMemVal->data<__half>(), outInd->data(), outVal->data<__half>(), @@ -378,5 +384,139 @@ void TopK(Tensor outVal, Tensor outInd, Ptr allocator, const Tensor i allocator->free(tempMemVal); } +// this function uses cub::DeviceSegmentedRadixSort::SortPairs to sort each row separately +template +void TypedSortCUB(Ptr allocator, Tensor outVal, Tensor outInd, const Tensor in, bool descending) { +#if CUDA_VERSION >= 11000 + int cols = in->shape()[-1]; + int rows = in->shape().elements() / cols; + + const T* inValData = in->data(); + T* outValData = outVal->data(); + IndexType* outIndData = outInd->data(); + + // create indices for the input tensor, i.e. [0, 1, 2, ..., cols] per row using single thrust transform + // CUB doesn't seem to have a transform operation, so let's use thrust. They seem to be compatible anyway. + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(rows * cols), + outIndData, + [=] HOST_DEVICE (int i) { return i % cols; }); + + // create row iterator, this iterates through the indices of row start offsets, e.g. [0, cols, 2*cols, ...] + // this is used to partition the input tensor into rows when sorting with the segmented sort + auto rowEndOp = [cols] HOST_DEVICE (int i) { return i * cols; }; + using TransformOp = decltype(rowEndOp); + using CountingIt = cub::CountingInputIterator; + using RowPartitionIt = cub::TransformInputIterator; + RowPartitionIt rowPartitionIt(CountingIt(0), rowEndOp); + + auto cubSortbyKey = [=](void* storage, size_t& storageSize, bool descending) { + using cubSort = cub::DeviceSegmentedRadixSort; + if(descending) + cubSort::SortPairsDescending(storage, storageSize, + inValData, outValData, + outIndData, outIndData, + /*total=*/rows * cols, + /*segments=*/rows, + rowPartitionIt, rowPartitionIt + 1); + else + cubSort::SortPairs(storage, storageSize, + inValData, outValData, + outIndData, outIndData, + /*total=*/rows * cols, + /*segments=*/rows, + rowPartitionIt, rowPartitionIt + 1); + }; + + // Important lesson: before I used my own allocation and deallocation of temporary memory, this + // was actually slower than the thrust version. Again, mixing computation and cudaMalloc is a bad idea. + // @TODO: review other kernels to make sure I don't use cudaMalloc directly anywhere. + + // Determine temporary device storage requirements, this doesn't sort anything + size_t tempStorageBytes = 0; + cubSortbyKey(nullptr, /*out=*/tempStorageBytes, descending); + // Allocate temporary storage + auto tempStorage = allocator->alloc(tempStorageBytes); + // Run sorting operation + cubSortbyKey(tempStorage->data(), tempStorageBytes, descending); + // free temporary storage + allocator->free(tempStorage); +#else + ABORT("CUB sort requires CUDA 11.0 or higher"); +#endif +} + +// the same as above but using thrust::sort_by_key instead of cub::DeviceSegmentedRadixSort::SortPairs; +// used for CUDA < 11.0, slower than cub::DeviceSegmentedRadixSort::SortPairs +template +void TypedSortThrust(Tensor outVal, Tensor outInd, const Tensor in, bool descending) { + int cols = in->shape()[-1]; + int rows = in->shape().elements() / cols; + + // use thrust device_ptr to wrap raw pointers + thrust::device_ptr inVal(in->data()); + thrust::device_ptr outValData(outVal->data()); + thrust::device_ptr outIndData(outInd->data()); + + // lambda that sorts a row + auto sortRow = [=] (int rowIdx) { + // currently use default stream + cudaStream_t stream = 0; + auto exec = thrust::cuda::par.on(stream); + + auto outValRow = outValData + rowIdx * cols; // pointer to row in output value tensor + auto outIndRow = outIndData + rowIdx * cols; // pointer to row in output index tensor + // sort the indices and values according to the values in the output tensor and using the stream + if(descending) + thrust::sort_by_key(exec, outValRow, outValRow + cols, outIndRow, thrust::greater()); + else + thrust::sort_by_key(exec, outValRow, outValRow + cols, outIndRow, thrust::less()); + }; + + // copy input tensor to output tensor + thrust::copy(thrust::device, inVal, inVal + rows * cols, outValData); + + // create indices for the input tensor, i.e. [0, 1, 2, ..., cols] per row using single thrust transform + thrust::transform(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(rows * cols), + outIndData, + [=] HOST_DEVICE (int i) { return i % cols; }); + + // sort each row of the input tensor separately + // couldn't find a way to do this with thrust::for_each that wasn't hilariously slow + for(int i = 0; i < rows; ++i) + sortRow(i); +} + +template +void TypedSort(Ptr allocator, Tensor outVal, Tensor outInd, const Tensor in, bool descending) { +#if CUDA_VERSION < 11000 + // CUDA_VERSION < 11000 doesn't include and hence cub::DeviceSegmentedRadixSort::SortPairs + // we use thrust::sort_by_key instead which is slower + TypedSortThrust(outVal, outInd, in, descending); +#else + TypedSortCUB(allocator, outVal, outInd, in, descending); +#endif } + +void Sort(Tensor outVal, Tensor outInd, Ptr allocator, const Tensor in, int axis, bool descending) { + ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis"); + ABORT_IF(!isFloat(in->type()), "Input should be float type and not {}", in->type()); + ABORT_IF(outInd->type() != Type::uint32, "Output should have type {}", Type::uint32); + ABORT_IF(outVal->type() != in->type(), "Output should have type {}", in->type()); + + if(in->type() == Type::float32) { + TypedSort(allocator, outVal, outInd, in, descending); +#if COMPILE_FP16 + } else if(in->type() == Type::float16) { + TypedSort<__half>(allocator, outVal, outInd, in, descending); +#endif + } else { + ABORT("Sort not implemented for type {}", in->type()); + } } + +} // namespace gpu +} // namespace marian diff --git a/src/tensors/rand.cpp b/src/tensors/rand.cpp index e6dbc46ed..cfe768f59 100644 --- a/src/tensors/rand.cpp +++ b/src/tensors/rand.cpp @@ -14,8 +14,9 @@ class StdlibRandomGenerator : public RandomGenerator { std::mt19937 engine_; public: - StdlibRandomGenerator(size_t seed) - : RandomGenerator(seed), engine_((unsigned int)seed) {} + StdlibRandomGenerator(size_t seed, DeviceId deviceId) + : RandomGenerator(seed, deviceId), + engine_((unsigned int)RandomGenerator::seed()) {} virtual void uniform(Tensor tensor, float a, float b) override; virtual void normal(Tensor, float mean, float stddev) override; @@ -68,7 +69,7 @@ void StdlibRandomGenerator::normal(Tensor tensor, float mean, float stddev) { #ifdef CUDA_FOUND CurandRandomGenerator::CurandRandomGenerator(size_t seed, DeviceId deviceId) -: RandomGenerator(seed), deviceId_(deviceId) { +: RandomGenerator(seed, deviceId), deviceId_(deviceId) { if(deviceId_.type == DeviceType::gpu) { cudaSetDevice((int)deviceId_.no); CURAND_CHECK(curandCreateGenerator(&generator_, CURAND_RNG_PSEUDO_DEFAULT)); @@ -76,7 +77,7 @@ CurandRandomGenerator::CurandRandomGenerator(size_t seed, DeviceId deviceId) else { CURAND_CHECK(curandCreateGeneratorHost(&generator_, CURAND_RNG_PSEUDO_DEFAULT)); } - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator_, seed_)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator_, RandomGenerator::seed())); } CurandRandomGenerator::~CurandRandomGenerator() { @@ -112,9 +113,7 @@ Ptr createRandomGenerator(size_t seed, DeviceId deviceId) { #ifdef CUDA_FOUND return New(seed, deviceId); #else - ABORT_IF(deviceId.type != DeviceType::cpu, - "StdlibRandomGenerator can only be used for CPU tensors"); - return New(seed); + return New(seed, deviceId); #endif } diff --git a/src/tensors/rand.h b/src/tensors/rand.h index 94b44a97a..1042104f4 100644 --- a/src/tensors/rand.h +++ b/src/tensors/rand.h @@ -1,6 +1,8 @@ #pragma once #include "common/definitions.h" +#include "common/hash.h" +#include "common/logging.h" #include @@ -13,11 +15,26 @@ class RandomGenerator { protected: size_t seed_; + // hashing device type and id to get a unique seed for each device, e.g. for different samples on different devices + size_t hashSeed(size_t seed, DeviceId deviceId) { + // on the first device, use the seed as is. This keeps unit tests etc. working correctly + // on other devices, hash the seed with the device type and id, so that we get different seeds for different devices + // this is important for e.g. different samples on different devices + if(deviceId.no == 0) + return seed; + else + return util::hashArgs(seed, deviceId.type, deviceId.no); + } + public: - RandomGenerator(size_t seed) : seed_(seed) { } + RandomGenerator(size_t seed, DeviceId deviceId) + : seed_(hashSeed(seed, deviceId)) { + LOG(debug, "Setting random seed to {} (device {}{})", seed_, deviceId.typeAsString(), deviceId.no); + } virtual ~RandomGenerator() {} virtual void uniform(Tensor, float a, float b) = 0; virtual void normal(Tensor, float mean, float stddev) = 0; + virtual size_t seed() { return seed_; } }; Ptr createRandomGenerator(size_t /*seed*/, DeviceId); diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 2747a6d66..1940e9d95 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -346,6 +346,11 @@ static inline void Select(Tensor out, const Tensor in, const Tensor indices, int } DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, int, bool); +DISPATCH6(Sort, marian::Tensor, marian::Tensor, Ptr, const marian::Tensor, int, bool); + +DISPATCH4(CumSum, marian::Tensor, const marian::Tensor, bool, bool); +DISPATCH4(CumProd, marian::Tensor, const marian::Tensor, bool, bool); +DISPATCH5(LogCumSumExp, marian::Tensor, const marian::Tensor, bool, bool, bool); DISPATCH2(LSTMCellForward, marian::Tensor, std::vector) DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector); diff --git a/src/tests/transformer_new.cpp b/src/tests/transformer_new.cpp deleted file mode 100644 index 2d1e89281..000000000 --- a/src/tests/transformer_new.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "marian.h" -#include "models/transformer_new.h" - - -int main(int argc, char** argv) { - using namespace marian; - - testme(); - - return 0; -} diff --git a/src/tests/units/CMakeLists.txt b/src/tests/units/CMakeLists.txt index 7745fee14..ee5920521 100644 --- a/src/tests/units/CMakeLists.txt +++ b/src/tests/units/CMakeLists.txt @@ -7,6 +7,7 @@ set(UNIT_TESTS fastopt_tests utils_tests binary_tests + transformer_tests # cosmos_tests # optional, uncomment to test with specific files. ) diff --git a/src/tests/units/operator_tests.cpp b/src/tests/units/operator_tests.cpp index 5806e94de..565ffb1d7 100644 --- a/src/tests/units/operator_tests.cpp +++ b/src/tests/units/operator_tests.cpp @@ -1054,6 +1054,9 @@ void tests(DeviceType device, Type floatType = Type::float32) { auto ridx4 = get<1>(rtopk4); auto gval4 = gather(a, -2, ridx4); + const auto& [valDesc, indDesc] = sort(a, /*axis=*/-1, /*descending=*/true); + const auto& [valAsc, indAsc] = sort(a, /*axis=*/-1, /*descending=*/false); + graph->forward(); CHECK(rval1 != gval1); @@ -1095,6 +1098,36 @@ void tests(DeviceType device, Type floatType = Type::float32) { gval4->val()->get(values); CHECK( values == vval4 ); + + std::vector vvalDesc = { 0.3333, 0, -0.2, + 4.5, 0, -0.3, + 101.45, 5.2, -10.0, + 1.05e-5, 0, -100.05 }; + valDesc->val()->get(values); + CHECK( values == vvalDesc ); + + std::vector vindDesc = { 1, 0, 2, + 2, 1, 0, + 2, 0, 1, + 2, 1, 0 }; + std::vector testVindDesc; + indDesc->val()->get(testVindDesc); + CHECK( testVindDesc == vindDesc ); + + std::vector vvalAsc = { -0.2, 0, 0.3333, + -0.3, 0, 4.5, + -10.0, 5.2, 101.45, + -100.05, 0, 1.05e-5 }; + valAsc->val()->get(values); + CHECK( values == vvalAsc ); + + std::vector vindAsc = { 2, 0, 1, + 0, 1, 2, + 1, 0, 2, + 0, 1, 2 }; + std::vector testVindAsc; + indAsc->val()->get(testVindAsc); + CHECK( testVindAsc == vindAsc ); } SECTION("cross entropy with label smoothing vs logsoftmax with gather") { @@ -1141,6 +1174,73 @@ void tests(DeviceType device, Type floatType = Type::float32) { CHECK( std::equal(values.begin(), values.end(), values2.begin(), floatApprox2) ); } + + SECTION("Scan operations") { + std::vector input = { + -0.1, -1.2, -0.4, + 1.2, 2.3, -3.4, + -2.2, 1.0, -1.2 + }; + + auto x = graph->constant({3, 3}, inits::fromVector(input)); + auto a = logcumsumexp(x, /*axis=*/-1); + auto b = logcumsumexp(x, /*axis=*/-2, /*reverse=*/false, /*exclusive=*/true); + + auto c = cumsum(x, /*axis=*/-1, /*reverse=*/false, /*exclusive=*/true); + auto d = cumsum(x, /*axis=*/-2, /*reverse=*/true); + + graph->forward(); + + CHECK(a->shape() == Shape({3, 3})); + CHECK(b->shape() == Shape({3, 3})); + + std::vector aValues = { + -0.1000, 0.1875, 0.6294, + 1.1992, 2.5859, 2.5879, + -2.1992, 1.0400, 1.1416 + }; + + T negInf = -std::numeric_limits::infinity(); + std::vector bValues = { + negInf, negInf, negInf, + -0.1f, -1.2f, -0.4f, + 1.44101f, 2.32975f, -0.35141f, + }; + + a->val()->get(values); + b->val()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + aValues.begin(), floatApprox2) ); + + CHECK( std::equal(values2.begin(), values2.end(), + bValues.begin(), floatApprox2) ); + + CHECK(c->shape() == Shape({3, 3})); + CHECK(d->shape() == Shape({3, 3})); + + std::vector cValues = { + 0, -0.1000, -1.3000, + 0, 1.2000, 3.5000, + 0, -2.2000, -1.2000, + }; + + std::vector dValues = { + -1.1, 2.1, -5.0, + -1.0, 3.3, -4.6, + -2.2, 1.0, -1.2 + }; + + c->val()->get(values); + d->val()->get(values2); + + CHECK( std::equal(values.begin(), values.end(), + cValues.begin(), floatApprox2) ); + + CHECK( std::equal(values2.begin(), values2.end(), + dValues.begin(), floatApprox2) ); + } + } #ifdef CUDA_FOUND diff --git a/src/tests/units/transformer_tests.cpp b/src/tests/units/transformer_tests.cpp new file mode 100644 index 000000000..4f7cc4d29 --- /dev/null +++ b/src/tests/units/transformer_tests.cpp @@ -0,0 +1,147 @@ +/* All or part of this file was contributed by NVIDIA under license: + * Copyright (C) 2020 NVIDIA Corporation + * SPDX-License-Identifier: MIT + */ +#include "catch.hpp" +#include "graph/expression_graph.h" +#include "graph/expression_operators.h" +#include "layers_new/transformer.h" + +#ifdef CUDA_FOUND +#include "tensors/gpu/backend.h" +#endif + +#include + +using namespace marian; + +template +void tests(DeviceType device, Type floatType = Type::float32) { + +// Checking for FP16 support and skipping if not supported. +#ifdef CUDA_FOUND + if(device == DeviceType::gpu && floatType == Type::float16) { + auto gpuBackend = New(DeviceId({0, device}), /*seed=*/1234); + auto cudaCompute = gpuBackend->getCudaComputeCapability(); + if(cudaCompute.major < 6) return; + } +#endif + + auto floatApprox = [](T x, T y) -> bool { return x == Approx(y).margin(0.001f); }; + auto floatApprox2 = [](T x, T y) -> bool { return x == Approx(y).margin(0.01f); }; + auto floatEqual = [](T x, T y) -> bool { return x == y; }; + + Config::seed = 4321; + auto graph = New(); + + graph->setInference(true); + graph->setDefaultElementType(floatType); + graph->setDevice({0, device}); + graph->reserveWorkspaceMB(16); + + std::vector values; + + SECTION("Test equivalence of layers and specialized operators") { + graph->clear(); + values.clear(); + + std::vector vecState = { + 0.82858741, 0.97615969, 0.67942131, 0.17952891, + 0.65630823, 0.38350773, 0.74830967, 0.67770803, + 0.00955211, 0.02345274, 0.02023151, 0.97143453, + 0.89971799, 0.50413132, 0.62781775, 0.59496081, + 0.14006306, 0.46450409, 0.91360050, 0.10497642, + 0.25477138, 0.63996094, 0.53658444, 0.88240266, + 0.37349635, 0.38880551, 0.18208119, 0.62951839, + 0.04330675, 0.59304160, 0.20436798, 0.74339235, + 0.32903627, 0.81596214, 0.44163024, 0.92444748, + 0.80231488, 0.52994978, 0.13350771, 0.40195912, + 0.55303711, 0.55137914, 0.98701674, 0.54963994, + 0.45657760, 0.57295781, 0.58645976, 0.74960953, + 0.77174628, 0.06652048, 0.68104792, 0.84806365, + 0.75292617, 0.82063907, 0.96599948, 0.63845992, + 0.47047511, 0.48726216, 0.95756608, 0.01479877, + 0.75449765, 0.55964196, 0.66664016, 0.34928808 + }; + + auto state = graph->constant({2, 2, 4, 4}, inits::fromVector(vecState)); + + using namespace marian::nn; + + auto rnn = New>(graph, state->shape()[-1], /*transformer-rnn-projection*/true); + auto output = rnn->apply(state); + + auto iProj = rnn->cell->iProj->weight; + auto iBias = rnn->cell->iProj->bias; + + auto fProj = rnn->cell->fProj->weight; + auto fBias = rnn->cell->fProj->bias; + + auto oProj = rnn->oProj->weight; + auto oBias = rnn->oProj->bias; + +#if 0 + debug(output, "output"); + + auto x = affine(state, iProj, iBias); + auto f = affine(state, fProj, fBias); + + auto ssruFwd = [=](Expr out, const std::vector& inputs) { + auto x = inputs[0]; + auto f = inputs[1]; + + SSRUScanForward(out->val(), x->val(), f->val()); + }; + + auto output2 = lambda({x, f}, x->shape(), x->value_type(), ssruFwd); + + output2 = relu(output2); + output2 = affine(output, oProj, oBias); + debug(output2, "output2"); +#endif + + graph->forward(); + + std::vector expected = { + -0.23135981, 0.04476057, 0.16183880, -0.13936377, + -0.47255400, -0.00786887, 0.10853745, -0.06822529, + -0.51970947, -0.10289559, -0.06798580, 0.10712720, + -0.58211476, -0.10762983, -0.06099827, 0.10525966, + -0.33873928, 0.07430670, 0.24815071, -0.21479189, + -0.50458324, -0.01065392, 0.11723585, -0.07428676, + -0.47146145, -0.07140756, -0.01806587, 0.05478236, + -0.49719882, -0.10403568, -0.07004700, 0.10721481, + -0.31213918, -0.07793316, -0.06812444, 0.09076738, + -0.26403564, -0.08575443, -0.10109652, 0.11913717, + -0.57269764, -0.03178894, 0.08730030, -0.03967147, + -0.63041478, -0.07102037, 0.02447471, 0.02596882, + -0.40184090, -0.07519485, -0.04389046, 0.07439522, + -0.62908661, -0.03906321, 0.08765715, -0.03556710, + -0.54157418, 0.06784889, 0.27720353, -0.22676750, + -0.50410551, 0.02381870, 0.17982434, -0.13504542 + }; + + output->val()->get(values); + + CHECK(values.size() == expected.size()); + // CHECK(std::equal(values.begin(), values.end(), expected.begin(), floatApprox)); + } +} + +#ifdef CUDA_FOUND +TEST_CASE("Expression graph supports basic math operations (gpu)", "[operator]") { + tests(DeviceType::gpu); +} + +#if COMPILE_FP16 +TEST_CASE("Expression graph supports basic math operations (gpu fp16)", "[operator]") { + tests(DeviceType::gpu, Type::float16); +} +#endif +#endif + +#ifdef BLAS_FOUND +TEST_CASE("Expression graph supports basic math operations (cpu)", "[operator]") { + tests(DeviceType::cpu); +} +#endif diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 1eeef913a..63aa0ec8f 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -79,7 +79,6 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current if(pathScore == INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor) continue; - ABORT_IF(pathScore < INVALID_PATH_SCORE, "Actual pathScore ({}) is lower than INVALID_PATH_SCORE ({})??", pathScore, INVALID_PATH_SCORE); // This should not happen in valid situations. Currently the only smaller value would be -inf (effect of overflow in summation?) ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...) // map wordIdx to word @@ -450,19 +449,27 @@ Histories BeamSearch::search(Ptr graph, Ptr logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, /*shortlist=*/ nullptr, hypIndices, maxBeamSize); // [maxBeamSize, 1, currentDimBatch, dimVocab] } // expand all hypotheses, [maxBeamSize, 1, currentDimBatch, 1] -> [maxBeamSize, 1, currentDimBatch, dimVocab] - if(i == 0) - stepScores = scorers_[i]->getWeight() * logProbs; - else + if(i == 0) { + stepScores = scorers_[i]->getWeight() * logProbs; + } else { stepScores = stepScores + scorers_[i]->getWeight() * logProbs; + } } + // we cast (ensembled) scores to float32, as accumulated them into path scores; + // also beneficial for sampling etc. + // @TODO:: consider doing this before ensembling + stepScores = cast(stepScores, Type::float32); + if(factorGroup == 0) { stepScores = distMod->force(stepScores, (int)t, (int)maxBeamSize, batchIndices); - stepScores = distMod->sample(stepScores); + stepScores = distMod->sample(stepScores, /*normalize=*/true); } // make beams continuous - auto expandedPathScores = prevPathScores + cast(stepScores, Type::float32); // will become [maxBeamSize, 1, currDimBatch, dimVocab] + auto expandedPathScores = prevPathScores + stepScores; // will become [maxBeamSize, 1, currDimBatch, dimVocab] + + // this transpose is required for the combined top-k search below expandedPathScores = swapAxes(expandedPathScores, 0, 2); // -> [currentDimBatch, 1, maxBeamSize, dimVocab] // perform NN computation diff --git a/src/translator/beam_search.h b/src/translator/beam_search.h index 75a9caeb0..0810b3332 100644 --- a/src/translator/beam_search.h +++ b/src/translator/beam_search.h @@ -18,9 +18,9 @@ class BeamSearch { const bool PURGE_BATCH = true; // @TODO: diagnostic, to-be-removed once confirmed there are no issues. static float chooseInvalidPathScore(Ptr options) { - auto prec = options->get>("precision", {"float32"}); - auto computeType = typeFromString(prec[0]); - return NumericLimits(computeType).lowest; + // We are now using float32 for accumulation along path scores, so we can just use float32 for the invalid scores + // Division by 2 to stay away from -inf. Here lowest / 2.f is bascially a magic number that marks finished beams. + return NumericLimits(Type::float32).lowest / 2.f; } public: diff --git a/src/translator/sampling.h b/src/translator/sampling.h index 4ac2063e9..2b13791d9 100644 --- a/src/translator/sampling.h +++ b/src/translator/sampling.h @@ -1,141 +1,240 @@ - namespace marian { - - class DistModifier { - private: - Ptr options_; - bool forceDecode_{false}; - bool sampling_{false}; - std::string samplingMethod_; - int topk_{10}; - float temperature_{1.f}; - - Ptr batch_; - float invalidPathScore_; - - Expr forceBatch_; +namespace marian { + +namespace sampling { + +// Prunning functions for sampling from the output distribution +// All functions take a logits tensor and return a tensor of the same shape and pruned values removed. +// The logits tensor is assumed to be in log-space (i.e. logprobs) and the returned tensor is also in log-space. +// The pruned distribution can be renormalized via logsoftmax to ensure that the sum of the probabilities is 1. +// However this doesn't matter much for sampling since the gumbel max trick works for unnormalized distributions. + +// Prune logits via top-k pruning +Expr topkPruning(Expr scores, int k, bool normalize = false) { + Expr val, idx; + + // note, for around k>200 topk is slower on the GPU than sorting and then selecting the top-k values + std::tie(val, idx) = topk(scores, k, /*axis=*/-1, /*descending=*/true); + if(normalize) + val = logsoftmax(val); // renormalize via logsoftmax + + // Scatter gumbelled values back into logits to fill with usable values + auto invalid = constant_like(scores, inits::fromValue(std::log(0.f))); + return scatter(invalid, /*axis=*/-1, idx, val); +} + +// Prune logits via nucleus pruning +Expr nucleusPruning(Expr scores, float threshold, bool normalize = false) { + // normalization would make sense here since we compare against a meaningful threshold and + // we don't know what other manipulations have been done to the logits before, but + // leaving it to the user for now. We do set it to true in beam_search.cpp + if(normalize) + scores = logsoftmax(scores); // renormalize via logsoftmax + + // sort scores in descending order, this way we can use the cumulative sum to find the nucleus + Expr val, idx; + std::tie(val, idx) = sort(scores, /*axis=*/-1, /*descending=*/true); + + // logcumsumexp because we have logprobs, exclusive because we keep at least the first element + // we can skip the numerical stability trick here since we are in log-space + auto lcse = logcumsumexp(val, /*axis=*/-1, /*reverse=*/false, /*exclusive=*/true, /*fast=*/true); + + // mask out all values that for which the cumulative sum is larger than the threshold (i.e. they are outside the nucleus) + auto lcseMask = log(le(lcse, std::log(threshold))); + val = minimum(val, lcseMask); // mask out all values outside the nucleus + + if(normalize) + val = logsoftmax(val); // renormalize via logsoftmax + + // scatter the masked values back into the correct positions (undo sorting) + return scatter(scores, /*axis=*/-1, idx, val); +} + +// Prune logits via epsilon pruning +Expr epsilonPruning(Expr scores, float epsilon, bool normalize = false) { + // normalization would make sense here since we compare against a meaningful threshold and + // we don't know what other manipulations have been done to the logits before + if(normalize) + scores = logsoftmax(scores); // renormalize via logsoftmax + + // make sure the epsilon is not larger than the largest value in the scores + // otherwise we will mask out all values + // equivalent to union of top-1 and log(epsilon) + auto safeThreshold = minimum(max(scores, /*axis=*/-1), std::log(epsilon)); + + // create mask for all values that are smaller than the epsilon + auto logEpsMask = log(ge(scores, safeThreshold)); // -inf for all values smaller than epsilon + auto logEpsScores = minimum(scores, logEpsMask); // mask out all values smaller than epsilon + + if(normalize) + logEpsScores = logsoftmax(logEpsScores); // renormalize after masking via logsoftmax + return logEpsScores; +} + +Expr gumbelMaxTrick(Expr scores, float temperature) { + // scale scores by temperature + if(temperature != 1.f) + scores = scores / temperature; + // add Gumbel noise to all values and renormalize via logsoftmax + return logsoftmax(scores + constant_like(scores, inits::gumbel())); +} +} // namespace sampling + +class DistModifier { +private: + Ptr options_; + bool forceDecode_{false}; + + bool sampling_{false}; + std::function samplingFn_; + + Ptr batch_; + float invalidPathScore_; + + Expr forceBatch_; + +public: + DistModifier(Ptr options, Ptr batch, float invalidPathScore) : + options_(options), forceDecode_(options_->get("force-decode", false)), + batch_(batch), invalidPathScore_(invalidPathScore) { - public: - DistModifier(Ptr options, Ptr batch, float invalidPathScore) : - options_(options), forceDecode_(options_->get("force-decode", false)), - batch_(batch), invalidPathScore_(invalidPathScore) { - - if(options_->hasAndNotEmpty("output-sampling")) { + if(options_->hasAndNotEmpty("output-sampling")) { + sampling_ = true; + auto samplingOpts = options_->get>("output-sampling", {}); + std::string samplingMethod = samplingOpts.size() > 0 ? samplingOpts[0] : "full"; + + if(samplingMethod == "0") { // for backcompat with boolean values + sampling_ = false; + samplingMethod = ""; + } else if(samplingMethod == "1") { // for backcompat with boolean values sampling_ = true; - auto samplingOpts = options_->get>("output-sampling", {}); - samplingMethod_ = samplingOpts.size() > 0 ? samplingOpts[0] : "full"; - if(samplingMethod_ == "0") { // for backcompat with boolean values - sampling_ = false; - samplingMethod_ = ""; - } else if(samplingMethod_ == "1") { // for backcompat with boolean values - sampling_ = true; - samplingMethod_ = "full"; - } - - if(samplingMethod_ == "full") { - if(samplingOpts.size() > 1) - temperature_ = std::stof(samplingOpts[1]); - } - - if(samplingMethod_ == "topk") { - if(samplingOpts.size() > 1) - topk_ = std::stoi(samplingOpts[1]); - if(samplingOpts.size() > 2) - temperature_ = std::stof(samplingOpts[2]); - } + samplingMethod = "full"; + } + + if(samplingMethod == "full") { + float temperature = 1.f; + if(samplingOpts.size() > 1) + temperature = std::stof(samplingOpts[1]); + + LOG_ONCE(info, "Output sampling from the full softmax distribution with temperature {}", temperature); + + samplingFn_ = [temperature](Expr logits, bool normalize = false) { + // full softmax sampling is just gumbel trick with temperature 1 and optional prior renormalization + return sampling::gumbelMaxTrick(normalize ? logsoftmax(logits) : logits, temperature); + }; + } else if(samplingMethod == "topk") { + int topk = 10; // number of top-k values to sample from + float temperature = 1.f; + if(samplingOpts.size() > 1) + topk = std::stoi(samplingOpts[1]); + if(samplingOpts.size() > 2) + temperature = std::stof(samplingOpts[2]); + + LOG_ONCE(info, "Output sampling via top-{} sampling with temperature {}", topk, temperature); + + samplingFn_ = [topk, temperature](Expr logits, bool normalize = false) { + // top-k sampling is just gumbel trick with temperature 1 and top-k pruning + return sampling::gumbelMaxTrick(sampling::topkPruning(logits, topk, normalize), temperature); + }; + } else if(samplingMethod == "nucleus") { + float threshold = 0.9f; // probability mass threshold of nucleus + float temperature = 1.f; + if(samplingOpts.size() > 1) + threshold = std::stof(samplingOpts[1]); + if(samplingOpts.size() > 2) + temperature = std::stof(samplingOpts[2]); + + LOG_ONCE(info, "Output sampling via nucleus sampling with threshold {} temperature {}", threshold, temperature); + + samplingFn_ = [threshold, temperature](Expr logits, bool normalize = false) { + // nucleus sampling is just gumbel trick with temperature 1 and nucleus pruning + return sampling::gumbelMaxTrick(sampling::nucleusPruning(logits, threshold, normalize), temperature); + }; + } else if(samplingMethod == "epsilon") { + float eps = 0.02f; // mimimal probability of sampled token + float temperature = 1.f; + if(samplingOpts.size() > 1) + eps = std::stof(samplingOpts[1]); + if(samplingOpts.size() > 2) + temperature = std::stof(samplingOpts[2]); + + LOG_ONCE(info, "Output sampling via epsilon sampling with eps {} and temperature {}", eps, temperature); + + samplingFn_ = [eps, temperature](Expr logits, bool normalize = false) { + // epsilon sampling is just gumbel trick with temperature 1 and epsilon pruning + return sampling::gumbelMaxTrick(sampling::epsilonPruning(logits, eps, normalize), temperature); + }; + } else { + ABORT("Unknown sampling method: {}", samplingMethod); } } - - Expr force(Expr scores, int pos, int beamSize, std::vector& batchIndices) { - // we check the last field of the batch for force-decoding content - int dimTime = (int)batch_->back()->batchWidth(); - if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores - return scores; - - LOG_ONCE(info, "Force-decoding with given prefixes"); - // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the - // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections - // which preserves the original tensor layout. - // This allows for beam-search and batched force-decoding with different length prefixes in a batch - // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search - // which then continues as normal on the modified distribution. - - if(!forceBatch_) { - // turn the batch into a cached tensor that lives in the computation graph - std::vector forceWords; - for(auto& word : batch_->back()->data()) - forceWords.push_back(word.toWordIndex()); - - int dimBatch = (int)batch_->back()->batchSize(); - forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1] - } - - // if we remove batch entries during decoding (finished decoding) then adjust here - if(forceBatch_->shape()[-2] != batchIndices.size()) - forceBatch_ = index_select(forceBatch_, -2, batchIndices); - - // get vocab index and probability for force-decoded tokens for the current time step - Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] - Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1] - - // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam - // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch - // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion. - int dimVocab = scores->shape()[-1]; - auto graph = scores->graph(); - // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred. - Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f); - // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself. - Expr dummyVals = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f); - - // here we add the force-decoded entries back into the zeroed positions - dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); - dummyVals = dummyVals + forceVals; - - // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and - // dummy values into the correct positions. - Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_)); - forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); - - // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry - // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means - // free decoding or sampling. - WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex(); - auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId); - return interpol * scores + (1.f - interpol) * forcedScores; + } + + Expr force(Expr scores, int pos, int beamSize, std::vector& batchIndices) { + // we check the last field of the batch for force-decoding content + int dimTime = (int)batch_->back()->batchWidth(); + if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores + return scores; + + LOG_ONCE(info, "Force-decoding with given prefixes"); + // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the + // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections + // which preserves the original tensor layout. + // This allows for beam-search and batched force-decoding with different length prefixes in a batch + // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search + // which then continues as normal on the modified distribution. + + if(!forceBatch_) { + // turn the batch into a cached tensor that lives in the computation graph + std::vector forceWords; + for(auto& word : batch_->back()->data()) + forceWords.push_back(word.toWordIndex()); + + int dimBatch = (int)batch_->back()->batchSize(); + forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1] } - Expr sample(Expr scores) { - if(sampling_) { - if(temperature_ != 1.f) - scores = scores / temperature_; - - if(samplingMethod_ == "full") { - LOG_ONCE(info, "Output sampling from the full softmax distribution with temperature {}", temperature_); - return logsoftmax(scores + constant_like(scores, inits::gumbel())); - } else if(samplingMethod_ == "topk") { - if(topk_ == 1) - LOG_ONCE(info, "Output sampling with k=1 is equivalent to beam search with beam size 1"); - LOG_ONCE(info, "Output sampling via top-{} sampling with temperature {}", topk_, temperature_); - - Expr invalidLogits = constant_like(scores, inits::fromValue(invalidPathScore_)); - - // select top-k values - Expr val, idx; - std::tie(val, idx) = topk(scores, topk_, /*axis=*/-1, /*descending=*/true); - - // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search - Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel())); - - // Scatter gumbelled values back into logits to fill with usable values - return scatter(invalidLogits, -1, idx, gumbelVal); - } else { - ABORT("Unknown sampling method: {}", samplingMethod_); - } - } else { // no sampling - return scores; - } + // if we remove batch entries during decoding (finished decoding) then adjust here + if(forceBatch_->shape()[-2] != batchIndices.size()) + forceBatch_ = index_select(forceBatch_, -2, batchIndices); + + // get vocab index and probability for force-decoded tokens for the current time step + Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] + Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1] + + // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam + // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch + // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion. + int dimVocab = scores->shape()[-1]; + auto graph = scores->graph(); + // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred. + Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f); + // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself. + Expr dummyVals = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f); + + // here we add the force-decoded entries back into the zeroed positions + dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); + dummyVals = dummyVals + forceVals; + + // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and + // dummy values into the correct positions. + Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_)); + forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); + + // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry + // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means + // free decoding or sampling. + WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex(); + auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId); + return interpol * scores + (1.f - interpol) * forcedScores; + } + + Expr sample(Expr scores, bool normalize = false) { + if(sampling_) { + return samplingFn_(scores, normalize); + } else { // no sampling + return scores; } + } +}; - }; - - } \ No newline at end of file +} \ No newline at end of file diff --git a/src/translator/translator.h b/src/translator/translator.h index f1fd04d3f..498ef65b3 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -125,12 +125,8 @@ class Translate : public ModelTask { if(options_->hasAndNotEmpty("output-sampling")) { if(options_->get("beam-size") > 1) LOG(warn, - "[warning] Output sampling and beam search (beam-size > 1) are contradictory methods " - "and using them together is not recommended. Set beam-size to 1"); - if(options_->get>("models").size() > 1) - LOG(warn, - "[warning] Output sampling and model ensembling are contradictory methods and using " - "them together is not recommended. Use a single model"); + "[warning] Enabling output sampling and beam search together (--output-sampling [...] && --beam-size > 1) results in so-called stochastic beam-search. " + "Are you sure this is desired? For normal sampling, use --beam-size 1."); } } From 5e47ab2ac4c916ab6f687598f81e19bdf326a509 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 6 Jan 2024 01:54:17 +0000 Subject: [PATCH 06/26] Merged PR 32433: Fix Logmask in BLEURT model This adjusts the logmask computation to match the implementation in COMET-QE model after the ALIBI refactoring. --- CHANGELOG.md | 1 + VERSION | 2 +- src/models/bleurt.h | 50 +++++++++++++++++++++++---------------------- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51df73b57..83c05ac4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed BLEURT logmask computation - Fixed wrong paramter name for norm in new layer framework - Fixed unit test for LayerNorm - Only collect batch statistics during mini-batch-fit up to actual max-length. diff --git a/VERSION b/VERSION index 658123368..5235dd6a9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.16 +v1.12.17 diff --git a/src/models/bleurt.h b/src/models/bleurt.h index baeb704a5..74848b788 100644 --- a/src/models/bleurt.h +++ b/src/models/bleurt.h @@ -12,11 +12,11 @@ class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions { public: Expr embeddings; - BleurtTypeEmbeddingLayer(Ptr graph, Ptr options) + BleurtTypeEmbeddingLayer(Ptr graph, Ptr options) : LayerWithOptions(graph, options) {} virtual ~BleurtTypeEmbeddingLayer() = default; - + Expr apply(Ptr subBatch) const { int dimEmb = opt("dim-emb"); int dimTypes = opt("bert-type-vocab-size", 2); @@ -27,7 +27,7 @@ class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions { const auto& words = subBatch->data(); const auto vocab = subBatch->vocab(); - + // Get word id of special symbols Word sepId = vocab->getEosId(); @@ -55,10 +55,10 @@ class BleurtTypeEmbeddingLayer : public nn::LayerWithOptions { struct BleurtEncoder final : public nn::TransformerEncoder { Ptr eProj; - BleurtEncoder(Ptr graph, - Ptr options) + BleurtEncoder(Ptr graph, + Ptr options) : TransformerEncoder(graph, options) { - + eProj = New(graph, opt("transformer-dim-model")); registerLayer(eProj); @@ -68,33 +68,35 @@ struct BleurtEncoder final : public nn::TransformerEncoder { Expr apply(Expr input, Expr mask) const override { auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - mask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - + + auto binaryMask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] + auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1] + // apply positional embeddings to contextual input output = positionEmbedding->apply(output); // apply dropout or layer-norm to embeddings if required output = preprocessor->apply(output); - + // scale from 256 to 1152 output = eProj->apply(output); - + // traverse the layers, use the same mask for each for(auto layer : *layers) - output = layer->apply(output, mask); + output = layer->apply(output, logMask); return output; } }; // Wrapper for backwards compatibility that uses current encoder/decoder framework -struct BleurtBatchEncoder final : public nn::LayerWithOptions, +struct BleurtBatchEncoder final : public nn::LayerWithOptions, public nn::IEmbeddingLayer, // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings public EncoderBase { // @TODO: should all encoders be IEmbeddingLayer? Ptr typeEmbedding; Ptr encoder; - - BleurtBatchEncoder(Ptr graph, + + BleurtBatchEncoder(Ptr graph, Ptr options) : LayerWithOptions(graph, options), EncoderBase(graph, options) @@ -110,7 +112,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions, virtual std::tuple apply(Ptr subBatch) const override { auto embeddingLayer = getEmbeddingLayer(EncoderBase::opt("ulr", false)); const auto& [batchEmbeddings, batchMask] = embeddingLayer->apply(subBatch); - + #if 1 auto typeEmbeddings = typeEmbedding->apply(subBatch); auto embeddings = batchEmbeddings + typeEmbeddings; @@ -142,12 +144,12 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions, EncoderBase::graph_ = graph; setGraph(graph); // This makes sure that the graph passed into the model during construction and now evaluation are identical. - // A good check to have for catching weird situations early. + // A good check to have for catching weird situations early. ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); #endif // @TODO: this needs to convert to a BERT-batch - + const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]); return New(batchEmbedding, batchMask, batch); } @@ -157,7 +159,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions, } }; -class BleurtPooler final : public nn::LayerWithOptions, +class BleurtPooler final : public nn::LayerWithOptions, public PoolerBase { private: Ptr layers; @@ -167,7 +169,7 @@ class BleurtPooler final : public nn::LayerWithOptions, BleurtPooler(Ptr graph, Ptr options) : LayerWithOptions(graph, options), PoolerBase(graph, options) { - + float dropoutProb = 0.f; layers = New( graph, @@ -176,7 +178,7 @@ class BleurtPooler final : public nn::LayerWithOptions, New(graph, dropoutProb), New(graph, 1) ); - + registerLayer(layers); } @@ -186,15 +188,15 @@ class BleurtPooler final : public nn::LayerWithOptions, PoolerBase::graph_ = graph; setGraph(graph); // This makes sure that the graph passed into the model during construction and now evaluation are identical. - // A good check to have for catching weird situations early. + // A good check to have for catching weird situations early. ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); #endif auto modelType = LayerWithOptions::opt("type"); - + auto emb = slice(encoderStates[0]->getContext(), -2, 0); emb = marian::cast(emb, Type::float32); - + Expr output; if(LayerWithOptions::opt("usage") == (int)models::usage::evaluating) { output = layers->apply(emb); @@ -202,7 +204,7 @@ class BleurtPooler final : public nn::LayerWithOptions, output = reshape(output, {dimBatch, 1, 1}); return { output }; } else { - ABORT("Usage other than evaluating not implemented"); + ABORT("Usage other than evaluating not implemented"); } } From fa06754f0fff1b49e6668c88eab86ff350e1e6a9 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 17 Jan 2024 22:14:41 +0000 Subject: [PATCH 07/26] Merged PR 32547: Add support for sparsemax and comet-22 (not kiwi yet) This adds a sparsemax function and support for COMET-22 ref-based metric. Worth adding a regression test for Unbabel/wmt22-comet-da model later. Scores seem to be pretty much identical to PyTorch implementation when running as float32. --- CHANGELOG.md | 2 + VERSION | 2 +- scripts/comet/comet2marian.py | 26 ++++++----- src/common/config_parser.cpp | 13 +++--- src/graph/expression_operators.cpp | 37 +++++++++++++-- src/graph/expression_operators.h | 15 ++++-- src/models/comet_qe.h | 75 ++++++++++++++++-------------- src/models/encoder_pooler.h | 5 +- 8 files changed, 110 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83c05ac4c..3e4a170a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added implementation of COMET-22 (reference-based) model and conversion +- Added sparsemax operator (slow version) - Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively. - Added ALIBI related options to new layer framework. - Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode. diff --git a/VERSION b/VERSION index 5235dd6a9..5c911e82d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.17 +v1.12.18 diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py index d5f86a95f..6b4f557db 100755 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -14,7 +14,7 @@ # supported_comets = [m for m in available_metrics if 'qe' in m.lower()] supported_comets = [ 'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da', - 'wmt20-comet-da', 'wmt21-comet-da' + 'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da' ] log.basicConfig(level=log.INFO) @@ -32,7 +32,7 @@ def load_from_huggingface(model_id): log.info(f"Loading transformer model from huggingface {model_id}") from transformers import AutoModel, AutoTokenizer try: - model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) + model = AutoModel.from_pretrained(model_id, add_pooling_layer=False) AutoTokenizer.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) return model.eval(), getattr(tokenizer, 'vocab_file', None) @@ -53,7 +53,7 @@ def load_comet_model(model_path): log.info(f"Loading COMET model from checkpoint {model_path}") comet_model = load_from_checkpoint(model_path) comet_model.eval() - + vocab_file = None try: pretrained_model = comet_model.hparams.get('pretrained_model') @@ -106,6 +106,11 @@ def load_comet_model(model_path): config["bert-train-type-embeddings"] = False config["bert-type-vocab-size"] = 0 config["comet-prepend-zero"] = True + +config["comet-mix"] = cometModel.hparams.get("layer") == "mix" +config["comet-mix-norm"] = cometModel.hparams.get('layer_norm', False) +config["comet-mix-transformation"] = cometModel.hparams.get("layer_transformation", "softmax"); + if not args.roberta: config["comet-final-sigmoid"] = args.add_sigmoid config["comet-pooler-ffn"] = [2048, 1024] @@ -155,15 +160,15 @@ def extract(layer, nth, level): blockPrefix = f"{prefix}->encoder->layers->at({nth})->as()->selfAttentionBlock" - # self-attention + # self-attention # query transformation convert(pd, ["attention.self.query.weight"], f"{blockPrefix}->selfAttention->qProj->weight") convert(pd, ["attention.self.query.bias"], f"{blockPrefix}->selfAttention->qProj->bias", bias=True) - + # key transformation convert(pd, ["attention.self.key.weight"], f"{blockPrefix}->selfAttention->kProj->weight") convert(pd, ["attention.self.key.bias"], f"{blockPrefix}->selfAttention->kProj->bias", bias=True) - + # values transformation convert(pd, ["attention.self.value.weight"], f"{blockPrefix}->selfAttention->vProj->weight") convert(pd, ["attention.self.value.bias"], f"{blockPrefix}->selfAttention->vProj->bias", bias=True) @@ -176,7 +181,7 @@ def extract(layer, nth, level): convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) convert(pd, ["attention.output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) - # ffn + # ffn # first ffn layer blockPrefix = f"{prefix}->encoder->layers->at({nth})->as()->filterBlock" @@ -206,7 +211,7 @@ def extract(layer, nth, level): marianModel["Wemb"] = npWemb prefix = "CometEncoder" - + # shift position embeddings so that we are back at 512 items and start at 0 npPos = pd["position_embeddings.weight"].detach().numpy() npPos = npPos[2:, :].copy() @@ -234,9 +239,6 @@ def extract(layer, nth, level): # gamma for weird batch/layer-norm step in pooler/encoder of COMET # @TODO: make optional marianModel["CometEncoder->encoder->gamma"] = pd["gamma"].detach().numpy().copy() - config["comet-mix"] = True - config["comet-mix-norm"] = True - elif name == "FeedForward": for n, p in layer.named_parameters(): @@ -262,7 +264,7 @@ def extract(layer, nth, level): convert(pd, ["ff.3.bias"], f"{prefix}->layers->at(3)->as()->bias", bias=True) convert(pd, ["ff.6.weight"], f"{prefix}->layers->at(6)->as()->weight") - convert(pd, ["ff.6.bias"], f"{prefix}->layers->at(6)->as()->bias", bias=True) + convert(pd, ["ff.6.bias"], f"{prefix}->layers->at(6)->as()->bias", bias=True) else: recurse(layer, level + 1) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index d797b8e2d..ec85e40ad 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -331,11 +331,11 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--transformer-disable-position-embeddings", "Do not add any position embeddings. Use e.g. with --transformer-attention-mask alibi"); - cli.add("--transformer-alibi-trainable", + cli.add("--transformer-alibi-trainable", "Make alibi slopes trainable, default slopes are constant"); // handy shortcut for the current best setup - cli.add("--alibi", + cli.add("--alibi", "Use alibi settings for transformer, this is a shortcut for --transformer-attention-mask alibi --transformer-alibi-shift --transformer-disable-position-embeddings --separator-symbol [eos]"); cli.alias("alibi", "true", [](YAML::Node& config) { // define current-best alibi settings @@ -361,9 +361,10 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { // Options specific for the "comet-qe" model type cli.add("--comet-final-sigmoid", "Add final sigmoid to COMET model"); cli.add("--comet-stop-grad", "Do not propagate gradients through COMET model"); - + cli.add("--comet-mix", "Mix encoder layers to produce embedding"); cli.add("--comet-mix-norm", "Normalize layers prior to mixing"); + cli.add("--comet-mix-transformation", "Which transformation to apply to layer mixing (softmax [default] or sparsemax)", "softmax"); cli.add("--comet-dropout", "Dropout for pooler layers", 0.1f); cli.add("--comet-mixup", "Alpha parameter for Beta distribution for mixup", 0.0f); cli.add("--comet-mixup-reg", "Use original and mixed-up samples in training"); @@ -418,7 +419,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Do not create model checkpoints, only overwrite main model file with last checkpoint. " "Reduces disk usage"); cli.add("--overwrite-checkpoint", - "When --overwrite=false (default) only model files get written at saving intervals (with iterations numbers). " + "When --overwrite=false (default) only model files get written at saving intervals (with iterations numbers). " "Setting --overwrite-checkpoint=false also saves full checkpoints checkpoints with optimizer parameters, etc. " "Uses (a lot) more disk space.", true); @@ -604,7 +605,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Dynamic cost scaling for mixed precision training: " "scaling factor, frequency, multiplier, minimum factor") ->implicit_val("8.f 10000 1.f 8.f"); - + cli.add>("--throw-on-divergence", "Throw exception if training diverges. Divergence is detected if the running average loss over arg1 steps " "is exceeded by the running average loss over arg2 steps (arg1 >> arg2) by arg3 standard deviations") @@ -617,7 +618,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "If fp16 training diverges and throws try to continue training with fp32 precision"); cli.alias("fp16-fallback-to-fp32", "true", [](YAML::Node& config) { // use default custom-fallbacks to handle DivergenceException for fp16 - config["custom-fallbacks"] = std::vector({ + config["custom-fallbacks"] = std::vector({ YAML::Load("{fp16 : false, precision: [float32, float32], cost-scaling: []}") }); }); diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 47da511cf..c6245636c 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -102,8 +102,7 @@ Expr operator-(Expr a) { return Expression(a); }; -Expr softmax(Expr a, int axis /*=-1*/) -{ +Expr softmax(Expr a, int axis /*=-1*/) { // @TODO: move axis parameter down into the kernel if (axis != -1) { @@ -129,6 +128,34 @@ Expr logsoftmax(Expr a) { return Expression(a); } +// based on https://proceedings.mlr.press/v48/martins16.pdf for k equal to full dimension. +Expr sparsemax(Expr z, int axis/*=-1*/) { + // we currently assume that k == modelDim and that we apply the sparse max to the last dimension + + auto graph = z->graph(); + + int dimk = z->shape()[axis]; // assuming axis==-1 for dimension comments + Type fType = z->value_type(); + + // cast to float32 for better precision + auto z32 = cast(z, Type::float32); // [dimBatch, dimTime, dimk] + + const auto& [zSorted, zIndices] = sort(z32, /*axis=*/axis, /*descending=*/true); + auto zCumSum = cumsum(zSorted, /*axis=*/axis); // [dimBatch, dimTime, dimk] + + auto k = graph->constant({dimk}, inits::range(1.f, (float)(dimk + 1)), Type::float32); + auto kMask = gt(1.f + k * zSorted, zCumSum); // [dimBatch, dimTime, dimk] + auto kMax = max(kMask * k, /*axis=*/axis); // [dimBatch, dimTime, 1] + auto kMaxIdx = cast(kMax - 1.f, Type::uint32); // [dimBatch, dimTime, 1] + auto zNum = index_select(zCumSum, /*axis=*/axis, kMaxIdx); // [dimBatch, dimTime, 1] + auto tau = (zNum - 1.f) / kMax; // [dimBatch, dimTime, 1] + + auto zSparsemax = maximum(z32 - tau, 0.f); // [dimBatch, dimTime, dimk] + + // cast back to original type + return cast(zSparsemax, fType); +} + /*********************************************************/ Expr operator+(Expr a, Expr b) { @@ -308,15 +335,15 @@ Expr operator/(float a, Expr b) { // @TODO: implement proper operators for all three: Expr pow(float a, Expr b) { - return exp(std::log(a) * b); + return exp(std::log(a) * b); } Expr pow(Expr a, float b) { - return exp(log(a) * b); + return exp(log(a) * b); } Expr pow(Expr a, Expr b) { - return exp(log(a) * b); + return exp(log(a) * b); } /*********************************************************/ diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 82d8726c5..685ef0ebf 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -503,7 +503,7 @@ Expr bdot(Expr a, float scalar = 1.f); /** - * bdot_legacy is an old implemetation of bdot without correct broadcasting on the batch dimensions, + * bdot_legacy is an old implemetation of bdot without correct broadcasting on the batch dimensions, * to be removed once the behavior can be correctly replicated with normal bdot on 5 dimensions. */ Expr bdot_legacy(Expr a, @@ -739,7 +739,7 @@ Expr gather(Expr a, int axis, Expr indices); * @param a The input expression * @param axis The axis along which to index * @param indices The indices to be scattered - * @param source Expression with values to scatter. + * @param source Expression with values to scatter. * @returns Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices * @note @p source and @p indices must have the same rank * @note In this version @p source and @p indicies must have the same shape @@ -929,6 +929,11 @@ Expr softmax(Expr a, Expr zeroOneMask, int axis = -1); */ Expr logsoftmax(Expr a); +/** + * Compute a sparsemax along the last axis. Slow implementation but differentiable. +*/ +Expr sparsemax(Expr a, int axis = -1); + /** * Computes the cross-entropy loss. * @param labelSmoothingAlpha The amount of label smoothing @f$\alpha \in [0,1]@f$. @@ -966,9 +971,9 @@ Expr weighted_average(Expr in, Expr weights, int ax = 0); Expr layerNorm(Expr x, Expr gamma = nullptr, Expr beta = nullptr, float eps = 1e-9); /** - * Applies RMS normalization over the last dimension. - * - * See: Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization. + * Applies RMS normalization over the last dimension. + * + * See: Biao Zhang; Rico Sennrich (2019). Root Mean Square Layer Normalization. * In Advances in Neural Information Processing Systems 32. Vancouver, Canada. * @f[ \frac{x}{\sqrt{\frac{1}{N}\sum x^2 + \mathrm{eps}}} \times \gamma + \beta diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h index 868f7d6e9..aa335696d 100644 --- a/src/models/comet_qe.h +++ b/src/models/comet_qe.h @@ -15,6 +15,7 @@ class CometEncoder final : public nn::TransformerEncoder { // models trained by us, but required when doing inference with Unbabel models. Expr cometNorm(Expr x, Expr binaryMask) const { Expr output; + if(opt("comet-mix-norm", false)) { registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); int dimModel = x->shape()[-1]; @@ -23,7 +24,7 @@ class CometEncoder final : public nn::TransformerEncoder { Type origType = x->value_type(); x = marian::cast(x, Type::float32); binaryMask = marian::cast(binaryMask, Type::float32); - + x = x * binaryMask; auto denom = (float)dimModel * sum(binaryMask, -2); auto mu = sum(sum(x, -1), -2) / denom; // sum over model and time @@ -34,8 +35,11 @@ class CometEncoder final : public nn::TransformerEncoder { // Undo conversion to fp32 if not originally fp32 (most likely fp16 then) output = marian::cast(output, origType); - } else { + } else if(opt("comet-mix", false)) { // average over time dimension + registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); + output = gamma * sum(x * binaryMask, -2) / sum(binaryMask, -2); + } else { output = sum(x * binaryMask, -2) / sum(binaryMask, -2); } @@ -46,15 +50,15 @@ class CometEncoder final : public nn::TransformerEncoder { Expr weights; Expr gamma; - CometEncoder(Ptr graph, - Ptr options) + CometEncoder(Ptr graph, + Ptr options) : TransformerEncoder(graph, options) {} Expr apply(Expr input, Expr mask) const override { auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - + auto binaryMask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - + // apply positional embeddings to contextual input output = positionEmbedding->apply(output); @@ -75,7 +79,10 @@ class CometEncoder final : public nn::TransformerEncoder { if(opt("comet-mix", false)) { registerParameterLazy(weights, Shape({ opt("enc-depth") + 1 }), inits::zeros()); - auto weightsNorm = reshape(softmax(weights), {weights->shape()[-1], 1}); + // comet22 has a sparsemax here + auto normFn = opt("comet-mix-transformation", "softmax"); + auto weightsNorm = (normFn == "sparsemax") ? sparsemax(weights) : softmax(weights); + weightsNorm = reshape(weightsNorm, {weights->shape()[-1], 1}); output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim] } else { // just use last layer, average over time dim @@ -87,12 +94,12 @@ class CometEncoder final : public nn::TransformerEncoder { }; // Wrapper for backwards compatibility that uses current encoder/decoder framework -struct CometBatchEncoder final : public nn::LayerWithOptions, +struct CometBatchEncoder final : public nn::LayerWithOptions, public nn::IEmbeddingLayer, // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings public EncoderBase { // @TODO: should all encoders be IEmbeddingLayer? Ptr encoder; - CometBatchEncoder(Ptr graph, + CometBatchEncoder(Ptr graph, Ptr options) : LayerWithOptions(graph, options), EncoderBase(graph, options) @@ -131,10 +138,10 @@ struct CometBatchEncoder final : public nn::LayerWithOptions, EncoderBase::graph_ = graph; setGraph(graph); // This makes sure that the graph passed into the model during construction and now evaluation are identical. - // A good check to have for catching weird situations early. + // A good check to have for catching weird situations early. ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); #endif - + const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]); return New(batchEmbedding, batchMask, batch); } @@ -145,7 +152,7 @@ struct CometBatchEncoder final : public nn::LayerWithOptions, }; // Dummpy pooler that only returns the encoder context -class CometEmbeddingPooler final : public nn::LayerWithOptions, +class CometEmbeddingPooler final : public nn::LayerWithOptions, public PoolerBase { public: CometEmbeddingPooler(Ptr graph, Ptr options) @@ -159,12 +166,12 @@ class CometEmbeddingPooler final : public nn::LayerWithOptions, return { encoderStates[0]->getContext() }; } - + void clear() override {} }; // Actual COMET-like pooler, works for COMET-QE and COMET models (prior to WMT22) -class CometMetricPooler final : public nn::LayerWithOptions, +class CometMetricPooler final : public nn::LayerWithOptions, public PoolerBase { private: Ptr layers; @@ -174,7 +181,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, CometMetricPooler(Ptr graph, Ptr options) : LayerWithOptions(graph, options), PoolerBase(graph, options) { - + float dropoutProb = LayerWithOptions::opt("comet-dropout", 0.1f); auto ffnHidden = LayerWithOptions::opt>("comet-pooler-ffn", {2048, 1024}); @@ -188,7 +195,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, if(LayerWithOptions::opt("comet-final-sigmoid")) layers->append(New(graph)); - + registerLayer(layers); } @@ -198,7 +205,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, PoolerBase::graph_ = graph; setGraph(graph); // This makes sure that the graph passed into the model during construction and now evaluation are identical. - // A good check to have for catching weird situations early. + // A good check to have for catching weird situations early. ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); #endif @@ -213,10 +220,10 @@ class CometMetricPooler final : public nn::LayerWithOptions, auto mixup = [&](Expr x, Expr y, float alpha, bool reg=true) -> Expr2 { if(alpha == 0.f) return {x, y}; - + int dimBatch = x->shape()[-3]; Type xType = x->value_type(); - + std::vector indices(dimBatch); std::iota(indices.begin(), indices.end(), 0); @@ -246,7 +253,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, int dimBatch = src->shape()[-3]; float badRatio = LayerWithOptions::opt("comet-augment-bad", 0.f); dimBad = (int)std::ceil(dimBatch * badRatio); // use ceiling to make sure it's at least 1 - + if(dimBad > 0) { LOG_ONCE(info, "Adding {:.1f} percent of bad examples to batch with label 0.0f", badRatio * 100); @@ -259,7 +266,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, indicesSrc.resize(dimBad); // shrink to size auto srcSub = index_select(src, -3, indicesSrc); src = concatenate({src, srcSub}, /*axis=*/-3); - + std::iota(indicesMt.begin(), indicesMt.end(), 0); // permute the indices and select batch entries accordingly std::shuffle(indicesMt.begin(), indicesMt.end(), rng); @@ -277,7 +284,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, auto modelType = LayerWithOptions::opt("type"); ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe"); ABORT_IF(modelType == "comet" && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet"); - + if(modelType == "comet-qe") { auto src = encoderStates[0]->getContext(); auto mt = encoderStates[1]->getContext(); @@ -296,7 +303,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, src = get<0>(srcMt); mt = get<1>(srcMt); } - + auto diff = abs(mt - src); auto prod = mt * src; @@ -313,10 +320,10 @@ class CometMetricPooler final : public nn::LayerWithOptions, return { output }; } else { auto emb = concatenate({mt, src, prod, diff}, /*axis=*/-1); // [batch, 1, model] - + auto softLabelsWords = batch->front()->data(); auto classVocab = batch->front()->vocab(); - + // we add bad examples to the batch, so we need to make sure the soft labels are padded accordingly with 0s int dimBatch = (int)softLabelsWords.size() + dimBad; std::vector softLabels; @@ -340,12 +347,12 @@ class CometMetricPooler final : public nn::LayerWithOptions, output = marian::cast(layers->apply(emb), Type::float32); return { output, labels }; - } + } } else if(modelType == "comet") { auto src = encoderStates[0]->getContext(); auto mt = encoderStates[1]->getContext(); auto ref = encoderStates[2]->getContext(); - + auto diffRef = abs(mt - ref); auto prodRef = mt * ref; @@ -361,7 +368,7 @@ class CometMetricPooler final : public nn::LayerWithOptions, return { output }; } else { // Currently no training for COMET with reference @TODO: add training - ABORT("Usage other than 'evaluating' not implemented"); + ABORT("Usage other than 'evaluating' not implemented"); } } else { ABORT("Unknown model type {}", modelType); @@ -380,7 +387,7 @@ class CometLoss final : public ICost { public: CometLoss(Ptr options) - : options_(options), inference_(options->get("inference", false)), + : options_(options), inference_(options->get("inference", false)), rescore_(options->get("cost-type", "ce-sum") == "ce-rescore") { } Ptr apply(Ptr model, @@ -391,7 +398,7 @@ class CometLoss final : public ICost { auto corpusBatch = std::static_pointer_cast(batch); auto inputTypes = options_->get>("input-types", {}); - ABORT_IF(inputTypes != std::vector({"class", "sequence", "sequence"}), + ABORT_IF(inputTypes != std::vector({"class", "sequence", "sequence"}), "Expected input-types to be have fields (class, sequence, sequence)"); ABORT_IF(corpusBatch->sets() != 3, "Expected 3 sub-batches, not {}", corpusBatch->sets()); @@ -416,9 +423,9 @@ class CometLoss final : public ICost { } else { ABORT("Unknown loss type {} for COMET training", lossType); } - + auto encoded = encpool->apply(graph, corpusBatch, clearGraph); - + Expr x = encoded[0]; Expr y = encoded[1]; auto loss = lossFn(x, y); @@ -428,9 +435,9 @@ class CometLoss final : public ICost { int dimBatch = loss->shape()[-3]; if(rescore_) loss = reshape(loss, {1, dimBatch, 1}); - else + else loss = sum(loss, /*axis=*/-3); // [1, 1, 1] - + Ptr multiLoss = New(); RationalLoss lossPiece(loss, (float)dimBatch); multiLoss->push_back(lossPiece); diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h index 0a781c9d5..b89f85c9e 100644 --- a/src/models/encoder_pooler.h +++ b/src/models/encoder_pooler.h @@ -7,8 +7,8 @@ #include "models/model_base.h" #include "models/states.h" -// @TODO: this introduces functionality to use LASER in Marian for the filtering workflow or for use in MS-internal -// COSMOS server-farm. There is a lot of code duplication with Classifier and EncoderDecoder and this needs to be fixed. +// @TODO: this introduces functionality to use LASER in Marian for the filtering workflow or for use in MS-internal +// COSMOS server-farm. There is a lot of code duplication with Classifier and EncoderDecoder and this needs to be fixed. // This will be done after the new layer system has been finished. namespace marian { @@ -163,6 +163,7 @@ class EncoderPooler : public EncoderPoolerBase { modelFeatures_.insert("comet-final-sigmoid"); modelFeatures_.insert("comet-mix"); modelFeatures_.insert("comet-mix-norm"); + modelFeatures_.insert("comet-mix-transformation"); } virtual Ptr getOptions() override { return options_; } From 7dcebfb924375973f68f094443792706ae7b813a Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 20 Jan 2024 23:30:53 +0000 Subject: [PATCH 08/26] Merged PR 32567: Refactoring of Graph loading and mmapping interface This is a rewrite of the graph loading and memory-mapping functionality. We now mmap and share oportunistically, i.e. whenever it is possible: * with cpu-decoding and *.bin files everything will be automatically mmapped * with *.npz files the model will be read only once. * on the GPU *.bin will be mmapped but still copied to GPU, ideally omitting CPU memory. This quite drastically reduces unnecessary CPU memory overhead and loading time for things like COMET scoring. --- CHANGELOG.md | 1 + VERSION | 2 +- src/command/marian_conv.cpp | 14 +- src/common/binary.cpp | 14 +- src/common/config.cpp | 6 +- src/common/io.cpp | 194 +++++++++++++------- src/common/io.h | 80 +++++++- src/common/shape.h | 2 +- src/embedder/embedder.h | 16 +- src/evaluator/evaluator.h | 31 ++-- src/examples/mnist/model.h | 7 +- src/graph/expression_graph.cpp | 4 +- src/graph/expression_graph.h | 88 ++++----- src/graph/node_initializers.cpp | 39 ++-- src/microsoft/cosmos.cpp | 27 +-- src/microsoft/quicksand.cpp | 60 +++--- src/microsoft/quicksand.h | 9 +- src/models/amun.h | 80 ++++---- src/models/costs.h | 36 +--- src/models/encoder_classifier.h | 32 +--- src/models/encoder_decoder.cpp | 20 +- src/models/encoder_decoder.h | 32 +--- src/models/encoder_pooler.h | 36 +--- src/models/model_base.h | 22 +-- src/models/nematus.h | 78 ++++---- src/models/transformer_factory.h | 36 ++-- src/rescorer/rescorer.h | 8 +- src/tensors/cpu/expression_graph_packable.h | 6 +- src/tensors/tensor.cpp | 8 +- src/training/graph_group.cpp | 107 ++++++----- src/training/graph_group.h | 5 +- src/training/validator.cpp | 6 +- src/translator/scorers.cpp | 88 ++------- src/translator/scorers.h | 58 +----- src/translator/translator.h | 49 ++--- 35 files changed, 597 insertions(+), 704 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e4a170a3..772349e3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp). ### Changed +- Refactoring of model loading, mmapping happens now opportunistically, --mmap-models for decoding forces mmap and croaks if not possible. - Removed --num-devices N option that wasn't really used by anyone (I assume). diff --git a/VERSION b/VERSION index 5c911e82d..2f107c43d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.18 +v1.12.19 diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index 12412a238..31f47946e 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -24,9 +24,9 @@ int main(int argc, char** argv) { cli->add("--to,-t", "Output model", "model.bin"); cli->add("--export-as", "Kind of conversion: marian-bin or onnx-{encode,decoder-step,decoder-init,decoder-stop}", "marian-bin"); cli->add("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, " - "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512", + "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512", "float32"); - cli->add>("--add-lsh", + cli->add>("--add-lsh", "Encode output matrix and optional rotation matrix into model file. " "arg1: number of bits in LSH encoding, arg2: name of output weights matrix")->implicit_val("1024 Wemb"); cli->add>("--vocabs,-V", "Vocabulary file, required for ONNX export"); @@ -69,21 +69,23 @@ int main(int argc, char** argv) { if(lshParams.size() > 1) lshOutputWeights = lshParams[1]; } - + // We accept any type here and will later croak during packAndSave if the type cannot be used for conversion Type saveGemmType = typeFromString(options->get("gemm-type", "float32")); LOG(info, "Outputting {}, precision: {}", modelTo, saveGemmType); - YAML::Node config; + + auto modelFile = New(modelFrom, io::MmapMode::DontMmap); + YAML::Node config = modelFile->getYamlFromModel(); std::stringstream configStr; - marian::io::getYamlFromModel(config, "special:model.yml", modelFrom); + configStr << config; if (exportAs == "marian-bin") { auto graph = New(); graph->setDevice(CPU0); - graph->load(modelFrom); + graph->load(modelFile); std::vector toBeLSHed; if(addLsh) { diff --git a/src/common/binary.cpp b/src/common/binary.cpp index 6bb90c508..0041275c5 100644 --- a/src/common/binary.cpp +++ b/src/common/binary.cpp @@ -19,7 +19,7 @@ struct Header { uint64_t dataLength; }; -// cast current void pointer to T pointer and move forward by num elements +// cast current void pointer to T pointer and move forward by num elements template const T* get(const void*& current, uint64_t num = 1) { const T* ptr = (const T*)current; @@ -48,9 +48,9 @@ void loadItems(const void* current, std::vector& items, bool mapped) { // read in actual shape and data for(int i = 0; i < numHeaders; ++i) { uint64_t len = headers[i].shapeLength; - items[i].shape.resize(len); + items[i].shape.resize(len); const int* arr = get(current, len); // read shape - std::copy(arr, arr + len, items[i].shape.begin()); // copy to Item::shape + std::copy(arr, arr + len, items[i].shape.begin()); // copy to Item::shape } // move by offset bytes, aligned to 256-bytes boundary @@ -64,8 +64,8 @@ void loadItems(const void* current, std::vector& items, bool mapped) { items[i].type = cpu::integer::getIntgemmType(Type::intgemm8); } if(items[i].mapped) { // memory-mapped, hence only set pointer - // @TOOD: verify this actually works for the hardware-specific ones like intgemm8avx2 - ABORT_IF(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16, "mmap format not supported for hardware non-specific intgemm matrices"); + if(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16) + throw MarianRuntimeException("mmap format not supported for hardware non-specific intgemm matrices", getCallStack(/*skipLevels=*/0)); items[i].ptr = get(current, headers[i].dataLength); } else { // reading into item data uint64_t len = headers[i].dataLength; @@ -170,8 +170,8 @@ void saveItems(const std::string& fileName, // Write out all values for(const auto& item : items) - pos += out.write(item.data(), item.bytes.size()); // writes out data with padding, keeps 256-byte boundary. - // Amazingly this is binary-compatible with V1 and aligned and + pos += out.write(item.data(), item.bytes.size()); // writes out data with padding, keeps 256-byte boundary. + // Amazingly this is binary-compatible with V1 and aligned and // non-aligned models can be read with the same procedure. // No version-bump required. Gets 5-8% of speed back when mmapped. } diff --git a/src/common/config.cpp b/src/common/config.cpp index efdd29c12..20ef6e046 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -170,15 +170,13 @@ void Config::save(const std::string& name) { } bool Config::loadModelParameters(const std::string& name) { - YAML::Node config; - io::getYamlFromModel(config, "special:model.yml", name); + auto config = New(name)->getYamlFromModel(); override(config); return true; } bool Config::loadModelParameters(const void* ptr) { - YAML::Node config; - io::getYamlFromModel(config, "special:model.yml", ptr); + auto config = New(ptr)->getYamlFromModel(); override(config); return true; } diff --git a/src/common/io.cpp b/src/common/io.cpp index 6a7be6a36..109b3a1ed 100644 --- a/src/common/io.cpp +++ b/src/common/io.cpp @@ -1,12 +1,15 @@ #include "common/io.h" #include "3rd_party/cnpy/cnpy.h" +#include "common/definitions.h" #include "common/shape.h" #include "common/types.h" #include "common/binary.h" #include "common/io_item.h" +#include "training/communicator.h" + namespace marian { namespace io { @@ -20,78 +23,73 @@ bool isBin(const std::string& fileName) { && fileName.substr(fileName.length() - 4) == ".bin"; } -void getYamlFromNpz(YAML::Node& yaml, - const std::string& varName, - const std::string& fileName) { - auto item = cnpy::npz_load(fileName, varName); - if(item->size() > 0) - yaml = YAML::Load(item->data()); +ModelWeights::FileType ModelWeights::getFileType(const std::string& fileName) { + if(isNpz(fileName)) + return FileType::isNpz; + else if(isBin(fileName)) + return FileType::isBin; + else + ABORT("Unknown file format for file {}", fileName); } -void getYamlFromBin(YAML::Node& yaml, - const std::string& varName, - const std::string& fileName) { - auto item = binary::getItem(fileName, varName); - if(item.size() > 0) - yaml = YAML::Load(item.data()); -} - -void getYamlFromModel(YAML::Node& yaml, - const std::string& varName, - const std::string& fileName) { - if(io::isNpz(fileName)) { - io::getYamlFromNpz(yaml, varName, fileName); - } else if(io::isBin(fileName)) { - io::getYamlFromBin(yaml, varName, fileName); - } else { - ABORT("Unknown model file format for file {}", fileName); - } +std::vector& ModelWeights::items() { + load(); + return items_; } -void getYamlFromModel(YAML::Node& yaml, - const std::string& varName, - const void* ptr) { - auto item = binary::getItem(ptr, varName); - if(item.size() > 0) - yaml = YAML::Load(item.data()); +const std::vector& ModelWeights::items() const { + const_cast(*this).load(); + return items_; } -// Load YAML from item -void getYamlFromModel(YAML::Node& yaml, - const std::string& varName, - const std::vector& items) { - for(auto& item : items) { - if(item.name == varName) { - yaml = YAML::Load(item.data()); - return; - } - } +const void* ModelWeights::data() const { + const_cast(*this).load(); + switch (fileType_) { + case FileType::isNpz: + return nullptr; + case FileType::isBin: + return mmap_->data(); + case FileType::isBuf: + return ptr_; + case FileType::isDummy: + ABORT("Cannot get data from dummy model"); + default: + ABORT("Unknown file type"); + } } -void addMetaToItems(const std::string& meta, - const std::string& varName, - std::vector& items) { - Item item; - item.name = varName; - - // increase size by 1 to add \0 - item.shape = Shape({(int)meta.size() + 1}); - - item.bytes.resize(item.shape.elements()); - std::copy(meta.begin(), meta.end(), item.bytes.begin()); - // set string terminator - item.bytes.back() = '\0'; - - item.type = Type::int8; +size_t ModelWeights::size() const { + const_cast(*this).load(); + switch (fileType_) { + case FileType::isNpz: + return 0; + case FileType::isBin: + return mmap_->size(); + case FileType::isBuf: + ABORT("Cannot get size of buffer"); + case FileType::isDummy: + ABORT("Cannot get size from dummy model"); + default: + ABORT("Unknown file type"); + } +} - items.push_back(item); +// @TODO: bring back fast peeking into the file to get config +// Load YAML from item +YAML::Node ModelWeights::getYamlFromModel(const std::string& varName) const { + const_cast(*this).load(); + for(auto& item : items_) { + if(item.name == varName) { + return YAML::Load(item.data()); + } + } + return YAML::Node(); } void loadItemsFromNpz(const std::string& fileName, std::vector& items) { auto numpy = cnpy::npz_load(fileName); for(auto it : numpy) { - ABORT_IF( - it.second->fortran_order, "Numpy item '{}' is not stored in row-major order", it.first); + ABORT_IF(it.second->fortran_order, "Numpy item '{}' is not stored in row-major order", it.first); Shape shape; shape.resize(it.second->shape.size()); @@ -122,7 +120,7 @@ void loadItemsFromNpz(const std::string& fileName, std::vector& items) { } } -std::vector loadItems(const std::string& fileName) { +std::vector ModelWeights::loadItems(const std::string& fileName) { std::vector items; if(isNpz(fileName)) { loadItemsFromNpz(fileName, items); @@ -135,16 +133,61 @@ std::vector loadItems(const std::string& fileName) { return items; } -std::vector loadItems(const void* ptr) { +std::vector ModelWeights::mmapItems(const void* ptr) { std::vector items; - binary::loadItems(ptr, items, false); + binary::loadItems(ptr, items, true); return items; } -std::vector mmapItems(const void* ptr) { - std::vector items; - binary::loadItems(ptr, items, true); - return items; +void ModelWeights::load() { + std::lock_guard lock(mutex_); + if(loaded_) + return; + + switch (fileType_) { + case FileType::isNpz: + loadItemsFromNpz(fileName_, items_); + break; + case FileType::isBin: + if(mmapMode_ == MmapMode::DontMmap) { + binary::loadItems(fileName_, items_); + } else { + try { + mmap_.reset(new mio::mmap_source(fileName_)); + binary::loadItems(mmap_->data(), items_, /*mapped=*/true); + } catch(const MarianRuntimeException& e) { + if(mmapMode_ == MmapMode::RequiredMmap) + ABORT("Could not memory-map file '{}': {}", fileName_, e.what()); + else + LOG(warn, "[warning] Could not memory-map file '{}' ({}), falling back to reading from disk", fileName_, e.what()); + mmapMode_ = MmapMode::DontMmap; + binary::loadItems(fileName_, items_); + } + } + break; + case FileType::isBuf: + binary::loadItems(ptr_, items_, /*mapped=*/mmapMode_ != MmapMode::DontMmap); + break; + case FileType::isDummy: + ABORT("Cannot load from dummy model"); + default: + ABORT("Unknown file type"); + } + + loaded_ = true; +} + +void ModelWeights::loadAndSync(Ptr mpi) { + ABORT_IF(!mpi, "MPI wrapper is null"); + ABORT_IF(mmapMode_ != MmapMode::DontMmap, "Mmapping not allowed"); + + if(mpi->isMainProcess()) + load(); + + mpi->bCast(fileName_); + mpi->bCast(&fileType_, 1, mpi->getDataType((size_t*)&fileType_)); + mpi->bCast(&loaded_, 1, mpi->getDataType(&loaded_)); + mpi->bCast(items_); } // @TODO: make cnpy and our wrapper talk to each other in terms of types @@ -167,12 +210,31 @@ void saveItemsNpz(const std::string& fileName, const std::vector& items) { else if(item.type == Type::uint32) type = cnpy::map_type(typeid(uint32_t)); else if(item.type == Type::uint64) type = cnpy::map_type(typeid(uint64_t)); else ABORT("Other types ({}) not supported", item.type); - + npzItems.emplace_back(item.name, item.bytes, shape, type, sizeOf(item.type)); } cnpy::npz_save(fileName, npzItems); } +void addMetaToItems(const std::string& meta, + const std::string& varName, + std::vector& items) { + Item item; + item.name = varName; + + // increase size by 1 to add \0 + item.shape = Shape({(int)meta.size() + 1}); + + item.bytes.resize(item.shape.elements()); + std::copy(meta.begin(), meta.end(), item.bytes.begin()); + // set string terminator + item.bytes.back() = '\0'; + + item.type = Type::int8; + + items.push_back(item); +} + void saveItems(const std::string& fileName, const std::vector& items) { if(isNpz(fileName)) { saveItemsNpz(fileName, items); diff --git a/src/common/io.h b/src/common/io.h index 3f340ed2f..1db0a83fe 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -1,6 +1,12 @@ #pragma once +#ifndef NOMINMAX +#define NOMINMAX +#endif + +#include "3rd_party/mio/mio.hpp" #include "3rd_party/yaml-cpp/yaml.h" +#include "common/definitions.h" #include "common/io_item.h" #include @@ -14,28 +20,84 @@ // CPU decoding. namespace marian { + +struct IMPIWrapper; + namespace io { +enum struct MmapMode { OpportunisticMmap, DontMmap, RequiredMmap }; + bool isNpz(const std::string& fileName); bool isBin(const std::string& fileName); -void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const std::string& fileName); -void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const void* ptr); -void getYamlFromModel(YAML::Node& yaml, const std::string& varName, const std::vector& items); +class ModelWeights { +private: + std::mutex mutex_; + + std::string fileName_; + const void* ptr_{nullptr}; + + enum struct FileType : size_t { isNpz, isBin, isBuf, isDummy }; + FileType fileType_{FileType::isNpz}; + FileType getFileType(const std::string& fileName); + + MmapMode mmapMode_{MmapMode::OpportunisticMmap}; + + bool loaded_{false}; + + std::vector items_; + std::unique_ptr mmap_; + + std::vector loadItems(const std::string& fileName); + std::vector mmapItems(const void* ptr); + + void load(); + +public: + ModelWeights(const std::string& fileName, MmapMode mmapMode = MmapMode::OpportunisticMmap) + : fileName_(fileName), fileType_(getFileType(fileName)), mmapMode_(mmapMode) { + // NPZ files cannot be memory-mapped, so we switch opportunistic mmap off, but keep any other mmap mode + if(fileType_ == FileType::isNpz && mmapMode_ == MmapMode::OpportunisticMmap) + mmapMode_ = MmapMode::DontMmap; + + // so we can croak here for NPZ files if the user sets mmap to required + ABORT_IF(fileType_ == FileType::isNpz && mmapMode_ != MmapMode::DontMmap, "NPZ files cannot be memory-mapped"); + } + + ModelWeights(const void* ptr, MmapMode mmapMode = MmapMode::RequiredMmap) + : ptr_(ptr), fileType_(FileType::isBuf), mmapMode_(mmapMode) {} + + ModelWeights() + : fileType_(FileType::isDummy), mmapMode_{MmapMode::DontMmap} {} + + ModelWeights(const ModelWeights&&) = delete; + ModelWeights(const ModelWeights&) = delete; + + std::vector& items(); + const std::vector& items() const; + + MmapMode mmapMode() const { + return mmapMode_; + } + const void* data() const; + size_t size() const; + + YAML::Node getYamlFromModel(const std::string& varName = "special:model.yml") const; + + void loadAndSync(Ptr mpi); +}; + +// for saving we keep the old interface since there is no intelligence going on here and it is useful +// to be able to assemble a set of items in different places. void addMetaToItems(const std::string& meta, const std::string& varName, std::vector& items); -std::vector loadItems(const std::string& fileName); -std::vector loadItems(const void* ptr); - -std::vector mmapItems(const void* ptr); - void saveItems(const std::string& fileName, const std::vector& items); /** - * Creates a flat io::Item from a given std::vector so that it can be saved in a npz file + * Creates a flat io::Item from a given std::vector so that it can be saved in a npz file * or Marian's native binary format with the given name. */ template diff --git a/src/common/shape.h b/src/common/shape.h index ad2be866f..bd9d98512 100644 --- a/src/common/shape.h +++ b/src/common/shape.h @@ -17,7 +17,7 @@ namespace marian { */ class ShapeSizeException : public std::runtime_error { public: - ShapeSizeException(size_t available, size_t asked) + ShapeSizeException(size_t available, size_t asked) : std::runtime_error(fmt::format("Expanded shape size {} exceeds numeric capcacity {}", asked, available)) {} }; diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h index ebd9782e2..812bed57d 100644 --- a/src/embedder/embedder.h +++ b/src/embedder/embedder.h @@ -30,7 +30,7 @@ class Embedder { Embedder(Ptr options) : model_(createModelFromOptions(options, models::usage::embedding)) {} - void load(Ptr graph, const std::string& modelFile) { + void load(Ptr graph, Ptr modelFile) { model_->load(graph, modelFile); } @@ -51,11 +51,12 @@ class Embed : public ModelTask { Ptr corpus_; std::vector> graphs_; std::vector> models_; + Ptr modelFile_; public: Embed(Ptr options) : options_(options) { - - options_ = options_->with("inference", true, + + options_ = options_->with("inference", true, "shuffle", "none"); // if a similarity is computed then double the input types and vocabs for @@ -87,7 +88,8 @@ class Embed : public ModelTask { graphs_.push_back(graph); } - auto modelFile = options_->get("model"); + auto modelPath = options_->get("model"); + modelFile_ = New(modelPath); models_.resize(graphs_.size()); ThreadPool pool(graphs_.size(), graphs_.size()); @@ -95,7 +97,7 @@ class Embed : public ModelTask { pool.enqueue( [=](size_t j) { models_[j] = New(options_); - models_[j]->load(graphs_[j], modelFile); + models_[j]->load(graphs_[j], modelFile_); }, i); } @@ -104,7 +106,7 @@ class Embed : public ModelTask { void run() override { LOG(info, "Embedding"); timer::Timer timer; - + auto batchGenerator = New>(corpus_, options_); batchGenerator->prepare(); @@ -140,7 +142,7 @@ class Embed : public ModelTask { } else { ABORT("Unknown embedding type {}", embeddings->value_type()); } - + // collect embedding vector per sentence. // if we compute similarities this is only one similarity per sentence pair. for(size_t i = 0; i < batch->size(); ++i) { diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h index 31fe00e87..bfed80a53 100644 --- a/src/evaluator/evaluator.h +++ b/src/evaluator/evaluator.h @@ -29,12 +29,8 @@ class Evaluator { Evaluator(Ptr options) : model_(createModelFromOptions(options, models::usage::evaluating)) {} - void load(Ptr graph, const std::vector& items) { - model_->load(graph, items); - } - - void load(Ptr graph, const std::string& fileName) { - model_->load(graph, fileName); + void load(Ptr graph, Ptr modelFile) { + model_->load(graph, modelFile); } Expr build(Ptr graph, Ptr batch) { @@ -54,11 +50,11 @@ class Evaluate : public ModelTask { Ptr corpus_; std::vector> graphs_; std::vector> models_; - std::vector ioItems_; + Ptr modelFile_; public: Evaluate(Ptr options) : options_(options) { - options_ = options_->with("inference", true, + options_ = options_->with("inference", true, "shuffle", "none"); corpus_ = New(options_); @@ -68,7 +64,8 @@ class Evaluate : public ModelTask { auto modelPath = options_->get("model"); LOG(info, "Loading model from {}", modelPath); - ioItems_ = io::loadItems(modelPath); + + modelFile_ = New(modelPath); graphs_.resize(devices.size()); models_.resize(devices.size()); @@ -79,15 +76,15 @@ class Evaluate : public ModelTask { [=](size_t j) { auto graph = New(true); auto precison = options_->get>("precision", {"float32"}); - graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph - graph->setDevice(devices[j]); + graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph + graph->setDevice(devices[j]); graph->reserveWorkspaceMB(options_->get("workspace")); - + auto model = New(options_); - model->load(graph, ioItems_); + model->load(graph, modelFile_); models_[j] = model; - graphs_[j] = graph; + graphs_[j] = graph; }, i); } @@ -96,7 +93,7 @@ class Evaluate : public ModelTask { void run() override { LOG(info, "Evaluating"); timer::Timer timer; - + auto batchGenerator = New>(corpus_, options_); batchGenerator->prepare(); @@ -105,7 +102,7 @@ class Evaluate : public ModelTask { size_t batchId = 0; { ThreadPool pool(graphs_.size(), graphs_.size()); - + for(auto batch : *batchGenerator) { auto task = [=](size_t id) { thread_local Ptr graph; @@ -132,7 +129,7 @@ class Evaluate : public ModelTask { } else { ABORT("Unknown value type {}", scores->value_type()); } - + // collect embedding vector per sentence. // if we compute similarities this is only one similarity per sentence pair. for(size_t i = 0; i < batch->size(); ++i) { diff --git a/src/examples/mnist/model.h b/src/examples/mnist/model.h index 5d50eae96..10c282c03 100755 --- a/src/examples/mnist/model.h +++ b/src/examples/mnist/model.h @@ -75,11 +75,8 @@ class MnistFeedForwardNet : public IModel { return Logits(apply(graph, batch, inference_)); } - void load(Ptr /*graph*/, const std::vector& /*items*/, bool) override { - LOG(critical, "Loading MNIST model is not supported"); - } - - void load(Ptr /*graph*/, const std::string& /*name*/, bool) override { + + void load(Ptr /*graph*/, Ptr /*name*/, bool) override { LOG(critical, "Loading MNIST model is not supported"); } diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index ce51b0f2b..f59edfca9 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -15,7 +15,7 @@ void ExpressionGraph::setDevice(DeviceId deviceId, Ptr device) { auto params = New(defaultElementType_); params->init(backend_); paramsByElementType_[defaultElementType_] = params; - + if(device) tensors_ = New(backend_, device); else @@ -285,7 +285,7 @@ void ExpressionGraph::checkNaN(Tensor t, bool& isNaN, bool& isInf) { IsNaN(t, allocator(), isNaN, isInf); } -void ExpressionGraph::save(std::vector& ioItems, Type saveElementType) { +void ExpressionGraph::getItems(std::vector& ioItems, Type saveElementType) { // sorted by type in std::map for(auto kvParams : paramsByElementType_) { // sorted by name in std::map diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index da69af091..915c9df3f 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -676,7 +676,7 @@ class ExpressionGraph : public std::enable_shared_from_this { * @param node a pointer to a expression node */ Expr add(Expr node); - + /** * Removes the node from the set of roots (will not be initialized during back propagation) * @param node a pointer to a expression node @@ -742,50 +742,29 @@ class ExpressionGraph : public std::enable_shared_from_this { /** Get the flag value whether the graph throws a NaN exception (true) or not */ bool getThrowNaN() { return throwNaN_; } -public: - /** Load model (mainly parameter objects) from array of io::Items */ - void load(const std::vector& ioItems, bool markReloaded = true) { - setReloaded(false); - for(auto& item : ioItems) { - std::string pName = item.name; - // skip over special parameters starting with "special:" - if(pName.substr(0, 8) == "special:") - continue; - - // if during loading the loaded type is of the same type class as the default element type, allow conversion; - // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both - // have type class TypeClass::float_type. - auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type; - param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false); - } - if(markReloaded) - setReloaded(true); - } - - /** Load model by filename */ - void load(const std::string& name, bool markReloaded = true) { - LOG(info, "Loading model from {}", name); - auto items = io::loadItems(name); - load(items, markReloaded); - } - - /** Load model from buffer (a file pointer) */ - void load(const void* ptr, bool markReloaded = true) { - LOG(info, "Loading model from buffer at {}", ptr); - auto items = io::loadItems(ptr); - load(items, markReloaded); - } /** * Turn the model (given a file pointer) into a memory-mapped type * by converting all the parameter object to memory-mapped version, i.e., MappedParameters. */ - void mmap(const void* ptr, bool markReloaded = true) { - ABORT_IF(backend_->getDeviceId().type != DeviceType::cpu || !inferenceOnly_, - "Memory mapping only supported for CPU inference mode"); + void prepareMmap(Ptr modelFile) { + bool graphAllowsMmapping = backend_->getDeviceId().type == DeviceType::cpu && inferenceOnly_; + auto mmapMode = modelFile->mmapMode(); + + // don't do anything if we don't want to mmap regardless if the graph allows it + if(mmapMode == io::MmapMode::DontMmap) + return; - LOG(info, "Memory mapping model at {}", ptr); - auto items = io::mmapItems(ptr); + // silently ignore if we can't mmap and it's not required + if(!graphAllowsMmapping && mmapMode != io::MmapMode::RequiredMmap) + return; + + // abort if we can't mmap and it's required + ABORT_IF(!graphAllowsMmapping && mmapMode == io::MmapMode::RequiredMmap, + "Memory mapping required but only supported for CPU inference graphs"); + + // if we got here, we mmap either opportunistically or by requirement + LOG_ONCE(info, "[memory] Memory mapping model parameters in graph"); // Deal with default parameter set object that might not be a mapped object. // This gets assigned during ExpressionGraph::setDevice(...) and by default @@ -803,7 +782,7 @@ class ExpressionGraph : public std::enable_shared_from_this { } // pre-populate parameters by type - for(auto& item : items) { + for(auto& item : modelFile->items()) { auto it1 = paramsByElementType_.find(item.type); if(it1 == paramsByElementType_.end()) { auto params = New(item.type); @@ -811,17 +790,39 @@ class ExpressionGraph : public std::enable_shared_from_this { paramsByElementType_.insert({item.type, params}); } } + } + +public: + /** Load model (mainly parameter objects) from a ModelWeights object */ + void load(Ptr modelWeights, bool markReloaded = true) { + prepareMmap(modelWeights); + + setReloaded(false); + for(auto& item : modelWeights->items()) { + std::string pName = item.name; + // skip over special parameters starting with "special:" + if(pName.substr(0, 8) == "special:") + continue; - load(items, markReloaded); + // if during loading the loaded type is of the same type class as the default element type, allow conversion; + // otherwise keep the loaded type. This is used when e.g. loading a float32 model as a float16 model as both + // have type class TypeClass::float_type. + auto loadElementType = isSameTypeClass(item.type, defaultElementType_) ? defaultElementType_ : item.type; + param(pName, item.shape, inits::fromItem(item), loadElementType, /*fixed=*/false); + } + if(markReloaded) + setReloaded(true); } + public: + /** * Convert all parameters into an array of io::Item elements, for saving. * @param ioItems an array of io::Item elements * @param saveElementType the element type for saving */ - void save(std::vector& ioItems, Type saveElementType = Type::float32); + void getItems(std::vector& ioItems, Type saveElementType = Type::float32); /** * Save all parameters into a file (.npz or .bin). @@ -831,7 +832,7 @@ class ExpressionGraph : public std::enable_shared_from_this { */ void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) { std::vector ioItems; - save(ioItems, saveElementType); + getItems(ioItems, saveElementType); if(ioItems.empty()) { LOG(warn, "Item list is empty, skipping saving"); } else { @@ -840,6 +841,7 @@ class ExpressionGraph : public std::enable_shared_from_this { io::saveItems(name, ioItems); } } + }; template diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp index 3afb599a9..69f8af92d 100644 --- a/src/graph/node_initializers.cpp +++ b/src/graph/node_initializers.cpp @@ -36,7 +36,7 @@ class LambdaInitConvert : public NodeInitializer { private: std::function lambda_; Type intermediateType_; // is used for the creation of a temporary intermediate tensor on which the lambda actually operates. - // This tensor is then automatically cast and copied to the type of the actual tensor. + // This tensor is then automatically cast and copied to the type of the actual tensor. public: LambdaInitConvert(std::function&& lambda, @@ -195,25 +195,24 @@ Ptr fromWord2vec(const std::string& file, Ptr fromItem(const io::Item& item) { if(item.mapped) { - return fromLambda([item](Tensor tensor) { - // @TODO: implement other types, for now croak loudly. - ABORT_IF(tensor->getBackend()->getDeviceId().type != DeviceType::cpu, - "Memory mapping only works for CPU tensors"); - ABORT_IF(tensor->type() != item.type, - "Tensor type ({}) and type for mapping ({}) do not match", - tensor->type(), - item.type); - ABORT_IF(tensor->shape() != item.shape, - "Tensor shape ({}) and shape of mapped item ({}) do not match", - tensor->shape(), - item.shape); - auto mp = MemoryPiece::New((uint8_t*)item.ptr, item.size()); // @TODO: this is not properly aligned now - tensor->reset(mp); - }); + return fromLambda([&item](Tensor tensor) { + if(tensor->getBackend()->getDeviceId().type != DeviceType::cpu) { + tensor->set(item); + } else { + ABORT_IF(tensor->type() != item.type, + "Tensor type ({}) and type for mapping ({}) do not match", + tensor->type(), + item.type); + ABORT_IF(tensor->shape() != item.shape, + "Tensor shape ({}) and shape of mapped item ({}) do not match", + tensor->shape(), + item.shape); + auto mp = MemoryPiece::New((uint8_t*)item.ptr, item.size()); // @TODO: this is not properly aligned now + tensor->reset(mp); + } + }, item.type); } else { - return fromLambda( - [item](Tensor tensor) { tensor->set(item); }, - item.type); + return fromLambda([&item](Tensor tensor) { tensor->set(item); }, item.type); } } @@ -223,7 +222,7 @@ Ptr fromTensor(Tensor externalTensor) { // Computes Google's sinusoidal position embeddings Ptr sinusoidalPositionEmbeddings(int start) { - return fromLambda([start](Tensor t) { SinusoidalPositionEmbeddings(t, start); }); + return fromLambda([start](Tensor t) { SinusoidalPositionEmbeddings(t, start); }); } // @TODO: this is rather inefficient also needs axis argument or something diff --git a/src/microsoft/cosmos.cpp b/src/microsoft/cosmos.cpp index 7493975eb..00ff9c90e 100644 --- a/src/microsoft/cosmos.cpp +++ b/src/microsoft/cosmos.cpp @@ -20,7 +20,7 @@ class EmbedderModel { EmbedderModel(Ptr options) : model_(createModelFromOptions(options, models::usage::embedding)) {} - void load(Ptr graph, const std::string& modelFile) { + void load(Ptr graph, Ptr modelFile) { model_->load(graph, modelFile); } @@ -36,21 +36,22 @@ namespace cosmos { const size_t MAX_BATCH_SIZE = 32; const size_t MAX_LENGTH = 256; -/** +/** * Single CPU-core implementation of an Embedder/Similiarity scorer. Turns sets of '\n' strings * into parallel batches and either outputs embedding vectors or similarity scores. */ class Embedder { -private: +private: Ptr options_; Ptr graph_; Ptr vocab_; Ptr model_; - + Ptr modelFile_; + public: Embedder(const std::string& modelPath, const std::string& vocabPath, bool computeSimilarity = false) { - options_ = New("inference", true, + options_ = New("inference", true, "shuffle", "none", "mini-batch", MAX_BATCH_SIZE, "maxi-batch", 100, @@ -59,7 +60,7 @@ class Embedder { "max-length-crop", true, "compute-similarity", computeSimilarity, "vocabs", std::vector(computeSimilarity ? 2 : 1, vocabPath)); - + vocab_ = New(options_, 0); vocab_->load(vocabPath, 0); @@ -67,20 +68,20 @@ class Embedder { graph_->setDevice(CPU0); graph_->reserveWorkspaceMB(512); - YAML::Node config; - io::getYamlFromModel(config, "special:model.yml", modelPath); - + modelFile_ = New(modelPath); + YAML::Node config = modelFile_->getYamlFromModel(); + Ptr modelOpts = New(); modelOpts->merge(options_); modelOpts->merge(config); model_ = New(modelOpts); - model_->load(graph_, modelPath); + model_->load(graph_, modelFile_); } // Compute embedding vectors for a batch of sentences std::vector> embed(const std::string& input) { - auto text = New(std::vector({input}), + auto text = New(std::vector({input}), std::vector>({vocab_}), options_); // we set runAsync=false as we are throwing exceptions instead of aborts. Exceptions and threading do not mix well. @@ -102,7 +103,7 @@ class Embedder { auto batchIdx = batch->getSentenceIds()[i]; if(output.size() <= batchIdx) output.resize(batchIdx + 1); - + int embSize = embeddings->shape()[-1]; size_t beg = i * embSize; size_t end = (i + 1) * embSize; @@ -116,7 +117,7 @@ class Embedder { // Compute cosine similarity scores for a two batches of corresponding sentences std::vector similarity(const std::string& input1, const std::string& input2) { - auto text = New(std::vector({input1, input2}), + auto text = New(std::vector({input1, input2}), std::vector>({vocab_, vocab_}), options_); // we set runAsync=false as we are throwing exceptions instead of aborts. Exceptions and threading do not mix well. diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 2302819eb..6a09469fd 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -7,6 +7,7 @@ #include "mkl.h" #endif +#include "common/io.h" #include "data/shortlist.h" #include "translator/beam_search.h" #include "translator/scorers.h" @@ -53,6 +54,13 @@ class VocabWrapper : public IVocabWrapper { Ptr getVocab() const { return pImpl_; } }; +IBeamSearchDecoder::IBeamSearchDecoder(Ptr options, + const std::vector& ptrs) + : options_(options) { + for(auto ptr : ptrs) + modelWeights_.push_back(New(ptr)); +} + class BeamSearchDecoder : public IBeamSearchDecoder { private: Ptr graph_; @@ -62,7 +70,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { std::vector> vocabs_; - static inline std::unordered_map configCache_; + static inline std::unordered_map configCache_; static inline std::mutex configCacheMutex_; public: BeamSearchDecoder(Ptr options, @@ -85,45 +93,31 @@ class BeamSearchDecoder : public IBeamSearchDecoder { mkl_set_num_threads(options_->get("mkl-threads", 1)); #endif - std::vector models - = options_->get>("model"); - - for(int i = 0; i < models.size(); ++i) { + for(int i = 0; i < modelWeights_.size(); ++i) { Ptr modelOpts = New(); // serializing this YAML can be costly, so read from cache YAML::Node config; - auto cachedConfig = getConfigFromCache(models[i]); + auto cachedConfig = getConfigFromCache((size_t)modelWeights_[i]->data()); if(cachedConfig != nullptr) { config = *cachedConfig; } else { - if(io::isBin(models[i]) && ptrs_[i] != nullptr) - io::getYamlFromModel(config, "special:model.yml", ptrs_[i]); - else - io::getYamlFromModel(config, "special:model.yml", models[i]); - writeConfigToCache(config, models[i]); + ABORT_IF(modelWeights_[i]->data() == nullptr, "Model pointer is null"); + config = modelWeights_[i]->getYamlFromModel("special:model.yml"); + writeConfigToCache(config, (size_t)modelWeights_[i]->data()); } modelOpts->merge(options_); modelOpts->merge(config); - // serializing this to YAML is expensive. we only want to do this once - // we can use whether we loaded the cache from config as a signal + // serializing this to YAML is expensive. we only want to do this once + // we can use whether we loaded the cache from config as a signal if(cachedConfig == nullptr){ std::cerr << modelOpts->asYamlString() << std::flush; } auto encdec = models::createModelFromOptions(modelOpts, models::usage::translation); - - if(io::isBin(models[i]) && ptrs_[i] != nullptr) { - // if file ends in *.bin and has been mapped by QuickSAND - scorers_.push_back(New( - encdec, "F" + std::to_string(scorers_.size()), /*weight=*/1.0f, ptrs[i])); - } else { - // it's a *.npz file or has not been mapped by QuickSAND - scorers_.push_back(New( - encdec, "F" + std::to_string(scorers_.size()), /*weight=*/1.0f, models[i])); - } + scorers_.push_back(New(encdec, "F" + std::to_string(scorers_.size()), /*weight=*/1.0f, modelWeights_[i])); } for(auto scorer : scorers_) { @@ -134,7 +128,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { graph_->forward(); } - YAML::Node* getConfigFromCache(std::string key){ + YAML::Node* getConfigFromCache(size_t key){ const std::lock_guard lock(configCacheMutex_); bool inCache = configCache_.find(key) != configCache_.end(); if (inCache) { @@ -144,7 +138,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { return nullptr; } } - void writeConfigToCache(YAML::Node config, std::string key) { + void writeConfigToCache(YAML::Node config, size_t key) { const std::lock_guard lock(configCacheMutex_); configCache_[key] = config; } @@ -154,7 +148,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { QSNBestBatch decode(const QSBatch& qsBatch, size_t maxLength, const std::unordered_set& shortlist) override { - + std::vector lshOpts = options_->get>("output-approx-knn", {}); ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); ABORT_IF(lshOpts.size() == 2 && shortlist.size() > 0, "LSH and shortlist cannot be used at the same time"); @@ -168,7 +162,7 @@ class BeamSearchDecoder : public IBeamSearchDecoder { shortListGen = New(lshOpts[0], lshOpts[1], vocabs_[1]->lemmaSize(), /*abortIfDynamic=*/true); } else { shortListGen = New(shortlist); - } + } for(auto scorer : scorers_) scorer->setShortlistGenerator(shortListGen); } @@ -297,15 +291,17 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) { bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, int32_t lshNBits) { std::cerr << "Converting from: " << inputFile << ", to: " << outputFile << ", precision: " << targetPrec << std::endl; - YAML::Node config; + auto modelFile = New(inputFile); + + YAML::Node config = modelFile->getYamlFromModel(); std::stringstream configStr; - marian::io::getYamlFromModel(config, "special:model.yml", inputFile); + configStr << config; auto graph = New(); graph->setDevice(CPU0); - graph->load(inputFile); + graph->load(modelFile); // MJD: Note, this is a default settings which we might want to change or expose. Use this only with Polonium students. // The LSH will not be used by default even if it exists in the model. That has to be enabled in the decoder config. @@ -329,8 +325,8 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP } Type targetPrecType = (Type) targetPrec; - if (targetPrecType == Type::packed16 - || targetPrecType == Type::packed8avx2 + if (targetPrecType == Type::packed16 + || targetPrecType == Type::packed8avx2 || targetPrecType == Type::packed8avx512 || (targetPrecType == Type::float32 && addLsh)) { // only allow non-conversion to float32 if we also use the LSH graph->packAndSave(outputFile, configStr.str(), targetPrecType); diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h index cddcfd22e..3ed866e83 100644 --- a/src/microsoft/quicksand.h +++ b/src/microsoft/quicksand.h @@ -13,6 +13,10 @@ using Ptr = std::shared_ptr; class Options; +namespace io { + class ModelWeights; +} + namespace quicksand { typedef uint32_t IndexType; @@ -47,12 +51,11 @@ class IVocabWrapper { class IBeamSearchDecoder { protected: Ptr options_; - std::vector ptrs_; + std::vector> modelWeights_; public: IBeamSearchDecoder(Ptr options, - const std::vector& ptrs) - : options_(options), ptrs_(ptrs) {} + const std::vector& ptrs); virtual ~IBeamSearchDecoder() {} diff --git a/src/models/amun.h b/src/models/amun.h index 135ce3597..d6b1209c6 100644 --- a/src/models/amun.h +++ b/src/models/amun.h @@ -36,7 +36,7 @@ class Amun : public EncoderDecoder { } void load(Ptr graph, - const std::vector& items, + Ptr modelFile, bool /*markedReloaded*/ = true) override { std::map nameMap = {{"decoder_U", "decoder_cell1_U"}, @@ -89,41 +89,51 @@ class Amun : public EncoderDecoder { if(opt("tied-embeddings-src") || opt("tied-embeddings-all")) nameMap["Wemb"] = "Wemb"; - auto ioItems = items; - // map names and remove a dummy matrices - for(auto it = ioItems.begin(); it != ioItems.end();) { - // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size - // @TODO: consider dropping support for Nematus models - if(it->shape.size() == 1) { - int dim = it->shape[-1]; - it->shape.resize(2); - it->shape.set(0, 1); - it->shape.set(1, dim); - } - - if(it->name == "decoder_c_tt") { - it = ioItems.erase(it); - } else if(it->name == "uidx") { - it = ioItems.erase(it); - } else if(it->name == "history_errs") { - it = ioItems.erase(it); - } else { - auto pair = nameMap.find(it->name); - if(pair != nameMap.end()) - it->name = pair->second; - it++; + // we will modify the items directly, so memory mapping etc. should just work + // This should never be done, but we need to be compatible with Amun/Nematus for now. + auto& ioItems = modelFile->items(); + + // @TODO: get rid of all this eventually + { // scope for lock_guard + // this is needed during loading since we modify the content of modelFile->items() directly + // This is quite ugly but this is legacy code anyway. + std::mutex mutex; + std::lock_guard lock(mutex); + + // only modify the first time. + bool modify = false; + for(auto& item : ioItems) + if(item.name == "decoder_c_tt") // still there, hence this is the first time. + modify = true; + + if(modify) { + // map names and remove a dummy matrices + for(auto it = ioItems.begin(); it != ioItems.end();) { + // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size + // @TODO: consider dropping support for Nematus models + if(it->shape.size() == 1) { + int dim = it->shape[-1]; + it->shape.resize(2); + it->shape.set(0, 1); + it->shape.set(1, dim); + } + + if(it->name == "decoder_c_tt") { + it = ioItems.erase(it); + } else if(it->name == "uidx") { + it = ioItems.erase(it); + } else if(it->name == "history_errs") { + it = ioItems.erase(it); + } else { + auto pair = nameMap.find(it->name); + if(pair != nameMap.end()) + it->name = pair->second; + it++; + } + } } } - // load items into the graph - graph->load(ioItems); - } - - void load(Ptr graph, - const std::string& name, - bool /*markReloaded*/ = true) override { - LOG(info, "Loading model from {}", name); - auto ioItems = io::loadItems(name); - load(graph, ioItems); + graph->load(modelFile); } void save(Ptr graph, @@ -179,7 +189,7 @@ class Amun : public EncoderDecoder { // get parameters from the graph to items std::vector ioItems; - graph->save(ioItems); + graph->getItems(ioItems); // replace names to be compatible with Nematus for(auto& item : ioItems) { auto newItemName = nameMap.find(item.name); diff --git a/src/models/costs.h b/src/models/costs.h index 45527362f..fa67b5fb2 100644 --- a/src/models/costs.h +++ b/src/models/costs.h @@ -218,17 +218,11 @@ class Trainer : public ICriterionFunction { Ptr getModel() { return model_; } void load(Ptr graph, - const std::vector& items, + Ptr modelFile, bool markedReloaded) override { - model_->load(graph, items, markedReloaded); + model_->load(graph, modelFile, markedReloaded); } - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override { - model_->load(graph, name, markedReloaded); - }; - virtual void save(Ptr graph, const std::string& name, bool saveTranslatorConfig = false) override { @@ -270,17 +264,11 @@ class Scorer : public IModel { Ptr getModel() { return model_; } virtual void load(Ptr graph, - const std::vector& items, + Ptr modelFile, bool markReloaded = true) override { - model_->load(graph, items, markReloaded); + model_->load(graph, modelFile, markReloaded); } - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override { - model_->load(graph, name, markedReloaded); - }; - virtual void save(Ptr graph, const std::string& name, bool saveTranslatorConfig = false) override { @@ -322,23 +310,11 @@ class Stepwise : public IEncoderDecoder { Stepwise(Ptr encdec, Ptr cost) : encdec_(encdec), cost_(cost) {} virtual void load(Ptr graph, - const std::vector& items, + Ptr modelFile, bool markedReloaded = true) override { - encdec_->load(graph, items, markedReloaded); + encdec_->load(graph, modelFile, markedReloaded); } - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override { - encdec_->load(graph, name, markedReloaded); - } - - virtual void mmap(Ptr graph, - const void* ptr, - bool markedReloaded = true) override { - encdec_->mmap(graph, ptr, markedReloaded); - }; - virtual void save(Ptr graph, const std::string& name, bool saveTranslatorConfig = false) override { diff --git a/src/models/encoder_classifier.h b/src/models/encoder_classifier.h index 552e428f2..7e25f33ed 100644 --- a/src/models/encoder_classifier.h +++ b/src/models/encoder_classifier.h @@ -21,26 +21,10 @@ class EncoderClassifierBase : public models::IModel { public: virtual ~EncoderClassifierBase() {} - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override - = 0; - - virtual void mmap(Ptr graph, - const void* ptr, - bool markedReloaded = true) - = 0; - - virtual void save(Ptr graph, - const std::string& name, - bool saveTranslatorConfig = false) override - = 0; - virtual void clear(Ptr graph) override = 0; virtual std::vector> apply(Ptr, Ptr, bool) = 0; - virtual Logits build(Ptr graph, Ptr batch, bool clearGraph = true) override = 0; @@ -154,21 +138,9 @@ class EncoderClassifier : public EncoderClassifierBase { void push_back(Ptr classifier) { classifiers_.push_back(classifier); } void load(Ptr graph, - const std::vector& items, - bool markedReloaded) override { - graph->load(items, markedReloaded && !opt("ignore-model-config", false)); - } - - void load(Ptr graph, - const std::string& name, - bool markedReloaded) override { - graph->load(name, markedReloaded && !opt("ignore-model-config", false)); - } - - void mmap(Ptr graph, - const void* ptr, + Ptr modelFile, bool markedReloaded) override { - graph->mmap(ptr, markedReloaded && !opt("ignore-model-config", false)); + graph->load(modelFile, markedReloaded && !opt("ignore-model-config", false)); } void save(Ptr graph, diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp index 971726271..0c27ab4c7 100644 --- a/src/models/encoder_decoder.cpp +++ b/src/models/encoder_decoder.cpp @@ -73,7 +73,7 @@ EncoderDecoder::EncoderDecoder(Ptr graph, Ptr options) modelFeatures_.insert("transformer-no-bias"); modelFeatures_.insert("transformer-no-affine"); - + modelFeatures_.insert("transformer-disable-position-embeddings"); modelFeatures_.insert("transformer-attention-mask"); modelFeatures_.insert("transformer-alibi-shift"); @@ -159,21 +159,9 @@ std::string EncoderDecoder::getModelParametersAsString() { } void EncoderDecoder::load(Ptr graph, - const std::vector& items, - bool markedReloaded) { - graph->load(items, markedReloaded && !opt("ignore-model-config", false)); -} - -void EncoderDecoder::load(Ptr graph, - const std::string& name, + Ptr modelFile, bool markedReloaded) { - graph->load(name, markedReloaded && !opt("ignore-model-config", false)); -} - -void EncoderDecoder::mmap(Ptr graph, - const void* ptr, - bool markedReloaded) { - graph->mmap(ptr, markedReloaded && !opt("ignore-model-config", false)); + graph->load(modelFile, markedReloaded && !opt("ignore-model-config", false)); } void EncoderDecoder::save(Ptr graph, @@ -237,7 +225,7 @@ Ptr EncoderDecoder::step(Ptr graph, // Fill state with embeddings based on last prediction decoders_[0]->embeddingsFromPrediction(graph, state, words, (int)batchIndices.size(), beamSize); auto nextState = decoders_[0]->step(graph, state); - + return nextState; } diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h index ef810ed8b..9d717dbd2 100644 --- a/src/models/encoder_decoder.h +++ b/src/models/encoder_decoder.h @@ -13,26 +13,6 @@ class IEncoderDecoder : public models::IModel { public: virtual ~IEncoderDecoder() {} - virtual void load(Ptr graph, - const std::vector& items, - bool markedReloaded = true) override - = 0; - - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override - = 0; - - virtual void mmap(Ptr graph, - const void* ptr, - bool markedReloaded = true) - = 0; - - virtual void save(Ptr graph, - const std::string& name, - bool saveTranslatorConfig = false) override - = 0; - virtual void clear(Ptr graph) override = 0; virtual Logits build(Ptr graph, @@ -62,7 +42,7 @@ class IEncoderDecoder : public models::IModel { virtual Ptr getShortlist() = 0; - virtual data::SoftAlignment getAlignment() = 0; + virtual data::SoftAlignment getAlignment() = 0; }; class EncoderDecoder : public IEncoderDecoder, public LayerBase { @@ -98,15 +78,7 @@ class EncoderDecoder : public IEncoderDecoder, public LayerBase { void push_back(Ptr decoder); virtual void load(Ptr graph, - const std::vector& items, - bool markedReloaded = true) override; - - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override; - - virtual void mmap(Ptr graph, - const void* ptr, + Ptr, bool markedReloaded = true) override; virtual void save(Ptr graph, diff --git a/src/models/encoder_pooler.h b/src/models/encoder_pooler.h index b89f85c9e..b9041cd6c 100644 --- a/src/models/encoder_pooler.h +++ b/src/models/encoder_pooler.h @@ -25,26 +25,6 @@ class EncoderPoolerBase : public models::IModel { public: virtual ~EncoderPoolerBase() {} - virtual void load(Ptr graph, - const std::vector& items, - bool markedReloaded = true) override - = 0; - - virtual void load(Ptr graph, - const std::string& name, - bool markedReloaded = true) override - = 0; - - virtual void mmap(Ptr graph, - const void* ptr, - bool markedReloaded = true) - = 0; - - virtual void save(Ptr graph, - const std::string& name, - bool saveTranslatorConfig = false) override - = 0; - virtual void clear(Ptr graph) override = 0; virtual std::vector apply(Ptr, Ptr, bool) = 0; @@ -175,21 +155,9 @@ class EncoderPooler : public EncoderPoolerBase { void push_back(Ptr pooler) { poolers_.push_back(pooler); } void load(Ptr graph, - const std::vector& items, - bool markedReloaded) override { - graph->load(items, markedReloaded && !opt("ignore-model-config", false)); - } - - void load(Ptr graph, - const std::string& name, - bool markedReloaded) override { - graph->load(name, markedReloaded && !opt("ignore-model-config", false)); - } - - void mmap(Ptr graph, - const void* ptr, + Ptr modelFile, bool markedReloaded) override { - graph->mmap(ptr, markedReloaded && !opt("ignore-model-config", false)); + graph->load(modelFile, markedReloaded && !opt("ignore-model-config", false)); } void save(Ptr graph, diff --git a/src/models/model_base.h b/src/models/model_base.h index 32705bbe7..a159d4e81 100644 --- a/src/models/model_base.h +++ b/src/models/model_base.h @@ -10,10 +10,10 @@ namespace marian { namespace models { enum struct usage { - raw, - training, - scoring, - translation, + raw, + training, + scoring, + translation, embedding, // used for laser and other models to produce embedding vectors evaluating // evaluating is a special mode for neural metrics, different from (probabilistic) scoring }; @@ -30,12 +30,7 @@ namespace models { class IModel { public: virtual void load(Ptr, - const std::string&, - bool markReloaded = true) - = 0; - - virtual void load(Ptr, - const std::vector&, + Ptr, bool markReloaded = true) = 0; @@ -59,12 +54,7 @@ class ICriterionFunction { virtual ~ICriterionFunction() {} virtual void load(Ptr, - const std::string&, - bool markReloaded = true) - = 0; - - virtual void load(Ptr, - const std::vector&, + Ptr, bool markReloaded = true) = 0; diff --git a/src/models/nematus.h b/src/models/nematus.h index aee8e3b04..7d421ec5c 100644 --- a/src/models/nematus.h +++ b/src/models/nematus.h @@ -26,43 +26,55 @@ class Nematus : public EncoderDecoder { } void load(Ptr graph, - const std::vector& items, + Ptr modelFile, bool /*markReloaded*/ = true) override { - auto ioItems = items; - // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node - for(auto it = ioItems.begin(); it != ioItems.end();) { - // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size - // @TODO: consider dropping support for Nematus models - if(it->shape.size() == 1) { - int dim = it->shape[-1]; - it->shape.resize(2); - it->shape.set(0, 1); - it->shape.set(1, dim); - } - if(it->name == "decoder_c_tt") { - it = ioItems.erase(it); - } else if(it->name == "uidx") { - it = ioItems.erase(it); - } else if(it->name == "history_errs") { - it = ioItems.erase(it); - } else { - auto pair = nameMap_.find(it->name); - if(pair != nameMap_.end()) - it->name = pair->second; - it++; + // we will modify the items directly, so memory mapping etc. should just work + // This should never be done, but we need to be compatible with Amun/Nematus for now. + auto& ioItems = modelFile->items(); + + // @TODO: get rid of all this eventually + { // scope for lock_guard + // this is needed during loading since we modify the content of modelFile->items() directly + // This is quite ugly but this is legacy code anyway. + std::mutex mutex; + std::lock_guard lock(mutex); + + // only modify the first time. + bool modify = false; + for(auto& item : ioItems) + if(item.name == "decoder_c_tt") // still there, hence this is the first time. + modify = true; + + if(modify) { + // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node + for(auto it = ioItems.begin(); it != ioItems.end();) { + // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size + // @TODO: consider dropping support for Nematus models + if(it->shape.size() == 1) { + int dim = it->shape[-1]; + it->shape.resize(2); + it->shape.set(0, 1); + it->shape.set(1, dim); + } + + if(it->name == "decoder_c_tt") { + it = ioItems.erase(it); + } else if(it->name == "uidx") { + it = ioItems.erase(it); + } else if(it->name == "history_errs") { + it = ioItems.erase(it); + } else { + auto pair = nameMap_.find(it->name); + if(pair != nameMap_.end()) + it->name = pair->second; + it++; + } + } } } - // load items into the graph - graph->load(ioItems); - } - void load(Ptr graph, - const std::string& name, - bool /*markReloaded*/ = true) override { - LOG(info, "Loading model from {}", name); - auto ioItems = io::loadItems(name); - load(graph, ioItems); + graph->load(modelFile); } void save(Ptr graph, @@ -77,7 +89,7 @@ class Nematus : public EncoderDecoder { // get parameters from the graph to items std::vector ioItems; - graph->save(ioItems); + graph->getItems(ioItems); // replace names to be compatible with Nematus for(auto& item : ioItems) { auto newItemName = nameMapRev_.find(item.name); diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h index 46df741b0..ac86e4dc7 100644 --- a/src/models/transformer_factory.h +++ b/src/models/transformer_factory.h @@ -14,25 +14,25 @@ Ptr NewDecoderTransformer(Ptr graph, Ptr class TransformerLegacy : public EncoderDecoder { public: - TransformerLegacy(Ptr graph, Ptr options) + TransformerLegacy(Ptr graph, Ptr options) : EncoderDecoder(graph, options), nameMap_(createNameMap()) { } void load(Ptr graph, - const std::vector& items, + Ptr modelFile, bool markedReloaded = true) override { - for(auto it = items.begin(); it != items.end(); it++) { - auto pair = nameMap_.find(it->name); + for(auto& item : modelFile->items()) { + auto pair = nameMap_.find(item.name); if(pair != nameMap_.end()) { - LOG(debug, "Mapping parameter {} to {}", it->name, pair->second); - const_cast(*it).name = pair->second; + LOG(debug, "Mapping parameter {} to {}", item.name, pair->second); + const_cast(item).name = pair->second; // reduce shape of bias vectors from {1, dimModel} to {dimModel} - int dimModel = it->shape[-1]; - if(it->shape == Shape({1, dimModel})) - const_cast(*it).shape = Shape({dimModel}); + int dimModel = item.shape[-1]; + if(item.shape == Shape({1, dimModel})) + const_cast(item).shape = Shape({dimModel}); } else { - LOG(debug, "Could not find parameter {}", it->name); + LOG(debug, "Could not find parameter {}", item.name); } } @@ -49,20 +49,12 @@ class TransformerLegacy : public EncoderDecoder { linear->transposed = false; // load items into the graph - graph->load(items); - } - - void load(Ptr graph, - const std::string& name, - bool markReloaded = true) override { - LOG(info, "Loading model from {}", name); - auto items = io::loadItems(name); - load(graph, items, markReloaded); + graph->load(modelFile); } private: std::map nameMap_; - + std::map createNameMap() { std::map nameMap = { {"Wemb", "Wemb"}, @@ -125,13 +117,13 @@ class TransformerLegacy : public EncoderDecoder { // name maps for decoder SSRU nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo); - + nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo); nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo); nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo); nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo); - + nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h index 062b91bca..051885c7b 100644 --- a/src/rescorer/rescorer.h +++ b/src/rescorer/rescorer.h @@ -25,7 +25,7 @@ class Rescorer { Rescorer(Ptr options) : builder_(models::createCriterionFunctionFromOptions(options, models::usage::scoring)) {} - void load(Ptr graph, const std::string& modelFile) { + void load(Ptr graph, Ptr modelFile) { builder_->load(graph, modelFile); } @@ -46,6 +46,7 @@ class Rescore : public ModelTask { Ptr corpus_; std::vector> graphs_; std::vector> models_; + Ptr modelFile_; public: Rescore(Ptr options) : options_(options) { @@ -77,7 +78,8 @@ class Rescore : public ModelTask { graphs_.push_back(graph); } - auto modelFile = options_->get("model"); + auto modelPath = options_->get("model"); + modelFile_ = New(modelPath); models_.resize(graphs_.size()); ThreadPool pool(graphs_.size(), graphs_.size()); @@ -85,7 +87,7 @@ class Rescore : public ModelTask { pool.enqueue( [=](size_t j) { models_[j] = New(options_); - models_[j]->load(graphs_[j], modelFile); + models_[j]->load(graphs_[j], modelFile_); }, i); } diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index 1a233372c..f1a68210e 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -18,7 +18,7 @@ namespace marian { // This requires some more changes, but we temporarily do this just by name ("_W") of the weights. // And, this introduces a low level packed_gemm.h apis interact with high level graph class. // So, we make a subclass of ExpressionGraph and put those immature codes in this class. -// We will improve this in the near future. +// We will improve this in the near future. class ExpressionGraphPackable : public ExpressionGraph { public: ExpressionGraphPackable() @@ -165,7 +165,7 @@ class ExpressionGraphPackable : public ExpressionGraph { Tensor tmp; allocator->allocate(tmp, val->shape(), val->type()); cpu::Transpose10(tmp, val); - + if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type float quantMult = cpu::integer::computeQuantMult(val); @@ -233,7 +233,7 @@ class ExpressionGraphPackable : public ExpressionGraph { } //Put the quantMult at the back of the tensor cpu::integer::getQuantMult(paramMat) = quantMult; - + } else { ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType)); } diff --git a/src/tensors/tensor.cpp b/src/tensors/tensor.cpp index e9a07ab46..d89e41964 100644 --- a/src/tensors/tensor.cpp +++ b/src/tensors/tensor.cpp @@ -114,7 +114,7 @@ template std::string TensorBase::debug(int, int); template std::string TensorBase::debug(int, int); template std::string TensorBase::debug(int, int); -// fill an io::item with data from a Tensor, used for saving +// fill an io::item with data from a Tensor, used for saving // and other IO operations. void TensorBase::get(io::Item& item, const std::string& name) { item.name = name; @@ -131,10 +131,10 @@ void TensorBase::get(io::Item& item, const std::string& name) { void TensorBase::set(const io::Item& item) { ABORT_IF(item.type != type_, "Tensor type {} and item type {} do not match", type_, item.type); ABORT_IF(item.shape != shape_, "Tensor shape {} and item shape {} do not match", shape_, item.shape); - ABORT_IF(item.bytes.size() > memory_->size(), "Item data size {} too large for memory {}", item.bytes.size(), memory_->size()); + ABORT_IF(item.size() > memory_->size(), "Item data size {} too large for memory {}", item.size(), memory_->size()); copy(backend_, - item.bytes.data(), - item.bytes.data() + item.bytes.size(), + item.data(), + item.data() + item.size(), memory_->data()); } diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 054b0ae76..9b5f300d4 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -16,7 +16,7 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) if(vcs.size() > 1) costScalingFreq_ = std::stoul(vcs[1]); if(vcs.size() > 2) costScalingMultiplier_ = std::stof( vcs[2]); if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]); - + LOG_ONCE(info, "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}", costScalingFactor_, @@ -69,7 +69,7 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) void GraphGroup::initGraphsAndOpts() { for(auto device : devices_) { auto graph = New(); - + // @TODO: validate precisions in config auto precisions = options_->get>("precision"); Type parameterType = typeFromString(precisions[0]); @@ -81,7 +81,7 @@ void GraphGroup::initGraphsAndOpts() { graph->setThrowNaN(true); graph->setDevice(device); - + graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); @@ -156,7 +156,7 @@ void GraphGroup::decreaseCostScaleFactor() { return; nanSeen_++; - + size_t total = nanSeen_ + noNanSeen_; // do not reduce cost-scaling factor below minimum @@ -177,15 +177,15 @@ void GraphGroup::decreaseCostScaleFactor() { float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) { auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin); - + // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than - // the minimum. If a NaN value is seen we exit here and will reduce the factor next and - // this skips an update. - // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces + // the minimum. If a NaN value is seen we exit here and will reduce the factor next and + // this skips an update. + // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces // NaNs with 0. Updates are not skipped any more. // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type. - // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent - // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. + // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent + // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large. if(costScaling_ || checkGradientNan_) { bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_; bool clipInf = !checkGradientNan_; @@ -206,7 +206,7 @@ float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) { auto gNorm = L2Norm(curGrad, graphs_[i]->allocator()); if(isFinite(gNorm) && gNorm > 0.0) return gNorm; - else + else return std::numeric_limits::quiet_NaN(); } @@ -218,10 +218,10 @@ float GraphGroup::executeAndCollectNorm(const std::functionallReduce(&gradNormSquared, &gradNormSquared, 1, MPI_FLOAT, MPI_SUM); // sum all - + if(shardingMode_ == ShardingMode::local) // we already have the correct norm on one device, but we also need to check for NaN gradNormSquared /= (float)mpi_->numMPIProcesses(); - + gradNorm = std::sqrt(gradNormSquared); // redo sqrt } return gradNorm; @@ -245,16 +245,16 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) if(!isFinite(gNorm)) // we are checking the sanity of the gradient elsewhere return normalizationFactor; - + if(dynamicGradientScaling_) { // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g) if(costScaling_) gNorm = gNorm / costScalingFactor_; - - // Normalize gradient norm w.r.t. number of labels in batch for statistics, + + // Normalize gradient norm w.r.t. number of labels in batch for statistics, // there should be no gradient normalization before this point, @TODO: check this - gNorm = gNorm / updateTrgWords; - + gNorm = gNorm / updateTrgWords; + size_t window; float gNormAvgTransform, gNormVarTransform, gNormTransform, gNormAvg; if(dynamicGradientScalingUseLogs_) { // tracking the log of the gradient norms rather than the gradient norms itself results in a larger standard deviation as the actual @@ -265,9 +265,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) } else { std::tie(window, gNormAvgTransform, gNormVarTransform) = scheduler_->getGradientNormStats(); gNormTransform = gNorm; // we are not using logs, so we can just use the normal gradient norm - gNormAvg = gNormAvgTransform; // we are getting the actual running average of gradient norms, no transformation needed. + gNormAvg = gNormAvgTransform; // we are getting the actual running average of gradient norms, no transformation needed. } - + auto deltaTransform = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average. auto gNormStdTransform = std::sqrt(gNormVarTransform); // compute STD for the running average of (log) gradient norms. @@ -283,7 +283,7 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}", dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm); - normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. + normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average. } } @@ -322,23 +322,27 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { std::string modelFileName = options_->get("model"); bool foundModel = false; - // these are structures that get fill in the main process and then broadcasted to other MPI + // these are structures that get filled in the main process and then broadcasted to other MPI processes std::vector items; bool markReloaded = true; if(isMainProcess()) { if(filesystem::exists(modelFileName)) { LOG(info, "Loading model from {}", modelFileName); - foundModel = true; - items = io::loadItems(modelFileName); + foundModel = true; + modelWeights_ = New(modelFileName, io::MmapMode::DontMmap); markReloaded = true; } else if(options_->hasAndNotEmpty("pretrained-model")) { std::string pretrainedModelFileName = options_->get("pretrained-model"); LOG(info, "[training] Initializing model weights with pre-trained model {}", pretrainedModelFileName); foundModel = true; - items = io::loadItems(pretrainedModelFileName); + modelWeights_ = New(pretrainedModelFileName, io::MmapMode::DontMmap); markReloaded = false; } + } else { + // Initialize with dummy and set correct file name in main process. + // If we are running only one process this will always be correctly initialized above. + modelWeights_ = New(); } // if a model file exists, the main process will find it and propagate this information to other MPI nodes @@ -349,27 +353,31 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { // continue with checkpoint loading if(mpi_) { // broadcast model information to other processes - mpi_->bCast(items); + modelWeights_->loadAndSync(mpi_); mpi_->bCast(&markReloaded, 1, mpi_->getDataType(&markReloaded)); } // handles MPI if(scheduler_) scheduler_->load(modelFileName); - - // we just load it N times from disk (it'll be in disk cache after the first) - // this also allocates memory correctly when calling forward() inside restoreOptimizerState + + // We just load it N times but it'll be in read into modelWeights after the first time. + // This also allocates memory correctly when calling forward() inside restoreOptimizerState size_t i = 0; - for(auto graph : graphs_) - models_[i++]->load(graph, items, markReloaded); + for(auto graph : graphs_) { + models_[i++]->load(graph, modelWeights_, markReloaded); + } // try to restore everything from checkpoint now loadOptimizerState(modelFileName, scatterFn); + + // @TODO: run another graph->forward() to allocate the weights from the checkpoint? + // then we might not need to keep modelWeights_ around. } } } -bool GraphGroup::loadOptimizerState(const std::string& modelFileName, +bool GraphGroup::loadOptimizerState(const std::string& modelFileName, const OptimizerBase::ScatterStateFunc& scatterFn) { /* if model checkpoint is available: @@ -383,22 +391,20 @@ bool GraphGroup::loadOptimizerState(const std::string& modelFileName, bool foundCheckpoint = filesystem::exists(checkpointName); if(mpi_) mpi_->bCast(&foundCheckpoint, 1, mpi_->getDataType(&foundCheckpoint)); - + // all nodes will either continue or exit if(!foundCheckpoint) { LOG(warn, "No checkpoint found, parameters reloaded from last inference model"); return false; // failed to restore } - std::vector items; + auto checkpoint = New(checkpointName, io::MmapMode::DontMmap); + // make sure all nodes receive the same checkpoint data from the main process. - if(mpi_) { // only the main process loads the checkpoint and the rest receives a copy - if(isMainProcess()) - items = io::loadItems(checkpointName); - mpi_->bCast(items); - } else { // not doing MPI, so just load the checkpoint from disk - items = io::loadItems(checkpointName); - } + if(mpi_) // only the main process loads the checkpoint and the rest receives a copy + checkpoint->loadAndSync(mpi_); + + auto& items = checkpoint->items(); // @TODO: probably we want to have the list of DeviceIds as an attribute std::vector> backends; @@ -438,7 +444,7 @@ bool GraphGroup::loadOptimizerState(const std::string& modelFileName, void GraphGroup::saveOptimizerState(const std::string& modelFileName, const OptimizerBase::GatherStateFunc& gatherFn) { - // @TODO: change to .checkpoint.npz, would break backwards compat + // @TODO: change to .checkpoint.npz, would break backwards compat std::string checkpointName = modelFileName + ".optimizer.npz"; std::vector items; @@ -446,7 +452,7 @@ void GraphGroup::saveOptimizerState(const std::string& modelFileName, optimizerShards_, gatherFn, isMainProcess()); - + if(isMainProcess()) { // only main process does the actual saving auto found = std::find_if(items.begin(), items.end(), [](const io::Item& item) { return item.name == "master_parameters"; }); @@ -461,7 +467,6 @@ void GraphGroup::saveOptimizerState(const std::string& modelFileName, items.push_back(masterParameters); } - LOG(info, "[training] Saving training checkpoint to {} and {}", modelFileName, checkpointName); io::saveItems(checkpointName, items); } @@ -469,7 +474,7 @@ void GraphGroup::saveOptimizerState(const std::string& modelFileName, void GraphGroup::saveCheckPoint(const std::string& modelFileName, bool isFinal, - bool doSaveOptimizerState, + bool doSaveOptimizerState, const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn) { barrier(); // (for better grouping of log messages) // bring the smoothed model in @@ -525,10 +530,10 @@ void GraphGroup::swapWithSmoothed() { }; comm_->foreach(swap); comm_->allGatherParams(); - + if(shardingMode_ == ShardingMode::local) comm_->broadcastParams(); - + barrier(); } @@ -543,10 +548,10 @@ void GraphGroup::replaceWithSmoothed() { }; comm_->foreach(replace); comm_->allGatherParams(); - + if(shardingMode_ == ShardingMode::local) comm_->broadcastParams(); - + barrier(); } @@ -587,7 +592,7 @@ Ptr GraphGroup::collectStats(Ptr graph, size_t step = options_->get("mini-batch-fit-step"); size_t maxLength = options_->get("max-length"); - + // this should be only one class label per line on input, hence restricting length to 1 std::vector localMaxes(numFiles, maxLength); auto inputTypes = options_->get>("input-types", {}); @@ -623,7 +628,7 @@ Ptr GraphGroup::collectStats(Ptr graph, // Do a binary search for maxmimum batch size that fits into given workspace memory // for a tested sentence length. // We round the maxLength to the next larger step to avoid a situation where we do not - // collect batch statistics for maximum length between steps. However, we do not exceed + // collect batch statistics for maximum length between steps. However, we do not exceed // the actual maxLength even if the rounded value is larger. size_t maxLengthRounded = (size_t)(std::ceil(maxLength / (float)step) * step); for(size_t i = step; i <= maxLengthRounded; i += step) { diff --git a/src/training/graph_group.h b/src/training/graph_group.h index b0c98e3ce..9f70ed81b 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -46,6 +46,7 @@ class GraphGroup { std::vector> models_; // [deviceIndex] std::vector> optimizerShards_; // [deviceIndex] + Ptr modelWeights_; // handle for model weights, we keep this around to make sure weights are not deallocated while we are still using them Ptr scheduler_; // scheduler that keeps track of how much has been processed bool finalized_{false}; // 'true' if training has completed (further updates are no longer allowed) @@ -105,7 +106,7 @@ class GraphGroup { void saveCheckPoint(const std::string& modelFileName, bool isFinal, - bool doSaveOptimizerState, + bool doSaveOptimizerState, const OptimizerBase::GatherStateFunc& gatherOptimizerStateFn); void saveOptimizerState(const std::string& modelFileName, @@ -117,7 +118,7 @@ class GraphGroup { void swapWithSmoothed(); // This function replaces the current optimizer parameters with the smoothed version (provided smoothing is enabled). - // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. + // This is different from swapping (swapping twice restores original state) as the original parameters get overwritten. void replaceWithSmoothed(); bool isMainProcess() const { return mpi_->isMainProcess(); } // (we need this test a few times) diff --git a/src/training/validator.cpp b/src/training/validator.cpp index b51f1de3f..44c61171b 100644 --- a/src/training/validator.cpp +++ b/src/training/validator.cpp @@ -354,11 +354,12 @@ float TranslationValidator::validate(const std::vector>& gr // Create scorer auto model = options_->get("model"); + auto modelFile = New(model); std::vector> scorers; for(auto graph : graphs) { auto builder = models::createModelFromOptions(options_, models::usage::translation); - Ptr scorer = New(builder, "", 1.0f, model); + Ptr scorer = New(builder, "", 1.0f, modelFile); scorers.push_back(scorer); // @TODO: should this be done in the contructor? } @@ -591,6 +592,7 @@ float SacreBleuValidator::validate(const std::vector>& grap // Create scorer auto model = options_->get("model"); + auto modelFile = New(model); // @TODO: check if required - Temporary options for translation auto mopts = New(); @@ -600,7 +602,7 @@ float SacreBleuValidator::validate(const std::vector>& grap std::vector> scorers; for(auto graph : graphs) { auto builder = models::createModelFromOptions(options_, models::usage::translation); - Ptr scorer = New(builder, "", 1.0f, model); + Ptr scorer = New(builder, "", 1.0f, modelFile); scorers.push_back(scorer); } diff --git a/src/translator/scorers.cpp b/src/translator/scorers.cpp index 7c9745c22..cee57bd85 100644 --- a/src/translator/scorers.cpp +++ b/src/translator/scorers.cpp @@ -5,7 +5,7 @@ namespace marian { Ptr scorerByType(const std::string& fname, float weight, - std::vector items, + Ptr modelFile, Ptr options) { options->set("inference", true); std::string type = options->get("type"); @@ -22,48 +22,25 @@ Ptr scorerByType(const std::string& fname, LOG(info, "Loading scorer of type {} as feature {}", type, fname); - return New(encdec, fname, weight, items); + return New(encdec, fname, weight, modelFile); } -Ptr scorerByType(const std::string& fname, - float weight, - const void* ptr, - Ptr options) { - options->set("inference", true); - std::string type = options->get("type"); - - // @TODO: solve this better - if(type == "lm" && options->has("input")) { - size_t index = options->get>("input").size(); - options->set("index", index); - } - - bool skipCost = options->get("skip-cost"); - auto encdec = models::createModelFromOptions( - options, skipCost ? models::usage::raw : models::usage::translation); - - LOG(info, "Loading scorer of type {} as feature {}", type, fname); - - return New(encdec, fname, weight, ptr); -} - -std::vector> createScorers(Ptr options, const std::vector> models) { +std::vector> createScorers(Ptr options, const std::vector>& modelFiles) { std::vector> scorers; - std::vector weights(models.size(), 1.f); + std::vector weights(modelFiles.size(), 1.f); if(options->hasAndNotEmpty("weights")) weights = options->get>("weights"); bool isPrevRightLeft = false; // if the previous model was a right-to-left model size_t i = 0; - for(auto items : models) { + for(auto modelFile : modelFiles) { std::string fname = "F" + std::to_string(i); // load options specific for the scorer auto modelOptions = options->clone(); if(!options->get("ignore-model-config")) { - YAML::Node modelYaml; - io::getYamlFromModel(modelYaml, "special:model.yml", items); + YAML::Node modelYaml = modelFile->getYamlFromModel("special:model.yml"); if(!modelYaml.IsNull()) { LOG(info, "Loaded model config"); modelOptions->merge(modelYaml, true); @@ -74,7 +51,7 @@ std::vector> createScorers(Ptr options, const std::vector 1 && modelOptions->has("right-left")) { + if(modelFiles.size() > 1 && modelOptions->has("right-left")) { if(i == 0) { isPrevRightLeft = modelOptions->get("right-left"); } else { @@ -85,7 +62,7 @@ std::vector> createScorers(Ptr options, const std::vector> createScorers(Ptr options, const std::vector> createScorers(Ptr options) { - std::vector> model_items; + std::vector> modelFiles; auto models = options->get>("models"); for(auto model : models) { - auto items = io::loadItems(model); - model_items.push_back(std::move(items)); + auto modelFile = New(model); + modelFiles.push_back(modelFile); } - return createScorers(options, model_items); -} - -std::vector> createScorers(Ptr options, const std::vector& ptrs) { - std::vector> scorers; - - std::vector weights(ptrs.size(), 1.f); - if(options->hasAndNotEmpty("weights")) - weights = options->get>("weights"); - - size_t i = 0; - for(auto ptr : ptrs) { - std::string fname = "F" + std::to_string(i); - - // load options specific for the scorer - auto modelOptions = options->clone(); - if(!options->get("ignore-model-config")) { - YAML::Node modelYaml; - io::getYamlFromModel(modelYaml, "special:model.yml", ptr); - if(!modelYaml.IsNull()) { - LOG(info, "Loaded model config"); - modelOptions->merge(modelYaml, true); - } - else { - LOG(warn, "No model settings found in model file"); - } - } - - scorers.push_back(scorerByType(fname, weights[i], ptr, modelOptions)); - i++; - } - - return scorers; -} - -std::vector> createScorers(Ptr options, const std::vector& mmaps) { - std::vector ptrs; - for(const auto& mmap : mmaps) { - ABORT_IF(!mmap.is_mapped(), "Memory mapping did not succeed"); - ptrs.push_back(mmap.data()); - } - return createScorers(options, ptrs); + return createScorers(options, modelFiles); } } // namespace marian diff --git a/src/translator/scorers.h b/src/translator/scorers.h index 72ebff5df..333e49108 100644 --- a/src/translator/scorers.h +++ b/src/translator/scorers.h @@ -4,7 +4,6 @@ #include "data/shortlist.h" #include "models/model_factory.h" -#include "3rd_party/mio/mio.hpp" namespace marian { @@ -72,47 +71,25 @@ class ScorerWrapperState : public ScorerState { class ScorerWrapper : public Scorer { private: Ptr encdec_; - std::string fname_; - std::vector items_; - const void* ptr_; + Ptr modelWeights_; public: ScorerWrapper(Ptr encdec, const std::string& name, float weight, - std::vector& items) + Ptr modelFile) : Scorer(name, weight), encdec_(std::static_pointer_cast(encdec)), - items_(items), - ptr_{0} {} - - ScorerWrapper(Ptr encdec, - const std::string& name, - float weight, - const std::string& fname) - : Scorer(name, weight), - encdec_(std::static_pointer_cast(encdec)), - fname_(fname), - ptr_{0} {} - - ScorerWrapper(Ptr encdec, - const std::string& name, - float weight, - const void* ptr) - : Scorer(name, weight), - encdec_(std::static_pointer_cast(encdec)), - ptr_{ptr} {} + modelWeights_(modelFile) + {} virtual ~ScorerWrapper() {} virtual void init(Ptr graph) override { graph->switchParams(getName()); - if(!items_.empty()) - encdec_->load(graph, items_); - else if(ptr_) - encdec_->mmap(graph, ptr_); - else - encdec_->load(graph, fname_); + // @TODO: unify to a single call, this logic should happen in modelFile_ + if(modelWeights_) + encdec_->load(graph, modelWeights_); } virtual void clear(Ptr graph) override { @@ -154,26 +131,7 @@ class ScorerWrapper : public Scorer { } }; -Ptr scorerByType(const std::string& fname, - float weight, - std::vector items, - Ptr options); - -Ptr scorerByType(const std::string& fname, - float weight, - const std::string& model, - Ptr config); - - std::vector> createScorers(Ptr options); -std::vector> createScorers(Ptr options, const std::vector> models); - -Ptr scorerByType(const std::string& fname, - float weight, - const void* ptr, - Ptr config); - -std::vector> createScorers(Ptr options, const std::vector& ptrs); -std::vector> createScorers(Ptr options, const std::vector& mmaps); +std::vector> createScorers(Ptr options, const std::vector>& models); } // namespace marian diff --git a/src/translator/translator.h b/src/translator/translator.h index 498ef65b3..081b06c42 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -19,9 +19,6 @@ #include "models/model_task.h" #include "translator/scorers.h" -// currently for diagnostics only, will try to mmap files ending in *.bin suffix when enabled. -#include "3rd_party/mio/mio.hpp" - namespace marian { template @@ -36,9 +33,7 @@ class Translate : public ModelTask { Ptr shortlistGenerator_; size_t numDevices_; - - std::vector model_mmaps_; // map - std::vector> model_items_; // non-mmap + std::vector> modelWeights_; public: Translate(Ptr options) @@ -70,20 +65,18 @@ class Translate : public ModelTask { scorers_.resize(numDevices_); graphs_.resize(numDevices_); - auto models = options->get>("models"); - if(options_->get("model-mmap", false)) { - for(auto model : models) { - ABORT_IF(!io::isBin(model), "Non-binarized models cannot be mmapped"); - LOG(info, "Loading model from {}", model); - model_mmaps_.push_back(mio::mmap_source(model)); - } - } - else { - for(auto model : models) { - LOG(info, "Loading model from {}", model); - auto items = io::loadItems(model); - model_items_.push_back(std::move(items)); - } + auto modelPaths = options->get>("models"); + + // We now opportunistically mmap the model files anyways, but to keep backward compatibility + // with the old --model-mmap option, we now croak if mmap is explicitly requested during decoding + // but not possible in the actual graph, e.g. if --model-mmap is specified but the model file is + // a npz-file or we decode on the GPU (will croak in different places). + bool mmap = options_->get("model-mmap", false); + auto mmapMode = mmap ? io::MmapMode::RequiredMmap : io::MmapMode::OpportunisticMmap; + + for(auto modelPath : modelPaths) { + LOG(info, "Loading model from {}", modelPath); + modelWeights_.push_back(New(modelPath, mmapMode)); } size_t id = 0; @@ -101,13 +94,7 @@ class Translate : public ModelTask { graph->reserveWorkspaceMB(options_->get("workspace")); graphs_[id] = graph; - std::vector> scorers; - if(options_->get("model-mmap", false)) { - scorers = createScorers(options_, model_mmaps_); - } - else { - scorers = createScorers(options_, model_items_); - } + std::vector> scorers = createScorers(options_, modelWeights_); for(auto scorer : scorers) { scorer->init(graph); @@ -242,6 +229,8 @@ class TranslateService : public ModelServiceTask { Ptr trgVocab_; Ptr shortlistGenerator_; + std::vector> modelFiles_; + size_t numDevices_; public: @@ -279,11 +268,9 @@ class TranslateService : public ModelServiceTask { numDevices_ = devices.size(); // preload models - std::vector> model_items_; auto models = options->get>("models"); for(auto model : models) { - auto items = io::loadItems(model); - model_items_.push_back(std::move(items)); + modelFiles_.push_back(New(model)); } // initialize scorers @@ -301,7 +288,7 @@ class TranslateService : public ModelServiceTask { graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); - auto scorers = createScorers(options_, model_items_); + auto scorers = createScorers(options_, modelFiles_); for(auto scorer : scorers) { scorer->init(graph); if(shortlistGenerator_) From 1656b9c0f6e238d3f39d5b43f874ee552f5eb49c Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 24 Jan 2024 01:21:51 +0000 Subject: [PATCH 09/26] Merged PR 32600: Full Comet-Kiwi implementation, partial xComet-XL/XXL This PR implements * Comet-Kiwi - fully functional * xComet-XL and xComet-XXL - scores for regressor part fully matching, MQM partial scores not implemented yet. --- CHANGELOG.md | 2 + VERSION | 2 +- scripts/comet/comet2marian.py | 111 ++++++++++++++++------ src/common/config_parser.cpp | 6 ++ src/data/corpus_base.cpp | 36 +++---- src/data/corpus_base.h | 24 ++--- src/layers_new/neuralnet.h | 80 ++++++++-------- src/layers_new/transformer.h | 174 +++++++++++++++++++--------------- src/models/comet_qe.h | 111 +++++++++++++--------- src/models/model_factory.cpp | 67 ++++++++----- src/training/graph_group.cpp | 4 +- src/translator/scorers.h | 5 +- 12 files changed, 370 insertions(+), 252 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 772349e3d..854162b6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added implementation of COMET-KIWI +- Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now) - Added implementation of COMET-22 (reference-based) model and conversion - Added sparsemax operator (slow version) - Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively. diff --git a/VERSION b/VERSION index 2f107c43d..5a8f2d3ca 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.19 +v1.12.20 diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py index 6b4f557db..09c369260 100755 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -14,7 +14,8 @@ # supported_comets = [m for m in available_metrics if 'qe' in m.lower()] supported_comets = [ 'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da', - 'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da' + 'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da', 'Unbabel/wmt22-cometkiwi-da', + 'Unbabel/XCOMET-XL', 'Unbabel/XCOMET-XXL' ] log.basicConfig(level=log.INFO) @@ -92,6 +93,12 @@ def load_comet_model(model_path): config["type"] = "comet-qe" elif model_type == "XLMRobertaModel": config["type"] = "comet-qe" +elif model_type == "UnifiedMetric" or model_type == "XCOMETMetric": + config["type"] = "comet-unified" + config["input-join-fields"] = True + config["separator-symbol"] = "" + config["comet-use-separator"] = True + config["comet-pool"] = "cls" else: raise Exception(f'Unknown type of model {model_type}') @@ -100,17 +107,32 @@ def load_comet_model(model_path): config["transformer-ffn-depth"] = 2 config["transformer-ffn-activation"] = "gelu" # figure this out dynamically config["transformer-train-position-embeddings"] = True -config["transformer-preprocess"] = "" -config["transformer-postprocess"] = "dan" -config["transformer-postprocess-emb"] = "nd" + +# Roberta-XXL (hence XCOMET-XXL) has pre-norm +if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + config["transformer-preprocess"] = "n" + config["transformer-postprocess"] = "da" + config["transformer-postprocess-emb"] = "" + config["transformer-postprocess-top"] = "n" +else: + config["transformer-preprocess"] = "" + config["transformer-postprocess"] = "dan" + config["transformer-postprocess-emb"] = "nd" + config["bert-train-type-embeddings"] = False config["bert-type-vocab-size"] = 0 config["comet-prepend-zero"] = True +print(cometModel.hparams) + config["comet-mix"] = cometModel.hparams.get("layer") == "mix" config["comet-mix-norm"] = cometModel.hparams.get('layer_norm', False) config["comet-mix-transformation"] = cometModel.hparams.get("layer_transformation", "softmax"); +# they have a bug in their code that makes this always true +if model_type == "UnifiedMetric" or model_type == "XCOMETMetric": + config["comet-mix-transformation"] = "softmax" + if not args.roberta: config["comet-final-sigmoid"] = args.add_sigmoid config["comet-pooler-ffn"] = [2048, 1024] @@ -132,26 +154,32 @@ def yaml2np(config): return npDesc def convert(pd, srcs, trg, transpose=True, bias=False): - if len(srcs) == 1: - for src in srcs: - num = pd[src].detach().numpy() - if bias: - marianModel[trg] = num.copy() - else: - if transpose: - marianModel[trg] = np.transpose(num).copy() - else: - marianModel[trg] = num - else: # path that joins matrices together for fused self-attention - nums = [pd[src].detach().numpy() for src in srcs] + # make sure exactly one element of list srcs exists in dictionary pd + found = sum([src in pd for src in srcs]) + assert found == 1, f"Found {found} of {srcs} in {pd}" + + for src in srcs: + if src not in pd: + continue + num = pd[src].detach().numpy() if bias: - nums = [np.transpose(num) for num in nums] - marianModel[trg] = np.stack(nums, axis=0).copy() + marianModel[trg] = num.copy() + else: + if transpose: + marianModel[trg] = np.transpose(num).copy() + else: + marianModel[trg] = num + + +def match(regex, string): + import re + return re.search(regex, string) is not None def extract(layer, nth, level): name = type(layer).__name__ print(" " * level, nth, name) - if "RobertaLayer" in name: + + if match(r"Roberta(XL+)?Layer", name): pd = dict(layer.named_parameters()) for n in pd: print(" " * (level + 1), n, pd[n].shape) @@ -178,8 +206,12 @@ def extract(layer, nth, level): convert(pd, ["attention.output.dense.bias"], f"{blockPrefix}->selfAttention->oProj->bias", bias=True) # self-attention layer-norm - convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) - convert(pd, ["attention.output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) + if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + convert(pd, ["attention.self_attn_layer_norm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True) + convert(pd, ["attention.self_attn_layer_norm.bias"], f"{blockPrefix}->preprocessor->norm->bias", bias=True) + else: + convert(pd, ["attention.output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) + convert(pd, ["attention.output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) # ffn # first ffn layer @@ -190,15 +222,20 @@ def extract(layer, nth, level): # second ffn layer convert(pd, ["output.dense.weight"], f"{blockPrefix}->layers->at(3)->as()->weight") convert(pd, ["output.dense.bias"], f"{blockPrefix}->layers->at(3)->as()->bias", bias=True) + # ffn layer-norm - convert(pd, ["output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) - convert(pd, ["output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) + if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + convert(pd, ["LayerNorm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True) + convert(pd, ["LayerNorm.bias"], f"{blockPrefix}->preprocessor->norm->bias", bias=True) + else: + convert(pd, ["output.LayerNorm.weight"], f"{blockPrefix}->postprocessor->norm->weight", bias=True) + convert(pd, ["output.LayerNorm.bias"], f"{blockPrefix}->postprocessor->norm->bias", bias=True) config["transformer-dim-ffn"] = pd["intermediate.dense.bias"].shape[-1] config["transformer-heads"] = layer.attention.self.num_attention_heads config["enc-depth"] += 1 - elif "RobertaEmbeddings" in name: + elif match(r"Roberta(XL+)?Embeddings", name): for n, p in layer.named_parameters(): print(" " * (level + 1), n, p.shape) pd = dict(layer.named_parameters()) @@ -208,6 +245,10 @@ def extract(layer, nth, level): npWemb = npWembTemp[1:-1, :].copy() npWemb[0, :] = npWembTemp[0, :] npWemb[2, :] = npWembTemp[2, :] + + # XCOMET-XXL has some additional tokens (why?), we truncate it back to normal size + npWemb = npWemb[0:250000, :].copy() + marianModel["Wemb"] = npWemb prefix = "CometEncoder" @@ -217,14 +258,26 @@ def extract(layer, nth, level): npPos = npPos[2:, :].copy() marianModel[f"{prefix}->encoder->positionEmbedding->embeddings"] = npPos - # post-embedding layer normalization - convert(pd, ["LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True) - convert(pd, ["LayerNorm.bias"], f"{prefix}->encoder->preprocessor->norm->bias", bias=True) - config["dim-emb"] = npWemb.shape[1] config["dim-vocabs"] = [ npWemb.shape[0] ] config["max-length"] = npPos.shape[0] + elif match(r"Roberta(XL+)?Model", name): + pd = dict(layer.named_parameters()) + prefix = "CometEncoder" + + # post-embedding layer normalization + if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + convert(pd, ["encoder.LayerNorm.weight"], f"{prefix}->encoder->postprocessor->norm->weight", bias=True) + convert(pd, ["encoder.LayerNorm.bias"], f"{prefix}->encoder->postprocessor->norm->bias", bias=True) + else: + convert(pd, ["embeddings.LayerNorm.weight"], f"{prefix}->encoder->preprocessor->norm->weight", bias=True) + convert(pd, ["embeddings.LayerNorm.bias"], f"{prefix}->encoder->preprocessor->norm->bias", bias=True) + + # on this level we actually keep recursing + recurse(layer, level + 1) + + elif name == "LayerwiseAttention": for n, p in layer.named_parameters(): print(" " * (level + 1), n, p.shape) @@ -232,7 +285,7 @@ def extract(layer, nth, level): # mix layers weights = [] - for i in range(25): + for i in range(config["enc-depth"] + 1): weights.append(pd[f"scalar_parameters.{i}"].detach().numpy()) marianModel["CometEncoder->encoder->weights"] = np.concatenate(weights).copy() diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index ec85e40ad..741a3915c 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -249,6 +249,10 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Possible values: sequence, class, alignment, weight. " "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).", {}); + cli.add("--input-join-fields", + "Join input fields (from files or TSV) into a single sequence " + "(mostly used single-encoder models like BLEURT and COMET-KIWI)", + false); cli.add("--best-deep", "Use Edinburgh deep RNN configuration (s2s)"); cli.add("--tied-embeddings", @@ -364,6 +368,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--comet-mix", "Mix encoder layers to produce embedding"); cli.add("--comet-mix-norm", "Normalize layers prior to mixing"); + cli.add("--comet-pool", "Pooling operation over time dimension (avg, cls, max)", "avg"); cli.add("--comet-mix-transformation", "Which transformation to apply to layer mixing (softmax [default] or sparsemax)", "softmax"); cli.add("--comet-dropout", "Dropout for pooler layers", 0.1f); cli.add("--comet-mixup", "Alpha parameter for Beta distribution for mixup", 0.0f); @@ -371,6 +376,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { cli.add("--comet-augment-bad", "Fraction of bad examples added via shuffling for class/label 0.f", 0.0f); cli.add>("--comet-pooler-ffn", "Hidden sizes for comet pooler", {2048, 1024}); cli.add("--comet-prepend-zero", "Add a start symbol to batch entries"); + cli.add("--comet-use-separator", "Add a sentence separator to batch entries when joining source, target and mt", false); #ifdef CUDNN cli.add("--char-stride", diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 0ef804b1c..47381d9b9 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -60,9 +60,10 @@ CorpusBase::CorpusBase(const std::vector& paths, maxLengthCrop_(options_->get("max-length-crop")), rightLeft_(options_->get("right-left")), prependZero_(options_->get("comet-prepend-zero", false)), + joinFields_(options_->get("input-join-fields", false)), + insertSeparator_(options_->get("comet-use-separator", false)), tsv_(options_->get("tsv", false)), - tsvNumInputFields_(getNumberOfTSVInputFields(options)), - joinFields_(options_->get("input-join-fields", false)) { + tsvNumInputFields_(getNumberOfTSVInputFields(options)) { // TODO: support passing only one vocab file if we have fully-tied embeddings if(tsv_) { ABORT_IF(tsvNumInputFields_ != vocabs_.size(), @@ -87,9 +88,10 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) maxLengthCrop_(options_->get("max-length-crop")), rightLeft_(options_->get("right-left")), prependZero_(options_->get("comet-prepend-zero", false)), + joinFields_(options_->get("input-join-fields", false)), + insertSeparator_(options_->get("comet-use-separator", false)), tsv_(options_->get("tsv", false)), - tsvNumInputFields_(getNumberOfTSVInputFields(options)), - joinFields_(options_->get("input-join-fields", false)) { + tsvNumInputFields_(getNumberOfTSVInputFields(options)) { bool training = !translate; if(training) @@ -361,7 +363,7 @@ CorpusBase::CorpusBase(Ptr options, bool translate, size_t seed) Ptr vocab = New(options_, i); vocabDims[i] = (int) vocab->load(vocabPaths[i], maxVocabs[i]); vocabs_.emplace_back(vocab); - } + } // TODO: As above, this is not nice as it modifies the option object and needs to expose the changes // outside the corpus as models need to know about the vocabulary size; extract the vocab @@ -430,18 +432,20 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, // This handles adding starts symbols for COMET () and BERT/BLEURT ([CLS]) bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0)); - if(prepend && inputTypes[batchIndex] == "sequence") { - auto prependedWord = Word::fromWordIndex(0); - words.insert(words.begin(), prependedWord); - } - + if(prepend && inputTypes[batchIndex] == "sequence") + words.insert(words.begin(), Word::fromWordIndex(0)); + + bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0; + if(prependSep && inputTypes[batchIndex] == "sequence") + words.insert(words.begin(), vocabs_[batchIndex]->getSepId()); + // if fields are joined and the current sentence is not the first one, we need to make sure that // the current sentence is not longer than the maximum length minus the length of the previous sentence - // (minus 1 for the separator token) + // (minus 1 for the separator token or 2 if we also add a separator token) size_t localMaxLength = maxLength_; if(joinFields_ && !tup.empty()) - localMaxLength = std::max(1, (int)maxLength_ - (int)tup.back().size()); - + localMaxLength = std::max(1 + (int)prependSep, (int)maxLength_ - (int)tup.back().size()); + // if the current sentence is longer than the maximum length, we need to crop it if(maxLengthCrop_ && words.size() > localMaxLength) { words.resize(localMaxLength); @@ -472,7 +476,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line, size_t srcEosPos = tup[0].size() - 1; size_t tgtEosPos = tup[1].size() - 1; - auto align = WordAlignment(line, srcEosPos, tgtEosPos); + auto align = WordAlignment(line, srcEosPos, tgtEosPos); tup.setAlignment(align); } @@ -497,10 +501,10 @@ void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupl void CorpusBase::addAlignmentsToBatch(Ptr batch, const std::vector& batchVector) { std::vector aligns; - + int dimBatch = (int)batch->getSentenceIds().size(); aligns.reserve(dimBatch); - + for(int b = 0; b < dimBatch; ++b) { // If the batch vector is altered within marian by, for example, case augmentation, // the guided alignments we received for this tuple cease to be valid. diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 1e28da7f4..074689804 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -56,15 +56,15 @@ class SentenceTupleImpl { * @brief Returns whether this Tuple was altered or augmented from what * was provided to Marian in input. */ - bool isAltered() const { - return altered_; + bool isAltered() const { + return altered_; } /** * @brief Mark that this Tuple was internally altered or augmented by Marian */ - void markAltered() { - altered_ = true; + void markAltered() { + altered_ = true; } /** @@ -79,7 +79,7 @@ class SentenceTupleImpl { * * @param words A vector of word indices. */ - void appendToBack(const Words& words) { + void appendToBack(const Words& words) { if(tuple_.empty()) { tuple_.push_back(words); } else { @@ -155,11 +155,11 @@ class SentenceTuple { * @brief Creates an empty tuple with no associated future. */ SentenceTuple() {} - - SentenceTuple(const SentenceTupleImpl& tupImpl) + + SentenceTuple(const SentenceTupleImpl& tupImpl) : impl_(std::make_shared(tupImpl)) {} - SentenceTuple(std::future&& fImpl) + SentenceTuple(std::future&& fImpl) : fImpl_(new std::future(std::move(fImpl))) {} SentenceTupleImpl& get() const { @@ -466,7 +466,7 @@ class CorpusBatch : public Batch { if(options->get("guided-alignment", std::string("none")) != "none") { // @TODO: if > 1 encoder, verify that all encoders have the same sentence lengths - + std::vector alignment; for(size_t k = 0; k < batchSize; ++k) { data::WordAlignment perSentence; @@ -658,13 +658,15 @@ class CorpusBase : public DatasetBase separator will demark the fields (mostly used for BLEURT and COMET-KIWI) + bool insertSeparator_{false}; // when joining fields with joinFields_, additionally use this separator (mostly used for COMET-KIWI) + bool tsv_{false}; // true if the input is a single file with tab-separated values size_t tsvNumInputFields_{0}; // number of fields from the TSV input that are associated // with vocabs, i.e. excluding fields with alignment or // weights, only if --tsv - bool joinFields_{false}; // if true when given a TSV file or multiple inputs, join them together with a specified separator. - /** * @brief Determine the number of fields from the TSV input that are associated with * vocabs, i.e. excluding fields that contain alignment or weights diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h index 923838aa0..c0912634f 100644 --- a/src/layers_new/neuralnet.h +++ b/src/layers_new/neuralnet.h @@ -9,15 +9,15 @@ namespace nn { static inline Expr swapTimeBatch(Expr input) { return swapAxes(atleast_4d(input), -2, -3); } /** - * A generic Activation function layer. Any unary Marian operator or function accepted by - * `std::function` can be turned into an activation function like this: + * A generic Activation function layer. Any unary Marian operator or function accepted by + * `std::function` can be turned into an activation function like this: ``` auto reluLayer = New(graph, (Expr(*)(Expr))relu) ``` - * The function pointer cast may be required to disambiguate the operator name if operators - * of the same name but with a different sets of parameters exist, otherwise it can be dropped + * The function pointer cast may be required to disambiguate the operator name if operators + * of the same name but with a different sets of parameters exist, otherwise it can be dropped * or replaced with a more readable lambda function. - * + * * `Activation` will also accept lambdas for more complex activations: ``` // a reasonably accurate approximation of GELU @@ -30,11 +30,11 @@ class Activation : public Layer, public IUnaryLayer { public: Activation(Ptr graph, - const std::function& actFn) + const std::function& actFn) : Layer(graph), actFn(actFn) {} virtual ~Activation() = default; - + Expr apply(Expr x) const override { return actFn(x); } @@ -68,7 +68,7 @@ struct Swish final : public Activation { // Factory for activation function layers from name as string. Ptr activationLayerByName(Ptr graph, const std::string& actName); -// Applies a linear transformation to the incoming data: y = xA^T + b +// Applies a linear transformation to the incoming data: y = xA^T + b struct Linear : public Layer, public IUnaryLayer { Expr weight; Expr bias; @@ -79,7 +79,7 @@ struct Linear : public Layer, public IUnaryLayer { Ptr init; // Typical constructor that can take an initializer function - Linear(Ptr graph, + Linear(Ptr graph, int dimOut, bool useBias = true, bool transposed = false, @@ -108,22 +108,22 @@ struct Linear : public Layer, public IUnaryLayer { } else { registerParameterLazy(weight, Shape({ dimIn, dimOut }), init); } - + if(useBias) { registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); } Type outputType = x->value_type(); if(useBias) - return marian::affine(x, - marian::cast(weight, outputType), - marian::cast(bias, outputType), - /*transA=*/false, + return marian::affine(x, + marian::cast(weight, outputType), + marian::cast(bias, outputType), + /*transA=*/false, /*transB=*/transposed); else - return marian::dot(x, - marian::cast(weight, outputType), - /*transA=*/false, + return marian::dot(x, + marian::cast(weight, outputType), + /*transA=*/false, /*transB=*/transposed); } }; @@ -131,15 +131,15 @@ struct Linear : public Layer, public IUnaryLayer { struct Dropout final : public Layer, public IUnaryLayer { float dropoutProbability; Shape::Axes dropoutAxes{{-2, -1}}; - - Dropout(Ptr graph, + + Dropout(Ptr graph, float dropoutProbability, - const Shape::Axes& dropoutAxes) + const Shape::Axes& dropoutAxes) : Layer(graph), dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes) {} - Dropout(Ptr graph, - float dropoutProbability) + Dropout(Ptr graph, + float dropoutProbability) : Layer(graph), dropoutProbability(dropoutProbability) {} @@ -170,24 +170,24 @@ struct LinearReluDropout final : public Linear { Shape::Axes dropoutAxes{{-2, -1}}; // Typical constructor that can take an initializer function - LinearReluDropout(Ptr graph, + LinearReluDropout(Ptr graph, int dimOut, float dropoutProbability, bool useBias = true, bool transposed = false, Ptr init = inits::glorotUniform()) - : Linear(graph, dimOut, useBias, transposed, init), + : Linear(graph, dimOut, useBias, transposed, init), dropoutProbability(dropoutProbability) {} // Typical constructor that can take an initializer function - LinearReluDropout(Ptr graph, + LinearReluDropout(Ptr graph, int dimOut, float dropoutProbability, const Shape::Axes& dropoutAxes, bool useBias = true, bool transposed = false, Ptr init = inits::glorotUniform()) - : Linear(graph, dimOut, useBias, transposed, init), + : Linear(graph, dimOut, useBias, transposed, init), dropoutProbability(dropoutProbability), dropoutAxes(dropoutAxes) {} Expr apply(Expr x) const override { @@ -199,7 +199,7 @@ struct LinearReluDropout final : public Linear { } else { registerParameterLazy(weight, Shape({ dimIn, dimOut }), init); } - + if(useBias) { registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); } @@ -223,21 +223,21 @@ struct LinearReluDropout final : public Linear { struct Norm : public Layer, public IUnaryLayer { Expr weight{nullptr}; // = scale Expr bias{nullptr}; - + bool useScale{true}; bool useBias{true}; bool elementwise{true}; float eps{1e-5f}; - Norm(Ptr graph, - bool useScale = true, - bool useBias = true, - bool elementwise = true, + Norm(Ptr graph, + bool useScale = true, + bool useBias = true, + bool elementwise = true, float eps = 1e-5f) - : Layer(graph), - useScale(useScale), - useBias(useBias), - elementwise(elementwise), + : Layer(graph), + useScale(useScale), + useBias(useBias), + elementwise(elementwise), eps(eps) {} virtual Expr getScale(int dimModel) const { @@ -264,7 +264,7 @@ struct Norm : public Layer, public IUnaryLayer { }; struct LayerNorm : public Norm { - LayerNorm(Ptr graph, + LayerNorm(Ptr graph, bool useScale = true, bool useBias = true, bool elementwise = true, @@ -281,9 +281,9 @@ struct LayerNorm : public Norm { }; struct RMSNorm : public Norm { - RMSNorm(Ptr graph, - bool useScale = true, - bool useBias = true, + RMSNorm(Ptr graph, + bool useScale = true, + bool useBias = true, bool elementwise = true, float eps = 1e-5f) : Norm(graph, useScale, useBias, elementwise, eps) diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h index ccce35d13..d80fe102f 100644 --- a/src/layers_new/transformer.h +++ b/src/layers_new/transformer.h @@ -24,7 +24,7 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { TransformerPrePostProcessor(Ptr graph, const std::string& actionDesc, float dropoutProbablity) - : Layer(graph), + : Layer(graph), actionDesc(actionDesc) { for(char a : actionDesc) { @@ -45,11 +45,11 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { } virtual ~TransformerPrePostProcessor() = default; - + Expr apply(Expr input, Expr previous = nullptr) const override { Expr output = input; for(char action : actionDesc) { - if(action == 'd') + if(action == 'd') output = dropout->apply(output); else if(action == 'a' && previous) output = output + previous; @@ -64,7 +64,7 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { } }; -/** +/** * This is a typical transformer self-attention block. The default configuration will * use a multi-head multiplicative self-attention layer, followed by dropout, the skip * connection and layer normalization (dan) in the post-processor. The pre-processor does @@ -76,13 +76,13 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin Ptr selfAttention; Ptr postprocessor; - TransformerSelfAttentionBlock(Ptr graph, + TransformerSelfAttentionBlock(Ptr graph, Ptr options) : LayerWithOptions(graph, options) { preprocessor = New( - graph, - opt("transformer-preprocess", ""), + graph, + opt("transformer-preprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); @@ -90,8 +90,8 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin registerLayer(selfAttention); postprocessor = New( - graph, - opt("transformer-postprocess", ""), + graph, + opt("transformer-postprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(postprocessor); } @@ -104,9 +104,9 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin } }; -/** +/** * This is a typical transformer filter (1-dimensional convolution) block. The default configuration will - * use scale up to a larger dimension, apply a ReLU activation and scale down again, followed by dropout, + * use scale up to a larger dimension, apply a ReLU activation and scale down again, followed by dropout, * the skip connection and layer normalization (dan) in the post-processor. The pre-processor does * nothing in the default configuration. */ @@ -115,18 +115,18 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye Ptr layers; Ptr postprocessor; bool isDecoder{false}; - - TransformerFilterBlock(Ptr graph, + + TransformerFilterBlock(Ptr graph, Ptr options, bool isDecoder = false) : LayerWithOptions(graph, options), isDecoder(isDecoder) { preprocessor = New( - graph, - opt("transformer-preprocess", ""), + graph, + opt("transformer-preprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); - + int modelDim = opt("transformer-dim-model", opt("dim-emb")); int ffnDim = opt("transformer-dim-ffn"); if(isDecoder && opt("transformer-decoder-dim-ffn") != 0) @@ -144,7 +144,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye // assemble filter of given depth layers = New(graph); registerLayer(layers); - + if(actName == "relu") { layers->append(New(graph, ffnDim, ffnDropoutProbability)); } else { @@ -164,7 +164,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye layers->append(New(graph, modelDim)); postprocessor = New( - graph, + graph, opt("transformer-postprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(postprocessor); @@ -178,7 +178,7 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye } }; -/** +/** * A full transformer encoder layer consists of a self-attention block followed by * a filter block. Skip connections etc. are handled inside the blocks, see above. */ @@ -186,13 +186,13 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa Ptr selfAttentionBlock; Ptr filterBlock; - TransformerEncoderLayer(Ptr graph, + TransformerEncoderLayer(Ptr graph, Ptr options) : LayerWithOptions(graph, options) { selfAttentionBlock = New(graph, options); registerLayer(selfAttentionBlock); - + filterBlock = New(graph, options); registerLayer(filterBlock); } @@ -200,30 +200,38 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa Expr apply(Expr input, Expr mask = nullptr) const override { Expr output = selfAttentionBlock->apply(input, mask); output = filterBlock->apply(output); - + checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) - + return output; } }; /** - * A full transformer encoder stack. Before applying multiple transformer layers (depth of the encoder), we + * A full transformer encoder stack. Before applying multiple transformer layers (depth of the encoder), we * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity - * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. + * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. * @TODO: get rid of these transposes. */ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { +public: Ptr positionEmbedding; Ptr maskProcessor; Ptr preprocessor; Ptr layers; Ptr postprocessor; - TransformerEncoder(Ptr graph, +protected: // @TODO: should this be public? + // collect hidden states as we step through the layers + mutable bool keepHiddenStates{false}; + mutable std::vector hiddenStates; + // apply this function to hidden states before collecting them + mutable std::function hiddenTransformFn = [](Expr x) { return x; }; + +public: + TransformerEncoder(Ptr graph, Ptr options) - : LayerWithOptions(graph, options) - { + : LayerWithOptions(graph, options) { if(!opt("transformer-disable-position-embeddings", false)) { positionEmbedding = positionEmbeddingFromOptions(graph, options, /*positionAxis=*/-2); registerLayer(positionEmbedding); @@ -233,8 +241,8 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { registerLayer(maskProcessor); preprocessor = New( - graph, - opt("transformer-postprocess-emb", ""), + graph, + opt("transformer-postprocess-emb", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); @@ -242,15 +250,15 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { registerLayer(layers); for(int i = 0; i < opt("enc-depth"); ++i) { auto transformerEncoderLayer = New(graph, options); - // example of changing linear layer init functions burried deep in the model + // example of changing linear layer init functions burried deep in the model if(opt("transformer-depth-scaling", false)) for(auto linear : transformerEncoderLayer->allLayers()) linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); - + if(opt("transformer-no-bias", false)) for(auto linear : transformerEncoderLayer->allLayers()) linear->useBias = false; - + if(opt("transformer-no-affine", false)) { for(auto norm : transformerEncoderLayer->allLayers()) { norm->useScale = false; @@ -261,8 +269,8 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { } postprocessor = New( - graph, - opt("transformer-postprocess-top", ""), + graph, + opt("transformer-postprocess-top", ""), opt("transformer-dropout", 0.f)); registerLayer(postprocessor); } @@ -274,9 +282,9 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { // dimensions. This order is more natural for the transformer, but more difficult to handle // during beam search or when using RNNs. Hence the input/output transpositions here. - // @TODO: still worth to review this whole transpose business across the tool. In the - // decoder state, Frank added information about batchMajor/timeMajor orientation. If we - // do that everywhere we can detect inconsistencies automatically. + // @TODO: still worth to review this whole transpose business across the tool. In the + // decoder state, Frank added information about batchMajor/timeMajor orientation. If we + // do that everywhere we can detect inconsistencies automatically. // reorganize batch and timestep auto output = swapTimeBatch(input); // [1, dimBatch, dimSrcWords, dimModel] if(mask) @@ -296,11 +304,16 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { auto logMask = maskProcessor->apply(output, mask); // traverse the layers, use the same mask for each - for(auto layer : *layers) + for(auto layer : *layers) { + if(keepHiddenStates) // note, with pre-norm, the hidden states will not be normed here. + hiddenStates.push_back(hiddenTransformFn(output)); output = layer->apply(output, logMask); + } // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection output = postprocessor->apply(output, prevOutput); + if(keepHiddenStates) + hiddenStates.push_back(hiddenTransformFn(output)); // restore organization of batch and time steps. This is currently required // to make RNN-based decoders and beam search work with this. We are looking @@ -313,9 +326,14 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim] return output; } + + virtual void clear() override { + LayerWithOptions::clear(); + hiddenStates.clear(); + } }; -/** +/** * This is a typical transformer cross-attention block. The default configuration will * use a multi-head multiplicative cross-attention layer, followed by dropout, the skip * connection and layer normalization (dan) in the post-processor. The pre-processor does @@ -327,23 +345,23 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe Ptr crossAttention; Ptr postprocessor; - TransformerCrossAttentionBlock(Ptr graph, + TransformerCrossAttentionBlock(Ptr graph, Ptr options) : LayerWithOptions(graph, options) { preprocessor = New( - graph, - opt("transformer-preprocess", ""), + graph, + opt("transformer-preprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); - + // @TODO: factory to support different attention flavors? crossAttention = attentionFromOptions(graph, options); registerLayer(crossAttention); postprocessor = New( - graph, - opt("transformer-postprocess", ""), + graph, + opt("transformer-postprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(postprocessor); } @@ -358,17 +376,17 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer { public: - TransformerAutoRegressiveBlock(Ptr graph, + TransformerAutoRegressiveBlock(Ptr graph, Ptr options) : LayerWithOptions(graph, options) {} - + virtual ~TransformerAutoRegressiveBlock() = default; using IBinaryDecoderLayer::apply; }; -/** - * This is a transformer RNN block. +/** + * This is a transformer RNN block. */ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { public: @@ -376,13 +394,13 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { Ptr> rnn; Ptr postprocessor; - TransformerRNNBlock(Ptr graph, + TransformerRNNBlock(Ptr graph, Ptr options) : TransformerAutoRegressiveBlock(graph, options) { preprocessor = New( - graph, - opt("transformer-preprocess", ""), + graph, + opt("transformer-preprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); @@ -392,8 +410,8 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { registerLayer(rnn); postprocessor = New( - graph, - opt("transformer-postprocess", ""), + graph, + opt("transformer-postprocess", ""), opt("transformer-dropout", 0.f)); registerLayer(postprocessor); } @@ -406,22 +424,22 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { } }; -/** +/** * A full transformer decoder layer consists of a self-attention block followed by - * cross-attention block and a filter block. Skip connections etc. are handled inside + * cross-attention block and a filter block. Skip connections etc. are handled inside * the blocks, see above. - * + * * For the self-attention block we need a special mask, usually a triangle mask that - * prohibits to look into the future. - * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive - * for many layers. + * prohibits to look into the future. + * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive + * for many layers. */ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaternaryDecoderLayer { Ptr autoRegressiveBlock; Ptr crossAttentionBlock; Ptr filterBlock; - TransformerDecoderLayer(Ptr graph, + TransformerDecoderLayer(Ptr graph, Ptr options) : LayerWithOptions(graph, options) { @@ -434,10 +452,10 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna ABORT("Unknown auto-regression block type {}", autoRegressionType); } registerLayer(autoRegressiveBlock); - + crossAttentionBlock = New(graph, options); registerLayer(crossAttentionBlock); - + filterBlock = New(graph, options, /*isDecoder=*/true); registerLayer(filterBlock); } @@ -447,15 +465,15 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna output = crossAttentionBlock->apply(output, context, logMask); output = filterBlock->apply(output); - checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) + checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) return output; } }; /** - * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we + * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity - * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. + * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. * @TODO: get rid of these transposes. */ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer { @@ -464,8 +482,8 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec Ptr preprocessor; Ptr layers; Ptr postprocessor; - - TransformerDecoder(Ptr graph, + + TransformerDecoder(Ptr graph, Ptr options) : LayerWithOptions(graph, options) { @@ -478,8 +496,8 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec registerLayer(maskProcessor); preprocessor = New( - graph, - opt("transformer-postprocess-emb", ""), + graph, + opt("transformer-postprocess-emb", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); @@ -505,7 +523,7 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec } auto currentLayer = layers->at(i)->as(); - // example of changing linear layer init functions burried deep in the model + // example of changing linear layer init functions burried deep in the model if(opt("transformer-depth-scaling", false)) { auto autoRegLayer = currentLayer->autoRegressiveBlock->as(); autoRegLayer->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); @@ -519,7 +537,7 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec if(opt("transformer-no-bias", false)) for(auto linear : currentLayer->allLayers()) linear->useBias = false; - + if(opt("transformer-no-affine", false)) { for(auto norm : currentLayer->allLayers()) { norm->useScale = false; @@ -529,8 +547,8 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec } postprocessor = New( - graph, - opt("transformer-postprocess-top", ""), + graph, + opt("transformer-postprocess-top", ""), opt("transformer-dropout", 0.f)); registerLayer(postprocessor); } @@ -550,19 +568,19 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec // @TODO: write function prepareMasks(); // @TODO: create triangle mask here and combine with inputMask LOG_ONCE(info, "Don't forget the triangle mask if required!"); - + if(inputMask) inputMask = swapTimeBatch(inputMask); // [dimBeam=1, dimBatch, dimTrgWords, dimModel=1] if(contextMask) contextMask = swapTimeBatch(contextMask); // [dimBeam=1, dimBatch, dimSrcWords, dimModel=1] - + // apply positional embeddings to contextual input if(positionEmbedding) output = positionEmbedding->apply(output, startPos); else output = std::sqrt((float)output->shape()[-1]) * output; - + // handle for skip connection at top auto prevOutput = output; diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h index aa335696d..d818ae384 100644 --- a/src/models/comet_qe.h +++ b/src/models/comet_qe.h @@ -10,37 +10,46 @@ namespace models { class CometEncoder final : public nn::TransformerEncoder { private: + Expr cometPool(Expr x, Expr binaryMask) const { + auto poolType = opt("comet-pool", "avg"); + if(poolType == "avg") + return sum(x * binaryMask, /*axis=*/-2) / sum(binaryMask, /*axis=*/-2); + else if(poolType == "max") + return max(x + marian::log(binaryMask), /*axis=*/-2); + else if(poolType == "cls") + return slice(x, /*axis=*/-2, 0); + else + ABORT("Unknown pool type {}", poolType); + } + // This seems to be a mix of LayerNorm and BatchNorm and present in the original Unbabel code. // It norms over time, not batch, also should be optimized. Seems safe to disable for custom // models trained by us, but required when doing inference with Unbabel models. Expr cometNorm(Expr x, Expr binaryMask) const { - Expr output; + Expr output = x; if(opt("comet-mix-norm", false)) { - registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); - int dimModel = x->shape()[-1]; + int dimModel = output->shape()[-1]; // Convert type to fp32 for better accumulation. This is a no-op if things are already fp32. - Type origType = x->value_type(); - x = marian::cast(x, Type::float32); - binaryMask = marian::cast(binaryMask, Type::float32); - - x = x * binaryMask; - auto denom = (float)dimModel * sum(binaryMask, -2); - auto mu = sum(sum(x, -1), -2) / denom; // sum over model and time - auto sigma = sum(sum(square(x - mu), -1), -2) / denom; - - auto normed = (x - mu) / sqrt(sigma + 1e-12f); - output = marian::cast(gamma, Type::float32) * sum(normed * binaryMask, -2) / sum(binaryMask, -2); - - // Undo conversion to fp32 if not originally fp32 (most likely fp16 then) - output = marian::cast(output, origType); - } else if(opt("comet-mix", false)) { - // average over time dimension + Type origType = output->value_type(); + auto output32 = marian::cast(output, Type::float32); + auto binaryMask32 = marian::cast(binaryMask, Type::float32); + + output32 = output32 * binaryMask32; + auto denom = (float)dimModel * sum(binaryMask32, -2); + auto mu = sum(sum(output32, -1), -2) / denom; // sum over model and time + auto sigma = sum(sum(square(output32 - mu), -1), -2) / denom; + + auto normed = (output32 - mu) / sqrt(sigma + 1e-12f); + output = marian::cast(normed, origType); + } + + output = cometPool(output, binaryMask); + + if(opt("comet-mix", false)) { registerParameterLazy(gamma, Shape({ 1 }), inits::ones()); - output = gamma * sum(x * binaryMask, -2) / sum(binaryMask, -2); - } else { - output = sum(x * binaryMask, -2) / sum(binaryMask, -2); + output = gamma * output; } return output; @@ -55,40 +64,36 @@ class CometEncoder final : public nn::TransformerEncoder { : TransformerEncoder(graph, options) {} Expr apply(Expr input, Expr mask) const override { - auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - - auto binaryMask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - - // apply positional embeddings to contextual input - output = positionEmbedding->apply(output); - - // apply dropout or layer-norm to embeddings if required - output = preprocessor->apply(output); - auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1] - std::vector pooler; - if(opt("comet-mix", false)) - pooler.push_back(cometNorm(output, binaryMask)); - - // traverse the layers, use the same mask for each - for(auto layer : *layers) { - output = layer->apply(output, logMask); - if(opt("comet-mix", false)) - pooler.push_back(cometNorm(output, binaryMask)); // [ batch, time, modelDim ] + auto binaryMask = marian::nn::swapTimeBatch(mask); + if(opt("comet-mix", false)) { + // we collect hidden states from the base class encoder + TransformerEncoder::keepHiddenStates = true; + // to save memory we can already pool/norm the hidden states before storing them + TransformerEncoder::hiddenTransformFn = [this, binaryMask](Expr x) { + return cometNorm(x, binaryMask); + }; } + // execute to populate hidden states + // the actual output is not used, because we use the collected hidden states instead + auto unused = TransformerEncoder::apply(input, mask); + + Expr output; if(opt("comet-mix", false)) { registerParameterLazy(weights, Shape({ opt("enc-depth") + 1 }), inits::zeros()); - // comet22 has a sparsemax here + // comet22/comet-kiwi has a sparsemax here auto normFn = opt("comet-mix-transformation", "softmax"); auto weightsNorm = (normFn == "sparsemax") ? sparsemax(weights) : softmax(weights); weightsNorm = reshape(weightsNorm, {weights->shape()[-1], 1}); - output = sum(weightsNorm * concatenate(pooler, /*axis=*/-2), -2); // [batch, 1, modelDim] + output = sum(weightsNorm * concatenate(hiddenStates, /*axis=*/-2), -2); // [batch, 1, modelDim] } else { // just use last layer, average over time dim output = cometNorm(output, binaryMask); // [batch, 1, modelDim] } + // attach the unused output to the graph to avoid dangling nodes, this is a no-op. + output = choose({output, unused}, 0); return output; } }; @@ -147,7 +152,7 @@ struct CometBatchEncoder final : public nn::LayerWithOptions, } virtual void clear() override { - Layer::clear(); + LayerWithOptions::clear(); } }; @@ -282,8 +287,9 @@ class CometMetricPooler final : public nn::LayerWithOptions, ABORT_IF(usage == models::usage::embedding, "Wrong pooler for embedding??"); auto modelType = LayerWithOptions::opt("type"); - ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe"); - ABORT_IF(modelType == "comet" && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet"); + ABORT_IF(modelType == "comet-qe" && encoderStates.size() != 2, "Pooler expects exactly two encoder states for comet-qe"); + ABORT_IF(modelType == "comet" && encoderStates.size() != 3, "Pooler expects exactly three encoder states for comet"); + ABORT_IF(modelType == "comet-unified" && encoderStates.size() != 1, "Pooler expects exactly one encoder state for comet-unified"); if(modelType == "comet-qe") { auto src = encoderStates[0]->getContext(); @@ -370,6 +376,19 @@ class CometMetricPooler final : public nn::LayerWithOptions, // Currently no training for COMET with reference @TODO: add training ABORT("Usage other than 'evaluating' not implemented"); } + } else if(modelType == "comet-unified") { + auto emb = encoderStates[0]->getContext(); + Expr output; + if(usage == models::usage::evaluating) { + output = layers->apply(emb); + output = minimum(output, 1.f); // comet-kiwi/XL/XXL clamp at 1.f + int dimBatch = output->shape()[-3]; + output = reshape(output, {dimBatch, 1, 1}); + return { output }; + } else { + // Currently no training for COMET with reference @TODO: add training + ABORT("Usage other than 'evaluating' not implemented"); + } } else { ABORT("Unknown model type {}", modelType); } diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 5b4cd34eb..1316dacd4 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -36,7 +36,7 @@ namespace models { Ptr EncoderFactory::construct(Ptr graph) { if(options_->get("type") == "s2s") return New(graph, options_); - + if(options_->get("type") == "laser" || options_->get("type") == "laser-sim") return New(graph, options_); @@ -134,12 +134,16 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti Ptr graph = nullptr; // graph unknown at this stage // clang-format off - if(type == "comet-qe" || type == "comet") { - if(type == "comet") { - ABORT_IF(use == usage::training, "Usage {} is not supported for model of type {}", (int)use, type); - ABORT_IF(use == usage::scoring, "Usage {} is not supported for model of type {}", (int)use, type); + if(type == "comet-qe" || type == "comet" || type == "comet-unified") { + if(type == "comet" || type == "comet-unified") { + ABORT_IF(use == usage::training, "Usage {} is not supported for model of type {}", (int)use, type); + ABORT_IF(use == usage::scoring, "Usage {} is not supported for model of type {}", (int)use, type); + } + + if(type == "comet-unified") { + LOG_ONCE(warn, "Warning: For xCOMET-XL/XXL - this is currently only an implementation of the regressor part and does not include the interpolation with MQM scores"); } - + auto inputTypes = options->get>("input-types"); ABORT_IF(inputTypes.empty(), "Required option --input-types for COMET-QE not set. " @@ -149,7 +153,7 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti int shift = 0; if(inputTypes[0] == "class") shift = 1; - + auto newOptions = options->with("usage", use); auto res = New(newOptions); @@ -160,24 +164,35 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti switch(use) { case usage::embedding: numEncoders = 1; addEmbeddingPooler = true; break; case usage::raw: - case usage::evaluating: + case usage::evaluating: case usage::scoring: - case usage::training: numEncoders = (type == "comet-qe") ? 2 : 3; addMetricPooler = true; break; - default: ABORT("Usage {} is not supported for model of type {}", (int)use, type); + case usage::training: + if(type == "comet-qe") + numEncoders = 2; + else if(type == "comet") + numEncoders = 3; + else if(type == "comet-unified") + numEncoders = 1; + else + ABORT("Unknown model type {}", type); + + addMetricPooler = true; + break; + default: ABORT("Usage {} is not supported for model of type {}", (int)use, type); } - + for(size_t i = 0; i < numEncoders; i++) { auto enc = New(graph, newOptions->with("type", "transformer", "index", i + shift)); enc->setName("CometEncoder"); // parameters will be shared res->push_back(enc); } - + if(addEmbeddingPooler) { auto pooler = New(graph, newOptions); - pooler->setName("CometEmbeddingPooler"); + pooler->setName("CometEmbeddingPooler"); res->push_back(pooler); } - + if(addMetricPooler) { auto pooler = New(graph, newOptions); pooler->setName("CometQEPooler"); // @TODO: change name for different models @@ -188,8 +203,8 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti } if(type == "bleurt") { - ABORT_IF(use != usage::evaluating, "Usage other than 'evaluating' is not supported for model of type {}", type); - + ABORT_IF(use != usage::evaluating, "Usage other than 'evaluating' is not supported for model of type {}", type); + auto newOptions = options->with("usage", use); auto res = New(newOptions); @@ -202,11 +217,11 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti int shift = 0; if(inputTypes[0] == "class") shift = 1; - + auto enc = New(graph, newOptions->with("type", "transformer", "index", 0 + shift)); enc->setName("BleurtEncoder"); res->push_back(enc); - + auto pooler = New(graph, newOptions); pooler->setName("BleurtPooler"); res->push_back(pooler); @@ -236,8 +251,8 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti "input-types", std::vector({"sequence"}), "dim-vocabs", std::vector(1, dimVocab)); } - - auto res = New(newOptions); + + auto res = New(newOptions); if(options->get("compute-similarity", false)) { res->push_back(models::encoder(newOptions->with("index", 0)).construct(graph)); res->push_back(models::encoder(newOptions->with("index", 1)).construct(graph)); @@ -270,15 +285,15 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti else if(type == "transformer-new") { auto newOptions = options->with("usage", use); auto res = New(graph, newOptions); - + auto enc = New(graph, newOptions->with("type", "transformer")); enc->setName("TransformerBatchEncoder"); res->push_back(enc); - + auto dec = New(graph, newOptions->with("type", "transformer")); dec->setName("TransformerBatchDecoder"); res->push_back(dec); - + return res; } @@ -287,15 +302,15 @@ Ptr createBaseModelByType(std::string type, usage use, Ptr opti if(tflavor && std::strcmp(tflavor, "experimental") == 0) { auto newOptions = options->with("usage", use); auto res = New(graph, newOptions); - + auto enc = New(graph, newOptions->with("type", "transformer")); enc->setName("TransformerBatchEncoder"); res->push_back(enc); - + auto dec = New(graph, newOptions->with("type", "transformer")); dec->setName("TransformerBatchDecoder"); res->push_back(dec); - + return res; } else { auto newOptions = options->with("usage", use); diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index 9b5f300d4..efada03ae 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -329,8 +329,8 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { if(isMainProcess()) { if(filesystem::exists(modelFileName)) { LOG(info, "Loading model from {}", modelFileName); - foundModel = true; - modelWeights_ = New(modelFileName, io::MmapMode::DontMmap); + foundModel = true; + modelWeights_ = New(modelFileName, io::MmapMode::DontMmap); markReloaded = true; } else if(options_->hasAndNotEmpty("pretrained-model")) { std::string pretrainedModelFileName = options_->get("pretrained-model"); diff --git a/src/translator/scorers.h b/src/translator/scorers.h index 333e49108..21ab77c66 100644 --- a/src/translator/scorers.h +++ b/src/translator/scorers.h @@ -77,17 +77,16 @@ class ScorerWrapper : public Scorer { ScorerWrapper(Ptr encdec, const std::string& name, float weight, - Ptr modelFile) + Ptr modelWeights) : Scorer(name, weight), encdec_(std::static_pointer_cast(encdec)), - modelWeights_(modelFile) + modelWeights_(modelWeights) {} virtual ~ScorerWrapper() {} virtual void init(Ptr graph) override { graph->switchParams(getName()); - // @TODO: unify to a single call, this logic should happen in modelFile_ if(modelWeights_) encdec_->load(graph, modelWeights_); } From b5c892e8eee189d3bc81cbf31e3274d692821641 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 1 Feb 2024 16:45:01 +0000 Subject: [PATCH 10/26] Merged PR 32781: Attach missing node for mt-detect models Fixes small bug for mt-detect models --- CHANGELOG.md | 1 + VERSION | 2 +- src/models/comet_qe.h | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 854162b6d..382aedb8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed lost node in mt-detect metrics - Fixed BLEURT logmask computation - Fixed wrong paramter name for norm in new layer framework - Fixed unit test for LayerNorm diff --git a/VERSION b/VERSION index 5a8f2d3ca..cddff7b16 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.20 +v1.12.21 diff --git a/src/models/comet_qe.h b/src/models/comet_qe.h index d818ae384..df351ecf2 100644 --- a/src/models/comet_qe.h +++ b/src/models/comet_qe.h @@ -66,6 +66,7 @@ class CometEncoder final : public nn::TransformerEncoder { Expr apply(Expr input, Expr mask) const override { auto binaryMask = marian::nn::swapTimeBatch(mask); + if(opt("comet-mix", false)) { // we collect hidden states from the base class encoder TransformerEncoder::keepHiddenStates = true; @@ -75,9 +76,8 @@ class CometEncoder final : public nn::TransformerEncoder { }; } - // execute to populate hidden states - // the actual output is not used, because we use the collected hidden states instead - auto unused = TransformerEncoder::apply(input, mask); + // execute to populate hidden states and compute top output layer + auto hiddenTop = TransformerEncoder::apply(input, mask); // [time, batch, modelDim] (because the last state is being transposed again) Expr output; if(opt("comet-mix", false)) { @@ -86,14 +86,20 @@ class CometEncoder final : public nn::TransformerEncoder { auto normFn = opt("comet-mix-transformation", "softmax"); auto weightsNorm = (normFn == "sparsemax") ? sparsemax(weights) : softmax(weights); weightsNorm = reshape(weightsNorm, {weights->shape()[-1], 1}); + output = sum(weightsNorm * concatenate(hiddenStates, /*axis=*/-2), -2); // [batch, 1, modelDim] + + // since we use the hidden states from the encoder and not the top layer, we need to + // attach the unused output to the graph to avoid dangling nodes, this is a no-op. + output = choose({output, hiddenTop}, 0); } else { + // @TODO: get rid of this + // undo the time-batch swap + hiddenTop = marian::nn::swapTimeBatch(hiddenTop); // [batch, time, modelDim] // just use last layer, average over time dim - output = cometNorm(output, binaryMask); // [batch, 1, modelDim] + output = cometNorm(hiddenTop, binaryMask); // [batch, 1, modelDim] } - // attach the unused output to the graph to avoid dangling nodes, this is a no-op. - output = choose({output, unused}, 0); return output; } }; From 1c63c1ecc0487747906387df6a2050f295b4cf5d Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Sat, 3 Feb 2024 00:23:02 +0000 Subject: [PATCH 11/26] Merged PR 31744: Pymarian: python bindings to marian * This code is same as [public github repo tg/pybind-new branch](https://github.com/marian-nmt/marian-dev/pull/1013). Git histories seems slightly different between public and private repo so we are seeing a lot of commits * This builds on top of work by Elijah https://github.com/marian-nmt/marian-dev/pull/948 --- .github/workflows/macos.yml | 8 +- .github/workflows/ubuntu.yml | 11 + .github/workflows/windows.yml | 12 + .gitignore | 11 +- .gitmodules | 3 + CHANGELOG.md | 1 + CMakeLists.txt | 7 +- azure-pipelines.yml | 27 ++- src/3rd_party/CMakeLists.txt | 16 +- src/3rd_party/pybind11 | 1 + src/CMakeLists.txt | 37 ++- src/common/config.cpp | 14 +- src/common/config.h | 15 ++ src/common/logging.cpp | 18 +- src/data/text_input.cpp | 34 ++- src/data/text_input.h | 20 +- src/embedder/vector_collector.cpp | 4 + src/embedder/vector_collector.h | 25 ++ src/evaluator/evaluator.h | 25 +- src/models/model_task.h | 3 +- src/python/README.md | 185 +++++++++++++++ src/python/binding/bind.cpp | 47 ++++ src/python/binding/embedder.hpp | 29 +++ src/python/binding/evaluator.hpp | 119 ++++++++++ src/python/binding/trainer.hpp | 51 +++++ src/python/binding/translator.hpp | 69 ++++++ src/python/pymarian/__init__.py | 48 ++++ src/python/pymarian/__main__.py | 18 ++ src/python/pymarian/constants.py | 28 +++ src/python/pymarian/evaluate.py | 344 ++++++++++++++++++++++++++++ src/python/pymarian/mtapi_server.py | 84 +++++++ src/python/pymarian/qtdemo.py | 125 ++++++++++ src/python/pymarian/utils.py | 101 ++++++++ src/python/pyproject.toml | 63 +++++ src/python/setup.py | 102 +++++++++ src/python/tests/__init__.py | 15 ++ src/python/tests/test_evaluate.py | 148 ++++++++++++ src/python/tests/test_train.py | 142 ++++++++++++ src/python/tests/test_translate.py | 16 ++ src/translator/translator.h | 101 +++++--- 40 files changed, 2035 insertions(+), 92 deletions(-) create mode 160000 src/3rd_party/pybind11 create mode 100644 src/python/README.md create mode 100644 src/python/binding/bind.cpp create mode 100644 src/python/binding/embedder.hpp create mode 100644 src/python/binding/evaluator.hpp create mode 100644 src/python/binding/trainer.hpp create mode 100644 src/python/binding/translator.hpp create mode 100644 src/python/pymarian/__init__.py create mode 100644 src/python/pymarian/__main__.py create mode 100644 src/python/pymarian/constants.py create mode 100755 src/python/pymarian/evaluate.py create mode 100755 src/python/pymarian/mtapi_server.py create mode 100644 src/python/pymarian/qtdemo.py create mode 100644 src/python/pymarian/utils.py create mode 100644 src/python/pyproject.toml create mode 100644 src/python/setup.py create mode 100644 src/python/tests/__init__.py create mode 100644 src/python/tests/test_evaluate.py create mode 100644 src/python/tests/test_train.py create mode 100644 src/python/tests/test_translate.py diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 8b992e404..abff1d712 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -49,5 +49,11 @@ jobs: ./marian --version ./marian-decoder --version ./marian-scorer --version - ./spm_encode --version ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \)) + + - name: Install PyMarian + run: | + python3 -m pip install --upgrade pip setuptools wheel pytest + CMAKE_ARGS="" python3 -m pip install -v . + python3 -m pymarian -v + MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests \ No newline at end of file diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index bc01b74a8..f2baae82d 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -115,6 +115,7 @@ jobs: -DCOMPILE_CPU=${{ matrix.cpu }} \ -DCOMPILE_CUDA=${{ matrix.gpu }} \ -DCOMPILE_EXAMPLES=${{ matrix.examples }} \ + -DUSE_TCMALLOC=OFF \ -DCOMPILE_SERVER=on \ -DCOMPILE_TESTS=${{ matrix.unit_tests }} \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \ @@ -143,3 +144,13 @@ jobs: ./marian-server --version ./spm_encode --version ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \)) + + - name: Install PyMarian + working-directory: build + env: + CUDA_VERSION: ${{ matrix.cuda }} + run: | + python3 -m pip install --upgrade pip setuptools wheel pytest + CMAKE_ARGS="" python3 -m pip install -v . + python3 -m pymarian -v + MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index b1d6b1bd1..55ff0d688 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -134,4 +134,16 @@ jobs: .\marian-decoder.exe --version .\marian-scorer.exe --version dir *.exe + cd .. + shell: cmd + + - name: Install PyMarian + working-directory: src/python + run: | + python3 -m pip install --upgrade pip setuptools wheel pytest + python3 -m pip install -v . + python3 -m pymarian -v + python3 -m pytest -vs src/python/tests + env: + CUDA_VERSION: ${{ matrix.cuda }} shell: cmd diff --git a/.gitignore b/.gitignore index d7f2f4df3..a55d45a39 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -# Config files from CMake +.history* src/common/project_version.h src/common/git_revision.h src/common/build_info.cpp @@ -48,6 +48,8 @@ pingme.txt # CMake files build build-* +# pymarian wheels +dist/ # Examples examples/*/*.gz @@ -61,4 +63,9 @@ examples/mnist/*ubyte /vs/MarianDll.VC.VC.opendb .vs -.vscode +.vscode + +# Python : pymarian +*.whl +*.egg-info +src/python/pymarian/_version.py diff --git a/.gitmodules b/.gitmodules index a1a876d8b..7a94dab1d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -20,3 +20,6 @@ [submodule "src/3rd_party/simple-websocket-server"] path = src/3rd_party/simple-websocket-server url = https://github.com/marian-nmt/Simple-WebSocket-Server +[submodule "src/3rd_party/pybind11"] + path = src/3rd_party/pybind11 + url = https://github.com/pybind/pybind11.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 382aedb8c..13dd5e301 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added `pymarian`: python bindings based on pybind11 - Added implementation of COMET-KIWI - Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now) - Added implementation of COMET-22 (reference-based) model and conversion diff --git a/CMakeLists.txt b/CMakeLists.txt index 595f87cc1..0ebe2b819 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,9 +29,11 @@ option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) +option(USE_TCMALLOC "Use TCMALLOC if available" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" OFF) option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF) option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF) +option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF) # fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them, # so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12, @@ -105,7 +107,7 @@ if(MSVC) set(INTRINSICS "/arch:AVX2") # set(INTRINSICS "/arch:AVX512") # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj - set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}") + set(CMAKE_CXX_FLAGS "/permissive- /EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG") @@ -347,6 +349,7 @@ if(CUDA_FOUND) LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets) endif() + message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}") if(COMPILE_KEPLER) message(STATUS "Compiling code for Kepler GPUs") LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above @@ -412,7 +415,7 @@ if(CUDA_FOUND) if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")) find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH) if(NOT CUDA_cublasLt_LIBRARY) - message(FATAL_ERROR "cuBLASLt library not found") + message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}") endif() set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY}) set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY}) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4e1744375..a1e9ea94f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -193,6 +193,7 @@ stages: -DUSE_NCCL="FALSE" ^ -DUSE_SENTENCEPIECE="TRUE" ^ -DUSE_STATIC_LIBS="TRUE" + displayName: Configure CMake env: # Set envvars so that CMake can find the installed packages @@ -322,6 +323,10 @@ stages: displayName: Install CUDA condition: eq(variables.gpu, true) + # Some preinstalled versions of pip are bad for pymarian; see https://github.com/pypa/setuptools/issues/3269 + - bash: python3 -m pip install pip -U + displayName: Upgrade pip + - bash: | mkdir -p build cd build @@ -336,7 +341,11 @@ stages: -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=$(static) \ -DBoost_ARCHITECTURE=-x64 \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda) + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda) \ + -DUSE_TCMALLOC=off \ + -DPYMARIAN=ON \ + -DPYTHON_EXECUTABLE=python3 + displayName: Configure CMake # Clean build/src/ to safe disk space on Azure-hosted VMs and stay below the 10GB limit @@ -361,6 +370,11 @@ stages: displayName: Print versions workingDirectory: build + - bash: | + python3 -m pip install build/pymarian-*.whl + python3 -m pymarian -v + displayName: Build Pymarian + ###################################################################### - job: BuildMacOS cancelTimeoutInMinutes: 1 @@ -393,6 +407,7 @@ stages: -DUSE_FBGEMM=on \ -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=off + displayName: Configure CMake - bash: make -j2 @@ -453,7 +468,10 @@ stages: -DCOMPILE_CUDA=off \ -DGENERATE_MARIAN_INSTALL_TARGETS=on \ -DUSE_FBGEMM=on \ - -DUSE_SENTENCEPIECE=on + -DUSE_SENTENCEPIECE=on \ + -DPYMARIAN=on \ + -DPYTHON_EXECUTABLE=python3 + displayName: Configure CMake - bash: make -j3 install @@ -468,6 +486,11 @@ stages: displayName: Check targets workingDirectory: install + - bash: | + python3 -m pip install build/pymarian-*.whl + python3 -m pymarian -v + displayName: Build Pymarian + # Marian is built in the same job where the regression tests are run to make sure that executables # are compiled and run on a machine with the same CPU architecture, which is required for diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 838951c50..6cf46533f 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -72,13 +72,17 @@ if(USE_SENTENCEPIECE) # regardless of -DUSE_STATIC_LIBS setting always build sentencepiece statically set(SPM_ENABLE_SHARED OFF CACHE BOOL "Builds shared libaries in addition to static libraries." FORCE) - set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.") - if(USE_STATIC_LIBS) - set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE) - else(USE_STATIC_LIBS) - set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.") - endif(USE_STATIC_LIBS) + if(USE_TCMALLOC) + set(SPM_ENABLE_TCMALLOC ON CACHE BOOL "Enable TCMalloc if available.") + if(USE_STATIC_LIBS) + set(SPM_TCMALLOC_STATIC ON CACHE BOOL "Link static library of TCMALLOC." FORCE) + else(USE_STATIC_LIBS) + set(SPM_TCMALLOC_STATIC OFF CACHE BOOL "Link static library of TCMALLOC.") + endif(USE_STATIC_LIBS) + else(USE_TCMALLOC) + set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "Enable TCMalloc if available.") + endif(USE_TCMALLOC) add_subdirectory(./sentencepiece) include_directories(./sentencepiece) diff --git a/src/3rd_party/pybind11 b/src/3rd_party/pybind11 new file mode 160000 index 000000000..869cc1ff0 --- /dev/null +++ b/src/3rd_party/pybind11 @@ -0,0 +1 @@ +Subproject commit 869cc1ff085dd405635b00eb46e5c84f50f26099 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5bf321af5..c40eabc76 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,6 +4,7 @@ include_directories(.) include_directories(3rd_party) include_directories(3rd_party/SQLiteCpp/include) include_directories(3rd_party/sentencepiece) + if(USE_SENTENCEPIECE) include_directories(3rd_party/sentencepiece/third_party/protobuf-lite) endif(USE_SENTENCEPIECE) @@ -260,11 +261,11 @@ if (NOT COMPILE_LIBRARY_ONLY) endif(COMPILE_SERVER) foreach(exec ${EXECUTABLES}) - target_link_libraries(${exec} marian) - if(CUDA_FOUND) - target_link_libraries(${exec} marian_cuda) - endif(CUDA_FOUND) - set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") + target_link_libraries(${exec} marian) + if(CUDA_FOUND) + target_link_libraries(${exec} marian_cuda) + endif(CUDA_FOUND) + set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") endforeach(exec) endif(NOT COMPILE_LIBRARY_ONLY) @@ -282,9 +283,33 @@ endif(COMPILE_EXAMPLES) if(GENERATE_MARIAN_INSTALL_TARGETS) # Install the marian library if given a "make install" target - include(GNUInstallDirs) # This defines default values for installation directories (all platforms even if named GNU) + include(GNUInstallDirs) # This defines default values for installation directories (all platforms even if named GNU) install(TARGETS marian EXPORT marian-targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endif(GENERATE_MARIAN_INSTALL_TARGETS) + + +if(PYMARIAN) + if(NOT PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE python) # default to python in the environment + endif() + + include_directories(3rd_party/pybind11/include) + add_subdirectory(3rd_party/pybind11) + + pybind11_add_module(_pymarian MODULE python/binding/bind.cpp) + target_link_libraries(_pymarian PUBLIC marian) + if(CUDA_FOUND) + target_link_libraries(_pymarian PUBLIC marian_cuda) + endif(CUDA_FOUND) + install(TARGETS _pymarian DESTINATION .) + + # build pymarian wheel + add_custom_target(pymarian ALL + ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}" + "${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}" + DEPENDS _pymarian + VERBATIM COMMENT "Building pymarian wheel") +endif(PYMARIAN) diff --git a/src/common/config.cpp b/src/common/config.cpp index 20ef6e046..b6296a8b2 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -274,12 +274,22 @@ std::vector Config::getDevices(Ptr options, return devices; } -Ptr -parseOptions(int argc, char** argv, cli::mode mode, bool validate){ +Ptr parseOptions(int argc, char** argv, cli::mode mode, bool validate) { ConfigParser cp(mode); return cp.parseOptions(argc, argv, validate); } +Ptr parseOptions(const std::string& args, cli::mode mode, bool validate) { + std::vector vArgs = utils::split(args, " "); + + std::string dummy("marian"); + std::vector cArgs = { &dummy[0] }; + for(auto& arg : vArgs) + cArgs.push_back(&arg[0]); + + return parseOptions((int)cArgs.size(), cArgs.data(), mode, validate); +} + std::ostream& operator<<(std::ostream& out, const Config& config) { YAML::Emitter outYaml; cli::OutputYaml(config.get(), outYaml); diff --git a/src/common/config.h b/src/common/config.h index c22d7415e..06110e17e 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -119,4 +119,19 @@ Ptr parseOptions(int argc, cli::mode mode, bool validate = true); +/** + * Parse the command line options. + * Same as above, but args provided as C++ string object, space-delimited. This is used for instance + * in the python bindings as a simple string-based interface. + * + * @param args space delimited command line options + * @param mode change the set of available command-line options, e.g. training, translation, etc. + * @param validate validate parsed options and abort on failure + * + * @return parsed options + */ +Ptr parseOptions(const std::string& args, + cli::mode mode, + bool validate = true); + } // namespace marian diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 69efeb482..53bb6ba81 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -26,11 +26,13 @@ std::shared_ptr createStderrLogger(const std::string& name, const std::string& pattern, const std::vector& files, bool quiet) { - std::vector sinks; + auto logger = spdlog::get(name); + if(!logger) { + std::vector sinks; - auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance(); - if(!quiet) - sinks.push_back(stderr_sink); + auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance(); + if(!quiet) + sinks.push_back(stderr_sink); // @TODO: think how to solve this better than using OMPI_COMM_WORLD_RANK env variable // only create output files if we are the main process or if MPI rank is not defined @@ -42,10 +44,11 @@ std::shared_ptr createStderrLogger(const std::string& name, } } - auto logger = std::make_shared(name, begin(sinks), end(sinks)); + logger = std::make_shared(name, begin(sinks), end(sinks)); - spdlog::register_logger(logger); - logger->set_pattern(pattern); + spdlog::register_logger(logger); + logger->set_pattern(pattern); + } return logger; } @@ -72,6 +75,7 @@ bool setLoggingLevel(spdlog::logger& logger, std::string const level) { } static void setErrorHandlers(); + void createLoggers(const marian::Config* config) { std::vector generalLogs; std::vector validLogs; diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp index 3485a223f..0ccaedf14 100644 --- a/src/data/text_input.cpp +++ b/src/data/text_input.cpp @@ -13,7 +13,13 @@ void TextIterator::increment() { } bool TextIterator::equal(TextIterator const& other) const { - return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid()); + // two iterators are equal if any of the following is true: + // 1. both are invalid (null ptrs) + // 2. both at the end of the stream (empty tuples as record, regardless of pos_) + // 3. both are at the same position + return (!this->tup_.valid() && !other.tup_.valid()) || + (this->tup_.valid() && other.tup_.valid() && this->tup_.empty() && other.tup_.empty()) || + this->pos_ == other.pos_; } const SentenceTuple& TextIterator::dereference() const { @@ -38,30 +44,18 @@ TextInput::TextInput(std::vector inputs, SentenceTuple TextInput::next() { // get index of the current sentence size_t curId = pos_++; - - // fill up the sentence tuple with source and/or target sentences - SentenceTupleImpl tup(curId); + // read next row, i.e. vector from files + // if any file is empty, we are done + std::vector row; for(size_t i = 0; i < files_.size(); ++i) { std::string line; if(io::getline(*files_[i], line)) { - Words words = vocabs_[i]->encode(line, /*addEOS=*/true, /*inference=*/inference_); - if(this->maxLengthCrop_ && words.size() > this->maxLength_) { - words.resize(maxLength_); - words.back() = vocabs_.back()->getEosId(); // note: this will not work with class-labels - } - - ABORT_IF(words.empty(), "No words (not even EOS) found in string??"); - ABORT_IF(tup.size() != i, "Previous tuple elements are missing."); - tup.pushBack(words); + row.push_back(line); + } else { + return SentenceTupleImpl(); // return an empty tuple if above test does not pass(); } } - - if(tup.size() == files_.size()) // check if each input file provided an example - return SentenceTuple(tup); - else if(tup.size() == 0) // if no file provided examples we are done - return SentenceTupleImpl(); // return an empty tuple if above test does not pass(); - else // neither all nor none => we have at least on missing entry - ABORT("There are missing entries in the text tuples."); + return encode(row, curId); } } // namespace data diff --git a/src/data/text_input.h b/src/data/text_input.h index 98d991bcb..3a399b6d2 100644 --- a/src/data/text_input.h +++ b/src/data/text_input.h @@ -27,7 +27,7 @@ class TextIterator : public IteratorFacade { }; class TextInput : public DatasetBase { -private: +protected: std::vector> files_; std::vector> vocabs_; @@ -92,6 +92,24 @@ class TextInput : public DatasetBase { } void prepare() override {} + + SentenceTuple encode(std::vector& row, size_t id) { + ABORT_IF(row.size() != vocabs_.size(), "Number of fields does not match number of vocabs"); + // fill up the sentence tuple with source and/or target sentences + SentenceTupleImpl tup(id); + for(size_t i = 0; i < row.size(); ++i) { + std::string field = row[i]; + Words words = vocabs_[i]->encode(field, /*addEOS=*/true, /*inference=*/inference_); + if(this->maxLengthCrop_ && words.size() > this->maxLength_) { + words.resize(maxLength_); + words.back() = vocabs_.back()->getEosId(); // note: this will not work with class-labels + } + ABORT_IF(words.empty(), "No words (not even EOS) found in the input text. ID: " + std::to_string(id)); + tup.pushBack(words); + } + return SentenceTuple(tup); + } + }; } // namespace data } // namespace marian diff --git a/src/embedder/vector_collector.cpp b/src/embedder/vector_collector.cpp index 1268de530..fcfbb02e7 100644 --- a/src/embedder/vector_collector.cpp +++ b/src/embedder/vector_collector.cpp @@ -109,6 +109,10 @@ Ptr VectorCollector::Create(Ptr options) { return collector; } +void BufferedVectorCollector::WriteVector(const std::vector& vec) { + buffer.push_back(vec); +} + const size_t VectorCollector::DEFAULT_WIDTH = 4; } // namespace marian diff --git a/src/embedder/vector_collector.h b/src/embedder/vector_collector.h index 6c727203c..ff4c4dd9c 100644 --- a/src/embedder/vector_collector.h +++ b/src/embedder/vector_collector.h @@ -63,4 +63,29 @@ class AveragingVectorCollector : public VectorCollector { virtual void WriteAverage(); }; + +// collects vectors and hold them in memory +class BufferedVectorCollector : public VectorCollector { + +private: + std::vector> buffer; + +protected: + virtual void WriteVector(const std::vector& vec) override; + +public: + BufferedVectorCollector(bool binary=false, size_t width=DEFAULT_WIDTH) + : VectorCollector(binary, width) {} + + BufferedVectorCollector(std::string outFile, bool binary=false, size_t width=DEFAULT_WIDTH) + : VectorCollector(outFile, binary, width) {} + + auto getBuffer() -> decltype(buffer) { + return buffer; + } + + virtual ~BufferedVectorCollector() {} + +}; + } // namespace marian diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h index bfed80a53..022a8204c 100644 --- a/src/evaluator/evaluator.h +++ b/src/evaluator/evaluator.h @@ -47,7 +47,7 @@ template class Evaluate : public ModelTask { private: Ptr options_; - Ptr corpus_; + std::vector> graphs_; std::vector> models_; Ptr modelFile_; @@ -57,8 +57,12 @@ class Evaluate : public ModelTask { options_ = options_->with("inference", true, "shuffle", "none"); - corpus_ = New(options_); - corpus_->prepare(); + /* Number of embeddings parameter is determined at runtime based on the given vocabulary file. + In addtiion, this parameter has to be set before initializing the model object. + Corpus initializer is the one that sets the number of embeddings into options_ object. + However, we do not need to use corpus object here, so we just create a dummy corpus object. + */ + Ptr corpus = New(options_); auto devices = Config::getDevices(options_); @@ -94,11 +98,19 @@ class Evaluate : public ModelTask { LOG(info, "Evaluating"); timer::Timer timer; - auto batchGenerator = New>(corpus_, options_); + Ptr corpus = New(options_); + corpus->prepare(); + auto batchGenerator = New>(corpus, options_); batchGenerator->prepare(); Ptr output = VectorCollector::Create(options_); - + run(batchGenerator, output); + LOG(info, "Total time: {:.5f}s wall", timer.elapsed()); + } + + template + void run(Ptr> batchGenerator, Ptr collector) { + size_t batchId = 0; { ThreadPool pool(graphs_.size(), graphs_.size()); @@ -137,14 +149,13 @@ class Evaluate : public ModelTask { auto beg = i * numScores; auto end = (i + 1) * numScores; std::vector sentVector(sentVectors.begin() + beg, sentVectors.begin() + end); - output->Write((long)batch->getSentenceIds()[i], sentVector); + collector->Write((long)batch->getSentenceIds()[i], sentVector); } }; pool.enqueue(task, batchId++); } } - LOG(info, "Total time: {:.5f}s wall", timer.elapsed()); } }; diff --git a/src/models/model_task.h b/src/models/model_task.h index 96dfadd0c..798dd5546 100644 --- a/src/models/model_task.h +++ b/src/models/model_task.h @@ -11,6 +11,7 @@ struct ModelTask { struct ModelServiceTask { virtual ~ModelServiceTask() {} - virtual std::string run(const std::string&) = 0; + virtual std::string run(const std::string& /*input*/, const std::string& /*yaml*/) = 0; + virtual std::vector run(const std::vector& /*input*/, const std::string& /*yaml*/) = 0; }; } // namespace marian diff --git a/src/python/README.md b/src/python/README.md new file mode 100644 index 000000000..f8f00bdc5 --- /dev/null +++ b/src/python/README.md @@ -0,0 +1,185 @@ +# PyMarian + +* Python bindings to Marian (C++) is using [PyBind11] +* The python package is built using [scikit-build-core](https://github.com/scikit-build/scikit-build-core) + + +## Install + +```bash +# get source code +git clone https://github.com/marian-nmt/marian-dev +cd marian-dev + +# build marian with -DPYMARIAN=on option to create a pymarian wheel +cmake . -Bbuild -DCOMPILE_CUDA=off -DPYMARIAN=on -DCMAKE_BUILD_TYPE=Release +cmake --build build -j # -j option parallelizes build on all cpu cores +python -m pip install build/pymarian-*.whl +``` + +Since the above commands uses `python` executable in the PATH to determine Python version to compile marian native extension, make sure to have the desired `python` executable in your environment _before_ invoking these commands. + +## Python API + +Python API is designed to take same argument as marian CLI string. +> NOTE: these APIs are experimental only and not finalized. see `mtapi_server.py` for an example use of Translator API + +**Translator** +```python + +# Translator +from pymarian import Translator +cli_string = "..." +translator = Translator(cli_string) + +sources = ["sent1" , "sent2" ] +result = translator.translate(sources) +print(result) +``` + +**Evaluator** +```python +# Evaluator +from pymarian import Evaluator +cli_string = '-m path/to/model.npz -v path/to.vocab.spm path/to.vocab.spm --like comet-qe' +evaluator = Evaluator(cli_str) + +data = [ + ["Source1", "Hyp1"], + ["Source2", "Hyp2"] +] +scores = evaluator.run(data) +for score in scores: + print(score) +``` + +## CLI Usage +. `pymarian-evaluate` : CLI to download and use pretrained metrics such as COMETs, COMETOIDs, ChrFoid, and BLEURT +. `pymarian-mtapi` : REST API demo powered by Flask +. `pymarian-qtdemo` : GUI App demo powered by QT + + +### `pymarian-evaluate` + +```bash +$ pymarian-evaluate -h +usage: pymarian-evaluate [-h] [-m MODEL] [--stdin] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE] [-o OUT] [-a {skip,append,only}] [-w WIDTH] [--debug] [--mini-batch MINI_BATCH] [-d [DEVICES ...] | -c + CPU_THREADS] [-ws WORKSPACE] [--backend {subprocess,pymarian}] + +options: + -h, --help show this help message and exit + -m MODEL, --model MODEL + Model name, or path. Known models=['cometoid22-wmt21', 'cometoid22-wmt22', 'cometoid22-wmt23', 'chrfoid-wmt23', 'comet20-da-qe', 'bleurt20', 'comet20-da'] (default: + cometoid22-wmt22) + --stdin Read input from stdin. TSV file with following format: QE metrics: "srcmt", Comet with ref: "srcref; or BLEURT: "refmt" (default: False) + -t MT_FILE, --mt MT_FILE + MT output file. Ignored when --stdin. (default: None) + -s SRC_FILE, --src SRC_FILE + Source file. Ignored when --stdin (default: None) + -r REF_FILE, --ref REF_FILE + Ref file. Ignored when --stdin (default: None) + -o OUT, --out OUT output file. Default stdout (default: <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>) + -a {skip,append,only}, --average {skip,append,only} + Average segment scores to produce system score. skip=do not output average (default; segment scores only); append=append average at the end; only=output the average only + (i.e system score only) (default: skip) + -w WIDTH, --width WIDTH + Output score width (default: 4) + --debug Verbose output (default: False) + --mini-batch MINI_BATCH + Mini-batch size (default: 16) + -d [DEVICES ...], --devices [DEVICES ...] + GPU device IDs (default: None) + -c CPU_THREADS, --cpu-threads CPU_THREADS + Use CPU threads. 0=use gpu device 0 (default: None) + -ws WORKSPACE, --workspace WORKSPACE + Workspace memory (default: 8000) + --backend {subprocess,pymarian} + Marian backend interface. subprocess looks for marian binary in PATH. pymarian is a pybind wrapper (default: pymarian) +``` + +**Performance Tuning Tips**: +* For CPU parallelization, `--cpu-threads ` +* For GPU parallelization, assuming pymarian was compiled with cuda support, e.g., `--devices 0 1 2 3` to use the specified 4 gpu devices. +* When OOM error: adjust `--mini-batch` argument +* To see full logs from marian, set `--debug` + + +*Example Usage* +```bash +# download sample dataset +langs=en-ru +prefix=tmp.$langs +teset=wmt21/systems +sysname=Online-B +sacrebleu -t $teset -l $langs --echo src > $prefix.src +sacrebleu -t $teset -l $langs --echo ref > $prefix.ref +sacrebleu -t $teset -l $langs --echo $sysname > $prefix.mt + +# chrfoid +paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m chrfoid-wmt23 + +# cometoid22-wmt{21,22,23} +paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m cometoid22-wmt22 + +# bleurt20 +paste $prefix.{ref,mt} | head | pymarian-evaluate --stdin -m bleurt20 --debug + +# FIXME: comet20-da-qe and comet20-da appear to be broken +# comet20-da-qe +paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m comet20-da-qe +# comet20-da +paste $prefix.{src,mt,ref} | pymarian-evaluate -m comet20-da + +``` + +### `pymarian-mtapi` + +Launch server +```bash +# example model: download and extract +wget http://data.statmt.org/romang/marian-regression-tests/models/wngt19.tar.gz +tar xvf wngt19.tar.gz + +# launch server +pymarian-mtapi -s en -t de "-m wngt19/model.base.npz -v wngt19/en-de.spm wngt19/en-de.spm" +``` + +Example request from client + +```bash +URL="http://127.0.0.1:5000/translate" +curl $URL --header "Content-Type: application/json" --request POST --data '[{"text":["Good Morning."]}]' +``` + +### `pymarian-qtdemo` +``` +pymarian-qtdemo +``` + +## Run Tests + +```bash +# install pytest if necessary +python -m pip install pytest + +# run tests in quiet mode +python -m pytest src/python/tests/ + +# or, add -s to see STDOUT/STDERR from tests +python -m pytest -s src/python/tests/ + +``` + + +## Known issues + +1. In conda or mamba environment, if you see `.../miniconda3/envs//bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error, + install libstdcxx-ng + + ```bash + conda install -c conda-forge libstdcxx-ng + ``` + + + + diff --git a/src/python/binding/bind.cpp b/src/python/binding/bind.cpp new file mode 100644 index 000000000..9e8cc4464 --- /dev/null +++ b/src/python/binding/bind.cpp @@ -0,0 +1,47 @@ +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +// if your IDE/vscode complains about missing paths +// pybind11 can be found by "python -m pybind11 --includes"; you may need to add both pybind11 and Python.h +#include "embedder.hpp" +#include "evaluator.hpp" +#include "trainer.hpp" +#include "translator.hpp" + + +#define PYBIND11_DETAILED_ERROR_MESSAGES + +namespace py = pybind11; +using namespace pymarian; + + +PYBIND11_MODULE(_pymarian, m) { + m.doc() = "Marian C++ API bindings via pybind11"; + + /** TODOS + * 1. API to check if gpu available: cuda_is_available() -> bool + * 2. API to check number of gpus:: cuda_device_count() -> int + */ + + py::class_(m, "Translator") + .def(py::init()) + .def("translate", py::overload_cast(&TranslateServicePyWrapper::run)) + .def("translate", py::overload_cast&, const py::kwargs&>(&TranslateServicePyWrapper::run)) + ; + + py::class_(m, "Evaluator") + .def(py::init()) + .def("evaluate", py::overload_cast(&EvaluatorPyWrapper::run)) + ; + + py::class_(m, "Trainer") + .def(py::init()) + .def("train", py::overload_cast<>(&PyTrainer::train)) + ; + + py::class_(m, "Embedder") + .def(py::init()) + .def("embed", py::overload_cast<>(&PyEmbedder::embed)) + ; + +} + diff --git a/src/python/binding/embedder.hpp b/src/python/binding/embedder.hpp new file mode 100644 index 000000000..12ae43c9d --- /dev/null +++ b/src/python/binding/embedder.hpp @@ -0,0 +1,29 @@ +#include "marian.h" + +#include "common/timer.h" +#include "embedder/embedder.h" +#include "models/model_task.h" + + +using namespace marian; + +namespace pymarian { + class PyEmbedder { + private: + Ptr options_; + Ptr> embedder_; + public: + PyEmbedder(const std::string& cliString) { + options_ = parseOptions(cliString, cli::mode::embedding, true); + embedder_ = New>(options_); + } + + int embed() { + //TODO: add options_ override from args to embed() + //TODO: read input from args instead of STDIN + embedder_->run(); + return 0; + } + }; + +} // namespace pymarian \ No newline at end of file diff --git a/src/python/binding/evaluator.hpp b/src/python/binding/evaluator.hpp new file mode 100644 index 000000000..f72ccd08a --- /dev/null +++ b/src/python/binding/evaluator.hpp @@ -0,0 +1,119 @@ +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +#include "marian.h" + +#include "common/logging.h" +#include "common/timer.h" +#include "data/batch_generator.h" +#include "data/corpus.h" +#include "data/text_input.h" +#include "evaluator/evaluator.h" +#include "models/model_task.h" + + +using namespace marian; + +namespace pymarian { + + //type aliases for convenience + using StrVector = std::vector; + using StrVectors = std::vector; + using FloatVector = std::vector; + using FloatVectors = std::vector; + using Evaluator = marian::Evaluate; + namespace py = pybind11; + + /** + * Wrapper for Marian Evaluator. + * + * This class is a wrapper for the Marian Evaluator class. + * It is used to run the evaluator on a given input. + * + **/ + class EvaluatorPyWrapper { + + private: + Ptr options_; + Ptr evaluator_; + std::vector> vocabs_; + + public: + /** + * Constructor for the EvaluatorPyWrapper class. + * @param cliString - the command line string to parse as Marian options + */ + EvaluatorPyWrapper(const std::string& cliString){ + options_ = parseOptions(cliString, cli::mode::evaluating, true) + ->with("inference", true, "shuffle", "none"); + evaluator_= New(options_); + vocabs_ = loadVocabs(options_); + } + + /** + * @brief Load the vocabularies from the given paths + * @param options - the options object + * @return vector of vocabularies + */ + static auto loadVocabs(Ptr options) -> std::vector> { + std::vector> vocabs; + auto vocabPaths = options->get>("vocabs"); + LOG(info, "Loading vocabularies from {}", utils::join(vocabPaths, ", ")); + for (size_t i = 0; i < vocabPaths.size(); ++i) { + Ptr vocab = New(options, i); + vocab->load(vocabPaths[i]); + vocabs.emplace_back(vocab); + } + return vocabs; + } + + /** + * Given a table of strings (i.e., rows x columns), concatenate each column into a single string. + * + * @param data - table of strings : rows x columns + * @return List of strings, one string for each column, concatenated across rows. + */ + static auto concatColumns(const StrVectors& data) -> StrVector { + // Get the number of rows and columns in the data + int rows = data.size(); + int cols = data[0].size(); + StrVector result(cols); + + for (int j = 0; j < cols; j++) { + std::string column = ""; + for (int i = 0; i < rows; i++) { + column += data[i][j]; + // If it is not the last row, add a newline character + if (i != rows - 1) { column += "\n";} + } + result[j] = column; + } + return result; + } + + /** + * Run the evaluator on the given input. + * Input is transformed as (in memory) files by concatenating columns. + * + * @param inputs - table of strings : rows x columns + * @return table of floats : rows x columns + * + */ + auto run(const StrVectors& inputs) -> FloatVectors { + StrVector columnFiles = concatColumns(inputs); + auto corpus = New(columnFiles, vocabs_, options_); + corpus->prepare(); + + auto batchGenerator = New>(corpus, options_, nullptr, /*runAsync=*/false); + batchGenerator->prepare(); + + std::string output = options_->get("output"); + Ptr collector = New(output, /*binary=*/false); + evaluator_->run(batchGenerator, collector); + FloatVectors outputs = collector->getBuffer(); + return outputs; + } + + }; + +} diff --git a/src/python/binding/trainer.hpp b/src/python/binding/trainer.hpp new file mode 100644 index 000000000..35cb34113 --- /dev/null +++ b/src/python/binding/trainer.hpp @@ -0,0 +1,51 @@ +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include +#include "marian.h" + +#include "common/signal_handling.h" +#include "training/graph_group_async.h" +#include "training/graph_group_singleton.h" +#include "training/graph_group_sync.h" +#include "training/training.h" + +#include "3rd_party/ExceptionWithCallStack.h" + + +namespace py = pybind11; +using namespace marian; + + +namespace pymarian { + + + class PyTrainer { + + private: + Ptr options_; + Ptr> trainer_; + + public: + PyTrainer(const std::string& cliString){ + options_ = parseOptions(cliString, cli::mode::training, true); + LOG(info, "Using synchronous SGD"); + trainer_ = New>(options_); + } + + int train() { + //TODO: add options_ override from args to train() + //TODO: read input from args instead of STDIN + + trainer_->run(); + // If we exit due to a graceful exit request via SIGTERM, exit with 128 + SIGTERM, + // as suggested for bash in http://tldp.org/LDP/abs/html/exitcodes.html. This allows parent + // scripts to determine if training terminated naturally or via SIGTERM. + // An alternative would be to exit with code 124, which is what the timeout command + // returns for timeout -s SIGTERM ...., because exiting after SIGTERM + // is not technically a fatal error (which is what the 128+x convention usually + // stands for). + return getSignalFlag(SIGTERM) ? 128 + SIGTERM : EXIT_SUCCESS; + } + }; + +} \ No newline at end of file diff --git a/src/python/binding/translator.hpp b/src/python/binding/translator.hpp new file mode 100644 index 000000000..97864c3cc --- /dev/null +++ b/src/python/binding/translator.hpp @@ -0,0 +1,69 @@ +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +#include "marian.h" + +#include "common/logging.h" +#include "common/timer.h" +#include "evaluator/evaluator.h" +#include "models/model_task.h" +#include "translator/beam_search.h" +#include "translator/translator.h" + + +namespace py = pybind11; +using namespace marian; + +namespace pymarian { + + class TranslateServicePyWrapper { + private: + Ptr> pImpl_; + + /** + * @brief Convert a pybind11::kwargs object to a YAML string + * + * @param kwargs - the kwargs object from pybind11 + * @return std::string - the YAML string + */ + std::string convertKwargsToYamlString(const py::kwargs& kwargs) { + std::stringstream ss; + if (kwargs) { + for (auto& [key, value] : kwargs) { + // Depythonize the keys + std::string yamlKey = utils::findReplace(key.cast(), "_", "-"); + ss << yamlKey << ": " << value << std::endl; + } + } + return ss.str(); + } + + public: + TranslateServicePyWrapper(const std::string& cliString) + : pImpl_(New>(cliString)) {} + + /** + * @brief Translate a vector of strings + * + * @param inputs - the vector of strings to translate + * @param kwargs - the kwargs object from pybind11 + * @return std::vector - the vector of translated strings + */ + std::vector run(const std::vector& inputs, const py::kwargs& kwargs) { + return this->pImpl_->run(inputs, convertKwargsToYamlString(kwargs)); + } + + /** + * @brief Translate a single string + * + * @param input - the string to translate + * @param kwargs - the kwargs object from pybind11 + * @return std::string - the translated string + */ + std::string run(const std::string& input, const py::kwargs& kwargs) { + return this->pImpl_->run(input, convertKwargsToYamlString(kwargs)); + } + }; + +} + diff --git a/src/python/pymarian/__init__.py b/src/python/pymarian/__init__.py new file mode 100644 index 000000000..f08d00944 --- /dev/null +++ b/src/python/pymarian/__init__.py @@ -0,0 +1,48 @@ +import _pymarian + +from ._version import __version__ +from .utils import kwargs_to_cli + + +class Translator(_pymarian.Translator): + """Python wrapper for Marian Translator""" + + def __init__(self, cli_string='', **kwargs): + """Initializes the translator + :param kwargs: kwargs + """ + cli_string += ' ' + kwargs_to_cli(**kwargs) + super().__init__(cli_string.strip()) + + +class Evaluator(_pymarian.Evaluator): + """Python wrapper for Marian Evaluator""" + + def __init__(self, cli_string='', **kwargs): + """Initializes the evaluator + :param kwargs: kwargs + """ + cli_string += ' ' + kwargs_to_cli(**kwargs) + super().__init__(cli_string.strip()) + + +class Trainer(_pymarian.Trainer): + """Python wrapper for Marian Trainer""" + + def __init__(self, cli_string='', **kwargs): + """Initializes the trainer + :param kwargs: kwargs + """ + cli_string += ' ' + kwargs_to_cli(**kwargs) + super().__init__(cli_string.strip()) + + +class Embedder(_pymarian.Embedder): + """Python wrapper for Marian Embedder""" + + def __init__(self, cli_string='', **kwargs): + """Initializes the embedder + :param kwargs: kwargs + """ + cli_string += ' ' + kwargs_to_cli(**kwargs) + super().__init__(cli_string.stip()) diff --git a/src/python/pymarian/__main__.py b/src/python/pymarian/__main__.py new file mode 100644 index 000000000..e0b68cd65 --- /dev/null +++ b/src/python/pymarian/__main__.py @@ -0,0 +1,18 @@ + +import argparse + +from pymarian import __version__ + +def parse_args(): + parser = argparse.ArgumentParser(prog='pymarian', description="Python wrapper for Marian NMT", + epilog='URL: https://github.com/marian-nmt/marian-dev') + parser.add_argument('--version', '-v', action='version', version=__version__) + return parser.parse_args() + +def main(): + args = parse_args() + # prints version for -v/-version option. + # no other options are currently supported. Space left/intended for future use. + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/python/pymarian/constants.py b/src/python/pymarian/constants.py new file mode 100644 index 000000000..3d04abbba --- /dev/null +++ b/src/python/pymarian/constants.py @@ -0,0 +1,28 @@ +from pathlib import Path + + +class Defaults: + BASE_URL = "https://textmt.blob.core.windows.net/www/models/mt-metric" + CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metrics' + MINI_BATCH = 16 + MAXI_BATCH = 256 + WORKSPACE = 8000 + AVERAGE = 'skip' + MAX_LENGTH = 512 + FLOAT_PRECISION = 4 + + # NOTE: model names must be lower case for caseless matching + KNOWN_METRICS = { + 'cometoid22-wmt21': "comet-qe", + 'cometoid22-wmt22': "comet-qe", + 'cometoid22-wmt23': "comet-qe", + 'chrfoid-wmt23': "comet-qe", + 'comet20-da-qe': "comet-qe", + 'bleurt20': "bleurt", + 'comet20-da': "comet", + } + + KNOWN_SCHEMA = {'comet-qe': 'src+mt', 'bleurt': 'ref+mt', 'comet': 'src+mt+ref'} + + DEF_MODEL = 'cometoid22-wmt22' + DEF_SCHEMA = KNOWN_METRICS[DEF_MODEL] diff --git a/src/python/pymarian/evaluate.py b/src/python/pymarian/evaluate.py new file mode 100755 index 000000000..be13f3f00 --- /dev/null +++ b/src/python/pymarian/evaluate.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python +# +# This is a python wrapper for marian evaluate command +# +import argparse +import itertools +import logging as log +import shutil +import subprocess +import sys +import threading +from pathlib import Path +from typing import Iterator, List, Optional, Tuple, Union + +from .constants import Defaults +from .utils import get_known_model + +log.basicConfig(level=log.INFO) +DEBUG_MODE = False + + +def copy_lines_to_stdin(proc, lines: Iterator[str]): + """Write data to subproc stdin. Note: run this on another thread to avoid deadlock + This function reads streams, and write them as TSV record to the stdin of the sub process. + :param proc: subprocess object to write to + """ + + for line in lines: + # line = line.rstrip('\n') + '\n' + proc.stdin.write(line) + proc.stdin.flush() + proc.stdin.close() # close stdin to signal end of input + + +def marian_evaluate( + model: Path, + input_lines: Iterator[str], + vocab_file: Path = None, + devices: Optional[List[int]] = None, + width=Defaults.FLOAT_PRECISION, + mini_batch=Defaults.MINI_BATCH, + like=Defaults.DEF_SCHEMA, + maxi_batch=Defaults.MAXI_BATCH, + workspace=Defaults.WORKSPACE, + max_length=Defaults.MAX_LENGTH, + cpu_threads=0, + average: str = Defaults.AVERAGE, + backend='subprocess', +) -> Iterator[Union[float, Tuple[float, float]]]: + """Run 'marian evaluate' as a subprocess or using pymarian, read input and write scores + Depending on the `model` argument, either a single score or a tuple of scores is returned per input line. + :param model: path to model file, or directory containing model.npz.best-embed.npz + :param vocab: path to vocabulary file (optional; if not given, assumed to be in the same directory as the model) + :param devices: list of GPU devices to use (optional; if not given, decision is let to marian process) + :param width: float precision + :param mini_batch: mini-batch size (default: 16) + :param like: marian embedding model like (default: comet-qe) + :param cpu_threads: number of CPU threads to use (default: 0) + :param: average: average segment scores to produce system score. + skip=do not output average (default; segment scores only); + append=append average at the end; + only=output the average only (i.e. system score only) + :param backend: subprocess or pymarian + :return: iterator over scores. + """ + + assert model.exists() + if model.is_dir(): + model_dir = model + _model_files = list(model.glob("*.npz")) + assert len(_model_files) == 1, f'Expected exactly one model file in {model_dir}' + model_file = _model_files[0] + else: + assert model.is_file() + model_dir = model.parent + model_file = model + if not vocab_file: + _vocab_files = list(model_dir.glob('*.spm')) + assert len(_vocab_files) == 1, f'Expected exactly one vocab file in {model_dir}' + vocab_file = _vocab_files[0] + + assert model_file.exists(), f'Model file {model_file} does not exist' + assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist' + + n_inputs = len(Defaults.KNOWN_SCHEMA[like].split('+')) + vocabs = [vocab_file] * n_inputs + kwargs = dict( + model=model_file, + vocabs=vocabs, + devices=devices, + width=width, + like=like, + mini_batch=mini_batch, + maxi_batch=maxi_batch, + max_length=max_length, + max_length_crop=True, + workspace=workspace, # negative memory => relative to total memory + cpu_threads=cpu_threads, + average=average, + ) + if backend == 'pymarian': + # handled separately for pymarian due to minibatching and iterator input + # TODO: remove this when iterator is supported in evaluator C++ API + kwargs['average'] = 'skip' + + cmd_line = [] + for key, val in kwargs.items(): + if val is None: # ignore this key / flag + continue + cmd_line.append(f"--{key.replace('_', '-')}") + if val is True: # boolean flag + cmd_line.append('true') + elif val is False: + cmd_line.append('false') + + elif isinstance(val, (list, tuple)): + cmd_line.extend(str(v) for v in val) + else: + cmd_line.append(str(val)) + if not DEBUG_MODE: + cmd_line.append('--quiet') + if backend == 'subprocess': + return subprocess_evaluate(cmd_line, input_lines) + elif backend == 'pymarian': + cmd_line = ' '.join(cmd_line) + batch_size = mini_batch * maxi_batch + return pymarian_evaluate(cmd_line, input_lines, batch_size=batch_size, average=average) + else: + raise ValueError(f'Unknown backend {backend}') + + +def pymarian_evaluate( + cmd_line: str, input_lines: Iterator[str], average=Defaults.AVERAGE, batch_size=int(Defaults.MINI_BATCH * Defaults.MAXI_BATCH) +): + try: + from pymarian import Evaluator + except: + raise ImportError('pymarian is not installed. Please install it and rerun') + + log.info(f'Marian CLI::\n\t{cmd_line}') + evaluator = Evaluator(cmd_line) + assert average in ('skip', 'append', 'only') + lines = (line.rstrip('\n').split('\t') for line in input_lines) + + # NOTE: pymarian doesn't support iterator input yet; so mini batching here + def make_mini_batches(lines, batch_size=batch_size): + assert batch_size > 0 + while True: + chunk = list(itertools.islice(lines, batch_size)) + if not chunk: + return + yield chunk + + total, count = 0.0, 0 + for batch in make_mini_batches(lines): + scores = evaluator.evaluate(batch) + assert len(scores) == len(batch) + for score in scores: + if isinstance(score, (tuple, list)): + score = score[0] + total += score + count += 1 + if average != 'only': # skip or append + yield score + + if average != 'skip': + yield total / count + + +def subprocess_evaluate(cmd_line: List[str], input_lines: Iterator[str]): + assert isinstance(cmd_line, list) + marian_bin_path = shutil.which('marian') + if marian_bin_path is None: + raise FileNotFoundError('marian binary not found in PATH. Please add it and rerun') + cmd_line = [marian_bin_path, 'evaluate'] + cmd_line + + proc = None + try: + proc = subprocess.Popen( + cmd_line, + shell=False, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=sys.stderr, + text=True, + encoding='utf8', + errors='replace', + ) + log.info(f'Running command: {" ".join(cmd_line)}') + copy_thread = threading.Thread(target=copy_lines_to_stdin, args=(proc, input_lines)) + + copy_thread.start() + # read output and yield scores + for line in proc.stdout: + line = line.rstrip() + if ' ' in line: + yield tuple(float(x) for x in line.split(' ')) + else: + yield float(line) + + # wait for copy thread to finish + copy_thread.join() + # proc.stdin.close() + returncode = proc.wait() + if returncode != 0: + raise RuntimeError(f'Process exited with code {returncode}') + finally: + if proc is not None and proc.returncode is None: + log.warning(f'Killing process {proc.pid}') + proc.kill() + + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument( + '-m', + '--model', + help=f'Model name, or path. Known models={list(Defaults.KNOWN_METRICS.keys())}', + default=Defaults.DEF_MODEL, + type=str, + ) + + parser.add_argument( + '--stdin', + action='store_true', + help='Read input from stdin. TSV file with following format: \ + QE metrics: "srcmt", Comet with ref: "srcref; or BLEURT: "refmt"', + ) + parser.add_argument('-t', '--mt', dest='mt_file', help='MT output file. Ignored when --stdin.', type=Path) + parser.add_argument('-s', '--src', dest='src_file', help='Source file. Ignored when --stdin', type=Path) + parser.add_argument('-r', '--ref', dest='ref_file', help='Ref file. Ignored when --stdin', type=Path) + parser.add_argument( + '-o', '--out', default=sys.stdout, help='output file. Default: stdout', type=argparse.FileType('w') + ) + parser.add_argument( + '-a', + '--average', + choices=('skip', 'append', 'only'), + default='skip', + help='Average segment scores to produce system score.' + ' skip=do not output average (default; segment scores only);' + ' append=append average at the end; ' + ' only=output the average only (i.e. system score only)', + ) + + parser.add_argument('-w', '--width', default=4, help='Output score width', type=int) + parser.add_argument('--debug', help='Verbose output', action='store_true') + parser.add_argument('--mini-batch', default=16, help='Mini-batch size', type=int) + group = parser.add_mutually_exclusive_group() + group.add_argument('-d', '--devices', nargs='*', type=int, help='GPU device IDs') + group.add_argument( + '-c', '--cpu-threads', default=None, type=int, help='Use CPU threads. 0=use GPU device 0' + ) + parser.add_argument('-ws', '--workspace', default=8000, help='Workspace memory', type=int) + parser.add_argument( + '--backend', + default='pymarian', + choices=['subprocess', 'pymarian'], + help='Marian backend interface. subprocess=look for marian binary in PATH. pymarian=pybind wrapper', + ) + + args = parser.parse_args() + return vars(args) + + +def read_input(args, model_id, schema=None): + model_schema = Defaults.KNOWN_METRICS.get(model_id, schema or Defaults.DEF_SCHEMA) + input_schema = Defaults.KNOWN_SCHEMA[model_schema] + n_inputs = len(input_schema.split('+')) + if args.pop('stdin'): + del args['mt_file'] + del args['src_file'] + del args['ref_file'] + return sys.stdin + + n_inputs = len(input_schema.split('+')) + mt_file = args.pop('mt_file') + src_file = args.pop('src_file') + ref_file = args.pop('ref_file') + assert mt_file.exists(), f'{mt_file} does not exist' + if 'src' in input_schema: + assert src_file, f'Source file is required for metric {model_id}' + assert src_file.exists(), f'{src_file} does not exist' + if 'ref' in input_schema: + assert ref_file, f'Reference file is required for metric {model_id}' + assert ref_file.exists(), f'{ref_file} does not exist' + if input_schema == 'src+mt': + input_lines = itertools.zip_longest(open(src_file), open(mt_file)) + elif input_schema == 'src+ref+mt': + input_lines = itertools.zip_longest(open(src_file), open(ref_file), open(mt_file)) + elif input_schema == 'src+mt+ref': + input_lines = itertools.zip_longest(open(src_file), open(mt_file), open(ref_file)) + elif input_schema == 'ref+mt': + input_lines = itertools.zip_longest(open(ref_file), open(mt_file)) + else: + raise ValueError(f'Unknown schema {input_schema}') + + def _validate_and_join(): + for row in input_lines: + assert len(row) == n_inputs, f'Expected {n_inputs} columns, but got {len(row)}' + for col in row: + assert col is not None, f'Expected {n_inputs} columns, but got {len(row)}' + yield '\t'.join(row) + + return _validate_and_join() + + +def main(**args): + args = args or parse_args() + if args.pop('debug'): + log.getLogger().setLevel(log.DEBUG) + global DEBUG_MODE + DEBUG_MODE = True + log.debug(args) + + model_id = args.pop('model') + if model_id.lower() in Defaults.KNOWN_METRICS: + model_path, vocab = get_known_model(model_id.lower()) + log.info(f'{model_id} --> {model_path}') + else: + model_path, vocab = Path(model_id), None + assert ( + model_path.exists() + ), f'{model_path} does not exist. Known models are {list(Defaults.KNOWN_METRICS.keys())}' + args['model'] = model_path + args['vocab_file'] = vocab + + args['input_lines'] = read_input(args, model_id=model_id) + args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_SCHEMA) + out = args.pop('out') + width = args.pop('width', Defaults.FLOAT_PRECISION) + scores = marian_evaluate(**args) + for i, score in enumerate(scores, start=1): + if isinstance(score, (tuple, list)): + score = score[0] # the first score + out.write(f'{score:.{width}f}\n') + out.close() + + log.info(f'Wrote {i} lines to {out.name}') + + +if '__main__' == __name__: + main() diff --git a/src/python/pymarian/mtapi_server.py b/src/python/pymarian/mtapi_server.py new file mode 100755 index 000000000..4391a3101 --- /dev/null +++ b/src/python/pymarian/mtapi_server.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +""" +Implements Microsoft's MTAPI (https://docs.microsoft.com/en-us/azure/cognitive-services/translator/quickstart-translator?tabs=python). +""" + +import argparse +import json +import logging as log +from typing import List + +from flask import Flask, request +from sacremoses import MosesPunctNormalizer +from sentence_splitter import SentenceSplitter + +import pymarian + +log.basicConfig(level=log.INFO) + + +class MarianService: + def __init__(self, source_lang: str, target_lang: str, cli_string: str = None): + self.source_lang = source_lang + self.target_lang = target_lang + self.cli_string = cli_string + self._translator = None # lazy init + + self.norm = MosesPunctNormalizer(lang="en") + self.splitter = SentenceSplitter(source_lang) + + @property + def translator(self): + if self._translator is None: + # lazy init + self._translator = pymarian.Translator(self.cli_string) + return self._translator + + def translate(self, text: List[str]) -> List[str]: + """Translates a list of sentences from source to target language.""" + text = self.norm.normalize(text) + input_lines = self.splitter.split(text) + output_lines = self.translator.translate(input_lines) + return " ".join(output_lines) + + +def attach_routes(app: Flask, service: MarianService): + @app.route('/translate', methods=["GET", "POST"]) + def translate(): + request_data = request.get_json() + outputs = [] + for source in request_data: + text = source["text"] + translation = service.translate(text) + outputs.append(translation) + response = [ + {"translations": [{"text": output, "to": service.target_lang} for output in outputs]}, + ] + return json.dumps(response), 200 + + +def parse_args(): + SOURCE_LANG = "en" + TARGET_LANG = "de" + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--source-lang", "-s", type=str, default=SOURCE_LANG) + parser.add_argument("--target-lang", "-t", type=str, default=TARGET_LANG) + parser.add_argument('args', type=str, help="CLI string for loading marian model") + parser.add_argument("--port", "-p", type=int, default=5000) + return vars(parser.parse_args()) + + +def main(): + app = Flask(__name__) + args = parse_args() + service = MarianService( + source_lang=args["source_lang"], target_lang=args["target_lang"], cli_string=args["args"] + ) + attach_routes(app, service) + app.run(port=args["port"]) + + +if __name__ == '__main__': + main() diff --git a/src/python/pymarian/qtdemo.py b/src/python/pymarian/qtdemo.py new file mode 100644 index 000000000..e95d0bf12 --- /dev/null +++ b/src/python/pymarian/qtdemo.py @@ -0,0 +1,125 @@ +import sys +import time + +from PyQt5.QtGui import * +from PyQt5.QtWidgets import * +from sacremoses import MosesPunctNormalizer, MosesTokenizer +from sentence_splitter import SentenceSplitter + +import pymarian + + +class Example(QWidget): + def __init__(self): + super().__init__() + + self.cache = dict() + self.norm = MosesPunctNormalizer(lang="en") + self.tok = MosesTokenizer(lang="en") + self.splitter = SentenceSplitter("en") + + self.setWindowTitle("Live Translator") + self.setFont(QFont(self.font().family(), 11)) + # setting the geometry of window + self.setGeometry(300, 300, 1200, 800) + + # centering + qtRectangle = self.frameGeometry() + centerPoint = QDesktopWidget().availableGeometry().center() + qtRectangle.moveCenter(centerPoint) + self.move(qtRectangle.topLeft()) + + self.marian = None + + self.input = QPlainTextEdit(self) + self.input.textChanged.connect(self.onChanged) + self.output = QPlainTextEdit(self) + + hbox = QHBoxLayout() + self.cli = QLineEdit(self) + self.cli.setText( + "-c models/enu.deu.yml --cpu-threads 8 -b1 --mini-batch-words 256 --maxi-batch 100 --maxi-batch-sort src" + ) + + self.reload = QPushButton("Reload") + self.reload.clicked.connect(self.onClicked) + self.run = QPushButton("Translate") + self.run.clicked.connect(self.onChanged) + + hbox.addWidget(self.cli) + hbox.addWidget(self.reload) + hbox.addWidget(self.run) + + layout = QVBoxLayout() + layout.addLayout(hbox) + hbox2 = QHBoxLayout() + hbox2.addWidget(self.input) + hbox2.addWidget(self.output) + layout.addLayout(hbox2) + + self.statusBar = QStatusBar() + layout.addWidget(self.statusBar) + + self.setLayout(layout) + + self.reloadMarian() + self.show() + + def onChanged(self): + inputText = self.input.toPlainText() + if not self.current: + self.reloadMarian() + if self.current: + outputText = self.translate(inputText) + self.output.setPlainText(outputText) + + def onClicked(self): + self.reloadMarian() + + def reloadMarian(self): + command = self.cli.text() + print(command) + self.cache = dict() # clean instead of caching + if command not in self.cache: + self.cache[command] = dict() + self.cache[command]["#MODEL#"] = pymarian.Translator(command) + self.current = self.cache[command] + + def translate(self, inputText): + t0 = time.perf_counter() + + inputLines = [self.splitter.split(p) for p in inputText.split("\n")] + + unseenLines = [] + for paragraph in inputLines: + for line in paragraph: + if line not in self.current: + unseenLines.append(line) + + normLines = [self.norm.normalize(c) for c in unseenLines] + + t1 = time.perf_counter() + outputLines = self.current["#MODEL#"].translate(normLines) + t2 = time.perf_counter() + + totalStat = sum([len(self.tok.tokenize(line)) for line in unseenLines]) + + if totalStat: + self.statusBar.showMessage( + f"Translated {totalStat} tokens ({len(unseenLines)} lines) in {t2 - t1:.2f} second ({totalStat / (t2 - t1):.2f} tokens per second). Preprocessing took {t1 - t0:.2f} seconds. Total: {t2 - t0:.2f} seconds" + ) + + for src, trg in zip(unseenLines, outputLines): + self.current[src] = trg + + return "\n".join([" ".join([self.current[src] for src in paragraph]) for paragraph in inputLines]) + + +def main(): + app = QApplication(sys.argv) + ex = Example() + sys.exit(app.exec_()) + + +if __name__ == '__main__': + main() diff --git a/src/python/pymarian/utils.py b/src/python/pymarian/utils.py new file mode 100644 index 000000000..16e2e3c22 --- /dev/null +++ b/src/python/pymarian/utils.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# +# This is a python wrapper for marian evaluate command +# created by Thamme Gowda on 2023-09-07 +# + +import logging as log +import shutil +from pathlib import Path + +import requests +from tqdm.auto import tqdm + +from .constants import Defaults + +log.basicConfig(level=log.INFO) +DEBUG_MODE = False + + +def get_known_model(model_name): + """Given a known model name, this functin gets the checkpoint and vocabulary paths. + This function downloads and extracts model files to a local cache directory if necessary. + + Specifically, checkpoint file must have model*.npz and vocab*.spm files in the resolved model directory. + :param model_name: model name + :return: checkpoint path, vocabulary path + """ + assert model_name in Defaults.KNOWN_METRICS, f'Unknown model {model_name}' + + model_url = f'{Defaults.BASE_URL}/{model_name}.tgz' + local_file = Defaults.CACHE_PATH / f'{model_name}.tgz' + local_dir = Defaults.CACHE_PATH / model_name + maybe_download_file(model_url, local_file) + maybe_extract(local_file, local_dir) + checkpt_file = list(local_dir.glob('model*.npz')) + vocab_file = list(local_dir.glob('vocab*.spm')) + assert len(checkpt_file) == 1, f'Expected exactly one model file in {local_dir}' + assert len(vocab_file) == 1, f'Expected exactly one vocab file in {local_dir}' + checkpt_file = checkpt_file[0] + vocab_file = vocab_file[0] + return checkpt_file, vocab_file + + +def maybe_download_file(url, local_file: Path): + """Downloads the file if not already downloaded + :param url: url to download + :param local_file: local file path + """ + flag_file = local_file.with_name(local_file.name + '._OK') + if local_file.exists() and flag_file.exists(): + log.info(f'Using cached file {local_file}') + return + log.info(f'Downloading {url} to {local_file}') + local_file.parent.mkdir(parents=True, exist_ok=True) + with requests.get(url, stream=True) as r: + r.raise_for_status() + file_size = int(r.headers.get('Content-Length', 0)) + with tqdm.wrapattr(r.raw, "read", total=file_size, desc='Downloading', dynamic_ncols=True) as r_raw: + with open(local_file, "wb") as f: + shutil.copyfileobj(r_raw, f) + flag_file.touch() + + +def maybe_extract(archive: Path, outdir: Path) -> Path: + """Extracts the archive to outdir if not already extracted + :param archive: path to archive file + :param outdir: output directory + :return: output directory path + """ + assert archive.exists(), f'{archive} does not exist' + flag_file = outdir / '._EXTRACT_OK' + if not outdir.exists() or not flag_file.exists(): + shutil.rmtree(outdir, ignore_errors=True) + log.info(f'Extracting {archive} to {outdir}') + # assumption: root dir in tar matches model name + shutil.unpack_archive(archive, outdir.parent) + flag_file.touch() + return outdir + + +def kwargs_to_cli(**kwargs) -> str: + """Converts kwargs to cli args + :param kwargs: kwargs + :return: cli args + """ + args = [] + for k, v in kwargs.items(): + if v is None: + continue # ignore keys if values are None + k = k.replace('_', '-') + args.append(f'--{k}') + if v is '': + continue # only add keys for empty values + elif isinstance(v, bool): + args.append("true" if v else "false") + elif isinstance(v, (list, tuple)): + args.extend(str(x) for x in v) + else: + args.append(f'{v}') + + return ' '.join(args) diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml new file mode 100644 index 000000000..a9cf413a7 --- /dev/null +++ b/src/python/pyproject.toml @@ -0,0 +1,63 @@ +[build-system] +requires = ["setuptools >= 61.0", "pip >= 23.0"] #NOTE: we had troubles with pip v22; it set name as UNKNOWN +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +# where = ["."] # ["."] by default +include = ["pymarian*"] # ["*"] by default +# exclude = ["pymarian.tests*"] # empty by default +namespaces = true # true by default + + +[project] +name = "pymarian" +dynamic = ["version"] # see [tool.setuptools.dynamic] below +description = "Pymarian" +readme = "README.md" +authors = [ + { name = "Marian Developers", email = "noreply@email.com" }, +] +requires-python = ">=3.7" +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] + +dependencies = [ + "tqdm", + "requests" +] + +[project.scripts] +pymarian-evaluate = "pymarian.evaluate:main" +pymarian-qtdemo = "pymarian.qtdemo:main" +pymarian-mtapi = "pymarian.mtapi_server:main" + +[project.optional-dependencies] +test = ["pytest"] +demos = [ + "flask", + "sacremoses", + "pyqt5", + "sentence-splitter@git+https://github.com/mediacloud/sentence-splitter", +] + +[tool.setuptools] +include-package-data = true + +[tool.black] +line-length = 110 +target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] +include = 'src/python/.*\.pyi?$' +skip-string-normalization = true + +# black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333 +[tool.isort] +profile = "black" +src_paths = ["src/python"] +# isort --check --diff src/python/ \ No newline at end of file diff --git a/src/python/setup.py b/src/python/setup.py new file mode 100644 index 000000000..01d3a0f5f --- /dev/null +++ b/src/python/setup.py @@ -0,0 +1,102 @@ +import os +import platform +import shutil +import sys + +from pathlib import Path +from setuptools import setup, find_namespace_packages, Distribution + +""" +This script expects _pymarian.*.so to be present in $CMAKE_BINARY_DIR + +NOTE: Most of (static) metadata is set in pyproject.toml. +# This setup.py is for specifying dynamic aspect of build. All static metadata is in pyproject.toml +""" + +DEF_CMAKE_BINARY_DIR = (Path(__file__).parent / '../../build').resolve() +CMAKE_BINARY_DIR = os.getenv("CMAKE_BINARY_DIR", DEF_CMAKE_BINARY_DIR) +print("\t>>>CMAKE_BINARY_DIR is ", CMAKE_BINARY_DIR) + +if platform.system() == 'Windows': + NATIVE_EXT_GLOB = '_pymarian.*.pyd' +elif platform.system() == 'Darwin': + NATIVE_EXT_GLOB = '_pymarian.*.dylib' +else: + NATIVE_EXT_GLOB = '_pymarian.*.so' + + +def get_version(cuda_version=None) -> str: + vfile = Path(__file__).parent / '../../VERSION' + if not vfile.exists() and "CMAKE_SOURCE_DIR" in os.environ: + # some build tools may copy src/python into a temporary directory, which disconnects it from the source tree + # using CMAKE_SOURCE_DIR to find the source tree + vfile = Path(os.environ["CMAKE_SOURCE_DIR"]) / 'VERSION' + try: + assert vfile.exists(), f'Version file {vfile.resolve()} does not exist' + version = vfile.read_text().strip().lstrip("v") # gets rid of 'v' prefix in v1.17.5 etc. + except: + # FIXME: This is a hack. We need to read version from VERSION file + version = '0.0.0' + print( + f'WARNING: Could not read version from {vfile.resolve()}. Setting version to {version}', + file=sys.stderr, + ) + version = version if not cuda_version else f"{version}+cu{cuda_version.replace('.', '')}" + + print("\t>>>Marian version is ", version) + # we need to write version to _version.py file, so that it can be imported from python + vfile_lines = [ + "# File generated by setuptools; it will be overwritten on every build", + "# Hence, do not edit or track this file in version control", + f"__version__ = '{version}'", + "__cuda_version__ = %s" % ("None" if not cuda_version else f"'{cuda_version}'"), + ] + + vfile_py = Path(__file__).parent / 'pymarian' / '_version.py' + vfile_py.write_text("\n".join(vfile_lines)) + return version + + +def get_native_ext() -> Path: + + native_exts = list(Path(CMAKE_BINARY_DIR).glob(f'src/{NATIVE_EXT_GLOB}')) + if not native_exts: + raise Exception( + f'No native extension found; Looked at {CMAKE_BINARY_DIR}/src/{NATIVE_EXT_GLOB}. \ + Please run cmake build first with -DPYMARIAN=ON or set CMAKE_BINARY_DIR to the build dir' + ) + elif len(native_exts) >= 2: + raise Exception(f'Only one native extension expected, but found: {native_exts}') + + native_ext = native_exts[0] + # Pip does not allow inclusion of files from parent dir our outside of package context (for security reasons). + # So, we copy the native extension to the package directory + native_ext_local = Path(__file__).parent / native_ext.name + print(f"\t>>>Found native extension at: {native_ext}") + print(f"\t >>>Making it available under scope at: {native_ext_local}") + if native_ext_local.exists(): + native_ext_local.unlink() + shutil.copy(native_ext, native_ext_local) + return native_ext_local + + +version = get_version(os.getenv("CUDA_VERSION", default="")) +native_ext = get_native_ext() + + +# Thanks to https://stackoverflow.com/a/62668026/1506477 +class BinaryDistribution(Distribution): + """Distribution which always forces a binary package with platform name""" + + def has_ext_modules(foo): + return True + + +setup( + version=version, + package_dir={"pymarian": "pymarian"}, + packages=find_namespace_packages(where=".", exclude=["tests", "binding"]), + include_package_data=True, + package_data={"": [str(native_ext)]}, + distclass=BinaryDistribution, +) diff --git a/src/python/tests/__init__.py b/src/python/tests/__init__.py new file mode 100644 index 000000000..4fa4672c7 --- /dev/null +++ b/src/python/tests/__init__.py @@ -0,0 +1,15 @@ +import os + +QUIET = os.getenv('MARIAN_QUIET', "").lower() in ("1", "yes", "y", "true", "on") +CPU_THREADS = int(os.getenv('MARIAN_CPU_THREADS', "4")) +WORKSPACE_MEMORY = int(os.getenv('MARIAN_WORKSPACE_MEMORY', "6000")) + +EPSILON = 0.0001 # the precision error we afford in float comparison + +BASE_ARGS = dict( + mini_batch=8, + maxi_batch=64, + cpu_threads=CPU_THREADS, + workspace=WORKSPACE_MEMORY, + quiet=QUIET, +) diff --git a/src/python/tests/test_evaluate.py b/src/python/tests/test_evaluate.py new file mode 100644 index 000000000..d79462901 --- /dev/null +++ b/src/python/tests/test_evaluate.py @@ -0,0 +1,148 @@ +""" +# silense marian log +export MARIAN_QUIET=yes + +# run all tests in this file + pytest -v src/python/tests/test_evaluate.py + pytest -vx src/python/tests/test_evaluate.py #stop on first failure + +# run a single test: + pytest -v src/python/tests/test_evaluate.py -k test_evaluator_chrfoid + pytest -vs src/python/tests/test_evaluate.py -k test_evaluator_chrfoid # see stdout and stderr +""" +import os + +from pymarian import Evaluator +from pymarian.utils import get_known_model + +from . import BASE_ARGS + +EPSILON = 0.0001 # the precision error we afford in float comparison + + +# dummy sentences for testing +SAMPLE_SRC_HYP = [ + ["This is a test", "This is a test A"], + ["This is a test B", "This is a test C"], + ["This is a test D", "This is a test E"], +] +SAMPLE_REF_HYP = SAMPLE_SRC_HYP # same for now +SAMPLE_SRC_HYP_REF = [ + ["This is a test", "This is a test A", "This is a test AA"], + ["This is a test B", "This is a test C", "This is a test CC"], + ["This is a test D", "This is a test E", "This is a test EE"], +] + + +def test_evaluator_chrfoid(): + model_path, vocab_path = get_known_model("chrfoid-wmt23") + args = BASE_ARGS | dict( + like="comet-qe", + model=model_path, + vocabs=[vocab_path, vocab_path], + ) + # args = dict(help='') # to get help message with all args + eval = Evaluator(**args) + data = SAMPLE_SRC_HYP + expected_scores = [0.0548, 0.0797, 0.0988] + + scores = eval.evaluate(data) + assert len(scores) == len(data) + for score, expected_score in zip(scores, expected_scores): + if isinstance(score, list): + score = score[0] + assert abs(score - expected_score) < EPSILON + + +def test_evaluator_cometoid22_wmt22(): + model_path, vocab_path = get_known_model("cometoid22-wmt22") + args = BASE_ARGS | dict( + like="comet-qe", + model=model_path, + vocabs=[vocab_path, vocab_path], + ) + # args = dict(help='') # to get help message with all args + eval = Evaluator(**args) + data = SAMPLE_SRC_HYP + expected_scores = [0.71845, 0.7906, 0.81549] + + scores = eval.evaluate(data) + assert len(scores) == len(data) + + for score, expected_score in zip(scores, expected_scores): + if isinstance(score, list): + score = score[0] + assert abs(score - expected_score) < EPSILON + + +def test_evaluator_cometoid22_wmt23(): + model_path, vocab_path = get_known_model("cometoid22-wmt23") + args = BASE_ARGS | dict( + like="comet-qe", + model=model_path, + vocabs=[vocab_path, vocab_path], + ) + eval = Evaluator(**args) + data = SAMPLE_SRC_HYP + expected_scores = [0.75715, 0.81395, 0.8361] + + scores = eval.evaluate(data) + assert len(scores) == len(data) + for score, expected_score in zip(scores, expected_scores): + if isinstance(score, list): + score = score[0] + assert abs(score - expected_score) < EPSILON + + +def test_evaluator_bleurt(): + model_path, vocab_path = get_known_model("bleurt20") + args = BASE_ARGS | dict( + like="bleurt", + model=model_path, + vocabs=[vocab_path, vocab_path], + ) + + eval = Evaluator(**args) + data = SAMPLE_REF_HYP + scores = eval.evaluate(data) + expected_scores = [0.30929, 0.3027, 0.3113] + assert len(scores) == len(data) + for score, expected_score in zip(scores, expected_scores): + if isinstance(score, list): + score = score[0] + assert abs(score - expected_score) < EPSILON + + +# TODO: These below tests are failing + + +def test_evaluator_comet20qe(): + model_path, vocab_path = get_known_model("comet20-da-qe") + args = BASE_ARGS | dict( + like="comet-qe", + model=model_path, + vocabs=[vocab_path, vocab_path], + ) + + eval = Evaluator(**args) + data = SAMPLE_SRC_HYP + scores = eval.evaluate(data) + assert len(scores) == len(data) + # TODO: add expected scores and asserts + + +def test_evaluator_comet20ref(): + model_path, vocab_path = get_known_model("comet20-da") + args = BASE_ARGS | dict( + like="comet", + model=model_path, + vocabs=[vocab_path, vocab_path], + ) + + eval = Evaluator(**args) + data = SAMPLE_SRC_HYP_REF + scores = eval.evaluate(data) + len(scores) == len(data) + + +# TODO: add expected scores and asserts diff --git a/src/python/tests/test_train.py b/src/python/tests/test_train.py new file mode 100644 index 000000000..543e45db5 --- /dev/null +++ b/src/python/tests/test_train.py @@ -0,0 +1,142 @@ +import tarfile +import tempfile +import urllib.request +from pathlib import Path + +from pymarian import Trainer +from pymarian.utils import get_known_model + +QUIET = False + +TMP_DATA_DIR = Path.home() / 'tmp' / 'marian-tests' +DATA_URL = "https://textmt.blob.core.windows.net/www/data/marian-tests-data.tgz" + + +def setup(): + ok_file = TMP_DATA_DIR / '_OK' + if not TMP_DATA_DIR.exists() or not ok_file.exists(): + TMP_DATA_DIR.mkdir(parents=True, exist_ok=True) + + print("Downloading data package...") + with urllib.request.urlopen(DATA_URL) as response: + with tarfile.open(fileobj=response, mode="r|gz") as tar: + tar.extractall(path=TMP_DATA_DIR) + ok_file.touch() + print("Done.") + + +setup() + + +def test_train_comet_qe(): + data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng' + vocab_file = data_dir / 'vocab.8k.spm' + classe_file = data_dir / 'classes4f.txt' + train_file = data_dir / 'sample.5k.chrfoid-deu-eng.tsv' + # pretrained_model, vocab_file = get_known_model("chrfoid-wmt23") + assert classe_file.exists() + assert vocab_file.exists() + assert train_file.exists() + + args = { + 'dim_emb': 128, + 'enc_depth': 3, + 'dec_depth': 1, + 'tied_embeddings_all': True, + 'transformer_heads': 2, + 'transformer_dim_ffn': 256, + 'transformer_ffn_activation': 'relu', + 'transformer_dropout': 0.1, + 'cost_type': 'ce-mean', + 'max_length': 80, + 'mini_batch_fit': False, + 'maxi_batch': 256, + 'optimizer_params': [0.9, 0.98, '1e-09'], + 'sync_sgd': True, + 'learn_rate': 0.0003, + 'lr_decay_inv_sqrt': [16000], + 'lr_warmup': 16000, + 'label_smoothing': 0.1, + 'clip_norm': 0, + 'exponential_smoothing': 0.0001, + 'early_stopping': 2, + 'keep_best': True, + 'beam_size': 2, + 'normalize': 1, + 'valid_metrics': ['perplexity'], + 'valid_mini_batch': 16, + 'mini_batch': 8, + 'after': '400u', + 'valid_freq': '200u', + 'disp_freq': 100, + 'disp_first': 4, + 'save_freq': '200u', + 'quiet': QUIET, + #'like': 'comet-qe', # only supported at inference; for training, see task and input_types + 'task': 'comet-qe', + 'input_types': ['class', 'sequence', 'sequence'], # required for training + #'pretrained_model': pretrained_model, # for finetuning; not using it because its too big for tests + 'train_sets': [train_file], # TSV file having 3 columns: class sequence sequence + 'tsv': True, + 'tsv-fields': 3, # or it will complain that vocabs and train_sets should be one to one map + 'vocabs': [classe_file, vocab_file, vocab_file], # class sequence sequence + } + with tempfile.TemporaryDirectory() as tmpdir: + save_at = tmpdir + '/model.npz' + trainer = Trainer(model=save_at, **args) + trainer.train() + + +def test_train_transformer_nmt(): + data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng' + vocab_file = data_dir / 'vocab.8k.spm' + train_prefix = str(data_dir / 'sample.5k') + src_lang = "deu" + tgt_lang = "eng" + train_src = train_prefix + "." + src_lang + train_tgt = train_prefix + "." + tgt_lang + + # these are taken from regression-tests repo and simplified + args = { + 'type': 'transformer', + 'dim_emb': 128, + 'enc_depth': 3, + 'dec_depth': 1, + 'tied_embeddings_all': True, + 'transformer_heads': 2, + 'transformer_dim_ffn': 256, + 'transformer_ffn_activation': 'relu', + 'transformer_dropout': 0.1, + 'cost_type': 'ce-mean-words', + 'max_length': 80, + 'mini_batch_fit': False, + 'maxi_batch': 256, + 'optimizer_params': [0.9, 0.98, '1e-09'], + 'sync_sgd': True, + 'learn_rate': 0.0003, + 'lr_decay_inv_sqrt': [16000], + 'lr_warmup': 16000, + 'label_smoothing': 0.1, + 'clip_norm': 0, + 'exponential_smoothing': 0.0001, + 'early_stopping': 2, + 'keep_best': True, + 'beam_size': 2, + 'normalize': 1, + 'valid_metrics': ['ce-mean-words', 'bleu', 'perplexity'], + 'valid_mini_batch': 16, + 'mini_batch': 8, + 'after': '400u', # stop after 500 updates + 'valid_freq': '200u', # validate every 250 updates + 'disp_freq': 100, + 'disp_first': 4, + 'save_freq': '200u', + 'vocabs': [vocab_file, vocab_file], + 'train_sets': [train_src, train_tgt], + 'quiet': QUIET, + } + + with tempfile.TemporaryDirectory() as tmpdir: + save_at = tmpdir + '/model.npz' + trainer = Trainer(model=save_at, **args) + trainer.train() diff --git a/src/python/tests/test_translate.py b/src/python/tests/test_translate.py new file mode 100644 index 000000000..0ad5adc60 --- /dev/null +++ b/src/python/tests/test_translate.py @@ -0,0 +1,16 @@ +from pathlib import Path + +from pymarian import Translator + +from . import BASE_ARGS + + +def test_ende(): + # TODO: download model from blob storage + model_dir = Path.home() / 'tmp/marian-eng-deu' + model_file = str(model_dir / 'model.bin') + vocab_file = str(model_dir / 'vocab.spm') + args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file]) + translator = Translator(**args) + hyp = translator.translate("Hello. Good morning.") + assert hyp == "Hallo. Guten Morgen." diff --git a/src/translator/translator.h b/src/translator/translator.h index 081b06c42..b15683867 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -229,13 +229,17 @@ class TranslateService : public ModelServiceTask { Ptr trgVocab_; Ptr shortlistGenerator_; - std::vector> modelFiles_; + std::vector> modelWeights_; size_t numDevices_; + std::vector> model_items_; // non-mmap public: virtual ~TranslateService() {} + TranslateService(const std::string& cliString) + : TranslateService(parseOptions(cliString, cli::mode::translation, /*validate=*/true)) {} + TranslateService(Ptr options) : options_(options->clone()) { // initialize vocabs @@ -255,7 +259,7 @@ class TranslateService : public ModelServiceTask { trgVocab_->load(vocabPaths.back()); auto srcVocab = srcVocabs_.front(); - std::vector lshOpts = options_->get>("output-approx-knn"); + std::vector lshOpts = options_->get>("output-approx-knn", {}); ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); // load lexical shortlist @@ -267,47 +271,74 @@ class TranslateService : public ModelServiceTask { auto devices = Config::getDevices(options_); numDevices_ = devices.size(); + ThreadPool threadPool(numDevices_, numDevices_); + scorers_.resize(numDevices_); + graphs_.resize(numDevices_); + + bool mmap = options_->get("model-mmap", false); + auto mmapMode = mmap ? io::MmapMode::RequiredMmap : io::MmapMode::OpportunisticMmap; + // preload models - auto models = options->get>("models"); - for(auto model : models) { - modelFiles_.push_back(New(model)); - } + auto modelPaths = options->get>("models"); + for(auto modelPath : modelPaths) + modelWeights_.push_back(New(modelPath, mmapMode)); // initialize scorers + size_t id = 0; for(auto device : devices) { - auto graph = New(true); - - auto precison = options_->get>("precision", {"float32"}); - graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph - graph->setDevice(device); - if (device.type == DeviceType::cpu) { - graph->getBackend()->setOptimized(options_->get("optimize")); - graph->getBackend()->setGemmType(options_->get("gemm-type")); - graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); - } - graph->reserveWorkspaceMB(options_->get("workspace")); - graphs_.push_back(graph); - - auto scorers = createScorers(options_, modelFiles_); - for(auto scorer : scorers) { - scorer->init(graph); - if(shortlistGenerator_) - scorer->setShortlistGenerator(shortlistGenerator_); - } - scorers_.push_back(scorers); + auto task = [&](DeviceId device, size_t id) { + auto graph = New(true); + + auto precison = options_->get>("precision", {"float32"}); + graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph + graph->setDevice(device); + if (device.type == DeviceType::cpu) { + graph->getBackend()->setOptimized(options_->get("optimize")); + graph->getBackend()->setGemmType(options_->get("gemm-type")); + graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); + } + graph->reserveWorkspaceMB(options_->get("workspace")); + graphs_[id] = graph; + + auto scorers = createScorers(options_, modelWeights_); + for(auto scorer : scorers) { + scorer->init(graph); + if(shortlistGenerator_) + scorer->setShortlistGenerator(shortlistGenerator_); + } + + scorers_[id] = scorers; + graph->forward(); + }; + + threadPool.enqueue(task, device, id++); } } - std::string run(const std::string& input) override { + std::vector run(const std::vector& inputs, const std::string& yamlOverridesStr="") override { + auto input = utils::join(inputs, "\n"); + auto translations = run(input, yamlOverridesStr); + return utils::split(translations, "\n", /*keepEmpty=*/true); + } + + std::string run(const std::string& input, const std::string& yamlOverridesStr="") override { + YAML::Node configOverrides = YAML::Load(yamlOverridesStr); + + auto currentOptions = New(options_->clone()); + if (!configOverrides.IsNull()) { + LOG(info, "Overriding options:\n {}", configOverrides); + currentOptions->merge(configOverrides, /*overwrite=*/true); + } + // split tab-separated input into fields if necessary - auto inputs = options_->get("tsv", false) - ? convertTsvToLists(input, options_->get("tsv-fields", 1)) + auto inputs = currentOptions->get("tsv", false) + ? convertTsvToLists(input, currentOptions->get("tsv-fields", 1)) : std::vector({input}); - auto corpus_ = New(inputs, srcVocabs_, options_); - data::BatchGenerator batchGenerator(corpus_, options_, nullptr, /*runAsync=*/false); + auto corpus_ = New(inputs, srcVocabs_, currentOptions); + data::BatchGenerator batchGenerator(corpus_, currentOptions, nullptr, /*runAsync=*/false); - auto collector = New(options_->get("quiet-translation", false)); - auto printer = New(options_, trgVocab_); + auto collector = New(currentOptions->get("quiet-translation", false)); + auto printer = New(currentOptions, trgVocab_); size_t batchId = 0; batchGenerator.prepare(); @@ -325,7 +356,7 @@ class TranslateService : public ModelServiceTask { scorers = scorers_[id % numDevices_]; } - auto search = New(options_, scorers, trgVocab_); + auto search = New(currentOptions, scorers, trgVocab_); auto histories = search->search(graph, batch); for(auto history : histories) { @@ -341,7 +372,7 @@ class TranslateService : public ModelServiceTask { } } - auto translations = collector->collect(options_->get("n-best")); + auto translations = collector->collect(currentOptions->get("n-best")); return utils::join(translations, "\n"); } From 5e6e1a04c25e18852d35932bf96f4d68d6a0ec0b Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Mon, 5 Feb 2024 22:47:19 +0000 Subject: [PATCH 12/26] Merged PR 32806: Various small changes and fixes to pybindings and pymarian-evaluate This PR add minor fixes to pybindings and pymarian-evaluate: * comet2marian.py script correctly handles the wmt23-cometkiwi-da-xl/xxl models. * pymarian-evaluate now correctly computes scores * evaluator now exposes an interface function to read the model config --- CMakeLists.txt | 28 ++++++++-------- VERSION | 2 +- scripts/comet/comet2marian.py | 34 +++++++++++-------- src/common/config.cpp | 2 +- src/data/text_input.cpp | 11 +++++-- src/data/text_input.h | 56 +++++++++++++++++++++++++++----- src/evaluator/evaluator.h | 17 +++++++--- src/python/binding/bind.cpp | 5 +-- src/python/binding/evaluator.hpp | 22 ++++++++----- src/python/pymarian/evaluate.py | 6 ++++ src/python/pyproject.toml | 5 +-- 11 files changed, 130 insertions(+), 58 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ebe2b819..7c9ccc424 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -496,20 +496,22 @@ if(USE_STATIC_LIBS) endif() ############################################################################### -# Find Tcmalloc_minimal +# Find Tcmalloc_minimal # re-used from sentencepiece -if(NOT WIN32) - if(USE_STATIC_LIBS) - find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a) - else() - find_library(TCMALLOC_LIB NAMES tcmalloc_minimal) - endif() - if (TCMALLOC_LIB) - message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}") - set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES}) - add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free) - else() - message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}") +if(USE_TCMALLOC) + if(NOT WIN32) + if(USE_STATIC_LIBS) + find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a) + else() + find_library(TCMALLOC_LIB NAMES tcmalloc_minimal) + endif() + if (TCMALLOC_LIB) + message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}") + set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES}) + add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free) + else() + message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}") + endif() endif() endif() diff --git a/VERSION b/VERSION index cddff7b16..8b8e7fdd6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.21 +v1.12.22 diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py index 09c369260..68912befd 100755 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -13,9 +13,11 @@ # from comet.models import available_metrics # supported_comets = [m for m in available_metrics if 'qe' in m.lower()] supported_comets = [ - 'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da', - 'wmt20-comet-da', 'wmt21-comet-da', 'Unbabel/wmt22-comet-da', 'Unbabel/wmt22-cometkiwi-da', - 'Unbabel/XCOMET-XL', 'Unbabel/XCOMET-XXL' + 'wmt20-comet-qe-da', 'wmt20-comet-qe-da-v2', 'wmt20-comet-da', + 'wmt21-comet-qe-mqm', 'wmt21-comet-qe-da', 'wmt21-comet-da', + 'Unbabel/wmt22-comet-da', 'Unbabel/wmt22-cometkiwi-da', + 'Unbabel/XCOMET-XL', 'Unbabel/XCOMET-XXL', + 'Unbabel/wmt23-cometkiwi-da-xl', 'Unbabel/wmt23-cometkiwi-da-xxl' ] log.basicConfig(level=log.INFO) @@ -87,18 +89,22 @@ def load_comet_model(model_path): config = dict() model_type = type(cometModel).__name__ +print("COMET model params:", cometModel.hparams, file=sys.stderr) + +# are we using the xml-roberta-xl or xml-roberta-xxl model? +isXlmXL = any(pre in cometModel.hparams.get("pretrained_model") for pre in ["xlm-roberta-xl", "xlm-roberta-xxl"]) + if model_type == "RegressionMetric": config["type"] = "comet" elif model_type == "ReferencelessRegression": config["type"] = "comet-qe" elif model_type == "XLMRobertaModel": config["type"] = "comet-qe" -elif model_type == "UnifiedMetric" or model_type == "XCOMETMetric": +elif model_type == "UnifiedMetric" or isXlmXL: config["type"] = "comet-unified" config["input-join-fields"] = True config["separator-symbol"] = "" config["comet-use-separator"] = True - config["comet-pool"] = "cls" else: raise Exception(f'Unknown type of model {model_type}') @@ -109,7 +115,7 @@ def load_comet_model(model_path): config["transformer-train-position-embeddings"] = True # Roberta-XXL (hence XCOMET-XXL) has pre-norm -if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type +if isXlmXL: config["transformer-preprocess"] = "n" config["transformer-postprocess"] = "da" config["transformer-postprocess-emb"] = "" @@ -123,15 +129,16 @@ def load_comet_model(model_path): config["bert-type-vocab-size"] = 0 config["comet-prepend-zero"] = True -print(cometModel.hparams) - +config["comet-pool"] = cometModel.hparams.get("pool", "cls") config["comet-mix"] = cometModel.hparams.get("layer") == "mix" config["comet-mix-norm"] = cometModel.hparams.get('layer_norm', False) config["comet-mix-transformation"] = cometModel.hparams.get("layer_transformation", "softmax"); -# they have a bug in their code that makes this always true -if model_type == "UnifiedMetric" or model_type == "XCOMETMetric": +# there are several issues in their code that make the following always true regardless of values in hparams +# that was hard to find out +if model_type == "UnifiedMetric" or isXlmXL: config["comet-mix-transformation"] = "softmax" + config["comet-pool"] = "cls" if not args.roberta: config["comet-final-sigmoid"] = args.add_sigmoid @@ -206,7 +213,7 @@ def extract(layer, nth, level): convert(pd, ["attention.output.dense.bias"], f"{blockPrefix}->selfAttention->oProj->bias", bias=True) # self-attention layer-norm - if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + if isXlmXL: convert(pd, ["attention.self_attn_layer_norm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True) convert(pd, ["attention.self_attn_layer_norm.bias"], f"{blockPrefix}->preprocessor->norm->bias", bias=True) else: @@ -224,7 +231,7 @@ def extract(layer, nth, level): convert(pd, ["output.dense.bias"], f"{blockPrefix}->layers->at(3)->as()->bias", bias=True) # ffn layer-norm - if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + if isXlmXL: convert(pd, ["LayerNorm.weight"], f"{blockPrefix}->preprocessor->norm->weight", bias=True) convert(pd, ["LayerNorm.bias"], f"{blockPrefix}->preprocessor->norm->bias", bias=True) else: @@ -267,7 +274,7 @@ def extract(layer, nth, level): prefix = "CometEncoder" # post-embedding layer normalization - if model_type == "XCOMETMetric": # @TODO: make this depend on RobertaXL/XXL rather than model_type + if isXlmXL: convert(pd, ["encoder.LayerNorm.weight"], f"{prefix}->encoder->postprocessor->norm->weight", bias=True) convert(pd, ["encoder.LayerNorm.bias"], f"{prefix}->encoder->postprocessor->norm->bias", bias=True) else: @@ -309,7 +316,6 @@ def extract(layer, nth, level): # 3-layer FFN network that computes COMET regression prefix = "CometQEPooler" - # @TODO: make final sigmoid optional convert(pd, ["ff.0.weight"], f"{prefix}->layers->at(0)->as()->weight") convert(pd, ["ff.0.bias"], f"{prefix}->layers->at(0)->as()->bias", bias=True) diff --git a/src/common/config.cpp b/src/common/config.cpp index b6296a8b2..78c2aac1b 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -281,7 +281,7 @@ Ptr parseOptions(int argc, char** argv, cli::mode mode, bool validate) Ptr parseOptions(const std::string& args, cli::mode mode, bool validate) { std::vector vArgs = utils::split(args, " "); - + std::string dummy("marian"); std::vector cArgs = { &dummy[0] }; for(auto& arg : vArgs) diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp index 0ccaedf14..e8801afb8 100644 --- a/src/data/text_input.cpp +++ b/src/data/text_input.cpp @@ -13,9 +13,9 @@ void TextIterator::increment() { } bool TextIterator::equal(TextIterator const& other) const { - // two iterators are equal if any of the following is true: + // two iterators are equal if any of the following is true: // 1. both are invalid (null ptrs) - // 2. both at the end of the stream (empty tuples as record, regardless of pos_) + // 2. both at the end of the stream (empty tuples as record, regardless of pos_) // 3. both are at the same position return (!this->tup_.valid() && !other.tup_.valid()) || (this->tup_.valid() && other.tup_.valid() && this->tup_.empty() && other.tup_.empty()) || @@ -32,7 +32,12 @@ TextInput::TextInput(std::vector inputs, : DatasetBase(inputs, options), vocabs_(vocabs), maxLength_(options_->get("max-length")), - maxLengthCrop_(options_->get("max-length-crop")) { + maxLengthCrop_(options_->get("max-length-crop")), + rightLeft_(options_->get("right-left")), + prependZero_(options_->get("comet-prepend-zero", false)), + joinFields_(options_->get("input-join-fields", false)), + insertSeparator_(options_->get("comet-use-separator", false)) + { // Note: inputs are automatically stored in the inherited variable named paths_, but these are // texts not paths! for(const auto& text : paths_) diff --git a/src/data/text_input.h b/src/data/text_input.h index 3a399b6d2..0e6d86e23 100644 --- a/src/data/text_input.h +++ b/src/data/text_input.h @@ -35,6 +35,13 @@ class TextInput : public DatasetBase { size_t maxLength_{0}; bool maxLengthCrop_{false}; + bool rightLeft_{false}; + + // copied from corpus.h - TODO: refactor or unify code between Corpus and TextInput + bool prependZero_{false}; + bool joinFields_{false}; // if true when given a TSV file or multiple inputs, join them together into a single sentence tuple, + // the already present separator will demark the fields (mostly used for BLEURT and COMET-KIWI) + bool insertSeparator_{false}; // when joining fields with joinFields_, additionally use this separator (mostly used for COMET-KIWI) public: TextInput(std::vector inputs, std::vector> vocabs, Ptr options); @@ -97,15 +104,48 @@ class TextInput : public DatasetBase { ABORT_IF(row.size() != vocabs_.size(), "Number of fields does not match number of vocabs"); // fill up the sentence tuple with source and/or target sentences SentenceTupleImpl tup(id); - for(size_t i = 0; i < row.size(); ++i) { - std::string field = row[i]; - Words words = vocabs_[i]->encode(field, /*addEOS=*/true, /*inference=*/inference_); - if(this->maxLengthCrop_ && words.size() > this->maxLength_) { - words.resize(maxLength_); - words.back() = vocabs_.back()->getEosId(); // note: this will not work with class-labels + + // copied and adapted from corpus.cpp - @TODO: refactor or unify code between Corpus and TextInput + for(size_t batchIndex = 0; batchIndex < row.size(); ++batchIndex) { + std::string& field = row[batchIndex]; + Words words = vocabs_[batchIndex]->encode(field, /*addEOS =*/true, inference_); + ABORT_IF(words.empty(), "Empty input sequences are presently untested"); + + // This handles adding starts symbols for COMET () and BERT/BLEURT ([CLS]) + bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0)); + if(prepend) + words.insert(words.begin(), Word::fromWordIndex(0)); + + bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0; + if(prependSep) + words.insert(words.begin(), vocabs_[batchIndex]->getSepId()); + + // if fields are joined and the current sentence is not the first one, we need to make sure that + // the current sentence is not longer than the maximum length minus the length of the previous sentence + // (minus 1 for the separator token or 2 if we also add a separator token) + size_t localMaxLength = maxLength_; + if(joinFields_ && !tup.empty()) + localMaxLength = std::max(1 + (int)prependSep, (int)maxLength_ - (int)tup.back().size()); + + // if the current sentence is longer than the maximum length, we need to crop it + if(maxLengthCrop_ && words.size() > localMaxLength) { + words.resize(localMaxLength); + words.back() = vocabs_[batchIndex]->getEosId(); + } + + // if true, the words are reversed + if(rightLeft_) + std::reverse(words.begin(), words.end() - 1); + + // if true, the numeric indices get joined with the previous sentence, acts as a separator here + if(joinFields_) { + size_t currLength = tup.empty() ? 0 : tup.back().size(); + // if the current sentence would exceed the maximum length we don't add any more fields + if(currLength + words.size() < maxLength_) + tup.appendToBack(words); + } else { + tup.pushBack(words); } - ABORT_IF(words.empty(), "No words (not even EOS) found in the input text. ID: " + std::to_string(id)); - tup.pushBack(words); } return SentenceTuple(tup); } diff --git a/src/evaluator/evaluator.h b/src/evaluator/evaluator.h index 022a8204c..257804e41 100644 --- a/src/evaluator/evaluator.h +++ b/src/evaluator/evaluator.h @@ -50,7 +50,7 @@ class Evaluate : public ModelTask { std::vector> graphs_; std::vector> models_; - Ptr modelFile_; + Ptr modelWeights_; public: Evaluate(Ptr options) : options_(options) { @@ -69,7 +69,7 @@ class Evaluate : public ModelTask { auto modelPath = options_->get("model"); LOG(info, "Loading model from {}", modelPath); - modelFile_ = New(modelPath); + modelWeights_ = New(modelPath); graphs_.resize(devices.size()); models_.resize(devices.size()); @@ -85,7 +85,7 @@ class Evaluate : public ModelTask { graph->reserveWorkspaceMB(options_->get("workspace")); auto model = New(options_); - model->load(graph, modelFile_); + model->load(graph, modelWeights_); models_[j] = model; graphs_[j] = graph; @@ -107,10 +107,10 @@ class Evaluate : public ModelTask { run(batchGenerator, output); LOG(info, "Total time: {:.5f}s wall", timer.elapsed()); } - + template void run(Ptr> batchGenerator, Ptr collector) { - + size_t batchId = 0; { ThreadPool pool(graphs_.size(), graphs_.size()); @@ -158,6 +158,13 @@ class Evaluate : public ModelTask { } } + std::string getModelConfig() { + ABORT_IF(!modelWeights_, "Model weights are not loaded"); + YAML::Emitter outYaml; + cli::OutputYaml(modelWeights_->getYamlFromModel(), outYaml); + return outYaml.c_str(); + } + }; } // namespace marian diff --git a/src/python/binding/bind.cpp b/src/python/binding/bind.cpp index 9e8cc4464..38a1e3429 100644 --- a/src/python/binding/bind.cpp +++ b/src/python/binding/bind.cpp @@ -1,6 +1,6 @@ #include "pybind11/pybind11.h" #include "pybind11/stl.h" -// if your IDE/vscode complains about missing paths +// if your IDE/vscode complains about missing paths // pybind11 can be found by "python -m pybind11 --includes"; you may need to add both pybind11 and Python.h #include "embedder.hpp" #include "evaluator.hpp" @@ -17,7 +17,7 @@ using namespace pymarian; PYBIND11_MODULE(_pymarian, m) { m.doc() = "Marian C++ API bindings via pybind11"; - /** TODOS + /** TODOS * 1. API to check if gpu available: cuda_is_available() -> bool * 2. API to check number of gpus:: cuda_device_count() -> int */ @@ -31,6 +31,7 @@ PYBIND11_MODULE(_pymarian, m) { py::class_(m, "Evaluator") .def(py::init()) .def("evaluate", py::overload_cast(&EvaluatorPyWrapper::run)) + .def("get_model_config", py::overload_cast<>(&EvaluatorPyWrapper::getModelConfig)) ; py::class_(m, "Trainer") diff --git a/src/python/binding/evaluator.hpp b/src/python/binding/evaluator.hpp index f72ccd08a..37b687d21 100644 --- a/src/python/binding/evaluator.hpp +++ b/src/python/binding/evaluator.hpp @@ -26,13 +26,13 @@ namespace pymarian { /** * Wrapper for Marian Evaluator. - * + * * This class is a wrapper for the Marian Evaluator class. * It is used to run the evaluator on a given input. - * + * **/ class EvaluatorPyWrapper { - + private: Ptr options_; Ptr evaluator_; @@ -46,10 +46,10 @@ namespace pymarian { EvaluatorPyWrapper(const std::string& cliString){ options_ = parseOptions(cliString, cli::mode::evaluating, true) ->with("inference", true, "shuffle", "none"); - evaluator_= New(options_); + evaluator_ = New(options_); vocabs_ = loadVocabs(options_); } - + /** * @brief Load the vocabularies from the given paths * @param options - the options object @@ -69,7 +69,7 @@ namespace pymarian { /** * Given a table of strings (i.e., rows x columns), concatenate each column into a single string. - * + * * @param data - table of strings : rows x columns * @return List of strings, one string for each column, concatenated across rows. */ @@ -92,12 +92,12 @@ namespace pymarian { } /** - * Run the evaluator on the given input. + * Run the evaluator on the given input. * Input is transformed as (in memory) files by concatenating columns. - * + * * @param inputs - table of strings : rows x columns * @return table of floats : rows x columns - * + * */ auto run(const StrVectors& inputs) -> FloatVectors { StrVector columnFiles = concatColumns(inputs); @@ -114,6 +114,10 @@ namespace pymarian { return outputs; } + auto getModelConfig() -> std::string { + return evaluator_->getModelConfig(); + } + }; } diff --git a/src/python/pymarian/evaluate.py b/src/python/pymarian/evaluate.py index be13f3f00..371a37006 100755 --- a/src/python/pymarian/evaluate.py +++ b/src/python/pymarian/evaluate.py @@ -9,6 +9,8 @@ import subprocess import sys import threading +import yaml + from pathlib import Path from typing import Iterator, List, Optional, Tuple, Union @@ -138,7 +140,11 @@ def pymarian_evaluate( raise ImportError('pymarian is not installed. Please install it and rerun') log.info(f'Marian CLI::\n\t{cmd_line}') + evaluator = Evaluator(cmd_line) + config = yaml.safe_load(evaluator.get_model_config()) + log.info(f'Model config: {config}') + assert average in ('skip', 'append', 'only') lines = (line.rstrip('\n').split('\t') for line in input_lines) diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml index a9cf413a7..84d1b0e8f 100644 --- a/src/python/pyproject.toml +++ b/src/python/pyproject.toml @@ -30,7 +30,8 @@ classifiers = [ dependencies = [ "tqdm", - "requests" + "requests", + "pyyaml" ] [project.scripts] @@ -56,7 +57,7 @@ target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] include = 'src/python/.*\.pyi?$' skip-string-normalization = true -# black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333 +# black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333 [tool.isort] profile = "black" src_paths = ["src/python"] From 4cdf93a2c8b1d02f38faade4de73635a8d474f1b Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Tue, 6 Feb 2024 05:36:43 +0000 Subject: [PATCH 13/26] Merged PR 32860: Azure CI: save disk space by disabling compilation for Ampere and Turing Ubuntu CI: ON to Maxwell, Pascal and Volta; OFF to Ampere and Turing * to fix space issue on CI vms --- azure-pipelines.yml | 27 ++++++++++++++++++++------- src/data/corpus_base.cpp | 4 +++- src/data/text_input.h | 4 +++- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a1e9ea94f..4c7cd0bfd 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -247,6 +247,7 @@ stages: gcc: 9 unit_tests: true examples: false + pymarian: true static: true # Ubuntu GPU-only build "GPU-only": @@ -258,6 +259,7 @@ stages: gcc: 9 unit_tests: false examples: false + pymarian: true static: false ################################################################ # Ubuntu 22.04 supports CUDA 11+ @@ -271,20 +273,23 @@ stages: gpu: true cuda: 11.7 gcc: 11 - unit_tests: false # disable unit tests to minimize compilation time - examples: false # disable examples to minimize compilation time + unit_tests: true + examples: true + pymarian: true static: false ################################################################ # Ubuntu 20.04 supports CUDA 11+ "20.04 CUDA 11.1 gcc-9": image: ubuntu-20.04 - boost: true + boost: false cpu: true gpu: true cuda: 11.1 gcc: 9 - unit_tests: true - examples: true + # static cause large binaries so we turn off tests and examples + unit_tests: false + examples: false + pymarian: false static: true ################################################################ # Ubuntu 16.04 is no longer available on Azure-hosted machines @@ -324,8 +329,9 @@ stages: condition: eq(variables.gpu, true) # Some preinstalled versions of pip are bad for pymarian; see https://github.com/pypa/setuptools/issues/3269 - - bash: python3 -m pip install pip -U + - bash: python3 -m pip install pip -U displayName: Upgrade pip + condition: eq(variables.pymarian, true) - bash: | mkdir -p build @@ -334,6 +340,12 @@ stages: cmake .. \ -DCOMPILE_CPU=$(cpu) \ -DCOMPILE_CUDA=$(gpu) \ + -DCOMPILE_MAXWELL=$(gpu) \ + -DCOMPILE_PASCAL=$(gpu) \ + -DCOMPILE_VOLTA=$(gpu) \ + -DCOMPILE_AMPERE=OFF \ + -DCOMPILE_AMPERE_RTX=OFF \ + -DCOMPILE_TURING=OFF \ -DCOMPILE_EXAMPLES=$(examples) \ -DCOMPILE_SERVER=$(boost) \ -DCOMPILE_TESTS=$(unit_tests) \ @@ -343,7 +355,7 @@ stages: -DBoost_ARCHITECTURE=-x64 \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda) \ -DUSE_TCMALLOC=off \ - -DPYMARIAN=ON \ + -DPYMARIAN=$(pymarian) \ -DPYTHON_EXECUTABLE=python3 displayName: Configure CMake @@ -374,6 +386,7 @@ stages: python3 -m pip install build/pymarian-*.whl python3 -m pymarian -v displayName: Build Pymarian + condition: eq(variables.pymarian, true) ###################################################################### - job: BuildMacOS diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index 47381d9b9..e1b0aad62 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -461,8 +461,10 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, if(joinFields_) { size_t currLength = tup.empty() ? 0 : tup.back().size(); // if the current sentence would exceed the maximum length we don't add any more fields - if(currLength + words.size() < maxLength_) + if(currLength + words.size() <= maxLength_) tup.appendToBack(words); + + ABORT_IF(tup.empty(), "This should have content if we got here??"); } else { tup.pushBack(words); } diff --git a/src/data/text_input.h b/src/data/text_input.h index 0e6d86e23..f2e9831de 100644 --- a/src/data/text_input.h +++ b/src/data/text_input.h @@ -141,8 +141,10 @@ class TextInput : public DatasetBase { if(joinFields_) { size_t currLength = tup.empty() ? 0 : tup.back().size(); // if the current sentence would exceed the maximum length we don't add any more fields - if(currLength + words.size() < maxLength_) + if(currLength + words.size() <= maxLength_) tup.appendToBack(words); + + ABORT_IF(tup.empty(), "This should have content if we got here??"); } else { tup.pushBack(words); } From bd9a679396c304609ff7fce14bb7fc2e8535d840 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 6 Feb 2024 15:39:39 +0000 Subject: [PATCH 14/26] Merged PR 32636: Extending new layer framework to match production models This PR implements a bunch of missing functionality in the new layer framework. Among others: * Autoregressive self-attention * Guided alignment training * Decode-time alignment Minor refactoring of previous code to accommodate above changes. When setting `export TRANSFORMER_FLAVOR=experimental` all legacy transformer models are internally mapped to the new layer framework. With that enabled: Production regression tests all pass. Passes all public regression tests with the exception of: - tests/factors/test_factors_concat.sh - tests/factors/test_factors_decoder_concat.sh - tests/models/wnmt18/test_student_small_aan.sh - tests/models/wnmt18/test_student_small_aan_intgemm16.sh - tests/models/wnmt18/test_student_small_aan_intgemm8.sh and - tests/interface/input-tsv/test_tsv_train_with_align_and_weights.sh - tests/interface/input-tsv/test_tsv_train_with_align_and_weights_inputtypes.sh I could get these to work, but it doesn't seem to be worth it. I plan to remove both code paths in the future. The last two are -- I think -- just divergences due to mild model differences and probably don't need fixing, rather future adaptation. --- CMakeLists.txt | 7 +- VERSION | 2 +- src/common/io.cpp | 12 +- src/common/io.h | 23 +- src/data/corpus_base.h | 3 +- src/graph/cached_expression.h | 16 +- src/graph/expression_graph.h | 2 + src/graph/node_operators_unary.h | 28 +-- src/layers/output.cpp | 4 +- src/layers_new/alibi.cpp | 90 +++++--- src/layers_new/alibi.cu | 60 +++-- src/layers_new/alibi.h | 234 ++++++++++--------- src/layers_new/attention.cpp | 74 ++++-- src/layers_new/attention.h | 287 +++++++++++++++++------ src/layers_new/decoder.h | 42 +++- src/layers_new/interface.h | 77 ++++--- src/layers_new/neuralnet.h | 12 +- src/layers_new/rnn.h | 29 ++- src/layers_new/transformer.h | 378 +++++++++++++++++++++++-------- src/microsoft/quicksand.cpp | 2 +- src/models/amun.h | 5 +- src/models/bleurt.h | 9 +- src/models/nematus.h | 5 +- src/models/transformer.h | 54 ++--- src/models/transformer_factory.h | 34 +-- src/models/transformer_new.h | 45 ++-- 26 files changed, 1006 insertions(+), 528 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c9ccc424..e16876f78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -479,8 +479,11 @@ if(NOT MSVC) list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC}) else() # c++17 doesn't work with CUDA 10 - # list(APPEND CUDA_NVCC_FLAGS -std=c++17; -Xcompiler "/std:c++17"; -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) - list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) + if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0")) + list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++17; -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) + else() + list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++14; -Xcompiler\ /FS; -Xcompiler\ /MT$<$:d>; ) + endif() endif() list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS) diff --git a/VERSION b/VERSION index 8b8e7fdd6..9db15f195 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.22 +v1.12.23 diff --git a/src/common/io.cpp b/src/common/io.cpp index 109b3a1ed..61a9054cd 100644 --- a/src/common/io.cpp +++ b/src/common/io.cpp @@ -139,8 +139,18 @@ std::vector ModelWeights::mmapItems(const void* ptr) { return items; } +std::unique_ptr> ModelWeights::scopedLockGuard() const { + // @TODO: this should use std::optional, but as long as we use CUDA 10.x there may be + // random problems with std::optional and nvcc compilation + if(locking_) + return std::unique_ptr>(new std::lock_guard(mutex_)); + else + return nullptr; +} + void ModelWeights::load() { - std::lock_guard lock(mutex_); + auto optionalLock = scopedLockGuard(); + if(loaded_) return; diff --git a/src/common/io.h b/src/common/io.h index 1db0a83fe..8eaf47665 100644 --- a/src/common/io.h +++ b/src/common/io.h @@ -9,6 +9,7 @@ #include "common/definitions.h" #include "common/io_item.h" +#include #include #include @@ -32,8 +33,6 @@ bool isBin(const std::string& fileName); class ModelWeights { private: - std::mutex mutex_; - std::string fileName_; const void* ptr_{nullptr}; @@ -48,14 +47,17 @@ class ModelWeights { std::vector items_; std::unique_ptr mmap_; + mutable std::mutex mutex_; + bool locking_{true}; // if true, the mutex will be locked when accessing the data, see scopedLockGuard() + std::vector loadItems(const std::string& fileName); std::vector mmapItems(const void* ptr); void load(); public: - ModelWeights(const std::string& fileName, MmapMode mmapMode = MmapMode::OpportunisticMmap) - : fileName_(fileName), fileType_(getFileType(fileName)), mmapMode_(mmapMode) { + ModelWeights(const std::string& fileName, MmapMode mmapMode = MmapMode::OpportunisticMmap, bool locking = true) + : fileName_(fileName), fileType_(getFileType(fileName)), mmapMode_(mmapMode), locking_(locking) { // NPZ files cannot be memory-mapped, so we switch opportunistic mmap off, but keep any other mmap mode if(fileType_ == FileType::isNpz && mmapMode_ == MmapMode::OpportunisticMmap) @@ -65,11 +67,11 @@ class ModelWeights { ABORT_IF(fileType_ == FileType::isNpz && mmapMode_ != MmapMode::DontMmap, "NPZ files cannot be memory-mapped"); } - ModelWeights(const void* ptr, MmapMode mmapMode = MmapMode::RequiredMmap) - : ptr_(ptr), fileType_(FileType::isBuf), mmapMode_(mmapMode) {} + ModelWeights(const void* ptr, MmapMode mmapMode = MmapMode::RequiredMmap, bool locking = true) + : ptr_(ptr), fileType_(FileType::isBuf), mmapMode_(mmapMode), locking_(locking) {} - ModelWeights() - : fileType_(FileType::isDummy), mmapMode_{MmapMode::DontMmap} {} + ModelWeights(bool locking = true) + : fileType_(FileType::isDummy), mmapMode_{MmapMode::DontMmap}, locking_(locking) {} ModelWeights(const ModelWeights&&) = delete; ModelWeights(const ModelWeights&) = delete; @@ -85,6 +87,11 @@ class ModelWeights { YAML::Node getYamlFromModel(const std::string& varName = "special:model.yml") const; + // If locking is set to false, the returned unique_ptr will be empty and no lock will be acquired. + // Otherwise the returned unique_ptr will contain a lock guard that will be released when the unique_ptr + // goes out of scope. So we have an optional scoped lock guard. + std::unique_ptr> scopedLockGuard() const; + void loadAndSync(Ptr mpi); }; diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index 074689804..b21da01c6 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -657,8 +657,7 @@ class CorpusBase : public DatasetBase separator will demark the fields (mostly used for BLEURT and COMET-KIWI) bool insertSeparator_{false}; // when joining fields with joinFields_, additionally use this separator (mostly used for COMET-KIWI) diff --git a/src/graph/cached_expression.h b/src/graph/cached_expression.h index f7adff8bc..376a9f14f 100644 --- a/src/graph/cached_expression.h +++ b/src/graph/cached_expression.h @@ -6,8 +6,8 @@ namespace marian { -// This class allows for simpler caching of Expr objects and automatic checking if the -// cached Expr needs to be updated/recreated. +// This class allows for simpler caching of Expr objects and automatic checking if the +// cached Expr needs to be updated/recreated. class CachedExpr { private: ENABLE_INTRUSIVE_PTR(CachedExpr); @@ -21,27 +21,27 @@ class CachedExpr { UPtr applyFun_; // function that creates the cached result UPtr equalFun_; // function that checks if the input changed. If yes, // the `apply_` functions gets reapplied and the new result - // is cached. - + // is cached. + public: // No functors are given; they will have to supplied when calling `apply`. CachedExpr() {}; // No apply functor is given; it will have to supplied when calling `apply`. - CachedExpr(EqualFunT equalFun) + CachedExpr(EqualFunT equalFun) : equalFun_(new EqualFunT(equalFun)) {}; // Both functors are given, and will be used by default. They can however be overriden // if supplied directly in `apply`. - CachedExpr(ApplyFunT applyFun, EqualFunT equalFun) + CachedExpr(ApplyFunT applyFun, EqualFunT equalFun) : applyFun_(new ApplyFunT(applyFun)), equalFun_(new EqualFunT(equalFun)) {}; - // lazily executes the factory `applyFun` if `equalFun` indicates that the input has changed. + // lazily executes the factory `applyFun` if no value is cached or `equalFun` indicates that the input has changed. Expr apply(Expr key, ApplyFunT applyFun, EqualFunT equalFun) { if(!cachedKey_ || !equalFun(cachedKey_, key)) { cachedKey_ = key; cachedValue_ = applyFun(key); - } + } return cachedValue_; } diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 915c9df3f..239ecaeaf 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -799,6 +799,8 @@ class ExpressionGraph : public std::enable_shared_from_this { setReloaded(false); for(auto& item : modelWeights->items()) { + auto lockGuard = modelWeights->scopedLockGuard(); + std::string pName = item.name; // skip over special parameters starting with "special:" if(pName.substr(0, 8) == "special:") diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index aa3f5004c..ba11eca0e 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -503,7 +503,7 @@ struct ReduceNodeOp : public UnaryNodeOp { : UnaryNodeOp(a, newShape(a, axis)), opCode_(opCode) { reducedDim_ = a->shape()[axis]; // e.g. used in mean() - ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(), + ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(), "Bug in determining reducedDim {} != {}", reducedDim_, a->shape().elements() / shape().elements()); @@ -619,13 +619,13 @@ class CumSumNodeOp : public UnaryNodeOp { int axis_; bool reverse_; bool exclusive_; - + public: - CumSumNodeOp(Expr a, int axis, bool reverse, bool exclusive) - : UnaryNodeOp(a), - axis_(a->shape().axis(axis)), + CumSumNodeOp(Expr a, int axis, bool reverse, bool exclusive) + : UnaryNodeOp(a), + axis_(a->shape().axis(axis)), reverse_(reverse), - exclusive_(exclusive) + exclusive_(exclusive) {} NodeOps forwardOps() override { @@ -685,10 +685,10 @@ class LogCumSumExpNodeOp : public UnaryNodeOp { public: LogCumSumExpNodeOp(Expr a, int axis, bool reverse, bool exclusive, bool fast=false) - : UnaryNodeOp(a), - axis_(a->shape().axis(axis)), + : UnaryNodeOp(a), + axis_(a->shape().axis(axis)), reverse_(reverse), - exclusive_(exclusive), + exclusive_(exclusive), fast_(fast) {} @@ -1019,10 +1019,10 @@ class CallbackNodeOp : public ReshapeNodeOp { private: typedef std::function LambdaNodeCallback; std::unique_ptr callback_; - + public: CallbackNodeOp(Expr node, LambdaNodeCallback callback) - : ReshapeNodeOp(node, node->shape()), + : ReshapeNodeOp(node, node->shape()), callback_(new LambdaNodeCallback(callback)) { } @@ -1053,10 +1053,10 @@ class CallbackNodeOp : public ReshapeNodeOp { class DropoutReluInplaceNodeOp : public ReshapeNodeOp { private: Expr mask_; - + public: DropoutReluInplaceNodeOp(Expr node, Expr mask = nullptr) - : ReshapeNodeOp(node, node->shape()), + : ReshapeNodeOp(node, node->shape()), mask_(mask) {} void forward() override { @@ -1312,7 +1312,7 @@ struct ShiftNodeOp : public UnaryNodeOp { if(!cnode) return false; if(shift_ != cnode->shift_) - return false; + return false; if(padValue_ != cnode->padValue_) return false; return true; diff --git a/src/layers/output.cpp b/src/layers/output.cpp index 8977464b1..05b70645b 100644 --- a/src/layers/output.cpp +++ b/src/layers/output.cpp @@ -92,7 +92,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { }; auto affineShortlist = [this](Expr x, Expr W, Expr b, bool transA, bool transB) { - /* + /* std::cerr << "affineShortlist.x=" << x->shape() << std::endl; std::cerr << "affineShortlist.W=" << W->shape() << std::endl; if (b) std::cerr << "affineShortlist.b=" << b->shape() << std::endl; @@ -114,7 +114,7 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ { else { // original shortlist. W always has 1 for beam & batch ret = dot(x, W, transA, transB); - } + } //std::cerr << "ret.x=" << ret->shape() << std::endl; return ret; diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp index abffb6bae..44f0eb60b 100644 --- a/src/layers_new/alibi.cpp +++ b/src/layers_new/alibi.cpp @@ -56,7 +56,7 @@ Expr AlibiDecoderState::getAlibiShift(Ptr graph, bool decoding) std::vector shift; for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_) shift.push_back((float)(srcPos - trgPos)); - + if(!shift.empty()) { int dimBeam = lastBeam_; ABORT_IF(dimBeam == 0, "dimBeam is 0??"); @@ -66,7 +66,7 @@ Expr AlibiDecoderState::getAlibiShift(Ptr graph, bool decoding) return nullptr; } } else { - ABORT_IF(getBatch()->sets() != 2, + ABORT_IF(getBatch()->sets() != 2, "--transformer-alibi-shift=true currently only works with batch sets=2"); return getAlibiShiftFromBatch(graph); } @@ -93,7 +93,7 @@ Expr AlibiDecoderState::getAlibiShiftFromBatch(Ptr graph) const int dimBatch = (int)targetBatch->batchSize(); int dimSrc = (int)sourceBatch->batchWidth(); int dimTrg = (int)targetBatch->batchWidth(); - + for(int batchIdx = 0; batchIdx < dimBatch; ++batchIdx) { int trgPos = -1, srcPos = -1; for(int i = 0; i < dimTrg; ++i) { @@ -148,7 +148,7 @@ std::vector AlibiDecoderState::computeSyncPoints( // If the current symbol is a sync symbol, the sync point target coordinate is updated to the current position // and the source coordinate is updated to the next sync symbol in the source sentence. for(int i = 0; i < hypIndices.size(); ++i) { - SyncCoord pos = syncPoints_.empty() + SyncCoord pos = syncPoints_.empty() ? SyncCoord({-1, -1, (int)batchIndices[i % dimBatch]}) // no sync points yet, initialize with -1 position and current batch index : syncPoints_[hypIndices[i]]; // carry over the sync point from the previous state at first auto& [trgPos, srcPos, batchIdx] = pos; @@ -168,7 +168,7 @@ std::vector AlibiDecoderState::computeSyncPoints( } return nextSyncPoints; -} +} Ptr NewDecoderState(Ptr options, @@ -185,16 +185,22 @@ Ptr NewDecoderState(Ptr options, } } -Ptr convertDecoderState(Ptr state, - Ptr graph, +Ptr convertDecoderState(Ptr state, + Ptr graph, bool decoding) { Expr shift; auto alibiState = std::dynamic_pointer_cast(state); if(alibiState) shift = alibiState->getAlibiShift(graph, decoding); - size_t position = state->getPosition(); - auto nnState = New(position); + // @TODO: allow for 0 encoder states, i.e. a decoder-only model + ABORT_IF(state->getEncoderStates().size() != 1, "Only supports exactly one encoder state"); + + size_t position = state->getPosition(); + auto encoderContext = state->getEncoderStates()[0]->getContext(); + auto encoderMask = state->getEncoderStates()[0]->getMask(); + + auto nnState = New(position, encoderContext, encoderMask); for(auto& layerState : state->getStates()) { if(alibiState) { nnState->append(New(layerState.cell, shift, position)); @@ -208,97 +214,108 @@ Ptr convertDecoderState(Ptr state, #ifdef CUDA_FOUND namespace gpu { template - void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors); + void Alibi(int numHeads, int start, bool addCausalMask, marian::Tensor out, Tensors... tensors); } #endif namespace cpu { template - void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) { + void Alibi(int numHeads, int start, bool addCausalMask, marian::Tensor out, Tensors... tensors) { ABORT("Not implemented"); } } template -void Alibi(int numHeads, int start, marian::Tensor out, Tensors... tensors) { +void Alibi(int numHeads, int start, bool addCausalMask, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND if(out->getBackend()->getDeviceId().type == DeviceType::gpu) - gpu::Alibi(numHeads, start, out, tensors...); + gpu::Alibi(numHeads, start, addCausalMask, out, tensors...); else #endif - cpu::Alibi(numHeads, start, out, tensors...); + cpu::Alibi(numHeads, start, addCausalMask, out, tensors...); } #ifdef CUDA_FOUND namespace gpu { template - void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors); + void AlibiGrad(int numHeads, int start, bool addCausalMask, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors); } #endif namespace cpu { template - void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors) { + void AlibiGrad(int numHeads, int start, bool addCausalMask, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... tensors) { ABORT("Not implemented"); } } template -void AlibiGrad(int numHeads, int start, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... inputs) { +void AlibiGrad(int numHeads, int start, bool addCausalMask, marian::Tensor slopesGrad, marian::Tensor biasesGrad, Tensors... inputs) { #ifdef CUDA_FOUND if(slopesGrad->getBackend()->getDeviceId().type == DeviceType::gpu) - gpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...); + gpu::AlibiGrad(numHeads, start, addCausalMask, slopesGrad, biasesGrad, inputs...); else #endif - cpu::AlibiGrad(numHeads, start, slopesGrad, biasesGrad, inputs...); + cpu::AlibiGrad(numHeads, start, addCausalMask, slopesGrad, biasesGrad, inputs...); } class AlibiLogMaskNode : public NaryNodeOp { private: int numHeads_{8}; int start_{0}; + bool addCausalMask_{false}; - Shape newShape(Expr mask, Expr query, int numHeads) { + Shape newShape(Expr mask, Expr query, int numHeads, bool addCausalMask) { int dimBeam = query->shape()[-4]; int dimBatch = query->shape()[-3]; int dimQuery = query->shape()[-2]; int dimKeys = mask->shape()[-2]; + ABORT_IF(addCausalMask && dimQuery != dimKeys, "Causal mask only works for square attention matrices"); + return { dimBeam, dimBatch * numHeads, dimQuery, dimKeys }; } public: - AlibiLogMaskNode(const std::vector& nodes, int numHeads, int start) - : NaryNodeOp(nodes, newShape(/*mask=*/nodes[0], /*query=*/nodes[1], numHeads), nodes[0]->value_type()), - numHeads_(numHeads), start_{start} + AlibiLogMaskNode(const std::vector& nodes, int numHeads, int start, bool addCausalMask) + : NaryNodeOp(nodes, newShape(/*mask=*/nodes[0], /*query=*/nodes[1], numHeads, addCausalMask), nodes[0]->value_type()), + numHeads_(numHeads), start_{start}, addCausalMask_{addCausalMask} {} void forward() override { Alibi( - numHeads_, + numHeads_, start_, - val_, + addCausalMask_, + val_, /*mask=*/ child(0)->val(), - /*slopes=*/child(2)->val(), - /*biases=*/child(3)->val(), + /*slopes=*/child(2)->val(), + /*biases=*/child(3)->val(), /*shift=*/ children().size() == 5 ? child(4)->val() : nullptr); } void backward() override { if(!trainable()) return; - + + if(!child(2)->trainable()) + return; + + if(!child(3)->trainable()) + return; + AlibiGrad( - numHeads_, + numHeads_, start_, + addCausalMask_, // gradients - /*d_f/d_slopes=*/child(2)->grad(), - /*d_f/d_biases=*/child(3)->grad(), + /*d_f/d_slopes=*/child(2)->grad(), + /*d_f/d_biases=*/child(3)->grad(), // inputs /*mask=*/ child(0)->val(), - /*slopes=*/ child(2)->val(), - /*biases=*/ child(3)->val(), + /*slopes=*/ child(2)->val(), + /*biases=*/ child(3)->val(), /*shift=*/ children().size() == 5 ? child(4)->val() : nullptr, // adjoint /*d_J/d_f=*/adj_); @@ -308,6 +325,7 @@ class AlibiLogMaskNode : public NaryNodeOp { size_t seed = NaryNodeOp::hash(); util::hash_combine(seed, numHeads_); util::hash_combine(seed, start_); + util::hash_combine(seed, addCausalMask_); return seed; } @@ -321,18 +339,20 @@ class AlibiLogMaskNode : public NaryNodeOp { return false; if(start_ != cnode->start_) return false; + if(addCausalMask_ != cnode->addCausalMask_) + return false; return true; } const std::string type() override { return "alibi-log-mask"; } }; -Expr alibiLogMask(Expr mask, Expr query, Expr slopes, Expr biases, Expr shift, int numHeads, int start) { +Expr alibiLogMask(Expr mask, Expr query, Expr slopes, Expr biases, Expr shift, int numHeads, int start, bool addCausalMask) { std::vector nodes = {mask, query, slopes, biases}; if(shift) nodes.push_back(shift); - return Expression(nodes, numHeads, start); + return Expression(nodes, numHeads, start, addCausalMask); } diff --git a/src/layers_new/alibi.cu b/src/layers_new/alibi.cu index 07042699b..be4b30dea 100644 --- a/src/layers_new/alibi.cu +++ b/src/layers_new/alibi.cu @@ -15,7 +15,8 @@ __global__ void gAlibi( functional::Array, 4> inputs, int numHeads, int start, - float maskFactor) { + float maskFactor, + bool addCausalMask) { constexpr size_t N = functional::Shape::size(); functional::Array oDims; @@ -42,9 +43,9 @@ __global__ void gAlibi( int keyPos = keyIdx; int queryPos = queryIdx + start; - + float relPos = (float)keyPos - (float)queryPos; - + if(shift.data() != nullptr) relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}]; @@ -53,7 +54,12 @@ __global__ void gAlibi( float alibi = slope * abs(relPos + bias); float binMask = (float)mask[{0, batchIdx, keyIdx, 0}]; - float logMask = (2.f * binMask - 1.f) * maskFactor; // range (-maskFactor, maskFactor) + float logMask = binMask == 0 ? -maskFactor : maskFactor; // range (-maskFactor, maskFactor) + + if(addCausalMask) { + float causalMask = keyPos > queryPos ? -maskFactor : maskFactor; // range (-maskFactor, maskFactor) + logMask = min(logMask, causalMask); // range (-maskFactor, maskFactor) if any mask is set to -maskFactor then the result is -maskFactor + } out[index] = (T)min(logMask, alibi); } @@ -61,25 +67,23 @@ __global__ void gAlibi( } template -void Alibi(int numHeads, int start, Tensor out, Tensors... tensors) { +void Alibi(int numHeads, int start, bool addCausalMask, Tensor out, Tensors... tensors) { cudaSetDevice(out->getDeviceId().no); int length = out->size(); int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); - float largest = NumericLimits(out->type()).max; - float maskFactor = std::min(largest / 2.f, 99999999.f); // to make sure we do not overflow for fp16 + float maskFactor = std::numeric_limits::infinity(); constexpr size_t K = sizeof...(tensors); - if(out->type() == Type::float32) { functional::Array, K> inputs = {tensors...}; - gAlibi<<>>(out, inputs, numHeads, start, maskFactor); + gAlibi<<>>(out, inputs, numHeads, start, maskFactor, addCausalMask); #if COMPILE_FP16 } else if(out->type() == Type::float16) { functional::Array, K> inputs = {tensors...}; - gAlibi<<>>(out, inputs, numHeads, start, maskFactor); + gAlibi<<>>(out, inputs, numHeads, start, maskFactor, addCausalMask); #endif } else { ABORT("Alibi for type {} not implemented", out->type()); @@ -87,7 +91,7 @@ void Alibi(int numHeads, int start, Tensor out, Tensors... tensors) { } // template specialization for h/cpp separation -template void Alibi(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Alibi(int, int, bool, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template __global__ void gAlibiGrad( @@ -95,7 +99,8 @@ __global__ void gAlibiGrad( functional::Tensor biasesGrad, functional::Array, 5> inputs, int numHeads, - int start) { + int start, + bool addCausalMask) { const auto& mask = inputs[0]; const auto& slopes = inputs[1]; @@ -120,7 +125,7 @@ __global__ void gAlibiGrad( A5 dims5; const int HEAD_DIM = 2; - + // compute single element derivate for slopes and biases auto dJ_dxy = [&](int headIdx, int colIdx) -> thrust::tuple { // get the location for one head @@ -130,7 +135,7 @@ __global__ void gAlibiGrad( dims5[HEAD_DIM] = headIdx; // get the index into the full tensor int index = fullShape5.index(dims5); - // get the value of the full adjoint + // get the value of the full adjoint float vadj = (float)adj[index]; // handle the rest @@ -141,9 +146,9 @@ __global__ void gAlibiGrad( int keyPos = keyIdx; int queryPos = queryIdx + start; - + float relPos = (float)keyPos - (float)queryPos; - + if(shift.data() != nullptr) relPos -= (float)shift[{beamIdx, batchIdx, queryIdx, 0}]; @@ -152,7 +157,12 @@ __global__ void gAlibiGrad( float binMask = (float)mask[{0, batchIdx, keyIdx, 0}]; float signedAlibi = relPos + bias; - + + if(addCausalMask) { + float causalMask = keyPos > queryPos ? 0.f : 1.f; + binMask = binMask * causalMask; + } + // compute derivative of slope float dslope = binMask * abs(signedAlibi) * vadj; @@ -168,7 +178,7 @@ __global__ void gAlibiGrad( return { dslope, dbias }; }; - + for(int bid = 0; bid < numHeads; bid += gridDim.x) { int headIdx = bid + blockIdx.x; if(headIdx < numHeads) { @@ -215,7 +225,7 @@ __global__ void gAlibiGrad( } template -void TypedAlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) { +void TypedAlibiGrad(int numHeads, int start, bool addCausalMask, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) { cudaSetDevice(slopesGrad->getDeviceId().no); constexpr size_t K = sizeof...(tensors); @@ -223,22 +233,22 @@ void TypedAlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGra const auto& adj = inputs[K - 1]; // last one is adjoint and full broadcast shape int total = adj.size(); - + // we will reduce over each head int blocks = std::min(MAX_BLOCKS, numHeads); int threads = std::min(MAX_THREADS, total / numHeads); int shared = sizeof(float) * threads * 2; // Use float32 as accumulation type, we accumulate slopes and biases - gAlibiGrad<<>>(slopesGrad, biasesGrad, inputs, numHeads, start); + gAlibiGrad<<>>(slopesGrad, biasesGrad, inputs, numHeads, start, addCausalMask); } template -void AlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) { +void AlibiGrad(int numHeads, int start, bool addCausalMask, Tensor slopesGrad, Tensor biasesGrad, Tensors... tensors) { if(slopesGrad->type() == Type::float32) { - TypedAlibiGrad(numHeads, start, slopesGrad, biasesGrad, tensors...); + TypedAlibiGrad(numHeads, start, addCausalMask, slopesGrad, biasesGrad, tensors...); #if COMPILE_FP16 } else if(slopesGrad->type() == Type::float16) { - TypedAlibiGrad(numHeads, start, slopesGrad, biasesGrad, tensors...); + TypedAlibiGrad(numHeads, start, addCausalMask, slopesGrad, biasesGrad, tensors...); #endif } else { ABORT("AlibiGrad for type {} not implemented", slopesGrad->type()); @@ -246,6 +256,6 @@ void AlibiGrad(int numHeads, int start, Tensor slopesGrad, Tensor biasesGrad, Te } // template specialization for h/cpp separation -template void AlibiGrad(int, int, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void AlibiGrad(int, int, bool, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); } } diff --git a/src/layers_new/alibi.h b/src/layers_new/alibi.h index bec2da55d..66c102235 100644 --- a/src/layers_new/alibi.h +++ b/src/layers_new/alibi.h @@ -7,6 +7,8 @@ namespace marian { +const int ALIBI_REFERENCE_HEADS = 8; // number of heads in the reference model + // @TODO: this whole set of functions is currently somewhat akward in general, since we need to implement // old style and new style decoder state for this to work. We decoder with the old decoder framework, but // use the new style transformer layers. This will eventually be cleaned up. @@ -70,12 +72,12 @@ Ptr NewDecoderState(Ptr options, bool isBatchMajor = false); // convert an old-style decoder state to an (alibi) decoder state -Ptr convertDecoderState(Ptr state, - Ptr graph, +Ptr convertDecoderState(Ptr state, + Ptr graph, bool decoding=false); // efficient operator for ALIBI log mask with shift and optionally learnable parameters -Expr alibiLogMask(Expr mask, Expr query, Expr shift, Expr slopes, Expr biases, int numHeads, int start); +Expr alibiLogMask(Expr mask, Expr query, Expr shift, Expr slopes, Expr biases, int numHeads, int start, bool addCausalMask = false); namespace nn { @@ -92,14 +94,16 @@ class AlibiDecoderStateItem : public DecoderStateItem { } }; -// Experimental implementation of the ALIBI attention mechanism (via masking) (https://arxiv.org/abs/2108.12409) +/** + * Experimental implementation of the ALIBI attention mechanism (via masking) (https://arxiv.org/abs/2108.12409) + */ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor { public: - bool trainable{false}; // if true don't use learnable parameters + bool trainable{false}; // if true don't use learnable parameters Expr slopes; // learnable per head ALIBI slopes Expr biases; // learnable per head additive biases - + using AttentionMaskProcessor::numHeads; AlibiAttentionMaskProcessor(Ptr graph, @@ -110,87 +114,11 @@ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor { virtual ~AlibiAttentionMaskProcessor() = default; -private: -// @TODO: eventually to be removed. This computes ALIBI log masks with multiple operators, replaced with more efficient version below. -// For now we keep this for documentation and experimentation puprposes. -// The same functionality is implemented in `alibiLogMask` above via a special operator -#if 0 - const float ALIBI_REFERENCE_HEADS{8.f}; // number of reference heads that ALIBI slopes are computed for - - // Compute the alibi mask for a given query and keys - Expr alibiMask(Expr query, int dimQuery, int dimKeys, Ptr state) const { - int start = 0; - Expr shift = nullptr; - - int dimBatch = query->shape()[-3]; - int dimBeam = query->shape()[-4]; - - if(state) { - start = (int)state->getPosition(); - auto alibiState = std::dynamic_pointer_cast(state); - shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1] - } - - // Create constant tensors of reflecting the query and key positions. - // When decoding, we start with the decoding state position for the query. The key positions are just the indices for the whole sequence. - Expr queryPositions = graph()->constant({1, 1, dimQuery, 1}, inits::range((float)start, (float)(start + dimQuery))); // [1, 1, dimQuery, 1] - Expr keyPositions = graph()->constant({1, 1, 1, dimKeys}, inits::range(0.f, (float)dimKeys)); // [1, 1, 1, dimKeys] - - // Create matrix of distances between positions, rows are distances of current query position vs all key positions. - // Layout is the same as the attention distance matrix where we compute rowwise softmaxes of similarities between - // each target word and all the source words - Expr alibiBiases = keyPositions - queryPositions; // [1, 1, dimQuery, dimKeys] - - // apply the corrective shift if any sync-points are present - if(shift) { - alibiBiases = alibiBiases - shift; // [dimBeam, dimBatch, dimQuery, dimKeys] - alibiBiases = reshape(alibiBiases, {dimBeam * dimBatch, 1, dimQuery, dimKeys}); // [dimBeam * dimBatch, 1, dimQuery, dimKeys] - } - - Expr alibi = slopes * abs(alibiBiases + biases); // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys] - return alibi; - }; - - // Compute the log mask for a given query and combine with the alibi mask - Expr logMask(Expr query, Expr mask, Ptr state) const { - ABORT_IF(!mask, "mask is expected!!"); - - // query: [dimBeam, dimBatch, dimQuery, dimModel] -> dimQuery == dimTrgWords - int dimBatch = query->shape()[-3]; - int dimBeam = query->shape()[-4]; - - int dimQuery = query->shape()[-2]; - int dimKeys = mask->shape()[-2]; - - // all this is bascially a copy of the normal attention mask computation, however we need to do some extra reshaping - // to make the alibi mask and the log mask broadcastable and then combine them via minimum - - // Note, this is not a typical logMask with values 0 (don't mask) and -inf (mask). Rather we use +inf (or a large value) - // and -inf and then compbine with the ALIBI mask via minimum. This way, we keep the original ALIBI values where the mask has - // +inf and have -inf for masking. - // largest useful value and making sure we do not overflow for fp16 - float maskFactor = std::min(NumericLimits(mask->value_type()).max / 2.f, 99999999.f); - // convert binary 0/1 mask to -1/1 mask and then muliply with inf, results in -inf/+inf mask. - auto logMask = (2.f * mask - 1.f) * maskFactor; // [1, dimBatch, dimKeys, 1] - logMask = reshape(logMask, {dimBatch, 1, 1, dimKeys}); // [dimBatch, 1, 1, dimKeys] - - - // make logMask broadcastable when decoding with beam search - logMask = repeat(logMask, /*repeats=*/dimBeam, /*axis=*/-4); // [dimBeam|1 * dimBatch, 1, 1, dimKeys] - - // make logMask and alibiBias broadcastable, then combine - auto alibiBias = alibiMask(query, dimQuery, dimKeys, state); // [(dimBeam * dimBatch)|1, numHeads, dimQuery, dimKeys] - logMask = minimum(logMask, alibiBias); // [dimBeam|1 * dimBatch, numHeads, dimQuery, dimKeys] - - // final reshape to match attention operation - logMask = reshape(logMask, {dimBeam, dimBatch * numHeads, dimQuery, dimKeys}); // [dimBeam|1, dimBatch * numHeads, dimQuery, dimKeys] - return logMask; - } -#endif +private: // Initialized the head-wise scaling factors from ALIBI (they are constant in the original paper, // we are making them optionally learnable here) - Ptr initSlopes(bool decoder = false) const { + Ptr initSlopes() const { // This is the original implementation of ALIBI slopes for LMs. We find our slopes and biases work better for Seq2seq models // Keep for now until we find a use, e.g. in LMs #if 0 @@ -200,69 +128,137 @@ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor { // if there are more or less heads we scale back to 8 heads and interpolate. float exponent = (float)(i + 1) * (ALIBI_REFERENCE_HEADS / (float)numHeads); - // We multiply slopes with 2 for the symmetric mask to keep total probability mass the + // We multiply slopes with 2 for the symmetric mask to keep total probability mass the // same as in the causal mask (we have two symmetric halves instead of just one causal half) mVec[i] = -2.f / std::pow(2.f, exponent); if(decoder) mVec[i] *= 0.5f; } - + return inits::fromVector(mVec); #else // Magic numbers, for now don't ask. - std::vector init; - if(decoder) { - return inits::fromValue(-0.1f); - } else { - init = { -2.00f, -1.00f, -0.50f, -0.25f, -0.05f, -0.05f, -0.05f, -0.05f }; - init.resize(numHeads, -0.05f); - return inits::fromVector(init); - } + std::vector init = { -2.00f, -1.00f, -0.50f, -0.25f, -0.05f, -0.05f, -0.05f, -0.05f }; + init.resize(numHeads, -0.05f); + return inits::fromVector(init); #endif } // Head-wise biases for ALIBI, this does not occur in the paper, ignore the magic numbers - Ptr initBiases(bool decoder=false) const { - if(decoder) { - return inits::fromValue(0.3f); - } else { - std::vector init({ 1.00f, -2.00f, 3.00f, -4.00f, 5.00f, -6.00f, 7.00f, -8.00f }); - init.resize(numHeads, 0.f); - return inits::fromVector(init); - } + Ptr initBiases() const { + std::vector init({ 1.00f, -2.00f, 3.00f, -4.00f, 5.00f, -6.00f, 7.00f, -8.00f }); + init.resize(numHeads, 0.f); + return inits::fromVector(init); } public: + // Apply the alibi mask to the given query and mask virtual Expr apply(Expr query, Expr mask) const override { - return apply(query, mask, /*state=*/nullptr); - } - - // Apply the alibi mask to the given query and mask for decoder cross-attention - virtual Expr apply(Expr query, Expr mask, Ptr state) const override { - bool decoder = state != nullptr; - if(!trainable) { - const_cast(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes(decoder)); - const_cast(biases) = graph()->constant({numHeads, 1, 1}, initBiases(decoder)); + const_cast(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes()); + const_cast(biases) = graph()->constant({numHeads, 1, 1}, initBiases()); } else { - registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes(decoder)); - registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases(decoder)); + registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes()); + registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases()); } Expr shift = nullptr; int start = 0; - - if(state) { - start = (int)state->getPosition(); - auto alibiState = std::dynamic_pointer_cast(state); - shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1] - } auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start); return alibiMask; } }; +/** + * Experimental implementation of the ALIBI attention mechanism for decoder layers + */ +class AlibiDecoderAttentionMaskProcessor : public DecoderAttentionMaskProcessor { +public: + bool trainable{false}; // if true don't use learnable parameters + + Expr slopes; // learnable per head ALIBI slopes + Expr biases; // learnable per head additive biases + + using DecoderAttentionMaskProcessor::numHeads; + + AlibiDecoderAttentionMaskProcessor(Ptr graph, + Ptr options, + bool addCausalMask = false) + : DecoderAttentionMaskProcessor(graph, options, addCausalMask), + trainable(options->get("transformer-alibi-trainable", false)) {} + + virtual ~AlibiDecoderAttentionMaskProcessor() = default; + +private: + // Initialized the head-wise scaling factors from ALIBI (they are constant in the original paper, + // we are making them optionally learnable here) + Ptr initSlopes() const { + if(addCausalMask) { + std::vector mVec(numHeads); + for(size_t i = 0; i < numHeads; ++i) { + // slopes in the paper go from 1/2^1 to 1/2^8 where 8 is the reference number of heads; + // if there are more or less heads we scale back to 8 heads and interpolate. + float exponent = (float)(i + 1) * (ALIBI_REFERENCE_HEADS / (float)numHeads); + mVec[i] = -1.f / std::pow(2.f, exponent); + } + return inits::fromVector(mVec); + } else { + return inits::fromValue(-0.1f); // Magic numbers, for now don't ask. + } + } + + // Head-wise biases for ALIBI, this does not occur in the paper, ignore the magic numbers + Ptr initBiases() const { + if(addCausalMask) { + return inits::fromValue(0.0f); + } else { + return inits::fromValue(0.3f); + } + } + +public: + // Apply the alibi mask to the given query and mask for decoder cross-attention + virtual Expr apply(Expr query, Expr mask, Ptr state) const override { + auto processMask = [this, query, state](Expr mask) { + if(!trainable) { + const_cast(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes()); + const_cast(biases) = graph()->constant({numHeads, 1, 1}, initBiases()); + } else { + registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes()); + registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases()); + } + + Expr shift = nullptr; + int start = 0; + + if(state) { + start = (int)state->getPosition(); + auto alibiState = std::dynamic_pointer_cast(state); + shift = alibiState ? alibiState->getShift() : nullptr; // [dimBeam, dimBatch, dimQuery, 1] + } + + // @TODO: make sure that we never want to have a causal mask here if start > 0 (this should indicate decoding) + return alibiLogMask(mask, query, slopes, biases, shift, numHeads, start, addCausalMask && start == 0); + }; + + if(mask) { + // recompute the mask if input mask changes (different memory address), otherwise return cached version + auto equal = [](Expr a, Expr b) { return a == b; }; + return cachedMask_->apply(mask, processMask, equal); + } else { + // @TODO: avoid this mask recreation for every layer + int dimBatch = query->shape()[-3]; + int dimKeys = (int)state->getPosition() + 1; + mask = graph()->constant({1, dimBatch, dimKeys, 1}, inits::ones()); + + // recompute the ALIBI mask if shape changes, but still has to create the above temporary mask first + auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); }; + return cachedMask_->apply(mask, processMask, equal); + } + } +}; + } // namespace nn } // namespace marian \ No newline at end of file diff --git a/src/layers_new/attention.cpp b/src/layers_new/attention.cpp index c3758296e..2ec081a30 100644 --- a/src/layers_new/attention.cpp +++ b/src/layers_new/attention.cpp @@ -3,10 +3,10 @@ #include "layers_new/alibi.h" namespace marian { -namespace nn { +namespace nn { // Factory function to create attention layers from options -Ptr attentionFromOptions(Ptr graph, Ptr options) { +Ptr attentionFromOptions(Ptr graph, Ptr options, bool enableCache) { // @TODO: currently this does nothing as it isn't set anywhere std::string selfAttentionType = options->get("transformer-encoder-attention", "default"); // currently only default @@ -17,7 +17,7 @@ Ptr attentionFromOptions(Ptr graph, Ptrget("transformer-dropout-attention", 0.f); - return New>(graph, numHeads, modelDim, modelDim, attentionDropoutProbability); + return New(graph, numHeads, modelDim, modelDim, attentionDropoutProbability, enableCache); } else { ABORT("Unknown transformer encoder attention type: {}", selfAttentionType); @@ -25,9 +25,9 @@ Ptr attentionFromOptions(Ptr graph, Ptr attentionMaskProcessorFromOptions(Ptr graph, Ptr options) { +Ptr maskProcessorFromOptions(Ptr graph, Ptr options) { // currently only default or alibi - std::string processorType = options->get("transformer-attention-mask", "default"); + std::string processorType = options->get("transformer-attention-mask", "default"); if(processorType == "default") { return New(graph, options); } else if(processorType == "alibi") { @@ -37,6 +37,33 @@ Ptr attentionMaskProcessorFromOptions(Ptr selfMaskProcessorFromOptions(Ptr graph, Ptr options) { + auto autoRegType = options->get("transformer-decoder-autoreg", "self-attention"); + if(autoRegType == "rnn") { + // creates a dummy processor that returns an unprocessed mask + return New(graph, options); + } else if(autoRegType == "self-attention") { + // here we will return modified log masks for self-attention + std::string processorType = options->get("transformer-attention-mask", "default"); + if(processorType == "alibi") { + return New(graph, options, /*addCausalMask=*/true); + } else { + return New(graph, options, /*addCausalMask=*/true); + } + } else { + ABORT("Unknown transformer decoder autoregressive type: {}", autoRegType); + } +} + +Ptr contextDecoderMaskProcessorFromOptions(Ptr graph, Ptr options) { + std::string processorType = options->get("transformer-attention-mask", "default"); + if(processorType == "alibi") { + return New(graph, options, /*addCausalMask=*/false); + } else { + return New(graph, options, /*addCausalMask=*/false); + } +} + } // namespace nn // specialized faster operator for log-mask computation @@ -49,28 +76,27 @@ class LogMaskNode : public UnaryNodeOp { // see the reshape below in the logMask function int dimBatch = mask->shape()[-4]; int dimKeys = mask->shape()[-1]; - return { dimBatch, numHeads, 1, dimKeys }; + return { dimBatch, numHeads, 1, dimKeys }; } public: LogMaskNode(Expr mask, int numHeads) - : UnaryNodeOp(mask, newShape(mask, numHeads)), numHeads_(numHeads) + : UnaryNodeOp(mask, newShape(mask, numHeads)), + numHeads_(numHeads) {} NodeOps forwardOps() override { - float lowest = NumericLimits(value_type()).lowest; - float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 - - using namespace functional; // compared to the multi-operation code this does conversion and broadcasting in one step - return { NodeOp(Element(_1 = (1.f - _2) * maskFactor, val_, child(0)->val())) }; + using namespace functional; + return { NodeOp(Element(_1 = log(_2), val_, child(0)->val())) }; } NodeOps backwardOps() override { - float lowest = NumericLimits(value_type()).lowest; - float maskFactor = std::max(lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 + if(!trainable()) + return { }; + using namespace functional; - return { NodeOp(Add(-maskFactor * _1, child(0)->grad(), adj_)) }; + return { NodeOp(Add(_1 / _2, child(0)->grad(), adj_, child(0)->val())) }; } virtual size_t hash() override { @@ -93,13 +119,27 @@ class LogMaskNode : public UnaryNodeOp { const std::string type() override { return "log-mask"; } }; -Expr logMask(Expr mask, int numHeads) { +Expr logMask(Expr mask, int numHeads, bool addCausalMask) { // incoming mask has shape [1, dimBatch, dimKeys, 1] int dimBatch = mask->shape()[-3]; int dimKeys = mask->shape()[-2]; mask = reshape(mask, {dimBatch, 1, 1, dimKeys}); auto logMask = Expression(mask, numHeads); // [dimBatch, numHeads, 1, dimKeys] - return reshape(logMask, {1, dimBatch * numHeads, 1, dimKeys}); + logMask = reshape(logMask, {1, dimBatch * numHeads, 1, dimKeys}); + + // @TODO: this is needlessly slow, integrate with the above in special kernel + if(addCausalMask) { + // add causal mask to logMask + std::vector vMask(dimKeys * dimKeys, 0.f); + for(int i = 0; i < dimKeys; ++i) + for(int j = i + 1; j < dimKeys; ++j) + vMask[i * dimKeys + j] = -std::numeric_limits::infinity(); + + auto triangle = mask->graph()->constant({1, 1, dimKeys, dimKeys}, inits::fromVector(vMask)); + logMask = minimum(logMask, triangle); // [1, dimBatch * numHeads, dimKeys, dimKeys] + } + + return logMask; } } // namespace marian diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h index 6ddfaad2a..9bd31baa0 100644 --- a/src/layers_new/attention.h +++ b/src/layers_new/attention.h @@ -6,31 +6,88 @@ namespace marian { -// specialized operator for faster logMask computation -Expr logMask(Expr mask, int numHeads); +/** + * Specialized operator for faster logMask computation + */ +Expr logMask(Expr mask, int numHeads, bool addCausalMask); namespace nn { -// Abstract base class for attention mechanisms -class AttentionLayer : public Layer, +/** + * Abstract base class for attention mechanisms + */ +class AttentionLayer : public Layer, public IQuaternaryLayer { protected: using Layer::namedLayers_; - + public: AttentionLayer(Ptr graph) : Layer(graph) {} virtual ~AttentionLayer() = default; }; -class MultiplicativeAttention : public AttentionLayer { +/** + * Base class for attention layers that collect attention weights + */ +class AttentionCollector { +private: + mutable std::vector alignments_; // @TODO: rename to something more accurate + +public: + bool saveAttentionWeights{false}; + int numHeads{8}; + + AttentionCollector(bool saveAttentionWeights, int numHeads = 8) + : saveAttentionWeights(saveAttentionWeights), numHeads(numHeads) {} + + void collectOneHead(Expr weights) const { + // weights: [dimBeam, dimBatch * numHeads, dimQuery|1, dimKeys] + + int dimBeam = weights->shape()[-4]; + int dimBatchHeads = weights->shape()[-3]; + int dimQuery = weights->shape()[-2]; // (max) length of trg sequence, or 1 in decoding + int dimKeys = weights->shape()[-1]; // (max) length of src sequence + + int dimBatch = dimBatchHeads / numHeads; + + weights = reshape(weights, {dimBeam * dimBatch, numHeads, dimQuery, dimKeys}); + auto head0 = slice(weights, -3, 0); // [dimBeam * dimBatch, 1, dimQuery, dimKeys] + + // reshape and transpose to match the format guided_alignment expects + head0 = reshape(head0, {dimBeam, dimBatch, dimQuery, dimKeys}); + head0 = transpose(head0, {0, 3, 1, 2}); // [beam depth, dimKeys, dimBatch, dimQuery|1] + + // save only last alignment set. For training this will be all alignments, + // for translation only the last one. Also split alignments by target words. + // @TODO: make splitting obsolete + // @TODO: why is this even here? + alignments_.clear(); + for(int i = 0; i < dimQuery; ++i) { // loop over all trg positions. In decoding, there is only one. + alignments_.push_back(slice(head0, -1, i)); // [tgt index][beam depth, max src length, batch size, 1] P(src pos|trg pos, beam index, batch index) + } + } + + const std::vector& getAlignments() const { + return alignments_; + } + + void clear() { + alignments_.clear(); + } +}; + +/** + * Base class for multiplicative attention layers (can collect attention weights) + */ +class MultiplicativeAttention : public AttentionLayer, public AttentionCollector { protected: using AttentionLayer::namedLayers_; public: Ptr attentionDropout; - MultiplicativeAttention(Ptr graph, float dropoutProbability) - : AttentionLayer(graph) { + MultiplicativeAttention(Ptr graph, float dropoutProbability, bool saveAttentionWeights = false) + : AttentionLayer(graph), AttentionCollector(saveAttentionWeights) { attentionDropout = New(graph, dropoutProbability); registerLayer(attentionDropout); } @@ -45,7 +102,7 @@ class MultiplicativeAttention : public AttentionLayer { // multiplicative attention with flattened softmax float scale = 1.0f / std::sqrt((float)dimKeys); // scaling to avoid extreme values due to matrix multiplication - + // query, keys and values: [dimBeam, dimBatch * numHeads, (dimQuery|dimKeys=dimValues), dimHead] auto z = bdot(query, keys, false, true, scale); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys] @@ -55,11 +112,10 @@ class MultiplicativeAttention : public AttentionLayer { // take softmax along src sequence axis (-1) auto weights = softmax(z); // [dimBeam, dimBatch * numHeads, dimQuery, dimKeys] - -#if 0 // @TODO: make this work again - if(saveAttentionWeights) - collectOneHead(weights, dimBeam); -#endif + + if(saveAttentionWeights) { + collectOneHead(weights); + } // optional dropout for attention weights weights = attentionDropout->apply(weights); @@ -70,15 +126,25 @@ class MultiplicativeAttention : public AttentionLayer { auto output = bdot(weights, values); // [dimBeam, dimBatch * numHeads, dimQuery, dimHead] return output; } + + virtual void clear() override { + AttentionLayer::clear(); + AttentionCollector::clear(); + } }; -// Base class for multi-head attention -template // Currently only used for MultiplicativeAttention -class MultiHeadAttention : public AttentionType { +/** + * Extended multiplicative attention layer with multiple heads + * and separate query, key and value projections, as well as + * an output projection. + */ +class MultiHeadAttention : public MultiplicativeAttention { protected: - using AttentionType::namedLayers_; + using MultiplicativeAttention::namedLayers_; + using AttentionCollector::saveAttentionWeights; private: + bool enableCache_{false}; IPtr cachedKh_; // cached result of key projection IPtr cachedVh_; // cached result of value projection @@ -93,15 +159,17 @@ class MultiHeadAttention : public AttentionType { int modelDim; MultiHeadAttention(Ptr graph, - int numHeads, - int attDim, + int numHeads, + int attDim, int modelDim, - float dropoutProbability) - : AttentionType(graph, dropoutProbability), - cachedKh_(new CachedExpr()), + float dropoutProbability, + bool enableCache = false) + : MultiplicativeAttention(graph, dropoutProbability), + enableCache_(enableCache), + cachedKh_(new CachedExpr()), cachedVh_(new CachedExpr()), - numHeads(numHeads), - attDim(attDim), + numHeads(numHeads), + attDim(attDim), modelDim(modelDim) { qProj = New(graph, attDim); registerLayer(qProj); @@ -117,7 +185,7 @@ class MultiHeadAttention : public AttentionType { virtual ~MultiHeadAttention() = default; protected: - // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to + // join beam and batch dimension and split model dimension in to heads and head dimension. We also need to transpose to // be able to do an efficient batched matmul. Expr splitHeads(Expr input) const { int dimSteps = input->shape()[-2]; @@ -149,22 +217,27 @@ class MultiHeadAttention : public AttentionType { public: // Apply the multi-head attention to the given query, keys and values virtual Expr apply(Expr query, Expr keys, Expr values, Expr mask) const override { + // @TODO: implement custom bdot to avoid splitHeads/joinHeads + // @TODO: explore FlashAttention-like cpu implementation auto qh = splitHeads(qProj->apply(query)); - // @TODO: in original implementation we use shape()->elements(), dunno why - auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); }; - - // these two get conditionally recomputed if their size changes according to criterion above - auto kh = cachedKh_->apply(keys, [this](Expr keys) { - return splitHeads(kProj->apply(keys)); - }, equal); - - auto vh = cachedVh_->apply(values, [this](Expr values) { - return splitHeads(vProj->apply(values)); - }, equal); - - auto output = AttentionType::apply(qh, kh, vh, mask); - + // if enabledCache_ is true, we cache the results of the key and value projections + // otherwise equal is always false and the key and value projections are recomputed + Expr kh, vh; + if(enableCache_) { + // @TODO: in original implementation we use shape()->elements(), dunno why + auto equal = [](Expr a, Expr b) { return a->shape() == b->shape(); }; + // these two get conditionally recomputed if their size changes according to criterion above + kh = cachedKh_->apply(keys, [this](Expr keys) { return splitHeads(kProj->apply(keys)); }, equal); + vh = cachedVh_->apply(values, [this](Expr values) { return splitHeads(vProj->apply(values)); }, equal); + } else { + kh = splitHeads(kProj->apply(keys)); + vh = splitHeads(vProj->apply(values)); + } + + auto output = MultiplicativeAttention::apply(qh, kh, vh, mask); + + // @TODO: combine joinHeads and apply in one matrix multiplication via striding output = joinHeads(output); output = oProj->apply(output); @@ -178,51 +251,133 @@ class MultiHeadAttention : public AttentionType { } }; -// Base class for attention mask processors -// Attention mask processors are used to process a given attention mask before it is used in an attention computation. -struct AttentionMaskProcessor : public LayerWithOptions, public IBinaryLayer, public IBinaryDecoderLayer { +/** + * Base class for mask processors. + */ +struct MaskProcessor : public LayerWithOptions, public IBinaryLayer { + IPtr cachedMask_; + + MaskProcessor(Ptr graph, + Ptr options) + : LayerWithOptions(graph, options), + cachedMask_(new CachedExpr()) {} + + virtual ~MaskProcessor() = default; + + void clear() override { + LayerWithOptions::clear(); + cachedMask_->clear(); + } +}; + +/** + * Base class for decoder mask processors. + */ +struct DecoderMaskProcessor : public LayerWithOptions, public IBinaryDecoderLayer { + bool addCausalMask{false}; + IPtr cachedMask_; + + DecoderMaskProcessor(Ptr graph, + Ptr options, + bool addCausalMask = false) + : LayerWithOptions(graph, options), + addCausalMask(addCausalMask), + cachedMask_(new CachedExpr()) {} + + virtual ~DecoderMaskProcessor() = default; + + void clear() override { + LayerWithOptions::clear(); + cachedMask_->clear(); + } +}; + +/** + * Attention mask processors are used to process a given attention mask + * before it is used in an attention computation. + */ +struct AttentionMaskProcessor : public MaskProcessor { int numHeads{1}; AttentionMaskProcessor(Ptr graph, Ptr options) - : LayerWithOptions(graph, options), + : MaskProcessor(graph, options), numHeads(opt("transformer-heads", 1)) {} virtual ~AttentionMaskProcessor() = default; - + virtual Expr apply(Expr /*query*/, Expr mask) const override { if(!mask) return nullptr; - // @TODO eventually remove this branch. For now we keep it for documentation purposes -#if 0 - // LayerAttention expects mask in a different layout - int dimBatch = mask->shape()[-3]; - int dimKeys = mask->shape()[-2]; - - mask = reshape(mask, {dimBatch, 1, 1, dimKeys}); // [batch size, num heads broadcast=1, max length broadcast=1, max length] - - float maskFactor = std::max(NumericLimits(mask->value_type()).lowest / 2.f, -99999999.f); // to make sure we do not overflow for fp16 - auto logMask = (1 - mask) * maskFactor; - logMask = reshape(repeat(logMask, numHeads, -3), {1, dimBatch * numHeads, 1, dimKeys}); - return logMask; -#else // shape of mask should be [1, dimBatch, dimKeys, 1] - // this does all the above work in one step - return marian::logMask(mask, numHeads); // [1, dimBatch * numHeads, 1, dimKeys] -#endif + return marian::logMask(mask, numHeads, /*addCausalMask=*/false); // [1, dimBatch * numHeads, 1, dimKeys] } +}; + +/** + * Base class for decoder attention mask processors. Attention mask processors are used to + * process a given attention mask before it is used in an attention computation. + * Decoder attention mask processors can take advantage of information from the decoder state. + */ +struct DecoderAttentionMaskProcessor : public DecoderMaskProcessor { + int numHeads{1}; + + DecoderAttentionMaskProcessor(Ptr graph, + Ptr options, + bool addCausalMask = false) + : DecoderMaskProcessor(graph, options, addCausalMask), + numHeads(opt("transformer-heads", 1)) {} + + virtual ~DecoderAttentionMaskProcessor() = default; + + virtual void initState(Ptr /*state*/) const override {} virtual Expr apply(Expr query, Expr mask, Ptr /*state*/) const override { - return apply(query, mask); + if(!mask) + return nullptr; + + // shape of input `mask` should be [1, dimBatch, dimKeys, 1] + // output shape will be // [1, dimBatch * numHeads, 1, dimKeys] if addCausalMask is false + // or [1, dimBatch * numHeads, dimKeys, dimKeys] if addCausalMask is true + auto processMask = [this](Expr mask) { return marian::logMask(mask, numHeads, addCausalMask); }; + + // recompute the mask if input mask changes (different memory address), otherwise return cached version + auto equal = [](Expr a, Expr b) { return a == b; }; + + // recompute the mask if the shape changes, otherwise return cached version + return cachedMask_->apply(mask, processMask, equal); } }; -// Factory function to create attention layers from options -Ptr attentionFromOptions(Ptr graph, Ptr options); +/** + * Dummy decoder mask processor that returns the unprocessed mask, used for RNN autoregressive decoding + */ +struct DummyDecoderMaskProcessor : public DecoderMaskProcessor { + DummyDecoderMaskProcessor(Ptr graph, + Ptr options) + : DecoderMaskProcessor(graph, options, /*addCausalMask=*/false) {} + + virtual ~DummyDecoderMaskProcessor() = default; + + virtual void initState(Ptr /*state*/) const override {} + + virtual Expr apply(Expr /*query*/, Expr mask, Ptr /*state*/) const override { + return mask; + } +}; -// Factory function to create attention mask processors from options -Ptr attentionMaskProcessorFromOptions(Ptr graph, Ptr options); +/** + * Factory function to create attention layers from options + */ +Ptr attentionFromOptions(Ptr graph, Ptr options, bool enableCache = false); + +/** + * Factory function to create mask processors from options + */ +Ptr maskProcessorFromOptions(Ptr graph, Ptr options); +Ptr selfMaskProcessorFromOptions(Ptr graph, Ptr options); +Ptr contextDecoderMaskProcessorFromOptions(Ptr graph, Ptr options); } // namespace nn } // namespace marian diff --git a/src/layers_new/decoder.h b/src/layers_new/decoder.h index 406017d64..9ead145f9 100644 --- a/src/layers_new/decoder.h +++ b/src/layers_new/decoder.h @@ -11,7 +11,7 @@ namespace marian { namespace nn { // Interface: decoder state -struct DecoderState : public IClassName, public std::enable_shared_from_this { +class DecoderState : public IClassName, public std::enable_shared_from_this { protected: size_t position{0}; @@ -27,6 +27,10 @@ struct DecoderState : public IClassName, public std::enable_shared_from_thisposition = pos; + } + // Dynamic cast to requested layer type. Will return nullptr if not possible template Ptr as() { @@ -43,7 +47,7 @@ struct DecoderState : public IClassName, public std::enable_shared_from_this Ptr cast() { auto stateCast = as(); - ABORT_IF(!stateCast, "State {} cannot be cast to requested type {}", + ABORT_IF(!stateCast, "State {} cannot be cast to requested type {}", className(), utils::cxxTypeName()); return stateCast; @@ -57,10 +61,11 @@ struct DecoderState : public IClassName, public std::enable_shared_from_thissetPosition(pos); + } + void append(Ptr item) { ABORT_IF(position != item->getPosition(), "DecoderStateList.position ({}) != DecoderStateItem.position ({}) ?", position, item->getPosition()); items_.push_back(item); } - /** + /** * Retrieve DecoderStateItem at index i */ Ptr at(size_t i) const { @@ -106,29 +117,52 @@ class DecoderStateList : public DecoderState { size_t size() const { return items_.size(); } }; +class EncoderContext { +private: + Expr context_; + Expr contextMask_; + +public: + EncoderContext(Expr context, Expr contextMask) + : context_(context), contextMask_(contextMask) {} + + virtual Expr getContext() const { return context_; } + virtual Expr getContextMask() const { return contextMask_; } +}; + +class DecoderSeq2SeqState : public DecoderStateList, public EncoderContext { +public: + DecoderSeq2SeqState(size_t position, Expr context, Expr contextMask) + : DecoderStateList(position), EncoderContext(context, contextMask) {} +}; // Interface: Unary function struct IUnaryDecoderLayer { + virtual void initState(Ptr /*state*/) const = 0; virtual Expr apply(Expr /*input*/, Ptr /*state*/) const = 0; }; // Interface: Binary function struct IBinaryDecoderLayer { + virtual void initState(Ptr /*state*/) const = 0; virtual Expr apply(Expr, Expr, Ptr /*state*/) const = 0; }; // Interface: Ternary function struct ITernaryDecoderLayer { + virtual void initState(Ptr /*state*/) const = 0; virtual Expr apply(Expr, Expr, Expr, Ptr /*state*/) const = 0; }; // Interface: 4ary function struct IQuaternaryDecoderLayer { + virtual void initState(Ptr /*state*/) const = 0; virtual Expr apply(Expr, Expr, Expr, Expr, Ptr /*state*/) const = 0; }; // Interface: N-Ary function struct INaryLayerDecoderLayer { + virtual void initState(Ptr /*state*/) const = 0; virtual Expr apply(const std::vector& /*inputs*/, Ptr /*state*/) const = 0; }; diff --git a/src/layers_new/interface.h b/src/layers_new/interface.h index a938803ee..590348548 100644 --- a/src/layers_new/interface.h +++ b/src/layers_new/interface.h @@ -48,7 +48,6 @@ struct IClearable { virtual void clear() = 0; }; - // Helper macro to turn parameter C++ variable name into a string. #define registerParameter(paramArg, shape, init) \ do { \ @@ -58,7 +57,7 @@ do { \ } while(0); // Helper macro to turn parameter C++ variable name into a string. -// This version is meant to be used in apply(...) functions for lazy parameter inits +// This version is meant to be used in apply(...) functions for lazy parameter inits // hence has to cast away constness. #define registerParameterLazy(paramArg, shape, init) \ do { \ @@ -80,8 +79,8 @@ do { \ } \ } while(0); -// Helper macro that adds the layer as a named sublayer to the parent layer and uses the given name. Different from above as -// the C++ variable name itself is not used a name string. +// Helper macro that adds the layer as a named sublayer to the parent layer and uses the given name. Different from above as +// the C++ variable name itself is not used a name string. #define registerLayerWithName(layerArg, name) \ do { \ ABORT_IF(!layerArg, "Layer {} of type {} with name {} is not initialized", #layerArg, utils::cxxTypeName(layerArg), name); \ @@ -107,8 +106,8 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr private: Weak graph_; - // Using naked pointer as a weak reference. Cannot use shared_ptr or weak_ptr - // as registration happens in constructor of parent layer and shared_from_this() + // Using naked pointer as a weak reference. Cannot use shared_ptr or weak_ptr + // as registration happens in constructor of parent layer and shared_from_this() // cannot be used before parent layer constructor exits. Layer* firstParent_{nullptr}; std::string name_; @@ -135,13 +134,13 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr virtual ~Layer() = default; - Ptr graph() { + Ptr graph() { auto graph = graph_.lock(); ABORT_IF(!graph, "graph in layer {} expired?", path()); return graph; } - const Ptr graph() const { + const Ptr graph() const { auto graph = graph_.lock(); ABORT_IF(!graph, "graph in layer {} expired?", path()); return graph; @@ -172,7 +171,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr template Ptr cast() { auto layerCast = as(); - ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", + ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", className(), utils::cxxTypeName()); return layerCast; @@ -182,7 +181,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr Ptr cast() const { return const_cast(this)->cast(); } - + // Return all named parameters for this specific layer (not descending into sub-layers) std::vector& namedParameters() { return namedParameters_; } const std::vector& namedParameters() const { return namedParameters_; } @@ -192,7 +191,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr const std::vector>& namedLayers() const { return namedLayers_; } // Return all named sub-layers for this layer and its sub-layers (descending recursively into sub-layers). - // Can be used with layer type e.g. allNamedLayers() to return only sub-layers of this type. + // Can be used with layer type e.g. allNamedLayers() to return only sub-layers of this type. // Returned layers will then have the given type and do not need to be cast anymore. template std::vector> allNamedLayers() { @@ -201,7 +200,7 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr auto castLayer = namedLayer.second->as(); if(castLayer) layers.emplace_back(namedLayer.first, castLayer); - + auto subLayers = namedLayer.second->allNamedLayers(); layers.insert(layers.end(), subLayers.begin(), subLayers.end()); } @@ -213,8 +212,8 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr return const_cast(this)->allNamedLayers(); } - // Returns all sub-layers (only the layers, not the names) for this layer and its sub-layers (descending - // recursively into sub-layers). Can be used with layer type e.g. allLayers() to return only + // Returns all sub-layers (only the layers, not the names) for this layer and its sub-layers (descending + // recursively into sub-layers). Can be used with layer type e.g. allLayers() to return only // sub-layers of this type. Returned layers will then have the given type and do not need to be cast anymore. template std::vector> allLayers() { @@ -230,18 +229,18 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr } // Used by parent layers to set the name of a sub-layer. - // @TODO: make this private and only allow friend access from layers before merging with master. - // Currently misused for top layer that has no parent layer that can set its name. + // @TODO: make this private and only allow friend access from layers before merging with master. + // Currently misused for top layer that has no parent layer that can set its name. void setName(const std::string& name) { name_ = name; } const std::string& name() const { return name_; } // This sets the first parent of a sublayer (the layer a sublayer was first registered with). - // This is required to generate the correct path/name for layer parameters at saving time. - void setFirstParent(Layer* parent) { + // This is required to generate the correct path/name for layer parameters at saving time. + void setFirstParent(Layer* parent) { ABORT_IF(firstParent_ != nullptr, "Parent layer has already been set"); ABORT_IF(parent == this, "Parent layer has to be different from child"); - firstParent_ = parent; + firstParent_ = parent; } // The parent layer of a sublayer is the first layer the sublayer has been registered with. @@ -275,9 +274,9 @@ class Layer : public IClassName, public IClearable, public std::enable_shared_fr return ss.str(); } - // Return Mode::eval or Mode::train. This is used to determine if training only layer-internal actions + // Return Mode::eval or Mode::train. This is used to determine if training only layer-internal actions // like dropout should be run. This will not affect graph-internal gradient propagation unless somehow - // specified in a layer. + // specified in a layer. Mode getMode() const { #if 1 if(graph()->isInference()) { @@ -355,10 +354,10 @@ class LayerWithOptions : public Layer { /** * Wrapper to be used exclusively inside LayerList or other similar containers. This is allows to use the apply(...) functions * of a layer without having to cast to specific type (this is done internally based on the number of arguments). Inspired by - * boost::any_type which allows to construct containers that hold various types. + * boost::any_type which allows to construct containers that hold various types. * This should allow to use any layer and iterfaces will be added here as required. */ -class AnyLayer final : public IUnaryLayer, +class AnyLayer final : public IUnaryLayer, public IBinaryLayer, public ITernaryLayer, public IQuaternaryLayer, @@ -371,7 +370,7 @@ class AnyLayer final : public IUnaryLayer, // private/protected constructor, should only be created within listed classes with friendship AnyLayer(const Ptr& layer) : layer_(layer) {} - + friend class LayerList; public: @@ -385,7 +384,7 @@ class AnyLayer final : public IUnaryLayer, template Ptr cast() const { auto layerCast = as(); - ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", + ABORT_IF(!layerCast, "Layer {} cannot be cast to requested type {}", layer_->className(), utils::cxxTypeName()); return layerCast; @@ -416,12 +415,12 @@ class AnyLayer final : public IUnaryLayer, } }; -/** +/** * Holds sublayers in a list and performs correct registration of sublayers. Sublayers are indexed * and can be accessed like array elements, including iteration. - * `LayerList` -- in contrast to `Sequential` -- does not provide `apply` functions. + * `LayerList` -- in contrast to `Sequential` -- does not provide `apply` functions. * You have to define the execution order and information flow in code. - * + * * See TransformerEncoder for an example where we hold the transformer layer stack in a LayerList, * but define a custom apply function (due to masks being external information and shared between layers). */ @@ -433,7 +432,7 @@ class LayerList : public Layer { void recursiveAppend(Last last) { append(last); } - + template void recursiveAppend(First first, Rest ...rest) { append(first); @@ -452,8 +451,8 @@ class LayerList : public Layer { virtual ~LayerList() = default; - /** - * This inserts an already existing sublayer from this or a different container which will result in + /** + * This inserts an already existing sublayer from this or a different container which will result in * parameter sharing if there are parameters. ``` auto layers = New(graph); @@ -471,7 +470,7 @@ class LayerList : public Layer { layers_.emplace_back(new AnyLayer(layer)); // not using New<...> because of missing friendship } - /** + /** * Retrieve sublayer at index i */ Ptr at(size_t i) const { @@ -494,19 +493,19 @@ class LayerList : public Layer { } }; -/** +/** * `Sequential` is a list of layers similar to `LayerList`, but does provide a set of `apply` functions. * These function assume that the first element in the container can be a unary, binary, ternary * or n-ary layer, but all subsequent layers have to be unary layers as they will consume the single - * output of their preceding layer. Non-unary layers will fail to execute during runtime if they are + * output of their preceding layer. Non-unary layers will fail to execute during runtime if they are * not the very first layer. - * + * * `Sequential` can be used to implement typical feed forward networks: - * + * ``` using namespace marian::nn; - auto seq = New(graph, + auto seq = New(graph, New(graph, 100), New(graph), New(graph, 0.1f), @@ -519,7 +518,7 @@ class LayerList : public Layer { ``` * For other application patterns use `LayerList` and implement them yourself by traversing the layers. */ -class Sequential : public LayerList, +class Sequential : public LayerList, public IUnaryLayer, public IBinaryLayer, public ITernaryLayer, @@ -567,7 +566,7 @@ class Sequential : public LayerList, for(int i = 1; i < layers_.size(); ++i) output = layers_[i]->apply(output); return output; - } + } }; diff --git a/src/layers_new/neuralnet.h b/src/layers_new/neuralnet.h index c0912634f..04552501e 100644 --- a/src/layers_new/neuralnet.h +++ b/src/layers_new/neuralnet.h @@ -113,18 +113,10 @@ struct Linear : public Layer, public IUnaryLayer { registerParameterLazy(bias, Shape({ dimOut }), inits::zeros()); } - Type outputType = x->value_type(); if(useBias) - return marian::affine(x, - marian::cast(weight, outputType), - marian::cast(bias, outputType), - /*transA=*/false, - /*transB=*/transposed); + return marian::affine(x, weight, bias, /*transA=*/false, /*transB=*/transposed); else - return marian::dot(x, - marian::cast(weight, outputType), - /*transA=*/false, - /*transB=*/transposed); + return marian::dot(x, weight, /*transA=*/false, /*transB=*/transposed); } }; diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h index 720fa50f7..9a9cd067f 100644 --- a/src/layers_new/rnn.h +++ b/src/layers_new/rnn.h @@ -12,6 +12,7 @@ struct CellState { }; struct ICell { + virtual void initState(Ptr state) const = 0; virtual std::vector applyToInput(Expr input) const = 0; virtual Expr applyToState(const std::vector& inputs, Expr mask, Ptr state) const = 0; }; @@ -36,12 +37,17 @@ class SSRU final : public Layer, public ICell { registerLayer(dropout); } + virtual void initState(Ptr state) const override { + state->recurrent = graph()->constant({1, 1, 1, dimState}, inits::zeros()); + state->position = 0; + } + std::vector applyToInput(Expr input) const override { int dimModel = input->shape()[-1]; ABORT_IF(dimModel != dimState, "Model dimension {} has to match state dimension {}", dimModel, dimState); input = dropout->apply(input); - + Expr output = iProj->apply(input); Expr forget = fProj->apply(input); @@ -73,7 +79,7 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer Ptr cell; Ptr oProj; - RNN(Ptr graph, int dimState, bool outputProjection = false) + RNN(Ptr graph, int dimState, bool outputProjection = false) : Layer(graph) { cell = New(graph, dimState); registerLayer(cell); @@ -84,6 +90,14 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer } } + virtual void initState(Ptr state) const override { + ABORT("Remove this abort once this is actually used in the decoder"); + auto cellState = New(); + cell->initState(/*in/out=*/cellState); + state->as()->set(cellState->recurrent); + state->setPosition(cellState->position); + } + virtual Expr apply(Expr input, Expr inputMask = nullptr) const override { auto state = New(graph()->constant({1, 1, 1, cell->dimState}, inits::zeros()), /*position=*/0); return apply(input, inputMask, state); @@ -93,13 +107,16 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer auto cellState = New(); cellState->recurrent = state->as()->get(); + // during decoding time is of dimension 1, so this is a no-op (reshape in fact) input = swapTimeBatch(input); // [beam, time, batch, dim] if(inputMask) + // same here inputMask = swapTimeBatch(inputMask); int dimTimeAxis = -3; - + std::vector inputs = cell->applyToInput(input); + // @TODO: this could be implemented as a special kernel/operator std::vector outputs; for(int i = 0; i < input->shape()[dimTimeAxis]; ++i) { std::vector stepInputs(inputs.size()); @@ -109,13 +126,15 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer auto stepMask = inputMask; if(stepMask) stepMask = slice(inputMask, dimTimeAxis, i); - + Expr output = cell->applyToState(stepInputs, stepMask, /*in/out=*/cellState); outputs.push_back(output); } state->as()->set(cellState->recurrent); - + state->setPosition(cellState->position); + + // during decoding again, this is a no-op Expr output = swapTimeBatch(concatenate(outputs, dimTimeAxis)); if(oProj) output = oProj->apply(output); diff --git a/src/layers_new/transformer.h b/src/layers_new/transformer.h index d80fe102f..c358cd5c3 100644 --- a/src/layers_new/transformer.h +++ b/src/layers_new/transformer.h @@ -16,7 +16,8 @@ namespace nn { * Currently these are usually dropout, layer normalization and skip connections. * A transformer block will usually apply one of them. */ -struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { +class TransformerPrePostProcessor final : public Layer, public IBinaryLayer { +public: Ptr dropout; Ptr norm; std::string actionDesc; @@ -65,20 +66,24 @@ struct TransformerPrePostProcessor final : public Layer, public IBinaryLayer { }; /** - * This is a typical transformer self-attention block. The default configuration will + * This is a transformer self-attention block without state. The default configuration will * use a multi-head multiplicative self-attention layer, followed by dropout, the skip * connection and layer normalization (dan) in the post-processor. The pre-processor does - * nothing in the default configuration. + * nothing in the default configuration. See TransformerDecoderSelfAttentionBlock for a + * version that can be used in the decoder with state. */ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBinaryLayer { public: Ptr preprocessor; + Ptr selfMaskProcessor; Ptr selfAttention; Ptr postprocessor; TransformerSelfAttentionBlock(Ptr graph, - Ptr options) - : LayerWithOptions(graph, options) + Ptr options, + Ptr selfMaskProcessorInit = nullptr) + : LayerWithOptions(graph, options), + selfMaskProcessor(selfMaskProcessorInit) { preprocessor = New( graph, @@ -86,6 +91,11 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin opt("transformer-dropout", 0.f)); registerLayer(preprocessor); + if(!selfMaskProcessor) { + selfMaskProcessor = maskProcessorFromOptions(graph, options); + registerLayer(selfMaskProcessor); + } + selfAttention = attentionFromOptions(graph, options); registerLayer(selfAttention); @@ -96,8 +106,9 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin registerLayer(postprocessor); } - Expr apply(Expr input, Expr logMask = nullptr) const override { + Expr apply(Expr input, Expr inputMask = nullptr) const override { auto output = preprocessor->apply(input); // optional preprocessing + auto logMask = selfMaskProcessor->apply(output, inputMask); // mask out attention to padding symbols output = selfAttention->apply(output, output, output, logMask); // self attention, @TODO: make this a IBinaryLayer rather than IQuaternaryLayer output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection return output; @@ -110,7 +121,8 @@ class TransformerSelfAttentionBlock final : public LayerWithOptions, public IBin * the skip connection and layer normalization (dan) in the post-processor. The pre-processor does * nothing in the default configuration. */ -struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLayer { +class TransformerFilterBlock final : public LayerWithOptions, public IUnaryLayer { +public: Ptr preprocessor; Ptr layers; Ptr postprocessor; @@ -182,15 +194,17 @@ struct TransformerFilterBlock final : public LayerWithOptions, public IUnaryLaye * A full transformer encoder layer consists of a self-attention block followed by * a filter block. Skip connections etc. are handled inside the blocks, see above. */ -struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLayer { +class TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLayer { +public: Ptr selfAttentionBlock; Ptr filterBlock; TransformerEncoderLayer(Ptr graph, - Ptr options) + Ptr options, + Ptr selfMaskProcessorInit = nullptr) : LayerWithOptions(graph, options) { - selfAttentionBlock = New(graph, options); + selfAttentionBlock = New(graph, options, selfMaskProcessorInit); registerLayer(selfAttentionBlock); filterBlock = New(graph, options); @@ -213,10 +227,9 @@ struct TransformerEncoderLayer final : public LayerWithOptions, public IBinaryLa * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. * @TODO: get rid of these transposes. */ -struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { +class TransformerEncoder : public LayerWithOptions, public IBinaryLayer { public: Ptr positionEmbedding; - Ptr maskProcessor; Ptr preprocessor; Ptr layers; Ptr postprocessor; @@ -237,9 +250,6 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { registerLayer(positionEmbedding); } - maskProcessor = attentionMaskProcessorFromOptions(graph, options); - registerLayer(maskProcessor); - preprocessor = New( graph, opt("transformer-postprocess-emb", ""), @@ -248,8 +258,15 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { layers = New(graph); registerLayer(layers); + + Ptr selfMaskProcessor; // this will be initialized in the first encoder layer for(int i = 0; i < opt("enc-depth"); ++i) { - auto transformerEncoderLayer = New(graph, options); + auto transformerEncoderLayer = New(graph, options, selfMaskProcessor); + layers->append(transformerEncoderLayer); + + if(!selfMaskProcessor) + selfMaskProcessor = transformerEncoderLayer->selfAttentionBlock->selfMaskProcessor; + // example of changing linear layer init functions burried deep in the model if(opt("transformer-depth-scaling", false)) for(auto linear : transformerEncoderLayer->allLayers()) @@ -265,7 +282,6 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { norm->useBias = false; } } - layers->append(transformerEncoderLayer); } postprocessor = New( @@ -277,7 +293,7 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { virtual ~TransformerEncoder() = default; - Expr apply(Expr input, Expr mask = nullptr) const override { + Expr apply(Expr input, Expr inputMask = nullptr) const override { // first and last operations (see at the bottom of this function) switch the time and batch // dimensions. This order is more natural for the transformer, but more difficult to handle // during beam search or when using RNNs. Hence the input/output transpositions here. @@ -287,8 +303,8 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { // do that everywhere we can detect inconsistencies automatically. // reorganize batch and timestep auto output = swapTimeBatch(input); // [1, dimBatch, dimSrcWords, dimModel] - if(mask) - mask = swapTimeBatch(mask); // [1, dimBatch, dimSrcWords, 1] + if(inputMask) + inputMask = swapTimeBatch(inputMask); // [1, dimBatch, dimSrcWords, 1] // apply positional embeddings to contextual input if(positionEmbedding) @@ -301,13 +317,12 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { // apply dropout or layer-norm to embeddings if required output = preprocessor->apply(output); - auto logMask = maskProcessor->apply(output, mask); // traverse the layers, use the same mask for each for(auto layer : *layers) { if(keepHiddenStates) // note, with pre-norm, the hidden states will not be normed here. hiddenStates.push_back(hiddenTransformFn(output)); - output = layer->apply(output, logMask); + output = layer->apply(output, inputMask); } // apply final postprocessor if required, e.g. final layer-norm for pre-norm or final skip connection @@ -339,15 +354,18 @@ struct TransformerEncoder : public LayerWithOptions, public IBinaryLayer { * connection and layer normalization (dan) in the post-processor. The pre-processor does * nothing in the default configuration. */ -class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITernaryLayer { +class TransformerDecoderCrossAttentionBlock final : public LayerWithOptions, public ITernaryDecoderLayer { public: Ptr preprocessor; + Ptr contextMaskProcessor; Ptr crossAttention; Ptr postprocessor; - TransformerCrossAttentionBlock(Ptr graph, - Ptr options) - : LayerWithOptions(graph, options) + TransformerDecoderCrossAttentionBlock(Ptr graph, + Ptr options, + Ptr contextMaskProcessorInit = nullptr) + : LayerWithOptions(graph, options), + contextMaskProcessor(contextMaskProcessorInit) { preprocessor = New( graph, @@ -355,8 +373,15 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe opt("transformer-dropout", 0.f)); registerLayer(preprocessor); + if(!contextMaskProcessor) { + contextMaskProcessor = contextDecoderMaskProcessorFromOptions(graph, options); + registerLayer(contextMaskProcessor); + } + // @TODO: factory to support different attention flavors? - crossAttention = attentionFromOptions(graph, options); + // for cross-attention, we cache the projected keys and values since they come from + // the encoder and are static during decoding unless the batch size changes. + crossAttention = attentionFromOptions(graph, options, /*enableCache=*/true); registerLayer(crossAttention); postprocessor = New( @@ -366,37 +391,33 @@ class TransformerCrossAttentionBlock final : public LayerWithOptions, public ITe registerLayer(postprocessor); } - Expr apply(Expr input, Expr context, Expr logMask) const override { + void initState(Ptr state) const override {} + + Expr apply(Expr input, Expr context, Expr contextMask, Ptr state) const override { auto output = preprocessor->apply(input); // optional preprocessing + auto logMask = contextMaskProcessor->apply(output, contextMask, state); output = crossAttention->apply(output, context, context, logMask); // cross attention, @TODO: make this a ITernaryLayer rather than IQuaternaryLayer output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection return output; } }; -class TransformerAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer { -public: - TransformerAutoRegressiveBlock(Ptr graph, - Ptr options) - : LayerWithOptions(graph, options) {} - - virtual ~TransformerAutoRegressiveBlock() = default; - - using IBinaryDecoderLayer::apply; -}; - /** - * This is a transformer RNN block. + * Base class for transformer auto-regressive blocks. These are blocks that can be used in the decoder + * and that take the previous step's output as input. Currently this is either a self-attention block + * or an RNN block. */ -class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { +class TransformerDecoderAutoRegressiveBlock : public LayerWithOptions, public IBinaryDecoderLayer { public: Ptr preprocessor; - Ptr> rnn; + Ptr selfMaskProcessor; Ptr postprocessor; - TransformerRNNBlock(Ptr graph, - Ptr options) - : TransformerAutoRegressiveBlock(graph, options) + TransformerDecoderAutoRegressiveBlock(Ptr graph, + Ptr options, + Ptr selfMaskProcessorInit = nullptr) + : LayerWithOptions(graph, options), + selfMaskProcessor(selfMaskProcessorInit) { preprocessor = New( graph, @@ -404,10 +425,10 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { opt("transformer-dropout", 0.f)); registerLayer(preprocessor); - // @TODO: factory to support different attention flavors? - int modelDim = opt("transformer-dim-model", opt("dim-emb")); - rnn = New>(graph, modelDim, opt("transformer-rnn-projection", false)); - registerLayer(rnn); + if(!selfMaskProcessor) { + selfMaskProcessor = selfMaskProcessorFromOptions(graph, options); + registerLayer(selfMaskProcessor); + } postprocessor = New( graph, @@ -416,6 +437,85 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { registerLayer(postprocessor); } + virtual ~TransformerDecoderAutoRegressiveBlock() = default; + + using IBinaryDecoderLayer::initState; + using IBinaryDecoderLayer::apply; +}; + +/** + * This is a typical transformer self-attention block. The default configuration will + * use a multi-head multiplicative self-attention layer, followed by dropout, the skip + * connection and layer normalization (dan) in the post-processor. The pre-processor does + * nothing in the default configuration. + */ +class TransformerDecoderSelfAttentionBlock final : public TransformerDecoderAutoRegressiveBlock { +public: + Ptr selfAttention; + + using TransformerDecoderAutoRegressiveBlock::preprocessor; + using TransformerDecoderAutoRegressiveBlock::selfMaskProcessor; + using TransformerDecoderAutoRegressiveBlock::postprocessor; + + TransformerDecoderSelfAttentionBlock(Ptr graph, + Ptr options, + Ptr selfMaskProcessorInit = nullptr) + : TransformerDecoderAutoRegressiveBlock(graph, options, selfMaskProcessorInit) + { + // no caching of keys and values for self-attention since they change at each step + selfAttention = attentionFromOptions(graph, options, /*enableCache=*/false); + registerLayer(selfAttention); + } + + void initState(Ptr state) const override { + state->setPosition(0); + } + + Expr apply(Expr input, Expr inputMask, Ptr state) const override { + auto output = preprocessor->apply(input); // optional preprocessing + + // Here we extend the state with the keys and values from the previous step. + auto query = output; + auto keysValues = output; + if(state->getPosition() > 0) { + auto kvHistory = state->as()->get(); // [dimBeam, dimBatch, dimHistory, dimModel] + keysValues = concatenate({kvHistory, keysValues}, /*axis=*/-2); // [dimBeam, dimBatch, dimHistory + 1, dimModel] + } + state->as()->set(keysValues); + + auto logMask = selfMaskProcessor->apply(query, inputMask, state); + output = selfAttention->apply(query, keysValues, keysValues, logMask); + output = postprocessor->apply(output, input); // optional postprocessing, optional skip connection + return output; + } +}; + +/** + * This is a transformer RNN block that can be used as a replacement for the self-attention + * block in the decoder. + */ +class TransformerDecoderRNNBlock final : public TransformerDecoderAutoRegressiveBlock { +public: + Ptr> rnn; // @TODO: support other RNN types like LSTM or GRU + + using TransformerDecoderAutoRegressiveBlock::preprocessor; + using TransformerDecoderAutoRegressiveBlock::postprocessor; + + TransformerDecoderRNNBlock(Ptr graph, + Ptr options, + Ptr selfMaskProcessorInit = nullptr) + : TransformerDecoderAutoRegressiveBlock(graph, options, selfMaskProcessorInit) + { + // @TODO: factory to support different attention flavors? + int modelDim = opt("transformer-dim-model", opt("dim-emb")); + rnn = New>(graph, modelDim, opt("transformer-rnn-projection", false)); + registerLayer(rnn); + } + + void initState(Ptr state) const override { + rnn->as()->initState(state); + } + Expr apply(Expr input, Expr inputMask, Ptr state) const override { auto output = preprocessor->apply(input); // optional preprocessing output = rnn->apply(output, inputMask, state); // rnn application with state extension @@ -425,44 +525,39 @@ class TransformerRNNBlock final : public TransformerAutoRegressiveBlock { }; /** - * A full transformer decoder layer consists of a self-attention block followed by - * cross-attention block and a filter block. Skip connections etc. are handled inside - * the blocks, see above. - * - * For the self-attention block we need a special mask, usually a triangle mask that - * prohibits to look into the future. - * @TODO: should the triangle mask be constructed locally here? Would make sense, but expensive - * for many layers. + * A full transformer (LM) decoder layer consists of a self-attention block followed by + * a filter block. Skip connections etc. are handled inside the blocks, see above. */ -struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaternaryDecoderLayer { - Ptr autoRegressiveBlock; - Ptr crossAttentionBlock; +class TransformerDecoderLayer : public LayerWithOptions, public IBinaryDecoderLayer { +public: + Ptr autoRegressiveBlock; Ptr filterBlock; TransformerDecoderLayer(Ptr graph, - Ptr options) + Ptr options, + Ptr selfMaskProcessorInit = nullptr) : LayerWithOptions(graph, options) { auto autoRegressionType = opt("transformer-decoder-autoreg", "self-attention"); if(autoRegressionType == "self-attention") { - ABORT("Auto-regression block type {} not yet implemented", autoRegressionType); + autoRegressiveBlock = New(graph, options, selfMaskProcessorInit); } else if(autoRegressionType == "rnn") { - autoRegressiveBlock = New(graph, options); + autoRegressiveBlock = New(graph, options, selfMaskProcessorInit); } else { ABORT("Unknown auto-regression block type {}", autoRegressionType); } registerLayer(autoRegressiveBlock); - crossAttentionBlock = New(graph, options); - registerLayer(crossAttentionBlock); - filterBlock = New(graph, options, /*isDecoder=*/true); registerLayer(filterBlock); } - Expr apply(Expr input, Expr inputMask, Expr context, Expr logMask, Ptr state) const override { + void initState(Ptr state) const override { + autoRegressiveBlock->as()->initState(state); + } + + Expr apply(Expr input, Expr inputMask, Ptr state) const override { Expr output = autoRegressiveBlock->apply(input, inputMask, state); - output = crossAttentionBlock->apply(output, context, logMask); output = filterBlock->apply(output); checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) @@ -470,18 +565,64 @@ struct TransformerDecoderLayer final : public LayerWithOptions, public IQuaterna } }; +/** + * A transformer (S2S) decoder layer consists of a self-attention block followed by + * cross-attention block and a filter block. Skip connections etc. are handled inside + * the blocks. We inherit from TransformerDecoderLayer and add the cross-attention block. + * * @TODO: get rid of IQuaternaryDecoderLayer and use IBinaryDecoderLayer instead + */ +class TransformerDecoderLayerWithCrossAttention : public TransformerDecoderLayer, public IQuaternaryDecoderLayer { +public: + Ptr crossAttentionBlock; + using TransformerDecoderLayer::autoRegressiveBlock; + using TransformerDecoderLayer::filterBlock; + + TransformerDecoderLayerWithCrossAttention(Ptr graph, + Ptr options, + Ptr selfMaskProcessorInit = nullptr, + Ptr contextMaskProcessorInit = nullptr) + : TransformerDecoderLayer(graph, options, selfMaskProcessorInit) + { + crossAttentionBlock = New(graph, options, contextMaskProcessorInit); + registerLayer(crossAttentionBlock); + } + + void initState(Ptr state) const override { + TransformerDecoderLayer::initState(state); + } + + Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr state) const override { + Expr output = autoRegressiveBlock->apply(input, inputMask, state); + output = crossAttentionBlock->apply(output, context, contextMask, state); + output = filterBlock->apply(output); + + checkpoint(output); // A full transformer block is a good point for gradient checkpointing (currently manual) + return output; + } + +private: + // @TODO: once we have correct decoder states we can change the interface to IBinaryDecoderLayer and remove this + // this is a dummy implementation to satisfy the interface, it should never be called + Expr apply(Expr input, Expr inputMask, Ptr state) const override { + ABORT("This should never be called"); + } +}; + /** * A full transformer decoder stack. Before applying multiple transformer layers (depth of the decoder), we * add positional embeddings and apply post-processing actions to the combined embeddings. Due to backward-compatiblity * with RNN models and for easier beam-search we transpose batch and time dimensions on input and output. * @TODO: get rid of these transposes. */ -struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDecoderLayer { +class TransformerDecoder final : public LayerWithOptions, public IBinaryDecoderLayer { +private: + Ptr attentionCollector_; + +public: Ptr positionEmbedding; - Ptr maskProcessor; Ptr preprocessor; - Ptr layers; Ptr postprocessor; + Ptr layers; TransformerDecoder(Ptr graph, Ptr options) @@ -492,15 +633,18 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec registerLayer(positionEmbedding); } - maskProcessor = attentionMaskProcessorFromOptions(graph, options); - registerLayer(maskProcessor); - preprocessor = New( graph, opt("transformer-postprocess-emb", ""), opt("transformer-dropout", 0.f)); registerLayer(preprocessor); + postprocessor = New( + graph, + opt("transformer-postprocess-top", ""), + opt("transformer-dropout", 0.f)); + registerLayer(postprocessor); + size_t decDepth = opt("dec-depth"); std::vector tiedLayers = opt>("transformer-tied-layers", std::vector()); ABORT_IF(!tiedLayers.empty() && tiedLayers.size() != decDepth, @@ -513,23 +657,40 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec layers = New(graph); registerLayer(layers); + + Ptr selfMaskProcessor; // this will be initialized in the first decoder layer + Ptr contextMaskProcessor; // this will be initialized in the first decoder layer for(size_t i = 0; i < decDepth; ++i) { if(tiedLayers.empty() || tiedLayers[i] == i) { // not tied or tied to itself, so needs to be created first - auto transformerDecoderLayer = New(graph, options); + auto transformerDecoderLayer = New(graph, options, selfMaskProcessor, contextMaskProcessor); layers->append(transformerDecoderLayer); + + if(!selfMaskProcessor) + selfMaskProcessor = transformerDecoderLayer->autoRegressiveBlock->selfMaskProcessor; + if(!contextMaskProcessor) + contextMaskProcessor = transformerDecoderLayer->crossAttentionBlock->contextMaskProcessor; + } else { ABORT_IF(tiedLayers[i] > i, "Cannot tie to layer above this layer??"); layers->append(layers->at(tiedLayers[i])); // repeat layer to tie weights } - auto currentLayer = layers->at(i)->as(); + auto currentLayer = layers->at(i)->as(); + // example of changing linear layer init functions burried deep in the model if(opt("transformer-depth-scaling", false)) { - auto autoRegLayer = currentLayer->autoRegressiveBlock->as(); - autoRegLayer->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + auto autoRegLayerRNN = currentLayer->autoRegressiveBlock->as(); + if(autoRegLayerRNN) + autoRegLayerRNN->rnn->oProj->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + + auto autoRegLayerSA = currentLayer->autoRegressiveBlock->as(); + if(autoRegLayerSA) + for(auto linear : autoRegLayerSA->allLayers()) + linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); for(auto linear : currentLayer->crossAttentionBlock->allLayers()) linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); + for(auto linear : currentLayer->filterBlock->allLayers()) linear->init = inits::glorotUniform(true, true, /*scale=*/ 1.f / std::sqrt((float)i + 1)); } @@ -544,34 +705,55 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec norm->useBias = false; } } + + if(opt("guided-alignment", "none") != "none" || options_->hasAndNotEmpty("alignment")) { + std::string gaStr = opt("transformer-guided-alignment-layer", "last"); + + size_t attLayer = decDepth - 1; + if(gaStr != "last") + attLayer = std::stoull(gaStr) - 1; + + ABORT_IF(attLayer >= decDepth, "Chosen layer for guided attention ({}) larger than number of layers ({})", attLayer + 1, decDepth); + + if(i == attLayer) { + attentionCollector_ = currentLayer->crossAttentionBlock->crossAttention->as(); + attentionCollector_->saveAttentionWeights = true; // @TODO: ugly + attentionCollector_->numHeads = opt("transformer-heads"); // @TODO: ugly + } + } } + } - postprocessor = New( - graph, - opt("transformer-postprocess-top", ""), - opt("transformer-dropout", 0.f)); - registerLayer(postprocessor); + void initState(Ptr state) const override { + ABORT("Remove this abort once this is actually used in the decoder"); + size_t positiion = 0; + state->setPosition(positiion); + for(auto layer : *layers) { + Ptr layerState = New(positiion); + layer->as()->initState(layerState); + state->as()->append(layerState); + } } - Expr apply(Expr input, Expr inputMask, Expr context, Expr contextMask, Ptr state) const override { + Expr apply(Expr input, Expr inputMask, Ptr state) const override { // first and last operations (see at the bottom of this function) switch the time and batch // dimensions. This order is more natural for the transformer, but more difficult to handle // during beam search or when using RNNs. Hence the input/output transpositions here. Expr output = swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] - context = swapTimeBatch(context); // [dimBeam=1, dimBatch, dimSrcWords, dimModel] // set current target token position during decoding or training. At training // this should be 0. During translation the current length of the translation. // Used for position embeddings and creating new decoder states. int startPos = (int)state->getPosition(); - // @TODO: write function prepareMasks(); - // @TODO: create triangle mask here and combine with inputMask - LOG_ONCE(info, "Don't forget the triangle mask if required!"); - if(inputMask) inputMask = swapTimeBatch(inputMask); // [dimBeam=1, dimBatch, dimTrgWords, dimModel=1] + Expr context = state->as()->getContext(); + Expr contextMask = state->as()->getContextMask(); + + // @TODO: get rid of this + context = swapTimeBatch(context); // [dimBeam=1, dimBatch, dimSrcWords, dimModel] if(contextMask) contextMask = swapTimeBatch(contextMask); // [dimBeam=1, dimBatch, dimSrcWords, dimModel=1] @@ -589,11 +771,11 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec // get an iterator to per-layer states auto layerStateIt = state->as()->begin(); - auto logMask = maskProcessor->apply(output, contextMask, *layerStateIt); - // traverse the layers, use the same mask for each for(auto layer : *layers) { - output = layer->as()->apply(output, inputMask, context, logMask, /*in/out=*/*layerStateIt++); + // @TODO: can we put logmask computation inside this layer? Then we can reduce the number of arguments here + // and use only the decoder state to provide context and mask. + output = layer->as()->apply(output, inputMask, context, contextMask, /*in/out=*/*layerStateIt++); } // apply final postprocessor if requred, e.g. final layer-norm for pre-norm or final skip connection @@ -609,6 +791,18 @@ struct TransformerDecoder final : public LayerWithOptions, public IQuaternaryDec output = swapTimeBatch(output); // [beam depth=1, max length, batch size, vector dim] return output; } + + std::vector getAlignments() { + if(attentionCollector_) + return attentionCollector_->getAlignments(); + else + return {}; + } + + virtual void clear() override { + LayerWithOptions::clear(); + } + }; } // namespace nn diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 6a09469fd..513639dd6 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -58,7 +58,7 @@ IBeamSearchDecoder::IBeamSearchDecoder(Ptr options, const std::vector& ptrs) : options_(options) { for(auto ptr : ptrs) - modelWeights_.push_back(New(ptr)); + modelWeights_.push_back(New(ptr, io::MmapMode::RequiredMmap, /*locking=*/false)); } class BeamSearchDecoder : public IBeamSearchDecoder { diff --git a/src/models/amun.h b/src/models/amun.h index d6b1209c6..65f5c6516 100644 --- a/src/models/amun.h +++ b/src/models/amun.h @@ -95,10 +95,7 @@ class Amun : public EncoderDecoder { // @TODO: get rid of all this eventually { // scope for lock_guard - // this is needed during loading since we modify the content of modelFile->items() directly - // This is quite ugly but this is legacy code anyway. - std::mutex mutex; - std::lock_guard lock(mutex); + auto lockGuard = modelFile->scopedLockGuard(); // only modify the first time. bool modify = false; diff --git a/src/models/bleurt.h b/src/models/bleurt.h index 74848b788..844f94609 100644 --- a/src/models/bleurt.h +++ b/src/models/bleurt.h @@ -70,8 +70,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder { auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] auto binaryMask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - auto logMask = maskProcessor->apply(output, binaryMask); // [beam depth=1, batch size * numHeads, max length, vector dim=1] - + // apply positional embeddings to contextual input output = positionEmbedding->apply(output); @@ -83,7 +82,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder { // traverse the layers, use the same mask for each for(auto layer : *layers) - output = layer->apply(output, logMask); + output = layer->apply(output, binaryMask); return output; } @@ -97,7 +96,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions, Ptr encoder; BleurtBatchEncoder(Ptr graph, - Ptr options) + Ptr options) : LayerWithOptions(graph, options), EncoderBase(graph, options) { @@ -155,7 +154,7 @@ struct BleurtBatchEncoder final : public nn::LayerWithOptions, } virtual void clear() override { - Layer::clear(); + LayerWithOptions::clear(); } }; diff --git a/src/models/nematus.h b/src/models/nematus.h index 7d421ec5c..d0132bc9e 100644 --- a/src/models/nematus.h +++ b/src/models/nematus.h @@ -35,10 +35,7 @@ class Nematus : public EncoderDecoder { // @TODO: get rid of all this eventually { // scope for lock_guard - // this is needed during loading since we modify the content of modelFile->items() directly - // This is quite ugly but this is legacy code anyway. - std::mutex mutex; - std::lock_guard lock(mutex); + auto lockGuard = modelFile->scopedLockGuard(); // only modify the first time. bool modify = false; diff --git a/src/models/transformer.h b/src/models/transformer.h index ad018b240..6feda24fe 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -40,16 +40,16 @@ class Transformer : public EncoderOrDecoderBase { std::vector alignments_; // [max tgt len or 1][beam depth, max src length, batch size, 1] // @TODO: make this go away - template - T opt(const char* const key) const { Ptr options = options_; return options->get(key); } + template + T opt(const char* const key) const { Ptr options = options_; return options->get(key); } - template - T opt(const std::string& key) const { return opt(key.c_str()); } + template + T opt(const std::string& key) const { return opt(key.c_str()); } - template + template T opt(const char* const key, const T& def) const { Ptr options = options_; return options->get(key, def); } - template + template T opt(const std::string& key, const T& def) const { opt(key.c_str(), def); } public: @@ -120,7 +120,7 @@ class Transformer : public EncoderOrDecoderBase { virtual Expr addSpecialEmbeddings(Expr input, int start = 0, Ptr /*batch*/ = nullptr) const { if(opt("transformer-disable-position-embeddings", false)) return input; - + bool trainPosEmbeddings = opt("transformer-train-positions", false); return addPositionalEmbeddings(input, start, trainPosEmbeddings); } @@ -248,7 +248,7 @@ class Transformer : public EncoderOrDecoderBase { // to avoid mistakenly using the old transformer framework for new features auto maskType = opt("transformer-attention-mask", "default"); - ABORT_IF(maskType != "default", + ABORT_IF(maskType != "default", "You specified --transformer-attention-mask={} which is not implemented for legacy Transformer", maskType ); // softmax over batched dot product of query and keys (applied over all @@ -263,7 +263,7 @@ class Transformer : public EncoderOrDecoderBase { // take softmax along src sequence axis (-1) auto weights = softmax(z); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] - + if(saveAttentionWeights) collectOneHead(weights, dimBeam); @@ -290,7 +290,7 @@ class Transformer : public EncoderOrDecoderBase { auto Wq = graph_->param(prefix + "_Wq", {dimModel, dimModel}, inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f)); auto bq = graph_->param(prefix + "_bq", { 1, dimModel}, inits::zeros()); auto qh = affine(q, Wq, bq); - + qh = SplitHeads(qh, dimHeads); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim] Expr kh; @@ -313,8 +313,8 @@ class Transformer : public EncoderOrDecoderBase { } Expr vh; - if (cache - && cache_.count(prefix + "_values") > 0 + if (cache + && cache_.count(prefix + "_values") > 0 && cache_[prefix + "_values"]->shape().elements() == values->shape().elements()) { vh = cache_[prefix + "_values"]; } else { @@ -391,7 +391,7 @@ class Transformer : public EncoderOrDecoderBase { // multi-head self-attention over previous input output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights); - + auto opsPost = opt("transformer-postprocess"); output = postProcess(prefix + "_Wo", opsPost, output, input, dropProb); @@ -431,14 +431,14 @@ class Transformer : public EncoderOrDecoderBase { int decDimFfn = opt("transformer-decoder-dim-ffn", 0); if(decDimFfn != 0) dimFfn = decDimFfn; - + int decDepthFfn = opt("transformer-decoder-ffn-depth", 0); if(decDepthFfn != 0) - depthFfn = decDepthFfn; + depthFfn = decDepthFfn; } - + ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn); - + float ffnDropProb = inference_ ? 0 : opt("transformer-dropout-ffn"); auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f); @@ -588,7 +588,7 @@ class EncoderTransformer : public Transformer { auto embeddingLayer = getEmbeddingLayer(opt("ulr", false)); std::tie(batchEmbeddings, batchMask) = embeddingLayer->apply((*batch)[batchIndex_]); batchEmbeddings = addSpecialEmbeddings(batchEmbeddings, /*start=*/0, batch); - + // reorganize batch and timestep batchEmbeddings = atleast_nd(batchEmbeddings, 4); // [beam depth=1, max length, batch size, vector dim] batchMask = atleast_nd(batchMask, 4); // [beam depth=1, max length, batch size, vector dim=1] @@ -623,7 +623,7 @@ class EncoderTransformer : public Transformer { } // this allows to run a final layernorm operation after going through the transformer layer stack. - // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) + // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested. auto opsTop = opt("transformer-postprocess-top", ""); layer = postProcess(prefix_ + "_top", opsTop, layer, prevLayer, dropProb); @@ -763,8 +763,8 @@ class DecoderTransformer : public Transformer { // This would happen if something goes wrong during batch pruning. ABORT_IF(encoderContext->shape()[-3] != dimBatch, - "Context and query batch dimension do not match {} != {}", - encoderContext->shape()[-3], + "Context and query batch dimension do not match {} != {}", + encoderContext->shape()[-3], dimBatch); // LayerAttention expects mask in a different layout @@ -801,7 +801,7 @@ class DecoderTransformer : public Transformer { rnn::State prevDecoderState; if(prevDecoderStates.size() > 0) prevDecoderState = prevDecoderStates[i]; - + // self-attention std::string layerType = opt("transformer-decoder-autoreg", "self-attention"); rnn::State decoderState; @@ -871,7 +871,7 @@ class DecoderTransformer : public Transformer { } // This allows to run a final layernorm operation after going through the transformer layer stack. - // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) + // By default the operations are empty, but with prenorm (--transformer-preprocess n --transformer-postprocess da) // it is recommended to normalize here. Can also be used to add a skip connection from the very bottom if requested. auto opsTop = opt("transformer-postprocess-top", ""); query = postProcess(prefix_ + "_top", opsTop, query, prevQuery, dropProb); @@ -883,7 +883,7 @@ class DecoderTransformer : public Transformer { if(shortlist_) output_->setShortlist(shortlist_); auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim] - + // return unormalized(!) probabilities Ptr nextState; if (opt("transformer-decoder-autoreg", "self-attention") == "rnn") { @@ -906,9 +906,9 @@ class DecoderTransformer : public Transformer { output_->clear(); cache_.clear(); alignments_.clear(); - perLayerRnn_.clear(); // this needs to be cleared between batches. - // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, - // but where underlying memory has been deallocated by dropping all tensors + perLayerRnn_.clear(); // this needs to be cleared between batches. + // @TODO: figure out how to detect stale nodes i.e. nodes that are referenced, + // but where underlying memory has been deallocated by dropping all tensors // from a TensorAllocator object. This can happen during ExpressionGraph::clear() } }; diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h index ac86e4dc7..fcd90ad63 100644 --- a/src/models/transformer_factory.h +++ b/src/models/transformer_factory.h @@ -22,6 +22,8 @@ class TransformerLegacy : public EncoderDecoder { bool markedReloaded = true) override { for(auto& item : modelFile->items()) { + auto lockGuard = modelFile->scopedLockGuard(); + auto pair = nameMap_.find(item.name); if(pair != nameMap_.end()) { LOG(debug, "Mapping parameter {} to {}", item.name, pair->second); @@ -42,22 +44,28 @@ class TransformerLegacy : public EncoderDecoder { ABORT_IF(!encoder, "Could not cast to new type of encoder??"); for(auto& linear : encoder->allLayers()) linear->transposed = false; + for(auto& norm : encoder->allLayers()) + norm->eps = 1e-6f; // used in old code by default, so we need to set it here explicitly auto decoder = std::dynamic_pointer_cast(decoders_[0]); ABORT_IF(!decoder, "Could not cast to new type of decoder??"); for(auto& linear : decoder->allLayers()) linear->transposed = false; + for(auto& norm : decoder->allLayers()) + norm->eps = 1e-6f; // used in old code by default, so we need to set it here explicitly // load items into the graph graph->load(modelFile); } private: - std::map nameMap_; + const std::unordered_map nameMap_; - std::map createNameMap() { - std::map nameMap = { + std::unordered_map createNameMap() { + std::unordered_map nameMap = { {"Wemb", "Wemb"}, + // {"decoder_ff_logit_out_b", "decoder_ff_logit_out_b"}, for now no shape conversion + // {"special:model.yml", "special:model.yml"} }; // @TODO: This is going to change @@ -100,20 +108,20 @@ class TransformerLegacy : public EncoderDecoder { prefix = "TransformerBatchDecoder"; for(int layerNo = 0; layerNo < opt("dec-depth"); ++layerNo) { // name maps for decoder self-attention blocks - nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->qProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->qProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->qProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->qProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->kProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->kProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->kProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->kProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->vProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->vProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->vProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->vProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->oProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->selfAttention->oProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->oProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->postprocessor->norm->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->selfAttentionBlock->postprocessor->norm->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); // name maps for decoder SSRU nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo); diff --git a/src/models/transformer_new.h b/src/models/transformer_new.h index 61de01db2..1c7807727 100644 --- a/src/models/transformer_new.h +++ b/src/models/transformer_new.h @@ -12,12 +12,12 @@ namespace marian { // Wrapper for backwards compatibility that uses current encoder/decoder framework -struct TransformerBatchEncoder : public nn::LayerWithOptions, +struct TransformerBatchEncoder : public nn::LayerWithOptions, public nn::IEmbeddingLayer, // TransformerBatchEncoder is an IEmbeddingLayer that produces contextual embeddings public EncoderBase { // @TODO: should all encoders be IEmbeddingLayer? Ptr encoder; - TransformerBatchEncoder(Ptr graph, + TransformerBatchEncoder(Ptr graph, Ptr options) : LayerWithOptions(graph, options), EncoderBase(graph, options) @@ -55,10 +55,10 @@ struct TransformerBatchEncoder : public nn::LayerWithOptions, EncoderBase::graph_ = graph; setGraph(graph); // This makes sure that the graph passed into the model during construction and now evaluation are identical. - // A good check to have for catching weird situations early. + // A good check to have for catching weird situations early. ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); #endif - + const auto& [batchEmbedding, batchMask] = apply((*batch)[batchIndex_]); return New(batchEmbedding, batchMask, batch); } @@ -69,11 +69,11 @@ struct TransformerBatchEncoder : public nn::LayerWithOptions, }; // Wrapper for backwards compatibility that uses current encoder/decoder framework -class TransformerBatchDecoder : public nn::LayerWithOptions, +class TransformerBatchDecoder : public nn::LayerWithOptions, public DecoderBase { Ptr decoder; - Ptr output_; + Ptr output_; void lazyCreateOutputLayer() { @@ -101,9 +101,9 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, } public: - TransformerBatchDecoder(Ptr graph, Ptr options) + TransformerBatchDecoder(Ptr graph, Ptr options) : LayerWithOptions(graph, options), DecoderBase(graph, options) { - + decoder = New(graph, options); registerLayer(decoder); @@ -118,7 +118,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, DecoderBase::graph_ = graph; setGraph(graph); // This makes sure that the graph passed into the model during construction and now evaluation are identical. - // A good check to have for catching weird situations early. + // A good check to have for catching weird situations early. ABORT_IF(this->graph() != graph, "Graph used for construction and graph parameter do not match"); #endif @@ -127,6 +127,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, int dimBatch = (int)batch->size(); int dim = DecoderBase::opt("dim-emb"); + // @TODO: use the actual initState function of the new state auto start = graph->constant({1, 1, dimBatch, dim}, inits::zeros()); rnn::States startStates(DecoderBase::opt("dec-depth"), {start, start}); @@ -134,7 +135,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/false); } else { - rnn::States startStates; + rnn::States startStates(DecoderBase::opt("dec-depth"), {nullptr, nullptr}); return NewDecoderState(DecoderBase::options_, startStates, Logits(), encStates, batch, /*isBatchMajor=*/true); } } @@ -157,20 +158,17 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, //************************************************************************// - auto encoderContext = state->getEncoderStates()[0]->getContext(); // encoder output - auto encoderMask = state->getEncoderStates()[0]->getMask(); // note: may differ from Encoder self-attention mask in that additional positions are banned for cross-attention - // Convert old style decoder state to new decoder state using namespace models; usage modelUsage = (usage)db::opt("usage", (int)usage::translation); auto nnState = convertDecoderState(state, graph(), /*decoding=*/modelUsage == usage::translation); - auto decoderContext = decoder->apply(embeddings, decoderMask, encoderContext, encoderMask, nnState); + auto decoderContext = decoder->apply(embeddings, decoderMask, nnState); // final feed-forward layer (output) if(shortlist_) output_->setShortlist(shortlist_); auto logits = output_->applyAsLogits(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim] - + // Convert new style decoder state to old decoder state // @TODO: This is such a mess! rnn::States decoderStates; @@ -185,8 +183,7 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, // helper function for guided alignment // @TODO: const vector<> seems wrong. Either make it non-const or a const& (more efficient but dangerous) virtual const std::vector getAlignments(int /*i*/ = 0) override { - ABORT("Not implemented"); - return {}; + return decoder->getAlignments(); } virtual void clear() override { @@ -203,13 +200,13 @@ class TransformerBatchDecoder : public nn::LayerWithOptions, static void testme() { using namespace marian; using namespace nn; - + auto options = New( - "enc-depth", 12, - "transformer-heads", 8, - "dim-emb", 512, + "enc-depth", 12, + "transformer-heads", 8, + "dim-emb", 512, "transformer-ffn-depth", 2, - "transformer-dim-ffn", 2048, + "transformer-dim-ffn", 2048, "transformer-dropout", 0.1, "transformer-dropout-attention", 0.0, "transformer-postprocess", "dan", @@ -230,13 +227,13 @@ static void testme() { auto encoder = New(graph, options); encoder->setName("TransformerEncoder"); encoder->setEvalMode(); - + auto context = encoder->apply(input, mask); std::cerr << encoder->layerInfo(/*includeChildren=*/true) << std::endl; debug(context); - + graph->forward(); graph->save("test.npz"); } From b683f4b16561fb91fb11cb3182885f46fb54d344 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 8 Feb 2024 07:00:41 +0000 Subject: [PATCH 15/26] Merged PR 32882: Reorder inputs for kiwi-style metrics This PR adds `--input-reorder` which allows to swap the indices of batch subfields. Currently, this is used for comet-kiwi-style models to accomodate that the mt output comes first and not the source. --- CHANGELOG.md | 1 + VERSION | 2 +- scripts/comet/comet2marian.py | 2 ++ src/common/config_parser.cpp | 5 +++++ src/data/corpus.cpp | 13 ++++++++++--- src/data/corpus_base.cpp | 6 ++++-- src/data/corpus_nbest.cpp | 2 ++ src/data/corpus_sqlite.cpp | 2 ++ src/data/dataset.h | 6 ++++-- src/data/text_input.h | 23 ++++++++++++++++------- src/examples/mnist/dataset.h | 4 ++-- src/examples/mnist/training.h | 2 +- 12 files changed, 50 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 13dd5e301..5f297383d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property. - Added `pymarian`: python bindings based on pybind11 - Added implementation of COMET-KIWI - Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now) diff --git a/VERSION b/VERSION index 9db15f195..bb7e6dd0a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.23 +v1.12.24 diff --git a/scripts/comet/comet2marian.py b/scripts/comet/comet2marian.py index 68912befd..94098711f 100755 --- a/scripts/comet/comet2marian.py +++ b/scripts/comet/comet2marian.py @@ -3,6 +3,7 @@ This script converts Unbabel COMET-QE models to Marian weight file. """ +import sys import argparse import logging as log import numpy as np @@ -105,6 +106,7 @@ def load_comet_model(model_path): config["input-join-fields"] = True config["separator-symbol"] = "" config["comet-use-separator"] = True + config["input-reorder"] = [1, 0, 2] # reorder input fields from [src, mt, ref] to [mt, src, ref] for comet-kiwi etc. else: raise Exception(f'Unknown type of model {model_type}') diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 741a3915c..9c8b0776f 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -249,6 +249,11 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "Possible values: sequence, class, alignment, weight. " "You need to provide one type per input file (if --train-sets) or per TSV field (if --tsv).", {}); + cli.add>("--input-reorder", + "Reorder input data to this order according to this permutation. If empty no reordering is done. " + "If non-empty, you need to provide one type per input file (if --train-sets) or per TSV field (if --tsv). " + "Usually, there should be no need to provide these on the command line, the model should have them saved.", + {}); cli.add("--input-join-fields", "Join input fields (from files or TSV) into a single sequence " "(mostly used single-encoder models like BLEURT and COMET-KIWI)", diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 902f0c9f6..8f5b0035e 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -116,6 +116,9 @@ SentenceTuple Corpus::next() { fields.swap(tmpFields); } + ABORT_IF(inputPermutation_.size() != 0 && inputPermutation_.size() < fields.size(), + "Input permutation given, but not for every input field??"); + // fill up the sentence tuple with sentences from all input files SentenceTupleImpl tup(curId); size_t shift = 0; @@ -125,12 +128,16 @@ SentenceTuple Corpus::next() { if(i == alignFileIdx_ || i == weightFileIdx_) { ++shift; } else { - size_t vocabId = i - shift; + size_t permutedIndex = i; + if(!inputPermutation_.empty()) + permutedIndex = inputPermutation_[i]; + + size_t vocabId = permutedIndex - shift; bool altered; - preprocessLine(fields[i], vocabId, curId, /*out=*/altered); + preprocessLine(fields[permutedIndex], vocabId, curId, /*out=*/altered); if(altered) tup.markAltered(); - addWordsToSentenceTuple(fields[i], vocabId, tup); + addWordsToSentenceTuple(fields[permutedIndex], vocabId, tup); } } // weights are added last to the sentence tuple, because this runs a validation that needs diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp index e1b0aad62..d11b5f763 100644 --- a/src/data/corpus_base.cpp +++ b/src/data/corpus_base.cpp @@ -430,12 +430,14 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line, auto inputTypes = options_->get>("input-types", {}); // empty list by default + bool isFirst = tup.empty(); + // This handles adding starts symbols for COMET () and BERT/BLEURT ([CLS]) - bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && batchIndex == 0)); + bool prepend = prependZero_ && (!joinFields_ || (joinFields_ && isFirst)); if(prepend && inputTypes[batchIndex] == "sequence") words.insert(words.begin(), Word::fromWordIndex(0)); - bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0; + bool prependSep = insertSeparator_ && joinFields_ && !isFirst; if(prependSep && inputTypes[batchIndex] == "sequence") words.insert(words.begin(), vocabs_[batchIndex]->getSepId()); diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp index 8029d3516..3c795e19b 100644 --- a/src/data/corpus_nbest.cpp +++ b/src/data/corpus_nbest.cpp @@ -33,6 +33,8 @@ std::string lineFromNbest(const std::string& line) { } SentenceTuple CorpusNBest::next() { + ABORT_IF(!inputPermutation_.empty(), "Input permutation not supported for n-best lists"); + bool cont = true; while(cont) { // get index of the current sentence diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp index f7c577f29..07cff6947 100644 --- a/src/data/corpus_sqlite.cpp +++ b/src/data/corpus_sqlite.cpp @@ -106,6 +106,8 @@ void CorpusSQLite::fillSQLite() { } SentenceTuple CorpusSQLite::next() { + ABORT_IF(!inputPermutation_.empty(), "Input permutation not supported for sqlite corpus"); + while(select_->executeStep()) { // fill up the sentence tuple with sentences from all input files size_t curId = select_->getColumn(0).getInt(); diff --git a/src/data/dataset.h b/src/data/dataset.h index 3cdccec99..d5098f3fa 100644 --- a/src/data/dataset.h +++ b/src/data/dataset.h @@ -15,7 +15,8 @@ class DatasetBase { protected: std::vector paths_; Ptr options_; - + std::vector inputPermutation_; // if not empty, this is used to reorder input fields/batches i.e. [1,0] swaps the first two fields + // currently this is used for comet-kiwi-style metrics where the mt output is the first field // Data processing may differ in training/inference settings bool inference_{false}; @@ -28,7 +29,8 @@ class DatasetBase { DatasetBase(std::vector paths, Ptr options) : paths_(paths), options_(options), - inference_(options != nullptr ? options->get("inference", false) : false) {} + inputPermutation_(options->get>("input-reorder", {})), + inference_(options->get("inference", false)) {} DatasetBase(Ptr options) : DatasetBase({}, options) {} diff --git a/src/data/text_input.h b/src/data/text_input.h index f2e9831de..4c4ea9f1b 100644 --- a/src/data/text_input.h +++ b/src/data/text_input.h @@ -100,15 +100,24 @@ class TextInput : public DatasetBase { void prepare() override {} - SentenceTuple encode(std::vector& row, size_t id) { - ABORT_IF(row.size() != vocabs_.size(), "Number of fields does not match number of vocabs"); + SentenceTuple encode(std::vector& fields, size_t id) { + ABORT_IF(fields.size() != vocabs_.size(), "Number of fields does not match number of vocabs"); + // fill up the sentence tuple with source and/or target sentences SentenceTupleImpl tup(id); + ABORT_IF(inputPermutation_.size() != 0 && inputPermutation_.size() < fields.size(), + "Input permutation given, but not for every input field??"); + // copied and adapted from corpus.cpp - @TODO: refactor or unify code between Corpus and TextInput - for(size_t batchIndex = 0; batchIndex < row.size(); ++batchIndex) { - std::string& field = row[batchIndex]; - Words words = vocabs_[batchIndex]->encode(field, /*addEOS =*/true, inference_); + for(size_t batchIndex = 0; batchIndex < fields.size(); ++batchIndex) { + size_t permutedBatchIndex = batchIndex; + if(inputPermutation_.size() > 0) + permutedBatchIndex = inputPermutation_[batchIndex]; + + std::string& field = fields[permutedBatchIndex]; + + Words words = vocabs_[permutedBatchIndex]->encode(field, /*addEOS =*/true, inference_); ABORT_IF(words.empty(), "Empty input sequences are presently untested"); // This handles adding starts symbols for COMET () and BERT/BLEURT ([CLS]) @@ -118,7 +127,7 @@ class TextInput : public DatasetBase { bool prependSep = insertSeparator_ && joinFields_ && batchIndex > 0; if(prependSep) - words.insert(words.begin(), vocabs_[batchIndex]->getSepId()); + words.insert(words.begin(), vocabs_[permutedBatchIndex]->getSepId()); // if fields are joined and the current sentence is not the first one, we need to make sure that // the current sentence is not longer than the maximum length minus the length of the previous sentence @@ -130,7 +139,7 @@ class TextInput : public DatasetBase { // if the current sentence is longer than the maximum length, we need to crop it if(maxLengthCrop_ && words.size() > localMaxLength) { words.resize(localMaxLength); - words.back() = vocabs_[batchIndex]->getEosId(); + words.back() = vocabs_[permutedBatchIndex]->getEosId(); } // if true, the words are reversed diff --git a/src/examples/mnist/dataset.h b/src/examples/mnist/dataset.h index c665fa655..8c02c3b4b 100644 --- a/src/examples/mnist/dataset.h +++ b/src/examples/mnist/dataset.h @@ -139,8 +139,8 @@ class MNISTData : public Dataset { public: MNISTData(std::vector paths, - std::vector> /*vocabs*/ = {}, - Ptr options = nullptr) + std::vector> /*vocabs*/, + Ptr options) : Dataset(paths, options), IMAGE_MAGIC_NUMBER(2051), LABEL_MAGIC_NUMBER(2049) { loadData(); } diff --git a/src/examples/mnist/training.h b/src/examples/mnist/training.h index eebcbf822..791e769be 100644 --- a/src/examples/mnist/training.h +++ b/src/examples/mnist/training.h @@ -22,7 +22,7 @@ class TrainMNIST : public ModelTask { // Prepare data set auto paths = options_->get>("train-sets"); - auto dataset = New(paths); + auto dataset = New(paths, std::vector>{}, options_); auto batchGenerator = New>(dataset, options_, nullptr); // Prepare scheduler with validators From 22ed792f867a63e3b6a6b534e7a12c38412cec2e Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 8 Feb 2024 23:14:55 +0000 Subject: [PATCH 16/26] Merged PR 32937: Fixes force-decoding for beam-size larger 1 It seems there was a shape mismatch for force-decoding with beams larger than 1. This PR fixes the problem. --- CHANGELOG.md | 1 + VERSION | 2 +- src/translator/sampling.h | 19 +++++++++++-------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f297383d..918deb710 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed force-decoding for beam-size > 1 - Fixed lost node in mt-detect metrics - Fixed BLEURT logmask computation - Fixed wrong paramter name for norm in new layer framework diff --git a/VERSION b/VERSION index bb7e6dd0a..53dbb431e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.24 +v1.12.25 diff --git a/src/translator/sampling.h b/src/translator/sampling.h index 2b13791d9..184202229 100644 --- a/src/translator/sampling.h +++ b/src/translator/sampling.h @@ -199,9 +199,12 @@ class DistModifier { // get vocab index and probability for force-decoded tokens for the current time step Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] - Expr forceVals = gather(scores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1] - // create dummy indices and values for beam entries other then the force-decoded value. This is required to ensure that the beam + // select scores from first beam entry for force-decoding + Expr b1stScores = slice(scores, /*axis=*/-4, 0); // [1, 1, dimBatch, dimVocab] + Expr forceVals = gather(b1stScores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1] + + // create dummy indices and values for beam entries other than the force-decoded value. This is required to ensure that the beam // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion. int dimVocab = scores->shape()[-1]; @@ -212,13 +215,13 @@ class DistModifier { Expr dummyVals = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f); // here we add the force-decoded entries back into the zeroed positions - dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); - dummyVals = dummyVals + forceVals; + dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); // [1, 1, dimBatch, dimBeam] + dummyVals = dummyVals + forceVals; // [1, 1, dimBatch, dimBeam] - // create a tensor of the same size as the original logits, initialize with invalidPathScore and then scatter the force-decoded and - // dummy values into the correct positions. - Expr forcedScores = constant_like(scores, inits::fromValue(invalidPathScore_)); - forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); + // create a tensor of the same size as the original logits from the first beam entry, initialize with invalidPathScore and then scatter + // the force-decoded and dummy values into the correct positions. + Expr forcedScores = constant_like(b1stScores, inits::fromValue(invalidPathScore_)); // [1, 1, dimBatch, dimVocab] + forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); // [1, 1, dimBatch, dimVocab] // for entries that have finished force-decoding (the batch has eosId as vocab id) use the original logits for the whole batch entry // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means From 9e40ac3df46514740bcbdc559a06f1e0077828b8 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Thu, 15 Feb 2024 21:21:44 +0000 Subject: [PATCH 17/26] Merged PR 32883: Pymarian improvements List of changes/updates/fixes to pymarian * Rename model IDs to match with hugging face (e.g., comet22-da -> wmt22-comet-da) * Rename CLI to make it short pymarian-evaluate -> pymarian-eval. * Rename pymarian.evaluate.py -> pymarian.eval.py to reflect CLI * The functional code from pymarian.eval.py is moved to Evaluator class (goal: allow reuse of Evaluator object for scoring many small files like WMT metric task) * Use mmap *.bins instead of *.npz * Downloads *.bin and *.spm individually instead of .tgz. Future plan to support quantized / gemm models. Downloading .tgz is okay but it will get too expensive since we dont need all variants of model (.npz, .bin, fp32, fp16, avx512 ...) * Uses file locking mechanism (based on `portalocker`) to avoid race condition between parallel download processes * Added optional `-v/--vocab` argument to pymarian-eval. * Added `--fields|-f` argument: supports `src mt ref` or a subsequence of this. Raises an error when missing fields are detected, ignores that extra fields * pymarian build improvements: strict on python version match between package and native extension. Also removes custom logic for extension detection, instead uses EXT_SUFFIX from sysconfig * add `--like` argument for local models * Ran black and isort to fix code formatting issues * pypdl -- parallel download * Regression tests to pymarian -- Other scripts * Added `convert-all-models.sh` : convert pytorch to marian .npz, convert .npz to .bin and creates directory structure compatible with pymarian-eval * Added `compare.sh` to compare metrics between original implementation and pymarian --- .dockerignore | 7 + .gitignore | 4 + CHANGELOG.md | 1 + azure-regression-tests.yml | 13 +- scripts/bleurt/bleurt2marian.py | 1 + scripts/metrics/.gitignore | 6 +- scripts/metrics/Dockerfile | 26 +- scripts/metrics/README.md | 53 +-- scripts/metrics/compare.sh | 121 +++--- scripts/metrics/convert-all-models.sh | 94 +++++ scripts/metrics/docker-run.sh | 20 - scripts/metrics/known-models.txt | 13 + scripts/metrics/marian-score.sh | 126 ------- scripts/metrics/requirements.txt | 8 + scripts/metrics/run.sh | 33 ++ scripts/metrics/setup.sh | 15 - src/CMakeLists.txt | 4 +- src/models/bleurt.h | 3 +- src/python/README.md | 88 ++--- src/python/pymarian/__init__.py | 129 ++++++- src/python/pymarian/__main__.py | 15 +- src/python/pymarian/constants.py | 28 -- src/python/pymarian/defaults.py | 40 ++ src/python/pymarian/eval.py | 264 +++++++++++++ src/python/pymarian/evaluate.py | 350 ------------------ src/python/pymarian/mtapi_server.py | 3 +- src/python/pymarian/pypdl/__init__.py | 1 + src/python/pymarian/pypdl/downloader.py | 97 +++++ src/python/pymarian/pypdl/main.py | 234 ++++++++++++ src/python/pymarian/pypdl/utils.py | 127 +++++++ src/python/pymarian/qtdemo.py | 3 +- src/python/pymarian/utils.py | 151 ++++++-- src/python/pyproject.toml | 7 +- src/python/setup.py | 46 +-- src/python/tests/{ => regression}/__init__.py | 0 .../tests/regression/test_pymarian_eval.py | 91 +++++ .../tests/{ => regression}/test_train.py | 16 +- src/python/tests/regression/test_translate.py | 35 ++ src/python/tests/test_evaluate.py | 148 -------- src/python/tests/test_translate.py | 16 - 40 files changed, 1514 insertions(+), 923 deletions(-) create mode 100644 .dockerignore mode change 100644 => 100755 scripts/bleurt/bleurt2marian.py create mode 100755 scripts/metrics/convert-all-models.sh delete mode 100755 scripts/metrics/docker-run.sh create mode 100644 scripts/metrics/known-models.txt delete mode 100755 scripts/metrics/marian-score.sh create mode 100644 scripts/metrics/requirements.txt create mode 100644 scripts/metrics/run.sh delete mode 100755 scripts/metrics/setup.sh delete mode 100644 src/python/pymarian/constants.py create mode 100644 src/python/pymarian/defaults.py create mode 100755 src/python/pymarian/eval.py delete mode 100755 src/python/pymarian/evaluate.py create mode 100644 src/python/pymarian/pypdl/__init__.py create mode 100644 src/python/pymarian/pypdl/downloader.py create mode 100644 src/python/pymarian/pypdl/main.py create mode 100644 src/python/pymarian/pypdl/utils.py rename src/python/tests/{ => regression}/__init__.py (100%) create mode 100644 src/python/tests/regression/test_pymarian_eval.py rename src/python/tests/{ => regression}/test_train.py (89%) create mode 100644 src/python/tests/regression/test_translate.py delete mode 100644 src/python/tests/test_evaluate.py delete mode 100644 src/python/tests/test_translate.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..57d59853c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +/regression-tests +/build* +/.pytest_cache +/.vscode +/dist +/doc +.history* diff --git a/.gitignore b/.gitignore index a55d45a39..736424f85 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,7 @@ examples/mnist/*ubyte *.whl *.egg-info src/python/pymarian/_version.py +src/python/tests/data +__pycache__ +.pytest_cache + diff --git a/CHANGELOG.md b/CHANGELOG.md index 918deb710..9412de3a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 ### Added +- Added `pymarian-eval`, CLI for scoring metrics - Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property. - Added `pymarian`: python bindings based on pybind11 - Added implementation of COMET-KIWI diff --git a/azure-regression-tests.yml b/azure-regression-tests.yml index 206c018a1..fb8a06f4e 100644 --- a/azure-regression-tests.yml +++ b/azure-regression-tests.yml @@ -42,7 +42,7 @@ stages: sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 sudo update-alternatives --set python3 /usr/bin/python3.8 sudo apt-get install -y python3-pip - python3 -m pip install --upgrade Cython + python3 -m pip install --upgrade Cython pip displayName: Clean and install packages # Collect details about CPU and GPU. @@ -105,7 +105,8 @@ stages: -DCOMPILE_SERVER=on \ -DCOMPILE_TESTS=on \ -DCOMPILE_MAXWELL=on -DCOMPILE_PASCAL=off -DCOMPILE_VOLTA=off -DCOMPILE_TURING=off -DCOMPILE_AMPERE=off -DCOMPILE_AMPERE_RTX=off \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1 + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.1 \ + -DPYMARIAN=on -DUSE_TCMALLOC=off -DPYTHON_EXECUTABLE=python3 displayName: Configure CMake - bash: make -j5 @@ -141,6 +142,14 @@ stages: displayName: Collect outputs workingDirectory: regression-tests + - bash: | + python3 -m pip install build/pymarian-*.whl + python3 -m pymarian -v + python3 -m pip install pytest + python3 -m pytest src/python/tests/regression + displayName: Pymarian Install and Test + + - publish: regression-tests-ci-public_linux-x64-static_cuda_m60.zip artifact: regression-tests-ci-public_linux-x64-static_cuda_m60 displayName: Publish outputs diff --git a/scripts/bleurt/bleurt2marian.py b/scripts/bleurt/bleurt2marian.py old mode 100644 new mode 100755 index 25aa8206f..f02d3a833 --- a/scripts/bleurt/bleurt2marian.py +++ b/scripts/bleurt/bleurt2marian.py @@ -57,6 +57,7 @@ def load_bleurt_model(): config["bert-type-vocab-size"] = 2 config["comet-prepend-zero"] = True config["input-join-fields"] = True +config["input-reorder"] = [1, 0] # bleurt expects ref < hyp order while embedding, we are providing hyp < ref, hence the reordering config["version"] = "bleurt2marian.py conversion" config["enc-depth"] = 0 diff --git a/scripts/metrics/.gitignore b/scripts/metrics/.gitignore index 5d66dfcd9..0ab29db58 100644 --- a/scripts/metrics/.gitignore +++ b/scripts/metrics/.gitignore @@ -1,2 +1,4 @@ -bins/ -tmp.* \ No newline at end of file +/bins +tmp.* +/workspace +/marian-metric \ No newline at end of file diff --git a/scripts/metrics/Dockerfile b/scripts/metrics/Dockerfile index 25a3236a9..995586219 100644 --- a/scripts/metrics/Dockerfile +++ b/scripts/metrics/Dockerfile @@ -1,10 +1,13 @@ -FROM mcr.microsoft.com/azureml/minimal-ubuntu20.04-py38-cuda11.6.2-gpu-inference:20231102.v2 +# syntax = docker/dockerfile:experimental +FROM mcr.microsoft.com/azureml/minimal-ubuntu22.04-py39-cuda11.8-gpu-inference:20240205.v2 # use this if microsoft image is not accessible; #FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 -LABEL description="Marian image - Ubuntu 20.04" +LABEL description="Marian image - Ubuntu 22.04" + +# required for microsoft cr image +USER root ARG DEBIAN_FRONTEND=noninteractive -ARG NCPU=24 ARG MARIAN_REPO="https://github.com/marian-nmt/marian-dev" ARG MARIAN_BRANCH=master @@ -18,9 +21,10 @@ RUN ln -sf /usr/bin/python3 /usr/bin/python && \ # install unbabel-comet (requires pytorch) and bleurt (requires tensorflow and cudnn) # note: unabel-comet 2.x is broken use 1.x. requires numpy < 1.24 + #&& pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \ RUN pip install --upgrade pip \ - && pip install torch==1.13.1+cu117 -f https://download.pytorch.org/whl/torch_stable.html \ - && pip install sacrebleu unbabel-comet==1.1.3 numpy==1.23.5 nvidia-cudnn-cu11==8.6.0.163 git+https://github.com/google-research/bleurt.git \ + && pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu118 \ + && pip install sacrebleu unbabel-comet==2.2.1 git+https://github.com/google-research/bleurt.git \ && rm -rf ~/.cache/pip/ # Install sentencepiece @@ -38,7 +42,11 @@ RUN pip3 uninstall -y sentencepiece && \ cd ../../.. && \ rm -rf src -RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \ - && mkdir /marian/build && cd /marian/build \ - && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=off -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off \ - && make -j $NCPU && cp -v marian spm_encode spm_decode /usr/bin/ \ +# add source repo (sans the .dockerignore files) +ADD . /marian-dev +# RUN git clone -b ${MARIAN_BRANCH} ${MARIAN_REPO} /marian \ + +RUN --mount=type=cache,target=/marian-dev/build mkdir -p /marian-dev/build && cd /marian-dev/build \ + && cmake .. -DUSE_MPI=on -DUSE_STATIC_LIBS=on -DCOMPILE_PASCAL=on -DCOMPILE_VOLTA=on -DCOMPILE_AMPERE=off -DBUILD_ARCH=x86-64 -DCOMPILE_AVX512=off -DPYMARIAN=on \ + && make -j && cp -v marian spm_encode spm_decode /usr/bin/ \ + && pip install -v pymarian-*.whl diff --git a/scripts/metrics/README.md b/scripts/metrics/README.md index 4d04c20b7..3148d3180 100644 --- a/scripts/metrics/README.md +++ b/scripts/metrics/README.md @@ -1,36 +1,41 @@ -# Marian Evaluate +# Marian Metrics + The main script is `compare.sh`, however it needs to be run in an environment where all three -- marian, unbabel-comet(pytorch), and bleurt(tensorflow) are available. -Hence, 1) we create a docker container with all the necessary libs. - and 2) run compare.sh inside the docker environment +Hence we create a new python environment using conda to run comparisons. -## Setup: build docker image +## Setup ```bash -./setup.sh +./run.sh +``` +This setups a conda environment named `metrics` which will have all the necessary requirements, except pymarian-eval, which you will have to install based on your CMAKE settings +```bash +# from the root dir of this repository +conda activate metrics +mkdir build; cd build +cmake .. -DPYMARIAN=on #.. other flags +pip install pymarian-*.whl ``` -## Run compare.sh in docker container +## Run Compare.sh ```bash -./docker-run.sh + +# option 1: +./run.sh + +# option 2 +conda activate metrics +bash compare.sh ``` -The `docker-run.sh` script mounts cache directory from the host to container. -The necessary files (weights and vocabularies) will be automatically downloaded and cached for unbabel-comet and Bleurt metrics. -However, for `marian-score.sh` expects the cache to be prepared under `$HOME/.cache/marian/metrics`. -The structure/format of the cache directory for marian-score.sh looks as follows: + +This script produces reports at `workspace/*.report.txt`, which shows average difference segment level scores between original implementation and `pymarian-eval` + +## Convert Metrics Weights to Marian format + ```bash -/home/$USER/.cache/marian/metrics/ -├── bleurt20-ref -│ ├── bleurt-20.model.npz -│ ├── bleurt.vocab.spm -├── comet20-da-src -│ ├── comet20-qe-da.model.npz -│ └── roberta.vocab.spm -└── comet20-da-src+ref - ├── comet20-da.model.npz - └── roberta.vocab.spm +conda activate metrics +MARIAN=../build/marian ./convert-all-models.sh ``` -Each metric subdir should have a `*model.npz` and a `*vocab.spm` files, and the name of metric directory should end with `-src|-qe|-ref|-src+ref` suffix to indicate the category of metric. - -> TODO: Upload Marian compatible comet and bleurt models to public blob storage and modify script to automatically download +To add a new model ID, edit `known-models.txt` file in the same directory as this README diff --git a/scripts/metrics/compare.sh b/scripts/metrics/compare.sh index 902258863..3d3799f5c 100755 --- a/scripts/metrics/compare.sh +++ b/scripts/metrics/compare.sh @@ -1,12 +1,39 @@ #!/usr/bin/env bash + +# This script compares the scores produced by +# original implementation (unbabel-score or BLEURT) and Marian NMT (pymarian-eval). + + MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -export PATH=$MYDIR:$PATH +OUT_DIR=$MYDIR/workspace +REGEN_ORIG=0 # 1 : clear and regenerate original scores. 0: reuse previous scores +REGEN_MARIAN=0 # 1 : to clear and regenerate marian scores (recommended). 0: reuse / resume from previous scores + +DEVICES=0 +cd $MYDIR +export CUDA_VISIBLE_DEVICES=0 + +# add source to python path to test changes before installing +# export PYTHONPATH=$(cd $MYDIR/../../src/python && pwd) log() { echo -e "\e[1;32m[$(date '+%Y-%m-%d %H:%M:%S')]\e[0m $@" >&2 } +for tool in comet-score pymarian-eval; do + which $tool > /dev/null || { + log "ERROR: $tool not found in PATH" + exit 1 + } +done + + +METRIC_NAMES=$(cat $MYDIR/known-models.txt | grep -v '^#' | awk '{print $1}') +# exclude xxl, they require more memory +METRIC_NAMES=$(grep -v -i '\-xxl\|xcomet' <<< $METRIC_NAMES) + get_sacrebleu_names(){ + set -eu # using sacrebleu to get the list of systems testset=wmt21/systems while read line; do @@ -14,7 +41,7 @@ get_sacrebleu_names(){ refs=() mts=() while read name; do - # skip if name starts with $pair or src or docid + # skip if name starts with $pair or src or docidq if [[ $name == $pair* || $name == src || $name == docid || $name == origlang ]]; then continue fi @@ -29,12 +56,15 @@ get_sacrebleu_names(){ for ref in ${refs[@]}; do for mt in ${mts[@]}; do echo -e "$testset\t$pair\t$ref\t$mt" + break # limit to one per lang pair done + break # limit to one per lang pair done done < <(sacrebleu -t $testset --list) } unbabel_score(){ + set -eu local metric=$1 local prefix=$2 log "Running $metric" @@ -45,6 +75,7 @@ unbabel_score(){ bleurt_score() { + set -eu local metric_name=$1 local prefix=$2 [[ $metric_name == "BLEURT-20" ]] || { @@ -63,54 +94,60 @@ bleurt_score() { # to check if cuda libs are configured and GPU is available # python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" - export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH + #export LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH python -m bleurt.score_files --bleurt_checkpoint=$metric_path \ --candidate_file=$prefix.mt --reference_file=$prefix.ref \ --bleurt_batch_size 64 2> /dev/null } -marian_score() { - local metric=$1 - local prefix=$2 - case $metric in - wmt20-comet-qe-da) metric="comet20-da-src" ;; - wmt20-comet-da) metric="comet20-da-src+ref" ;; - BLEURT-20) metric="bleurt20-ref" ;; - *) echo "Unknown metric $metric"; exit 1;; - esac - marian-score.sh -d '0' -n $metric --src $prefix.src --ref $prefix.ref --mt $prefix.mt --seg -} +MAX_TESTS=10 +MAX_LINES=100 # in each testset +mkdir -p $OUT_DIR + +while IFS=$'\t' read tset pair ref mt; do + data=$(sacrebleu -t $tset -l $pair --echo src ref $mt) + prefix=$OUT_DIR/${tset//\//-}.$pair.$MAX_LINES + + [[ -s $prefix.src ]] || cut -f1 <<< "$data" | head -n $MAX_LINES > $prefix.src + [[ -s $prefix.ref ]] || cut -f2 <<< "$data" | head -n $MAX_LINES > $prefix.ref + [[ -s $prefix.mt ]] || cut -f3 <<< "$data" | head -n $MAX_LINES > $prefix.mt + + report_file=$prefix.report.txt + echo "####$(date '+%Y-%m-%d %H:%M:%S') :: $(pymarian-eval -V) :: Avg diffs" | tee -a $report_file -main() { - cd $MYDIR - local metric_names=(BLEURT-20 wmt20-comet-da wmt20-comet-qe-da) - export CUDA_VISIBLE_DEVICES=0 - local max_tests=10 - local max_lines=100 # in each testset - while IFS=$'\t' read tset pair ref mt; do - for mn in ${metric_names[@]}; do - log "Comparing >> $mn << on $tset $pair $ref $mt" - local data=$(sacrebleu -t $tset -l $pair --echo src ref $mt) - local tmp_pref=tmp.testset - rm -rf $tmp_pref.{src,ref,mt} - cut -f1 <<< "$data" | head -n $max_lines > $tmp_pref.src - cut -f2 <<< "$data" | head -n $max_lines > $tmp_pref.ref - cut -f3 <<< "$data" | head -n $max_lines > $tmp_pref.mt + for mn in ${METRIC_NAMES[@]}; do + log "Comparing >> $mn << on $tset $pair $ref $mt" + metric_id=$(basename $mn | tr '[:upper:]' '[:lower:]') + score_pref=$prefix.$metric_id + orig_file=$score_pref.orig + if [[ ! -s $orig_file || $REGEN_ORIG -eq 1 ]]; then + rm -f $score_pref # cleanup + log "Generating original scores for $mn :: $prefix" if [[ $mn =~ BLEURT* ]]; then - local orig_out=$(bleurt_score $mn $tmp_pref) + bleurt_score $mn $prefix > $orig_file else - local orig_out=$(unbabel_score $mn $tmp_pref 2> /dev/null) + unbabel_score $mn $prefix 2> /dev/null > $orig_file fi - local marian_out=$(marian_score $mn $tmp_pref) - paste <(echo "$marian_out") <(echo "$orig_out") \ - | awk -F '\t' -v OFS='\t' -v mn=$mn \ - 'BEGIN {tot=0.0} {diff=sqrt(($1-$2)^2); tot+=diff; print diff,$0} - END {printf "\n===Avg diff in %s: %f===\n\n", mn, tot/NR}' - #TODO1: extract averages and write to a report file - #TODO2: benchmark speeds - done - done < <(get_sacrebleu_names | head -n $max_tests) -} + fi + + out_file=$score_pref.pymarian + if [[ ! -s $out_file || $REGEN_MARIAN -eq 1 ]]; then + rm -f $out_file $out_file.log # cleanup + log "Generating Marian scores for $mn :: $prefix" + pymarian-eval -d $DEVICES -m $(basename $mn) -s $prefix.src -r $prefix.ref -t $prefix.mt -a skip --fp16 --debug > $out_file 2> $out_file.log || { + log "ERROR: Failed to generate scores for $mn" + cat $out_file.log + continue + } + fi + + # compute diffs + paste $out_file $orig_file \ + | awk -F '\t' -v OFS='\t' -v mn=$mn -v of=$out_file.diff 'BEGIN {tot=0.0} + {$2 = +sprintf("%.4f", $2); diff=sqrt(($1-$2)^2); tot+=diff; print diff, $0 > of} + END {printf "%s\t%f\n", mn, tot/NR}' | tee -a $report_file + done +done < <(get_sacrebleu_names | head -n $MAX_TESTS) -main "$@" \ No newline at end of file +cat $OUT_DIR/*.report.txt #| column -t diff --git a/scripts/metrics/convert-all-models.sh b/scripts/metrics/convert-all-models.sh new file mode 100755 index 000000000..29fe72ff5 --- /dev/null +++ b/scripts/metrics/convert-all-models.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +set -eu +MYDIR=$(cd $(dirname ${BASH_SOURCE[0]}) && pwd) +SCRIPTS=$(cd $MYDIR/.. && pwd) + +: " +This script converts all metrics models to Marian format (.npz) and converts them to memory maps (.bin). +This script expects comet2marian.py and bleurt2marian.py +The script also expects marian binary to be in PATH or set as MARIAN environment variable. + +Pre-requisites: + pip install unbabel-comet +Optionally, you may need to configure huggingface transformers, + specifically, hf-login for models that reqire login (e.g., wmt22-cometkiwi-da). + +To run bleurt2marian, install bleurt-pytorch package: + pip install git+https://github.com/lucadiliello/bleurt-pytorch.git +" + +OUT_DIR=${1:-$MYDIR/marian-metric} # NOTE: manually copy this to /mnt/textmt/www/marian/metric +COMET2MARIAN=$SCRIPTS/comet/comet2marian.py +BLEURT2MARIAN=$SCRIPTS/bleurt/bleurt2marian.py +MARIAN=${MARIAN:-} + +# locate marian binary +if [[ -z "$MARIAN" ]]; then + if [[ -f $SCRIPTS/../build/marian ]]; then + MARIAN=$SCRIPTS/../build/marian + elif which marian > /dev/null; then + MARIAN=$(which marian) + fi +fi +if [[ -z "$MARIAN" || ! -e $MARIAN ]]; then + echo -e "Error: marian binary not found." \ + "\n Option 1) export MARIAN=path/to/marian" \ + "\n Option 2) make sudo /build/marian exists" \ + "\n Option 2) add marian binary to PATH" >&2 + exit 1 +fi + +if [[ ! -f $COMET2MARIAN ]]; then + echo "comet2marian.py not found at $COMET2MARIAN"; exit 2 +fi +if [[ ! -f $BLEURT2MARIAN ]]; then + echo "bleurt2marian.py not found at $BLEURT2MARIAN"; exit 2 +fi + +MODEL_IDS=$(cat $MYDIR/known-models.txt | grep -v '^#' | awk '{print $1}') + + +######## convert to marian ######### +for model_id in ${MODEL_IDS[@]}; do + # lowercase model name + model_name=$(basename $model_id | tr '[:upper:]' '[:lower:]') + model_dir=$OUT_DIR/$model_name + ok_flag=$model_dir/._OK + if [[ -f $ok_flag ]]; then + echo "$model_id already exists at $model_dir, skipping." >&2 + continue + fi + echo "Creating $model_dir" + mkdir -p $model_dir + npz_file=$model_dir/model.$model_name.npz + bin_file=${npz_file%.npz}.bin + + # step 1 create .npz file + if [[ ! -f $npz_file || ! -f $npz_file.md5 ]]; then + CONVERT="" + if [[ $model_id =~ BLEURT ]]; then + # only one BLEURT model supported, so it does not take model ID + CONVERT="$BLEURT2MARIAN" + else + CONVERT="$COMET2MARIAN -c $model_id" + fi + rm -f $npz_file $npz_file.md5 # remove incomplete files + ${CONVERT} -m $npz_file --spm $model_dir/vocab.spm \ + || { echo "Error: failed to convert $model_id to Marian format" >&2; exit 3; } + md5sum $npz_file | awk '{print $1}' > $npz_file.md5 + fi + + # Step 2: convert to memory map + if [[ ! -f $bin_file || ! -f $bin_file.md5 ]]; then + echo "Convert $npz_file --> $bin_file" + rm -f $bin_file $bin_file.md5 # remove incomplete files + $MARIAN convert -f $npz_file -t $bin_file || { + echo "Error: failed to convert $npz_file to memory map" >&2; exit 4; + } + md5sum $bin_file | awk '{print $1}' > $bin_file.md5 + fi + touch $ok_flag +done + +# NOTE: only update the new/changed models +#cp -r $OUT_DIR/* /mnt/textmt/www/marian/metric \ No newline at end of file diff --git a/scripts/metrics/docker-run.sh b/scripts/metrics/docker-run.sh deleted file mode 100755 index c379c4415..000000000 --- a/scripts/metrics/docker-run.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash -MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd $MYDIR - -IMAGE="marian-dev" - -VISIBLE_GPUS="1" # exlcude 0 for now; run on single GPU - -MOUNTS="-v $PWD:$PWD" -for cache in .sacrebleu .cache/{marian,torch,huggingface,bleurt}; do - MOUNTS+=" -v $HOME/$cache:/root/$cache" -done - - -cmd="docker run --rm -i $MOUNTS --gpus "\"device=$VISIBLE_GPUS\"" -t $IMAGE" - -# uncomment for an interactive shell -# $cmd bash - -$cmd $PWD/compare.sh $@ diff --git a/scripts/metrics/known-models.txt b/scripts/metrics/known-models.txt new file mode 100644 index 000000000..7b7307cef --- /dev/null +++ b/scripts/metrics/known-models.txt @@ -0,0 +1,13 @@ +BLEURT-20 +wmt20-comet-qe-da +wmt20-comet-qe-da-v2 +wmt20-comet-da +wmt21-comet-qe-mqm +wmt21-comet-qe-da +wmt21-comet-da +Unbabel/wmt22-comet-da +Unbabel/wmt22-cometkiwi-da +Unbabel/wmt23-cometkiwi-da-xl +Unbabel/wmt23-cometkiwi-da-xxl +Unbabel/XCOMET-XL +Unbabel/XCOMET-XXL \ No newline at end of file diff --git a/scripts/metrics/marian-score.sh b/scripts/metrics/marian-score.sh deleted file mode 100755 index 873ef5921..000000000 --- a/scripts/metrics/marian-score.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env bash -set -eu - -MYDIR=$(realpath $(dirname ${BASH_SOURCE[0]})) - - -METRICS_CACHE=$HOME/.cache/marian/metrics - -log() { - echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $@" >&2 -} - -which marian > /dev/null || { - log "marian not found in PATH. Please add marian binary to \$PATH and rerun" - exit 2 -} - -metric_name= -src_file= -ref_file= -hyp_file= -is_seg= -debug_mode= -batch_size=32 -pool_size=10 -max_length=256 -devices=0 -workspace=-4000 - -usage() { - log " ${BASH_SOURCE##*/} -n METRIC -m HYP [-s SRC] [-r REF] [-d DEVICES] [--seg] [--debug] [-h|--help] - -Args: - -n|--name|--metric NAME Metric name; required. See below for details. - -m|--mt|--hyp FILE MT hypothesis, required for all metrics. - -s|--src FILE Source file, required for source based metrics. - -r|--ref FILE Reference file, required for reference based metrics. - -d|--devices DEV IDs of GPU devices to use. Use quoted string to pass multiple values. Default: '$devices' - --seg Output segment-level scores. Default: print only the corpus-level score (mean of segment scores) - --debug Enable verbose mode (default is quiet) - -h|--help Print this help message - -Metric name (-n|--name) shuld be a subdir name under $METRICS_CACHE. -The metric name should have a suffix (-src|-qe|-ref|-src+ref) indicating the type of metric: - *-src|*-qe Source-based metric and requires --src arg, e.g., comet20-src or comet20-da-qe - *-ref Reference-based metric and requires --ref arg, e.g., bleurt20-ref - *-src+ref Both source and reference based and requires --src and --ref args e.g., comet20-src+ref -" -} - -while [[ $# -gt 0 ]]; do - case $1 in - -s|--src) src_file=$2; shift 2;; - -r|--ref) ref_file=$2; shift 2;; - -m|--mt|--hyp) hyp_file=$2; shift 2;; - -n|--name|--metric) metric_name=$2; shift 2;; - -d|--devices) devices=$2; shift 2;; - --seg) is_seg=1; shift 1;; - --debug) debug_mode=1; shift 1;; - -h|--help) usage; exit 0;; - *) log "ERROR: unknown option $1"; usage; exit 1;; - esac -done - -[[ -n $metric_name ]] || { log "ERROR: metric_name=$metric_name name not provided"; usage; exit 1; } -[[ -e $hyp_file ]] || { log "ERROR: hyp file not provided"; usage; exit 1; } - -metric_dir=$METRICS_CACHE/$metric_name -checkpoint=$(echo $metric_dir/*model.npz) # file model.npz or .model.npz -vocab=$(echo $metric_dir/*vocab.spm) -[[ -f $checkpoint && -f $vocab ]] || { - log "ERROR: metric $metric_name is not valid. See ls $METRICS_CACHE/$metric_name/{*model.npz,*vocab.spm}" - exit 1 -} - -# args common to all models -cmd="marian evaluate -w -4000" -[[ -n $devices ]] && cmd+=" -d $devices" -[[ -n $debug_mode ]] || cmd+=" --quiet" -cmd+=" -m $checkpoint --max-length $max_length --max-length-crop --mini-batch $batch_size --maxi-batch $pool_size -t stdin --tsv" -input= # to be filled later - - -check_file(){ - local name=$1 - local file=$2 - [[ -e $file ]] || { log "ERROR: $name file $file does not exist"; exit 1; } - [[ -s $file ]] || { log "ERROR: $name file $file is empty"; exit 1; } -} - -metric_type=${metric_name##*-} # suffix expected: src, ref, src+ref -case $metric_type in - src|qe) - # two sequences: src, hyp - check_file src $src_file - cmd+=" --like comet-qe -v $vocab $vocab" - input="paste $src_file $hyp_file" - ;; - ref) - check_file ref $ref_file - # two sequences: ref, hyp - cmd+=" --like bleurt -v $vocab $vocab" - input="paste $ref_file $hyp_file" - ;; - src+ref) - # three sequences: src, hyp, ref; three vocabularies - check_file src $src_file - check_file ref $ref_file - cmd+=" --like comet -v $vocab $vocab $vocab" - input="paste $src_file $hyp_file $ref_file" - ;; - *) - log "ERROR: $metric_name is not valid. Valid metrics have suffix '-{src|qe|ref|src+ref}'" - exit 3 - ;; -esac - -if [[ -z $is_seg ]]; then - cmd+=" --average only"; -fi -pipeline="$input | $cmd | cut -f1 -d' '" - -# mean (default) or segment-level scores - -log "Running: $pipeline" -eval $pipeline diff --git a/scripts/metrics/requirements.txt b/scripts/metrics/requirements.txt new file mode 100644 index 000000000..e9128e631 --- /dev/null +++ b/scripts/metrics/requirements.txt @@ -0,0 +1,8 @@ +# assume it downloads the correct pytorch +unbabel-comet==2.2.1 +sacrebleu +# this is the original bleurt; used for comparing scores +git+https://github.com/google-research/bleurt.git +# this is the pytorch version of bleurt; used in blert2marian +git+https://github.com/lucadiliello/bleurt-pytorch.git +huggingface_hub[cli] # required for login to hf to authenticate private models \ No newline at end of file diff --git a/scripts/metrics/run.sh b/scripts/metrics/run.sh new file mode 100644 index 000000000..7f067e013 --- /dev/null +++ b/scripts/metrics/run.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -eu +MYDIR=$(dirname ${BASH_SOURCE[0]}) +cd $MYDIR + +ENV_NAME=metrics +which conda > /dev/null || (echo "conda not found" && exit 1) +# conda functions are not exported in non-interactive shell, so we source conda.sh +CONDA_BASE=$(conda info --base) +source $CONDA_BASE/etc/profile.d/conda.sh +FOUND="$(conda env list | awk -v name=$ENV_NAME '$1==name { print $1 }')" + +log() { + echo -e "\e[32m$@\e[0m" >&2 +} +#### SETUP ######### +if [[ -z "$FOUND" ]]; then + log "Creating conda environment $ENV_NAME" + # create conda environment and install requirements + conda create -n $ENV_NAME python=3.10 + conda activate $ENV_NAME + log "Installing requirements" + pip install -r $MYDIR/requirements.txt +else + log "Activating conda environment $ENV_NAME" + conda activate $ENV_NAME +fi + +which pymarian-eval > /dev/null || ( + echo "pymarian-eval not found. Please install and return" && exit 1 ) + +##################### +bash ./compare.sh \ No newline at end of file diff --git a/scripts/metrics/setup.sh b/scripts/metrics/setup.sh deleted file mode 100755 index df16563a6..000000000 --- a/scripts/metrics/setup.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash -MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -cd $MYDIR - -#SSH_KEY=$HOME/.ssh/id_rsa # for git clone inside docker build -IMAGE=marian-dev -echo "Building docker image $IMAGE" -#DOCKER_BUILDKIT=1 docker build --ssh default=$SSH_KEY . -f Dockerfile -t $IMAGE -DOCKER_BUILDKIT=1 docker build . -f Dockerfile -t $IMAGE - - -# Optional build args: -# --build-arg MARIAN_COMMIT=master \ -# --build-arg MARIAN_REPO=https://github.com/marian-nmt/marian-dev.git \ -# --build-arg NCPUS=16 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c40eabc76..c0c4f74b9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -307,9 +307,9 @@ if(PYMARIAN) install(TARGETS _pymarian DESTINATION .) # build pymarian wheel - add_custom_target(pymarian ALL + add_custom_target(pymarian ALL ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}" "${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}" - DEPENDS _pymarian + DEPENDS _pymarian VERBATIM COMMENT "Building pymarian wheel") endif(PYMARIAN) diff --git a/src/models/bleurt.h b/src/models/bleurt.h index 844f94609..99cf7f7a1 100644 --- a/src/models/bleurt.h +++ b/src/models/bleurt.h @@ -70,7 +70,7 @@ struct BleurtEncoder final : public nn::TransformerEncoder { auto output = marian::nn::swapTimeBatch(input); // [beam depth=1, batch size, max length, vector dim] auto binaryMask = marian::nn::swapTimeBatch(mask); // [beam depth=1, batch size, max length, vector dim=1] - + // apply positional embeddings to contextual input output = positionEmbedding->apply(output); @@ -194,7 +194,6 @@ class BleurtPooler final : public nn::LayerWithOptions, auto modelType = LayerWithOptions::opt("type"); auto emb = slice(encoderStates[0]->getContext(), -2, 0); - emb = marian::cast(emb, Type::float32); Expr output; if(LayerWithOptions::opt("usage") == (int)models::usage::evaluating) { diff --git a/src/python/README.md b/src/python/README.md index f8f00bdc5..d3fc34e25 100644 --- a/src/python/README.md +++ b/src/python/README.md @@ -7,10 +7,6 @@ ## Install ```bash -# get source code -git clone https://github.com/marian-nmt/marian-dev -cd marian-dev - # build marian with -DPYMARIAN=on option to create a pymarian wheel cmake . -Bbuild -DCOMPILE_CUDA=off -DPYMARIAN=on -DCMAKE_BUILD_TYPE=Release cmake --build build -j # -j option parallelizes build on all cpu cores @@ -59,42 +55,50 @@ for score in scores: . `pymarian-qtdemo` : GUI App demo powered by QT -### `pymarian-evaluate` +### `pymarian-eval` ```bash -$ pymarian-evaluate -h -usage: pymarian-evaluate [-h] [-m MODEL] [--stdin] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE] [-o OUT] [-a {skip,append,only}] [-w WIDTH] [--debug] [--mini-batch MINI_BATCH] [-d [DEVICES ...] | -c - CPU_THREADS] [-ws WORKSPACE] [--backend {subprocess,pymarian}] +$ pymarian-eval -h +usage: pymarian-eval [-h] [-m MODEL] [-v VOCAB] [-l {comet-qe,bleurt,comet}] [-V] [-] [-t MT_FILE] [-s SRC_FILE] [-r REF_FILE] [-f FIELD [FIELD ...]] [-o OUT] [-a {skip,append,only}] [-w WIDTH] [--debug] [--fp16] [--mini-batch MINI_BATCH] [-d [DEVICES ...] | -c + CPU_THREADS] [-ws WORKSPACE] [-pc] options: -h, --help show this help message and exit -m MODEL, --model MODEL - Model name, or path. Known models=['cometoid22-wmt21', 'cometoid22-wmt22', 'cometoid22-wmt23', 'chrfoid-wmt23', 'comet20-da-qe', 'bleurt20', 'comet20-da'] (default: - cometoid22-wmt22) - --stdin Read input from stdin. TSV file with following format: QE metrics: "srcmt", Comet with ref: "srcref; or BLEURT: "refmt" (default: False) + Model name, or path. Known models: bleurt-20, wmt20-comet-da, wmt20-comet-qe-da, wmt20-comet-qe-da-v2, wmt21-comet-da, wmt21-comet-qe-da, wmt21-comet-qe-mqm, wmt22-comet-da, wmt22-cometkiwi-da, xcomet-xl, xcomet-xxL (default: wmt22-cometkiwi-da) + -v VOCAB, --vocab VOCAB + Vocabulary file (default: None) + -l {comet-qe,bleurt,comet}, --like {comet-qe,bleurt,comet} + Model type. Required if --model is a local file (auto inferred for known models) (default: None) + -V, --version show program's version number and exit + -, --stdin Read input from stdin. TSV file with following format: QE metrics: "srcmt", Ref based metrics ref: "srcmtref" or "mtref" (default: False) -t MT_FILE, --mt MT_FILE - MT output file. Ignored when --stdin. (default: None) + MT output file. Ignored when --stdin (default: None) -s SRC_FILE, --src SRC_FILE Source file. Ignored when --stdin (default: None) -r REF_FILE, --ref REF_FILE Ref file. Ignored when --stdin (default: None) - -o OUT, --out OUT output file. Default stdout (default: <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>) + -f FIELD [FIELD ...], --fields FIELD [FIELD ...] + Input fields, an ordered sequence of {src, mt, ref} (default: ['src', 'mt', 'ref']) + -o OUT, --out OUT output file (default: <_io.TextIOWrapper name='' mode='w' encoding='utf-8'>) -a {skip,append,only}, --average {skip,append,only} - Average segment scores to produce system score. skip=do not output average (default; segment scores only); append=append average at the end; only=output the average only - (i.e system score only) (default: skip) + Average segment scores to produce system score. skip=do not output average (default; segment scores only); append=append average at the end; only=output the average only (i.e. system score only) (default: skip) -w WIDTH, --width WIDTH Output score width (default: 4) - --debug Verbose output (default: False) + --debug Debug or verbose mode (default: False) + --fp16 Enable FP16 mode (default: False) --mini-batch MINI_BATCH Mini-batch size (default: 16) -d [DEVICES ...], --devices [DEVICES ...] GPU device IDs (default: None) -c CPU_THREADS, --cpu-threads CPU_THREADS - Use CPU threads. 0=use gpu device 0 (default: None) + Use CPU threads. 0=use GPU device 0 (default: None) -ws WORKSPACE, --workspace WORKSPACE Workspace memory (default: 8000) - --backend {subprocess,pymarian} - Marian backend interface. subprocess looks for marian binary in PATH. pymarian is a pybind wrapper (default: pymarian) + -pc, --print-cmd Print marian evaluate command and exit (default: False) + +More info at https://github.com/marian-nmt/marian-dev. This CLI is loaded from .../python3.10/site-packages/pymarian/eval.py (version: 1.12.25) + ``` **Performance Tuning Tips**: @@ -104,33 +108,6 @@ options: * To see full logs from marian, set `--debug` -*Example Usage* -```bash -# download sample dataset -langs=en-ru -prefix=tmp.$langs -teset=wmt21/systems -sysname=Online-B -sacrebleu -t $teset -l $langs --echo src > $prefix.src -sacrebleu -t $teset -l $langs --echo ref > $prefix.ref -sacrebleu -t $teset -l $langs --echo $sysname > $prefix.mt - -# chrfoid -paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m chrfoid-wmt23 - -# cometoid22-wmt{21,22,23} -paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m cometoid22-wmt22 - -# bleurt20 -paste $prefix.{ref,mt} | head | pymarian-evaluate --stdin -m bleurt20 --debug - -# FIXME: comet20-da-qe and comet20-da appear to be broken -# comet20-da-qe -paste $prefix.{src,mt} | head | pymarian-evaluate --stdin -m comet20-da-qe -# comet20-da -paste $prefix.{src,mt,ref} | pymarian-evaluate -m comet20-da - -``` ### `pymarian-mtapi` @@ -156,23 +133,32 @@ curl $URL --header "Content-Type: application/json" --request POST --data '[{"te pymarian-qtdemo ``` +## Code Formatting + +```bash + +pip install black isort +isort . +black . +cd src/python +``` + ## Run Tests ```bash # install pytest if necessary -python -m pip install pytest +python -m pip install pytest # run tests in quiet mode -python -m pytest src/python/tests/ +python -m pytest src/python/tests/regression # or, add -s to see STDOUT/STDERR from tests -python -m pytest -s src/python/tests/ +python -m pytest -s src/python/tests/regression ``` - ## Known issues - + 1. In conda or mamba environment, if you see `.../miniconda3/envs//bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error, install libstdcxx-ng diff --git a/src/python/pymarian/__init__.py b/src/python/pymarian/__init__.py index f08d00944..36011c203 100644 --- a/src/python/pymarian/__init__.py +++ b/src/python/pymarian/__init__.py @@ -1,6 +1,17 @@ +import logging +from itertools import islice +from pathlib import Path +from typing import Iterator, List, Optional, Tuple, Union + import _pymarian +import yaml + +# this log may be used by submodules, so we declare it here before submodule imports +log = logging.getLogger(__name__) +log.setLevel(logging.INFO) from ._version import __version__ +from .defaults import Defaults from .utils import kwargs_to_cli @@ -22,8 +33,122 @@ def __init__(self, cli_string='', **kwargs): """Initializes the evaluator :param kwargs: kwargs """ - cli_string += ' ' + kwargs_to_cli(**kwargs) - super().__init__(cli_string.strip()) + self._kwargs = kwargs + self._cli_string = (cli_string + ' ' + kwargs_to_cli(**kwargs)).strip() + super().__init__(self._cli_string) + self._config = yaml.safe_load(self.get_model_config()) + log.debug(f'Model config: {self._config}') + + @property + def model_type(self) -> str: + return self._config.get('type', None) + + @classmethod + def new( + cls, + model_file: Path, + vocab_file: Path = None, + devices: Optional[List[int]] = None, + width=Defaults.FLOAT_PRECISION, + mini_batch=Defaults.MINI_BATCH, + maxi_batch=Defaults.MAXI_BATCH, + like=Defaults.DEF_MODEL_TYPE, + workspace=Defaults.WORKSPACE, + max_length=Defaults.MAX_LENGTH, + cpu_threads=0, + average: str = Defaults.AVERAGE, + **kwargs, + ) -> Iterator[Union[float, Tuple[float, float]]]: + """A factory function to create an Evaluator with default values. + + :param model_file: path to model file + :param vocab_file: path to vocabulary file + :param devices: list of GPU devices to use (optional) + :param width: number of decimal places to have in output scores + :param mini_batch: mini-batch size + :param maxi_batch: maxi-batch size + :param like: marian metric model like + :param cpu_threads: number of CPU threads to use + :param: average: average segment scores to produce system score. + skip=do not output average (default; segment scores only); + append=append average at the end; + only=output the average only (i.e. system score only) + :return: iterator of scores + """ + + assert model_file.exists(), f'Model file {model_file} does not exist' + assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist' + assert like in Defaults.MODEL_TYPES, f'Unknown model type: {like}' + n_inputs = len(Defaults.MODEL_TYPES[like]) + vocabs = [vocab_file] * n_inputs + if not kwargs: + kwargs = {} + kwargs.update( + model=model_file, + vocabs=vocabs, + devices=devices, + width=width, + like=like, + mini_batch=mini_batch, + maxi_batch=maxi_batch, + max_length=max_length, + max_length_crop=True, + workspace=workspace, # negative memory => relative to total memory + cpu_threads=cpu_threads, + average=average, + ) + if kwargs.pop('fp16'): + kwargs['fp16'] = '' # empty string for flag; i.e, "--fp16" and not "--fp16=true" + + # TODO: remove this when c++ bindings supports iterator + kwargs['average'] = 'skip' + return cls(**kwargs) + + def evaluate(self, input_lines: Iterator[str], average: str = 'skip', batch_size: Optional[int] = None): + """Evaluates the input lines and returns the scores + + This function creates mini batches in python and calls the C++ bindings to evaluate the input lines. + This is a workaround until the C++ bindings support iterator API. + + :param input_lines: iterator of input lines + :param average: average segment scores to produce system score. Options: + skip=do not output average (default; segment scores only); + append=append average at the end; + only=output the average only (i.e. system score only) + :param batch_size: batch size (optional; default=2*mini_batch*maxi_batch) + :return: iterator of scores + """ + assert average in ('skip', 'append', 'only') + lines = (line.rstrip('\r\n').split('\t') for line in input_lines) + if not batch_size: + mini_batch = self._kwargs.get('mini_batch', Defaults.MINI_BATCH) + maxi_batch = self._kwargs.get('maxi_batch', Defaults.MAXI_BATCH) + batch_size = 2 * mini_batch * maxi_batch + # Sending twice the batch size to avoid starving GPU backend + # This is a workaround until the C++ bindings support iterator API + # pymarian bindings does not yet support iterator input, so this function is mini batching here + def make_maxi_batches(lines, batch_size=batch_size): + assert batch_size > 0 + while True: + chunk = list(islice(lines, batch_size)) + if not chunk: + return + yield chunk + + total, count = 0.0, 0 + for batch in make_maxi_batches(lines): + scores = super().evaluate(batch) + assert len(scores) == len(batch) + for score in scores: + if isinstance(score, (tuple, list)): + score = score[0] + total += score + count += 1 + if average != 'only': # skip or append + yield score + + if average != 'skip': # append or only + yield total / count class Trainer(_pymarian.Trainer): diff --git a/src/python/pymarian/__main__.py b/src/python/pymarian/__main__.py index e0b68cd65..08d4e6fcb 100644 --- a/src/python/pymarian/__main__.py +++ b/src/python/pymarian/__main__.py @@ -1,18 +1,23 @@ - import argparse from pymarian import __version__ + def parse_args(): - parser = argparse.ArgumentParser(prog='pymarian', description="Python wrapper for Marian NMT", - epilog='URL: https://github.com/marian-nmt/marian-dev') + parser = argparse.ArgumentParser( + prog='pymarian', + description="Python wrapper for Marian NMT", + epilog='URL: https://github.com/marian-nmt/marian-dev', + ) parser.add_argument('--version', '-v', action='version', version=__version__) return parser.parse_args() + def main(): args = parse_args() # prints version for -v/-version option. - # no other options are currently supported. Space left/intended for future use. + # no other options are currently supported. Space left/intended for future use. + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/python/pymarian/constants.py b/src/python/pymarian/constants.py deleted file mode 100644 index 3d04abbba..000000000 --- a/src/python/pymarian/constants.py +++ /dev/null @@ -1,28 +0,0 @@ -from pathlib import Path - - -class Defaults: - BASE_URL = "https://textmt.blob.core.windows.net/www/models/mt-metric" - CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metrics' - MINI_BATCH = 16 - MAXI_BATCH = 256 - WORKSPACE = 8000 - AVERAGE = 'skip' - MAX_LENGTH = 512 - FLOAT_PRECISION = 4 - - # NOTE: model names must be lower case for caseless matching - KNOWN_METRICS = { - 'cometoid22-wmt21': "comet-qe", - 'cometoid22-wmt22': "comet-qe", - 'cometoid22-wmt23': "comet-qe", - 'chrfoid-wmt23': "comet-qe", - 'comet20-da-qe': "comet-qe", - 'bleurt20': "bleurt", - 'comet20-da': "comet", - } - - KNOWN_SCHEMA = {'comet-qe': 'src+mt', 'bleurt': 'ref+mt', 'comet': 'src+mt+ref'} - - DEF_MODEL = 'cometoid22-wmt22' - DEF_SCHEMA = KNOWN_METRICS[DEF_MODEL] diff --git a/src/python/pymarian/defaults.py b/src/python/pymarian/defaults.py new file mode 100644 index 000000000..2fdeff278 --- /dev/null +++ b/src/python/pymarian/defaults.py @@ -0,0 +1,40 @@ +from pathlib import Path + + +class Defaults: + BASE_URL = "https://textmt.blob.core.windows.net/www/marian/metric" + CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric' + MINI_BATCH = 16 + MAXI_BATCH = 256 + WORKSPACE = 8000 + AVERAGE = 'skip' + MAX_LENGTH = 512 + FLOAT_PRECISION = 4 + FILE_LOCK_TIMEOUT = 1 * 60 * 60 # seconds => 1 hour + PROGRESS_BAR = True + + # metric name to model type; lowercase all IDs + KNOWN_METRICS = { + "bleurt-20": "bleurt", + "wmt20-comet-da": "comet", + "wmt20-comet-qe-da": "comet-qe", + "wmt20-comet-qe-da-v2": "comet-qe", + "wmt21-comet-da": "comet", + "wmt21-comet-qe-da": "comet-qe", + "wmt21-comet-qe-mqm": "comet-qe", + "wmt22-comet-da": "comet", + "wmt22-cometkiwi-da": "comet-qe", + "xcomet-xl": "comet", + "xcomet-xxL": "comet", + } + + # model type to field order + MODEL_TYPES = { + 'comet-qe': ('src', 'mt'), + 'bleurt': ('mt', 'ref'), + 'comet': ('src', 'mt', 'ref'), + } + + DEF_MODEL = 'wmt22-cometkiwi-da' + DEF_MODEL_TYPE = 'comet-qe' + DEF_FIELD_ORDER = 'src mt ref'.split() diff --git a/src/python/pymarian/eval.py b/src/python/pymarian/eval.py new file mode 100755 index 000000000..4b5e5f02c --- /dev/null +++ b/src/python/pymarian/eval.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python +# +# This is a python wrapper for marian evaluate command +# +import argparse +import logging as log +import sys +from itertools import zip_longest +from pathlib import Path +from typing import Iterator, List + +from . import Evaluator, __version__ +from .defaults import Defaults +from .utils import get_model_path, get_vocab_path + +log.basicConfig(level=log.INFO) +DEBUG_MODE = False + + +def parse_args(): + parser = argparse.ArgumentParser( + "pymarian-eval", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + epilog='More info at https://github.com/marian-nmt/marian-dev. ' + f'This CLI is loaded from {__file__} (version: {__version__})', + ) + + known_metrics = ', '.join(Defaults.KNOWN_METRICS) + parser.add_argument( + '-m', + '--model', + help=f'Model name, or path. Known models: {known_metrics}', + default=Defaults.DEF_MODEL, + type=str, + ) + parser.add_argument('-v', '--vocab', help=f'Vocabulary file', type=Path) + parser.add_argument( + '-l', + '--like', + help='Model type. Required if --model is a local file (auto inferred for known models)', + type=str, + choices=list(Defaults.MODEL_TYPES.keys()), + ) + parser.add_argument('-V', '--version', action="version", version=f"%(prog)s {__version__}") + + parser.add_argument( + '-', + '--stdin', + action='store_true', + help='Read input from stdin. TSV file with following format: \ + QE metrics: "srcmt", Ref based metrics ref: "srcmtref" or "mtref"', + ) + parser.add_argument('-t', '--mt', dest='mt_file', help='MT output file. Ignored when --stdin', type=Path) + parser.add_argument('-s', '--src', dest='src_file', help='Source file. Ignored when --stdin', type=Path) + parser.add_argument('-r', '--ref', dest='ref_file', help='Ref file. Ignored when --stdin', type=Path) + parser.add_argument( + '-f', + '--fields', + dest='user_fields', + metavar='FIELD', + nargs='+', + choices=['src', 'mt', 'ref'], + help='Input fields, an ordered sequence of {src, mt, ref}', + default=Defaults.DEF_FIELD_ORDER, + type=str, + ) + parser.add_argument('-o', '--out', default=sys.stdout, help='output file', type=argparse.FileType('w')) + parser.add_argument( + '-a', + '--average', + choices=('skip', 'append', 'only'), + default='skip', + help='Average segment scores to produce system score.' + ' skip=do not output average (default; segment scores only);' + ' append=append average at the end; ' + ' only=output the average only (i.e. system score only)', + ) + + parser.add_argument('-w', '--width', default=4, help='Output score width', type=int) + parser.add_argument('--debug', help='Debug or verbose mode', action='store_true') + parser.add_argument('--fp16', help='Enable FP16 mode', action='store_true') + parser.add_argument('--mini-batch', default=16, help='Mini-batch size', type=int) + group = parser.add_mutually_exclusive_group() + group.add_argument('-d', '--devices', nargs='*', type=int, help='GPU device IDs') + group.add_argument( + '-c', '--cpu-threads', default=None, type=int, help='Use CPU threads. 0=use GPU device 0' + ) + parser.add_argument('-ws', '--workspace', default=8000, help='Workspace memory', type=int) + parser.add_argument( + '-pc', '--print-cmd', action="store_true", help="Print marian evaluate command and exit" + ) + + args = parser.parse_args() + return vars(args) + + +def find_field_ordering(expected_fields: List[str], given_fields: List[str]) -> List[int]: + """Find the order of fields in given_fields to match expected_fields + + :param expected_fields: list of expected fields + :param given_fields: list of given fields + :return: list of indices to select from given_fields to match expected_fields + :throws ValueError: if any expected field is missing in given_fields + """ + + missing_fields = set(expected_fields) - set(given_fields) + if missing_fields: + raise ValueError( + f'Required fields are missing: {missing_fields} [expected: {expected_fields}, given: {given_fields}]' + ) + field_order = [] + for name in expected_fields: + idx = given_fields.index(name) + assert idx >= 0, f'Field {name} not found in {given_fields}. Please check --fields argument' # this should never happen + field_order.append(idx) + return field_order + + +def reorder_fields(lines: Iterator[str], field_order: List[int]) -> Iterator[str]: + """Reorder fields in each line according to field_order + + :param lines: input lines + :param field_order: list of indices to reorder fields + :return: lines with fields reordered + :throws ValueError: if any line has missing fields + """ + max_column = max(field_order) + for line_num, line in enumerate(lines, start=1): + fields = line.rstrip('\r\n').split('\t') + if len(fields) <= max_column: + raise ValueError( + f'Expected at least {max_column} columns, but got {len(fields)} in line {line_num}' + ) + yield '\t'.join(fields[i] for i in field_order) + + +def read_input( + stdin=False, + src_file=None, + mt_file=None, + ref_file=None, + expected_fields=Defaults.DEF_FIELD_ORDER, + user_fields=Defaults.DEF_FIELD_ORDER, +): + """Read input files and reorder fields if necessary. + + This function modifies args dictionary in place. + :param args: command line arguments + :param model_id: model ID + :param schema: schema to use for the model + """ + + n_inputs = len(expected_fields) + assert 1 <= n_inputs <= 3, f'Invalid : {expected_fields}' + + if stdin: + assert 1 <= len(user_fields) <= 3 + reorder_idx = find_field_ordering(expected_fields, user_fields) + log.info(f'Input field mappings: {reorder_idx}; expected: {expected_fields}, given: {user_fields}') + return reorder_fields(sys.stdin, reorder_idx) + + n_inputs = len(expected_fields) + assert mt_file.exists(), 'File with hypotheses {mt_file} does not exist' + if 'src' in expected_fields: + assert src_file, f'Source file is required' + assert src_file.exists(), f'{src_file} does not exist' + if 'ref' in expected_fields: + assert ref_file, f'Reference file is required' + assert ref_file.exists(), f'{ref_file} does not exist' + + if expected_fields == ('src', 'mt'): + input_lines = zip_longest(open(src_file), open(mt_file)) + elif expected_fields == ('mt', 'ref'): + input_lines = zip_longest(open(mt_file), open(ref_file)) + elif expected_fields == ('src', 'mt', 'ref'): + input_lines = zip_longest(open(src_file), open(mt_file), open(ref_file)) + else: + raise ValueError(f'Unknown schema {expected_fields}') + + def _validate_and_join(): + for row in input_lines: + assert len(row) == n_inputs, f'Expected {n_inputs} columns, but got {len(row)}' + for col in row: + assert col is not None, f'Expected {n_inputs} columns, but got {len(row)}' + line = '\t'.join(col.strip() for col in row) + yield line + + return _validate_and_join() + + +def main(**args): + args = args or parse_args() + if args.pop('debug'): + log.getLogger().setLevel(log.DEBUG) + global DEBUG_MODE + DEBUG_MODE = True + log.debug(args) + else: + args['quiet'] = '' + + model_id = args.pop('model') + model_path = Path(model_id) + vocab_path = args.pop('vocab') + if vocab_path: # if user gave this arg, it must be a valid arg + assert vocab_path.exists(), f'Vocabulary file {vocab_path} does not exist' + + # if model arg is local path + if model_path.suffix.lower() in ('.npz', '.bin'): + assert model_path.exists() and model_path.is_file(), f'Model file {model_path} does not exist' + model_id = model_path.stem + assert args.get('like'), f'--like is required when --model is a local file' + if not vocab_path: # if vocab is not given, resolve it from model directory + vocab_path = model_path.parent / 'vocab.spm' + if not vocab_path.exists(): + raise Exception( + f'Vocabulary file {vocab_path} does not exist. Plese sepcify it with --vocab option.' + ) + else: # assume it is ID and resolve path from cache + model_id = model_id.lower() + try: + model_path = get_model_path(model_id) + if not vocab_path: # if vocab is not given, resolve it from cache + vocab_path = get_vocab_path(model_id) + args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_MODEL_TYPE) + except ValueError as e: + raise ValueError(f'Invalid model ID: {model_id}') from e + + args['model_file'] = model_path + args['vocab_file'] = vocab_path + + out = args.pop('out') + width = args.pop('width', Defaults.FLOAT_PRECISION) + average = args.pop('average', Defaults.AVERAGE) + print_cmd = args.pop('print_cmd', False) + + input_args = ('stdin', 'src_file', 'mt_file', 'ref_file', 'user_fields') + input_args = {k: args.pop(k) for k in input_args} + input_args['expected_fields'] = Defaults.MODEL_TYPES[args['like']] + model_args = args + + evaluator = Evaluator.new(**model_args) + if evaluator.model_type != args['like']: + log.warning(f'Config model type is {evaluator.model_type}, but given: {args["like"]}') + + input_lines = read_input(**input_args) + cmd_line = "marian evaluate " + evaluator._cli_string + if print_cmd: # print the command and exit + print(cmd_line) + return + else: + log.info("CLI:\t" + cmd_line) + + scores = evaluator.evaluate(input_lines, average=average) + + for i, score in enumerate(scores, start=1): + if isinstance(score, (tuple, list)): + score = score[0] # the first score + out.write(f'{score:.{width}f}\n') + out.close() + log.info(f'Wrote {i} lines to {out.name}') + + +if '__main__' == __name__: + main() diff --git a/src/python/pymarian/evaluate.py b/src/python/pymarian/evaluate.py deleted file mode 100755 index 371a37006..000000000 --- a/src/python/pymarian/evaluate.py +++ /dev/null @@ -1,350 +0,0 @@ -#!/usr/bin/env python -# -# This is a python wrapper for marian evaluate command -# -import argparse -import itertools -import logging as log -import shutil -import subprocess -import sys -import threading -import yaml - -from pathlib import Path -from typing import Iterator, List, Optional, Tuple, Union - -from .constants import Defaults -from .utils import get_known_model - -log.basicConfig(level=log.INFO) -DEBUG_MODE = False - - -def copy_lines_to_stdin(proc, lines: Iterator[str]): - """Write data to subproc stdin. Note: run this on another thread to avoid deadlock - This function reads streams, and write them as TSV record to the stdin of the sub process. - :param proc: subprocess object to write to - """ - - for line in lines: - # line = line.rstrip('\n') + '\n' - proc.stdin.write(line) - proc.stdin.flush() - proc.stdin.close() # close stdin to signal end of input - - -def marian_evaluate( - model: Path, - input_lines: Iterator[str], - vocab_file: Path = None, - devices: Optional[List[int]] = None, - width=Defaults.FLOAT_PRECISION, - mini_batch=Defaults.MINI_BATCH, - like=Defaults.DEF_SCHEMA, - maxi_batch=Defaults.MAXI_BATCH, - workspace=Defaults.WORKSPACE, - max_length=Defaults.MAX_LENGTH, - cpu_threads=0, - average: str = Defaults.AVERAGE, - backend='subprocess', -) -> Iterator[Union[float, Tuple[float, float]]]: - """Run 'marian evaluate' as a subprocess or using pymarian, read input and write scores - Depending on the `model` argument, either a single score or a tuple of scores is returned per input line. - :param model: path to model file, or directory containing model.npz.best-embed.npz - :param vocab: path to vocabulary file (optional; if not given, assumed to be in the same directory as the model) - :param devices: list of GPU devices to use (optional; if not given, decision is let to marian process) - :param width: float precision - :param mini_batch: mini-batch size (default: 16) - :param like: marian embedding model like (default: comet-qe) - :param cpu_threads: number of CPU threads to use (default: 0) - :param: average: average segment scores to produce system score. - skip=do not output average (default; segment scores only); - append=append average at the end; - only=output the average only (i.e. system score only) - :param backend: subprocess or pymarian - :return: iterator over scores. - """ - - assert model.exists() - if model.is_dir(): - model_dir = model - _model_files = list(model.glob("*.npz")) - assert len(_model_files) == 1, f'Expected exactly one model file in {model_dir}' - model_file = _model_files[0] - else: - assert model.is_file() - model_dir = model.parent - model_file = model - if not vocab_file: - _vocab_files = list(model_dir.glob('*.spm')) - assert len(_vocab_files) == 1, f'Expected exactly one vocab file in {model_dir}' - vocab_file = _vocab_files[0] - - assert model_file.exists(), f'Model file {model_file} does not exist' - assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist' - - n_inputs = len(Defaults.KNOWN_SCHEMA[like].split('+')) - vocabs = [vocab_file] * n_inputs - kwargs = dict( - model=model_file, - vocabs=vocabs, - devices=devices, - width=width, - like=like, - mini_batch=mini_batch, - maxi_batch=maxi_batch, - max_length=max_length, - max_length_crop=True, - workspace=workspace, # negative memory => relative to total memory - cpu_threads=cpu_threads, - average=average, - ) - if backend == 'pymarian': - # handled separately for pymarian due to minibatching and iterator input - # TODO: remove this when iterator is supported in evaluator C++ API - kwargs['average'] = 'skip' - - cmd_line = [] - for key, val in kwargs.items(): - if val is None: # ignore this key / flag - continue - cmd_line.append(f"--{key.replace('_', '-')}") - if val is True: # boolean flag - cmd_line.append('true') - elif val is False: - cmd_line.append('false') - - elif isinstance(val, (list, tuple)): - cmd_line.extend(str(v) for v in val) - else: - cmd_line.append(str(val)) - if not DEBUG_MODE: - cmd_line.append('--quiet') - if backend == 'subprocess': - return subprocess_evaluate(cmd_line, input_lines) - elif backend == 'pymarian': - cmd_line = ' '.join(cmd_line) - batch_size = mini_batch * maxi_batch - return pymarian_evaluate(cmd_line, input_lines, batch_size=batch_size, average=average) - else: - raise ValueError(f'Unknown backend {backend}') - - -def pymarian_evaluate( - cmd_line: str, input_lines: Iterator[str], average=Defaults.AVERAGE, batch_size=int(Defaults.MINI_BATCH * Defaults.MAXI_BATCH) -): - try: - from pymarian import Evaluator - except: - raise ImportError('pymarian is not installed. Please install it and rerun') - - log.info(f'Marian CLI::\n\t{cmd_line}') - - evaluator = Evaluator(cmd_line) - config = yaml.safe_load(evaluator.get_model_config()) - log.info(f'Model config: {config}') - - assert average in ('skip', 'append', 'only') - lines = (line.rstrip('\n').split('\t') for line in input_lines) - - # NOTE: pymarian doesn't support iterator input yet; so mini batching here - def make_mini_batches(lines, batch_size=batch_size): - assert batch_size > 0 - while True: - chunk = list(itertools.islice(lines, batch_size)) - if not chunk: - return - yield chunk - - total, count = 0.0, 0 - for batch in make_mini_batches(lines): - scores = evaluator.evaluate(batch) - assert len(scores) == len(batch) - for score in scores: - if isinstance(score, (tuple, list)): - score = score[0] - total += score - count += 1 - if average != 'only': # skip or append - yield score - - if average != 'skip': - yield total / count - - -def subprocess_evaluate(cmd_line: List[str], input_lines: Iterator[str]): - assert isinstance(cmd_line, list) - marian_bin_path = shutil.which('marian') - if marian_bin_path is None: - raise FileNotFoundError('marian binary not found in PATH. Please add it and rerun') - cmd_line = [marian_bin_path, 'evaluate'] + cmd_line - - proc = None - try: - proc = subprocess.Popen( - cmd_line, - shell=False, - stdout=subprocess.PIPE, - stdin=subprocess.PIPE, - stderr=sys.stderr, - text=True, - encoding='utf8', - errors='replace', - ) - log.info(f'Running command: {" ".join(cmd_line)}') - copy_thread = threading.Thread(target=copy_lines_to_stdin, args=(proc, input_lines)) - - copy_thread.start() - # read output and yield scores - for line in proc.stdout: - line = line.rstrip() - if ' ' in line: - yield tuple(float(x) for x in line.split(' ')) - else: - yield float(line) - - # wait for copy thread to finish - copy_thread.join() - # proc.stdin.close() - returncode = proc.wait() - if returncode != 0: - raise RuntimeError(f'Process exited with code {returncode}') - finally: - if proc is not None and proc.returncode is None: - log.warning(f'Killing process {proc.pid}') - proc.kill() - - -def parse_args(): - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument( - '-m', - '--model', - help=f'Model name, or path. Known models={list(Defaults.KNOWN_METRICS.keys())}', - default=Defaults.DEF_MODEL, - type=str, - ) - - parser.add_argument( - '--stdin', - action='store_true', - help='Read input from stdin. TSV file with following format: \ - QE metrics: "srcmt", Comet with ref: "srcref; or BLEURT: "refmt"', - ) - parser.add_argument('-t', '--mt', dest='mt_file', help='MT output file. Ignored when --stdin.', type=Path) - parser.add_argument('-s', '--src', dest='src_file', help='Source file. Ignored when --stdin', type=Path) - parser.add_argument('-r', '--ref', dest='ref_file', help='Ref file. Ignored when --stdin', type=Path) - parser.add_argument( - '-o', '--out', default=sys.stdout, help='output file. Default: stdout', type=argparse.FileType('w') - ) - parser.add_argument( - '-a', - '--average', - choices=('skip', 'append', 'only'), - default='skip', - help='Average segment scores to produce system score.' - ' skip=do not output average (default; segment scores only);' - ' append=append average at the end; ' - ' only=output the average only (i.e. system score only)', - ) - - parser.add_argument('-w', '--width', default=4, help='Output score width', type=int) - parser.add_argument('--debug', help='Verbose output', action='store_true') - parser.add_argument('--mini-batch', default=16, help='Mini-batch size', type=int) - group = parser.add_mutually_exclusive_group() - group.add_argument('-d', '--devices', nargs='*', type=int, help='GPU device IDs') - group.add_argument( - '-c', '--cpu-threads', default=None, type=int, help='Use CPU threads. 0=use GPU device 0' - ) - parser.add_argument('-ws', '--workspace', default=8000, help='Workspace memory', type=int) - parser.add_argument( - '--backend', - default='pymarian', - choices=['subprocess', 'pymarian'], - help='Marian backend interface. subprocess=look for marian binary in PATH. pymarian=pybind wrapper', - ) - - args = parser.parse_args() - return vars(args) - - -def read_input(args, model_id, schema=None): - model_schema = Defaults.KNOWN_METRICS.get(model_id, schema or Defaults.DEF_SCHEMA) - input_schema = Defaults.KNOWN_SCHEMA[model_schema] - n_inputs = len(input_schema.split('+')) - if args.pop('stdin'): - del args['mt_file'] - del args['src_file'] - del args['ref_file'] - return sys.stdin - - n_inputs = len(input_schema.split('+')) - mt_file = args.pop('mt_file') - src_file = args.pop('src_file') - ref_file = args.pop('ref_file') - assert mt_file.exists(), f'{mt_file} does not exist' - if 'src' in input_schema: - assert src_file, f'Source file is required for metric {model_id}' - assert src_file.exists(), f'{src_file} does not exist' - if 'ref' in input_schema: - assert ref_file, f'Reference file is required for metric {model_id}' - assert ref_file.exists(), f'{ref_file} does not exist' - if input_schema == 'src+mt': - input_lines = itertools.zip_longest(open(src_file), open(mt_file)) - elif input_schema == 'src+ref+mt': - input_lines = itertools.zip_longest(open(src_file), open(ref_file), open(mt_file)) - elif input_schema == 'src+mt+ref': - input_lines = itertools.zip_longest(open(src_file), open(mt_file), open(ref_file)) - elif input_schema == 'ref+mt': - input_lines = itertools.zip_longest(open(ref_file), open(mt_file)) - else: - raise ValueError(f'Unknown schema {input_schema}') - - def _validate_and_join(): - for row in input_lines: - assert len(row) == n_inputs, f'Expected {n_inputs} columns, but got {len(row)}' - for col in row: - assert col is not None, f'Expected {n_inputs} columns, but got {len(row)}' - yield '\t'.join(row) - - return _validate_and_join() - - -def main(**args): - args = args or parse_args() - if args.pop('debug'): - log.getLogger().setLevel(log.DEBUG) - global DEBUG_MODE - DEBUG_MODE = True - log.debug(args) - - model_id = args.pop('model') - if model_id.lower() in Defaults.KNOWN_METRICS: - model_path, vocab = get_known_model(model_id.lower()) - log.info(f'{model_id} --> {model_path}') - else: - model_path, vocab = Path(model_id), None - assert ( - model_path.exists() - ), f'{model_path} does not exist. Known models are {list(Defaults.KNOWN_METRICS.keys())}' - args['model'] = model_path - args['vocab_file'] = vocab - - args['input_lines'] = read_input(args, model_id=model_id) - args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_SCHEMA) - out = args.pop('out') - width = args.pop('width', Defaults.FLOAT_PRECISION) - scores = marian_evaluate(**args) - for i, score in enumerate(scores, start=1): - if isinstance(score, (tuple, list)): - score = score[0] # the first score - out.write(f'{score:.{width}f}\n') - out.close() - - log.info(f'Wrote {i} lines to {out.name}') - - -if '__main__' == __name__: - main() diff --git a/src/python/pymarian/mtapi_server.py b/src/python/pymarian/mtapi_server.py index 4391a3101..fd11ba2bb 100755 --- a/src/python/pymarian/mtapi_server.py +++ b/src/python/pymarian/mtapi_server.py @@ -9,12 +9,11 @@ import logging as log from typing import List +import pymarian from flask import Flask, request from sacremoses import MosesPunctNormalizer from sentence_splitter import SentenceSplitter -import pymarian - log.basicConfig(level=log.INFO) diff --git a/src/python/pymarian/pypdl/__init__.py b/src/python/pymarian/pypdl/__init__.py new file mode 100644 index 000000000..6d670df6e --- /dev/null +++ b/src/python/pymarian/pypdl/__init__.py @@ -0,0 +1 @@ +from .main import Downloader diff --git a/src/python/pymarian/pypdl/downloader.py b/src/python/pymarian/pypdl/downloader.py new file mode 100644 index 000000000..ac4dd2f98 --- /dev/null +++ b/src/python/pymarian/pypdl/downloader.py @@ -0,0 +1,97 @@ +import copy +import logging +import time +from pathlib import Path +from threading import Event +from typing import Dict + +import requests + +MEGABYTE = 1048576 + + +class BasicDownloader: + """Base downloader class.""" + + def __init__(self, interrupt: Event): + self.curr = 0 # Downloaded size in bytes (current size) + self.completed = False + self.id = 0 + self.interrupt = interrupt + self.downloaded = 0 + + def download(self, url: str, path: str, mode: str, **kwargs) -> None: + """Download data in chunks.""" + try: + with open(path, mode) as file, requests.get(url, stream=True, **kwargs) as response: + for chunk in response.iter_content(MEGABYTE): + file.write(chunk) + self.curr += len(chunk) + self.downloaded += len(chunk) + + if self.interrupt.is_set(): + break + + except Exception as e: + self.interrupt.set() + time.sleep(1) + logging.error("(Thread: %d) [%s: %s]", self.id, type(e).__name__, e) + + +class Simpledown(BasicDownloader): + """Class for downloading the whole file in a single segment.""" + + def __init__( + self, + url: str, + file_path: str, + interrupt: Event, + **kwargs, + ): + super().__init__(interrupt) + self.url = url + self.file_path = file_path + self.kwargs = kwargs + + def worker(self) -> None: + self.download(self.url, self.file_path, mode="wb", **self.kwargs) + self.completed = True + + +class Multidown(BasicDownloader): + """Class for downloading a specific segment of the file.""" + + def __init__( + self, + segement_table: Dict, + segment_id: int, + interrupt: Event, + **kwargs, + ): + super().__init__(interrupt) + self.id = segment_id + self.segement_table = segement_table + self.kwargs = kwargs + + def worker(self) -> None: + url = self.segement_table["url"] + segment_path = Path(self.segement_table[self.id]["segment_path"]) + start = self.segement_table[self.id]["start"] + end = self.segement_table[self.id]["end"] + size = self.segement_table[self.id]["segment_size"] + + if segment_path.exists(): + downloaded_size = segment_path.stat().st_size + if downloaded_size > size: + segment_path.unlink() + else: + self.curr = downloaded_size + + if self.curr < size: + start = start + self.curr + kwargs = copy.deepcopy(self.kwargs) # since used by others + kwargs.setdefault("headers", {}).update({"range": f"bytes={start}-{end}"}) + self.download(url, segment_path, "ab", **kwargs) + + if self.curr == size: + self.completed = True diff --git a/src/python/pymarian/pypdl/main.py b/src/python/pymarian/pypdl/main.py new file mode 100644 index 000000000..1f85575d7 --- /dev/null +++ b/src/python/pymarian/pypdl/main.py @@ -0,0 +1,234 @@ +import logging +import sys +import time +from collections import deque +from concurrent.futures import ThreadPoolExecutor +from threading import Event +from typing import Callable, Optional, Union + +import requests +from tqdm import tqdm + +from .downloader import Multidown, Simpledown +from .utils import ( + AutoShutdownFuture, + FileValidator, + combine_files, + create_segment_table, + get_filepath, + seconds_to_hms, + to_mb, +) + +logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) + + +class Downloader: + """ + A multi-threaded file downloader that supports progress tracking, retries, pause/resume functionality etc. + + Keyword Arguments: + params (dict, optional): A dictionary, list of tuples or bytes to send as a query string. Default is None. + allow_redirects (bool, optional): A Boolean to enable/disable redirection. Default is True. + auth (tuple, optional): A tuple to enable a certain HTTP authentication. Default is None. + cert (str or tuple, optional): A String or Tuple specifying a cert file or key. Default is None. + cookies (dict, optional): A dictionary of cookies to send to the specified url. Default is None. + headers (dict, optional): A dictionary of HTTP headers to send to the specified url. Default is None. + proxies (dict, optional): A dictionary of the protocol to the proxy url. Default is None. + timeout (number or tuple, optional): A number, or a tuple, indicating how many seconds to wait for the client to make a connection and/or send a response. Default is 10 seconds. + verify (bool or str, optional): A Boolean or a String indication to verify the servers TLS certificate or not. Default is True. + """ + + def __init__(self, **kwargs): + self._pool = None # ThreadPoolExecutor, initialized in _downloader + self._workers = [] + self._interrupt = Event() + self._stop = False + self._kwargs = {"timeout": 10, "allow_redirects": True} # request module kwargs + self._kwargs.update(kwargs) + + # public attributes + self.size = None + self.progress = 0 + self.speed = 0 + self.time_spent = 0 + self.current_size = 0 + self.eta = "99:59:59" + self.remaining = None + self.failed = False + self.completed = False + + def _display(self, dynamic_print): + dynamic_print.update(self.current_size - dynamic_print.n) + + def _calc_values(self, recent_queue, interval): + self.current_size = sum(worker.curr for worker in self._workers) + + # Speed calculation + recent_queue.append(sum(worker.downloaded for worker in self._workers)) + non_zero_list = [to_mb(value) for value in recent_queue if value] + if len(non_zero_list) < 1: + self.speed = 0 + elif len(non_zero_list) == 1: + self.speed = non_zero_list[0] / interval + else: + diff = [b - a for a, b in zip(non_zero_list, non_zero_list[1:])] + self.speed = (sum(diff) / len(diff)) / interval + + if self.size: + self.progress = int(100 * self.current_size / self.size) + self.remaining = to_mb(self.size - self.current_size) + + if self.speed: + self.eta = seconds_to_hms(self.remaining / self.speed) + else: + self.eta = "99:59:59" + + def _single_thread(self, url, file_path): + sd = Simpledown(url, file_path, self._interrupt, **self._kwargs) + self._workers.append(sd) + self._pool.submit(sd.worker) + + def _multi_thread(self, segments, segement_table): + for segment in range(segments): + md = Multidown( + segement_table, + segment, + self._interrupt, + **self._kwargs, + ) + self._workers.append(md) + self._pool.submit(md.worker) + + def _get_header(self, url): + kwargs = self._kwargs.copy() + kwargs.pop("params", None) + response = requests.head(url, **kwargs) + + if response.status_code != 200: + self._interrupt.set() + raise ConnectionError(f"Server Returned: {response.reason}({response.status_code}), Invalid URL") + + return response.headers + + def _get_info(self, url, file_path, multithread, etag): + header = self._get_header(url) + file_path = get_filepath(url, header, file_path) + + if size := int(header.get("Content-Length", 0)): + self.size = size + + return file_path, multithread, etag + + def _downloader(self, url, file_path, segments, display, multithread, etag): + start_time = time.time() + + file_path, multithread, etag = self._get_info(url, file_path, multithread, etag) + + if multithread: + segment_table = create_segment_table(url, file_path, segments, self.size, etag) + segments = segment_table["segments"] + self._pool = ThreadPoolExecutor(max_workers=segments) + self._multi_thread(segments, segment_table) + else: + self._pool = ThreadPoolExecutor(max_workers=1) + self._single_thread(url, file_path) + + interval = 0.15 + recent_queue = deque([0] * 12, maxlen=12) + download_mode = "Multi-Threaded" if multithread else "Single-Threaded" + + with tqdm(total=self.size, desc=f"Downloading ({download_mode})", dynamic_ncols=True, unit='B', unit_scale=True, miniters=1) as dynamic_print: + while True: + status = sum(worker.completed for worker in self._workers) + self._calc_values(recent_queue, interval) + + if display: + self._display(dynamic_print) + + if self._interrupt.is_set(): + self.time_spent = time.time() - start_time + return None + + if status == len(self._workers): + if multithread: + combine_files(file_path, segments) + self.completed = True + self.time_spent = time.time() - start_time + return FileValidator(file_path) + + time.sleep(interval) + + def stop(self) -> None: + """Stop the download process.""" + self._interrupt.set() + self._stop = True + time.sleep(1) # wait for threads + + def start( + self, + url: str, + file_path: Optional[str] = None, + segments: int = 10, + display: bool = True, + multithread: bool = True, + block: bool = True, + retries: int = 0, + mirror_func: Optional[Callable[[], str]] = None, + etag: bool = True, + ) -> Union[AutoShutdownFuture, FileValidator, None]: + """ + Start the download process. + + Parameters: + url (str): The URL to download from. + file_path (str, Optional): The path to save the downloaded file. If not provided, the file is saved in the current working directory. + If `file_path` is a directory, the file is saved in that directory. If `file_path` is a file name, the file is saved with that name. + segments (int, Optional): The number of segments to divide the file into for multi-threaded download. Default is 10. + display (bool, Optional): Whether to display download progress and other messages. Default is True. + multithread (bool, Optional): Whether to use multi-threaded download. Default is True. + block (bool, Optional): Whether to block the function until the download is complete. Default is True. + retries (int, Optional): The number of times to retry the download if it fails. Default is 0. + mirror_func (Callable[[], str], Optional): A function that returns a new download URL if the download fails. Default is None. + etag (bool, Optional): Whether to validate the ETag before resuming downloads. Default is True. + + Returns: + AutoShutdownFuture: If `block` is False. + FileValidator: If `block` is True and the download successful. + None: If `block` is True and the download fails. + """ + + def download(): + for i in range(retries + 1): + try: + _url = mirror_func() if i > 0 and callable(mirror_func) else url + if i > 0 and display: + logging.info("Retrying... (%d/%d)", i, retries) + + self.__init__(**self._kwargs) + result = self._downloader(_url, file_path, segments, display, multithread, etag) + + if self._stop or self.completed: + if display: + print(f"Time elapsed: {seconds_to_hms(self.time_spent)}", file=sys.stderr) + return result + + time.sleep(3) + + except Exception as e: + logging.error("(%s) [%s]", e.__class__.__name__, e) + + finally: + self._pool.shutdown() + + self.failed = True + return None + + ex = ThreadPoolExecutor(max_workers=1) + future = AutoShutdownFuture(ex.submit(download), ex) + + if block: + result = future.result() + return result + + return future diff --git a/src/python/pymarian/pypdl/utils.py b/src/python/pymarian/pypdl/utils.py new file mode 100644 index 000000000..0c48642d2 --- /dev/null +++ b/src/python/pymarian/pypdl/utils.py @@ -0,0 +1,127 @@ +import hashlib +import json +import time +from concurrent.futures import Executor, Future +from pathlib import Path +from typing import Dict, Union +from urllib.parse import unquote, urlparse + +MEGABYTE = 1048576 +BLOCKSIZE = 4096 +BLOCKS = 1024 +CHUNKSIZE = BLOCKSIZE * BLOCKS + + +def to_mb(size_in_bytes: int) -> float: + return size_in_bytes / MEGABYTE + + +def seconds_to_hms(sec: float) -> str: + time_struct = time.gmtime(sec) + return time.strftime("%H:%M:%S", time_struct) + + +def get_filepath(url: str, headers: Dict, file_path: str) -> str: + content_disposition = headers.get("Content-Disposition", None) + + if content_disposition and "filename=" in content_disposition: + filename_start = content_disposition.index("filename=") + len("filename=") + filename = content_disposition[filename_start:] # Get name from headers + filename = unquote(filename.strip('"')) # Decode URL encodings + else: + filename = unquote(urlparse(url).path.split("/")[-1]) # Generate name from url + + if file_path: + file_path = Path(file_path) + if file_path.is_dir(): + return str(file_path / filename) + return str(file_path) + return filename + + +def create_segment_table(url: str, file_path: str, segments: str, size: int, etag: Union[str, bool]) -> Dict: + """Create a segment table for multi-threaded download.""" + segments = 5 if (segments > 5) and (to_mb(size) < 50) else segments + progress_file = Path(file_path + ".json") + + try: + progress = json.loads(progress_file.read_text()) + if etag and progress["url"] == url and progress["etag"] == etag: + segments = progress["segments"] + except Exception: + pass + + progress_file.write_text( + json.dumps( + {"url": url, "etag": etag, "segments": segments}, + indent=4, + ) + ) + + dic = {"url": url, "segments": segments} + partition_size = size / segments + for segment in range(segments): + start = int(partition_size * segment) + end = int(partition_size * (segment + 1)) + segment_size = end - start + if segment != (segments - 1): + end -= 1 # [0-100, 100-200] -> [0-99, 100-200] + # No segment_size+=1 for last setgment since final byte is end byte + + dic[segment] = { + "start": start, + "end": end, + "segment_size": segment_size, + "segment_path": f"{file_path }.{segment}", + } + + return dic + + +def combine_files(file_path: str, segments: int) -> None: + """Combine the downloaded file segments into a single file.""" + with open(file_path, "wb") as dest: + for segment in range(segments): + segment_file = f"{file_path}.{segment}" + with open(segment_file, "rb") as src: + while True: + chunk = src.read(CHUNKSIZE) + if chunk: + dest.write(chunk) + else: + break + Path(segment_file).unlink() + + progress_file = Path(f"{file_path}.json") + progress_file.unlink() + + +class FileValidator: + """A class used to validate the integrity of the file.""" + + def __init__(self, path: str): + self.path = path + + def calculate_hash(self, algorithm: str, **kwargs) -> str: + hash_obj = hashlib.new(algorithm, **kwargs) + with open(self.path, "rb") as file: + for chunk in iter(lambda: file.read(4096), b""): + hash_obj.update(chunk) + return hash_obj.hexdigest() + + def validate_hash(self, correct_hash: str, algorithm: str, **kwargs) -> bool: + file_hash = self.calculate_hash(algorithm, **kwargs) + return file_hash == correct_hash + + +class AutoShutdownFuture: + """A Future object wrapper that shuts down the executor when the result is retrieved.""" + + def __init__(self, future: Future, executor: Executor): + self.future = future + self.executor = executor + + def result(self, timeout: float = None) -> Union[FileValidator, None]: + result = self.future.result(timeout) + self.executor.shutdown() + return result diff --git a/src/python/pymarian/qtdemo.py b/src/python/pymarian/qtdemo.py index e95d0bf12..fe99784c3 100644 --- a/src/python/pymarian/qtdemo.py +++ b/src/python/pymarian/qtdemo.py @@ -1,13 +1,12 @@ import sys import time +import pymarian from PyQt5.QtGui import * from PyQt5.QtWidgets import * from sacremoses import MosesPunctNormalizer, MosesTokenizer from sentence_splitter import SentenceSplitter -import pymarian - class Example(QWidget): def __init__(self): diff --git a/src/python/pymarian/utils.py b/src/python/pymarian/utils.py index 16e2e3c22..c3a4efab0 100644 --- a/src/python/pymarian/utils.py +++ b/src/python/pymarian/utils.py @@ -1,64 +1,135 @@ #!/usr/bin/env python # # This is a python wrapper for marian evaluate command -# created by Thamme Gowda on 2023-09-07 -# + import logging as log import shutil from pathlib import Path +from typing import List, Tuple +import portalocker import requests -from tqdm.auto import tqdm -from .constants import Defaults +from .defaults import Defaults +from .pypdl import Downloader log.basicConfig(level=log.INFO) + DEBUG_MODE = False +PROGRESS_BAR = Defaults.PROGRESS_BAR + + +class InvalidIDException(ValueError): + """Invalid model ID exception""" + + pass + + +def validate_id(id: str) -> bool: + invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] + for c in invalid_chars: + if c in id: + raise InvalidIDException( + f'Invalid model id {id}. It must not contain characters: {invalid_chars}' + ) -def get_known_model(model_name): - """Given a known model name, this functin gets the checkpoint and vocabulary paths. - This function downloads and extracts model files to a local cache directory if necessary. - - Specifically, checkpoint file must have model*.npz and vocab*.spm files in the resolved model directory. +def get_model_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Path: + """Given the name of a (known) model, this function gets its checkpoint path. + If necessary, this function downloads checkpoint to a local cache directory. + :param model_name: model name - :return: checkpoint path, vocabulary path + :return: checkpoint path """ - assert model_name in Defaults.KNOWN_METRICS, f'Unknown model {model_name}' + validate_id(model_name) + chkpt_url = f'{Defaults.BASE_URL}/{model_name}/model.{model_name}.bin' - model_url = f'{Defaults.BASE_URL}/{model_name}.tgz' - local_file = Defaults.CACHE_PATH / f'{model_name}.tgz' local_dir = Defaults.CACHE_PATH / model_name - maybe_download_file(model_url, local_file) - maybe_extract(local_file, local_dir) - checkpt_file = list(local_dir.glob('model*.npz')) - vocab_file = list(local_dir.glob('vocab*.spm')) - assert len(checkpt_file) == 1, f'Expected exactly one model file in {local_dir}' - assert len(vocab_file) == 1, f'Expected exactly one vocab file in {local_dir}' - checkpt_file = checkpt_file[0] - vocab_file = vocab_file[0] - return checkpt_file, vocab_file - - -def maybe_download_file(url, local_file: Path): + chkpt_local = local_dir / f'model.{model_name}.bin' + + maybe_download_file(chkpt_url, chkpt_local) + assert chkpt_local.exists(), f'Checkpoint file {chkpt_local} does not exist' + return chkpt_local + + +def get_vocab_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Tuple[Path, Path]: + """Given the name of a (known) model, this function gets its vocabulary path. + This function downloads vocabulary to a local cache directory, if necessary. + + :param model_name: model name + :param progress_bar: show progress bar while downloading + :return: checkpoint path, vocabulary path + """ + validate_id(model_name) + local_dir = Defaults.CACHE_PATH / model_name + vocab_local = local_dir / 'vocab.spm' + + vocab_url = f'{Defaults.BASE_URL}/{model_name}/vocab.spm' + maybe_download_file(vocab_url, vocab_local, progress_bar=progress_bar) + assert vocab_local.exists(), f'Vocabulary file {vocab_local} does not exist' + return vocab_local + + +def maybe_download_file(url: str, local_file: Path, progress_bar: bool = PROGRESS_BAR): """Downloads the file if not already downloaded :param url: url to download :param local_file: local file path + :param progress_bar: show progress bar while downloading + :return: None + :raises: ValueError if the url is invalid """ - flag_file = local_file.with_name(local_file.name + '._OK') - if local_file.exists() and flag_file.exists(): - log.info(f'Using cached file {local_file}') + lock_file = local_file.with_name('._LOCK_' + local_file.name) + if local_file.exists() and local_file.stat().st_size > 0: + log.debug(f'Using cached file {local_file}') return - log.info(f'Downloading {url} to {local_file}') + + # check if the url has OK status; avoid creating cache directories when url is invalid due to bad model ID + if not is_ok_url(url): + raise ValueError(f'Invalid URL: {url}') + local_file.parent.mkdir(parents=True, exist_ok=True) - with requests.get(url, stream=True) as r: - r.raise_for_status() - file_size = int(r.headers.get('Content-Length', 0)) - with tqdm.wrapattr(r.raw, "read", total=file_size, desc='Downloading', dynamic_ncols=True) as r_raw: - with open(local_file, "wb") as f: - shutil.copyfileobj(r_raw, f) - flag_file.touch() + with portalocker.Lock(lock_file, 'w', timeout=Defaults.FILE_LOCK_TIMEOUT) as fh: + # check again if it is downloaded by another process while we were waiting for the lock + if local_file.exists() and local_file.stat().st_size > 0: + log.debug(f'Using cached file {local_file}') + return + + # use file lock to avoid race of parallel downloads + local_file.parent.mkdir(parents=True, exist_ok=True) + + tmp_file = local_file.with_name(local_file.name + '.downloading') + log.info(f'Downloading {url} to {tmp_file}') + dl = Downloader() + dl.start( + url=url, + file_path=tmp_file, + segments=20, + display=progress_bar, + multithread=True, + block=True, + retries=3, + mirror_func=None, + etag=False, + ) + + if dl.completed: + # move the file to the final location + if local_file.exists(): + local_file.unlink() + shutil.move(tmp_file, local_file) + + +def is_ok_url(url: str) -> bool: + """Checks if the given url has OK status code by making a HEAD request + :param url: url + :return: True if status is OK, False otherwise + """ + try: + return requests.head(url).status_code == requests.codes.ok + except requests.exceptions.RequestException as e: + log.error(f'Invalid URL: {url}') + return False def maybe_extract(archive: Path, outdir: Path) -> Path: @@ -79,9 +150,9 @@ def maybe_extract(archive: Path, outdir: Path) -> Path: def kwargs_to_cli(**kwargs) -> str: - """Converts kwargs to cli args + """Converts kwargs to command line arguments string :param kwargs: kwargs - :return: cli args + :return: CLI string """ args = [] for k, v in kwargs.items(): @@ -89,7 +160,7 @@ def kwargs_to_cli(**kwargs) -> str: continue # ignore keys if values are None k = k.replace('_', '-') args.append(f'--{k}') - if v is '': + if v == '': continue # only add keys for empty values elif isinstance(v, bool): args.append("true" if v else "false") @@ -97,5 +168,5 @@ def kwargs_to_cli(**kwargs) -> str: args.extend(str(x) for x in v) else: args.append(f'{v}') - return ' '.join(args) + diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml index 84d1b0e8f..f2008a924 100644 --- a/src/python/pyproject.toml +++ b/src/python/pyproject.toml @@ -29,13 +29,14 @@ classifiers = [ ] dependencies = [ + "portalocker", + "pyyaml", "tqdm", "requests", - "pyyaml" ] [project.scripts] -pymarian-evaluate = "pymarian.evaluate:main" +pymarian-eval = "pymarian.eval:main" pymarian-qtdemo = "pymarian.qtdemo:main" pymarian-mtapi = "pymarian.mtapi_server:main" @@ -54,7 +55,7 @@ include-package-data = true [tool.black] line-length = 110 target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] -include = 'src/python/.*\.pyi?$' +include = '.*\.pyi?$' skip-string-normalization = true # black doesn't sort imports. So we use isort for that. See discussion https://github.com/psf/black/issues/333 diff --git a/src/python/setup.py b/src/python/setup.py index 01d3a0f5f..bcbca2c63 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -1,10 +1,10 @@ import os -import platform import shutil import sys - +import sysconfig from pathlib import Path -from setuptools import setup, find_namespace_packages, Distribution + +from setuptools import Distribution, find_namespace_packages, setup """ This script expects _pymarian.*.so to be present in $CMAKE_BINARY_DIR @@ -17,12 +17,9 @@ CMAKE_BINARY_DIR = os.getenv("CMAKE_BINARY_DIR", DEF_CMAKE_BINARY_DIR) print("\t>>>CMAKE_BINARY_DIR is ", CMAKE_BINARY_DIR) -if platform.system() == 'Windows': - NATIVE_EXT_GLOB = '_pymarian.*.pyd' -elif platform.system() == 'Darwin': - NATIVE_EXT_GLOB = '_pymarian.*.dylib' -else: - NATIVE_EXT_GLOB = '_pymarian.*.so' +EXT_SUFFIX = sysconfig.get_config_var('EXT_SUFFIX') # See also: python -m sysconfig | grep -i EXT_SUFFIX +assert EXT_SUFFIX, "EXT_SUFFIX not found in sysconfig" +NATIVE_EXT_NAME = '_pymarian' + EXT_SUFFIX def get_version(cuda_version=None) -> str: @@ -59,24 +56,27 @@ def get_version(cuda_version=None) -> str: def get_native_ext() -> Path: - native_exts = list(Path(CMAKE_BINARY_DIR).glob(f'src/{NATIVE_EXT_GLOB}')) - if not native_exts: - raise Exception( - f'No native extension found; Looked at {CMAKE_BINARY_DIR}/src/{NATIVE_EXT_GLOB}. \ - Please run cmake build first with -DPYMARIAN=ON or set CMAKE_BINARY_DIR to the build dir' - ) - elif len(native_exts) >= 2: - raise Exception(f'Only one native extension expected, but found: {native_exts}') - - native_ext = native_exts[0] + native_ext = Path(CMAKE_BINARY_DIR) / 'src' / NATIVE_EXT_NAME + if not native_ext.exists(): + msg = f"No native extension found at {native_ext}.\n \ + Please run cmake build with -DPYMARIAN=ON or set CMAKE_BINARY_DIR to the existing build dir." + other_exts = list(Path(CMAKE_BINARY_DIR).glob("src/_pymarian.*." + NATIVE_EXT_NAME.split(".")[-1])) + if other_exts: + msg += f"\nOther extension(s) found: {other_exts} but they are not compatible with this platform ({EXT_SUFFIX})." + raise RuntimeError(msg) # Pip does not allow inclusion of files from parent dir our outside of package context (for security reasons). # So, we copy the native extension to the package directory native_ext_local = Path(__file__).parent / native_ext.name - print(f"\t>>>Found native extension at: {native_ext}") - print(f"\t >>>Making it available under scope at: {native_ext_local}") - if native_ext_local.exists(): - native_ext_local.unlink() + print(f"\t>>>Found the fresh native extension at: {native_ext}") + print(f"\t>>>Making it available under the package scope at: {native_ext_local}") shutil.copy(native_ext, native_ext_local) + + # remove incomaptible .so files from prior builds (if any) + for old_file in Path(__file__).parent.glob("_pymarian.*"): + if old_file.resolve() == native_ext_local.resolve(): + continue + print(f"\t>>>Removing old file: {old_file}") + old_file.unlink() return native_ext_local diff --git a/src/python/tests/__init__.py b/src/python/tests/regression/__init__.py similarity index 100% rename from src/python/tests/__init__.py rename to src/python/tests/regression/__init__.py diff --git a/src/python/tests/regression/test_pymarian_eval.py b/src/python/tests/regression/test_pymarian_eval.py new file mode 100644 index 000000000..8f207b149 --- /dev/null +++ b/src/python/tests/regression/test_pymarian_eval.py @@ -0,0 +1,91 @@ +import logging as log +import os +import shutil +import subprocess +import tarfile +import urllib.request +from pathlib import Path +from typing import List + +import pytest + +log.basicConfig(level=log.INFO) + +DATA_URL = "https://textmt.blob.core.windows.net/www/data/marian-regression-tests/metrics-regression.tgz" +DATA_DIR = Path(__file__).parent.parent / "data" / "metrics-regression" +SELECT_PREFIX = "wmt21-systems.en-de.100" +SYS_DIFF_OK = 0.01 +SEG_DIFF_OK = 0.05 + +N_CPUS = max(os.cpu_count() - 2, 2) + +USE_GPU = False +GPU_ARGS = "-d 0 --mini-batch 16" # --fp16 error margin is too high for bleurt-20 +CPU_ARGS = f"--cpu-threads {N_CPUS} --mini-batch 1" +# NOTE: --mini-batch > 1 on CPU deviates scores https://machinetranslation.visualstudio.com/DefaultCollection/Marian/_git/marian-dev/pullRequest/32883#1707853099 +BACKEND_ARGS = GPU_ARGS if USE_GPU else CPU_ARGS + +src_file = DATA_DIR / f"{SELECT_PREFIX}.src" +ref_file = DATA_DIR / f"{SELECT_PREFIX}.ref" +mt_file = DATA_DIR / f"{SELECT_PREFIX}.mt" + + +def setup(): + try: + flag_file = DATA_DIR / ".downloaded" + if flag_file.exists(): + log.info("Data already downloaded. Setup skipped...") + return + + DATA_DIR.mkdir(exist_ok=True, parents=True) + log.info(f"Downloading {DATA_URL} to {DATA_DIR}") + print("Downloading data package...") + with urllib.request.urlopen(DATA_URL) as response: + with tarfile.open(fileobj=response, mode="r|gz") as tar: + tar.extractall(path=DATA_DIR.parent) + + flag_file.touch() + log.info("Setup Done.") + finally: + if not shutil.which("pymarian-eval"): + raise FileNotFoundError("pymarian-eval not found in PATH") + for f in [src_file, ref_file, mt_file]: + if not f.exists(): + raise FileNotFoundError(f"File {f} not found.") + + +def compare_scores(tag: str, lhs: List[float], rhs: List[float]): + assert len(lhs) == len(rhs), f"{tag} :: length mismatch: {len(lhs)} != {len(rhs)}" + total_diff = sum(abs(l - r) for l, r in zip(lhs, rhs)) + avg_diff = total_diff / len(lhs) + + seg_err_count = 0 + for i, (l, r) in enumerate(zip(lhs, rhs)): + if abs(l - r) >= SEG_DIFF_OK: + log.warning(f"{tag} :: line {i}: {l:.4f} != {r:.4f} ({abs(l - r):.4f} > {SEG_DIFF_OK})") + seg_err_count += 1 + + assert avg_diff <= SYS_DIFF_OK, f"{tag} :: avg_diff: {avg_diff:.4f} > {SYS_DIFF_OK:.4f}" + assert seg_err_count == 0, f"{tag} :: seg_err_count: {seg_err_count:.4f} > 0" + + +setup() +# auto detect metric names +# metric_names = list(set(f.name.split(".")[-2] for f in DATA_DIR.glob(f"{select_prefix}*.orig"))) +# update: No need to run all metric names, select a few +metric_names = ["bleurt-20", "wmt20-comet-qe-da", "wmt22-comet-da", "wmt22-cometkiwi-da"] + + +@pytest.mark.parametrize("metric_name", metric_names) +def test_pymarian_cli(metric_name): + orig_file = DATA_DIR / f"{SELECT_PREFIX}.{metric_name}.orig" + assert orig_file.exists() + orig_scores = [float(x) for x in orig_file.read_text().splitlines() if x.strip()] + + pymarian_args = f"-a skip -s {src_file} -r {ref_file} -t {mt_file} {BACKEND_ARGS}" + cmd = f"pymarian-eval -m {metric_name} {pymarian_args} " + log.info(f"Running: {cmd}") + output = subprocess.check_output(cmd, shell=True) + out_lines = output.decode("utf-8").splitlines() + out_scores = [float(x) for x in out_lines if x.strip()] + compare_scores(metric_name, orig_scores, out_scores) diff --git a/src/python/tests/test_train.py b/src/python/tests/regression/test_train.py similarity index 89% rename from src/python/tests/test_train.py rename to src/python/tests/regression/test_train.py index 543e45db5..c538e716a 100644 --- a/src/python/tests/test_train.py +++ b/src/python/tests/regression/test_train.py @@ -4,23 +4,22 @@ from pathlib import Path from pymarian import Trainer -from pymarian.utils import get_known_model QUIET = False -TMP_DATA_DIR = Path.home() / 'tmp' / 'marian-tests' DATA_URL = "https://textmt.blob.core.windows.net/www/data/marian-tests-data.tgz" +DATA_DIR = Path(__file__).parent.parent / 'data' / 'marian-tests-data' def setup(): - ok_file = TMP_DATA_DIR / '_OK' - if not TMP_DATA_DIR.exists() or not ok_file.exists(): - TMP_DATA_DIR.mkdir(parents=True, exist_ok=True) + ok_file = DATA_DIR / '.downloaded' + if not ok_file.exists(): + DATA_DIR.mkdir(parents=True, exist_ok=True) print("Downloading data package...") with urllib.request.urlopen(DATA_URL) as response: with tarfile.open(fileobj=response, mode="r|gz") as tar: - tar.extractall(path=TMP_DATA_DIR) + tar.extractall(path=DATA_DIR.parent) ok_file.touch() print("Done.") @@ -29,11 +28,10 @@ def setup(): def test_train_comet_qe(): - data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng' + data_dir = DATA_DIR / 'deu-eng' vocab_file = data_dir / 'vocab.8k.spm' classe_file = data_dir / 'classes4f.txt' train_file = data_dir / 'sample.5k.chrfoid-deu-eng.tsv' - # pretrained_model, vocab_file = get_known_model("chrfoid-wmt23") assert classe_file.exists() assert vocab_file.exists() assert train_file.exists() @@ -88,7 +86,7 @@ def test_train_comet_qe(): def test_train_transformer_nmt(): - data_dir = TMP_DATA_DIR / 'marian-tests-data/deu-eng' + data_dir = DATA_DIR / 'deu-eng' vocab_file = data_dir / 'vocab.8k.spm' train_prefix = str(data_dir / 'sample.5k') src_lang = "deu" diff --git a/src/python/tests/regression/test_translate.py b/src/python/tests/regression/test_translate.py new file mode 100644 index 000000000..b65ec49dc --- /dev/null +++ b/src/python/tests/regression/test_translate.py @@ -0,0 +1,35 @@ +import tarfile +import urllib.request +from pathlib import Path + +from pymarian import Translator + +from . import BASE_ARGS + +DATA_URL = "http://data.statmt.org/romang/marian-regression-tests/models/wngt19.tar.gz" +DATA_DIR = Path(__file__).parent.parent / "data" / "wngt19" + + +def setup(): + flag_file = DATA_DIR / ".downloaded" + if flag_file.exists(): + print("Data already downloaded. Setup skipped...") + return + print(f"Downloading {DATA_URL} to {DATA_DIR}") + request = urllib.request.urlopen(DATA_URL) + with tarfile.open(fileobj=request, mode="r|gz") as tar: + tar.extractall(path=DATA_DIR.parent) + flag_file.touch() + + +setup() + + +def test_ende(): + + model_file = str(DATA_DIR / 'model.base.npz') + vocab_file = str(DATA_DIR / 'en-de.spm') + args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file]) + translator = Translator(**args) + hyp = translator.translate("Hello. Good morning.") + assert hyp == "Hallo , Guten Morgen ." diff --git a/src/python/tests/test_evaluate.py b/src/python/tests/test_evaluate.py deleted file mode 100644 index d79462901..000000000 --- a/src/python/tests/test_evaluate.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -# silense marian log -export MARIAN_QUIET=yes - -# run all tests in this file - pytest -v src/python/tests/test_evaluate.py - pytest -vx src/python/tests/test_evaluate.py #stop on first failure - -# run a single test: - pytest -v src/python/tests/test_evaluate.py -k test_evaluator_chrfoid - pytest -vs src/python/tests/test_evaluate.py -k test_evaluator_chrfoid # see stdout and stderr -""" -import os - -from pymarian import Evaluator -from pymarian.utils import get_known_model - -from . import BASE_ARGS - -EPSILON = 0.0001 # the precision error we afford in float comparison - - -# dummy sentences for testing -SAMPLE_SRC_HYP = [ - ["This is a test", "This is a test A"], - ["This is a test B", "This is a test C"], - ["This is a test D", "This is a test E"], -] -SAMPLE_REF_HYP = SAMPLE_SRC_HYP # same for now -SAMPLE_SRC_HYP_REF = [ - ["This is a test", "This is a test A", "This is a test AA"], - ["This is a test B", "This is a test C", "This is a test CC"], - ["This is a test D", "This is a test E", "This is a test EE"], -] - - -def test_evaluator_chrfoid(): - model_path, vocab_path = get_known_model("chrfoid-wmt23") - args = BASE_ARGS | dict( - like="comet-qe", - model=model_path, - vocabs=[vocab_path, vocab_path], - ) - # args = dict(help='') # to get help message with all args - eval = Evaluator(**args) - data = SAMPLE_SRC_HYP - expected_scores = [0.0548, 0.0797, 0.0988] - - scores = eval.evaluate(data) - assert len(scores) == len(data) - for score, expected_score in zip(scores, expected_scores): - if isinstance(score, list): - score = score[0] - assert abs(score - expected_score) < EPSILON - - -def test_evaluator_cometoid22_wmt22(): - model_path, vocab_path = get_known_model("cometoid22-wmt22") - args = BASE_ARGS | dict( - like="comet-qe", - model=model_path, - vocabs=[vocab_path, vocab_path], - ) - # args = dict(help='') # to get help message with all args - eval = Evaluator(**args) - data = SAMPLE_SRC_HYP - expected_scores = [0.71845, 0.7906, 0.81549] - - scores = eval.evaluate(data) - assert len(scores) == len(data) - - for score, expected_score in zip(scores, expected_scores): - if isinstance(score, list): - score = score[0] - assert abs(score - expected_score) < EPSILON - - -def test_evaluator_cometoid22_wmt23(): - model_path, vocab_path = get_known_model("cometoid22-wmt23") - args = BASE_ARGS | dict( - like="comet-qe", - model=model_path, - vocabs=[vocab_path, vocab_path], - ) - eval = Evaluator(**args) - data = SAMPLE_SRC_HYP - expected_scores = [0.75715, 0.81395, 0.8361] - - scores = eval.evaluate(data) - assert len(scores) == len(data) - for score, expected_score in zip(scores, expected_scores): - if isinstance(score, list): - score = score[0] - assert abs(score - expected_score) < EPSILON - - -def test_evaluator_bleurt(): - model_path, vocab_path = get_known_model("bleurt20") - args = BASE_ARGS | dict( - like="bleurt", - model=model_path, - vocabs=[vocab_path, vocab_path], - ) - - eval = Evaluator(**args) - data = SAMPLE_REF_HYP - scores = eval.evaluate(data) - expected_scores = [0.30929, 0.3027, 0.3113] - assert len(scores) == len(data) - for score, expected_score in zip(scores, expected_scores): - if isinstance(score, list): - score = score[0] - assert abs(score - expected_score) < EPSILON - - -# TODO: These below tests are failing - - -def test_evaluator_comet20qe(): - model_path, vocab_path = get_known_model("comet20-da-qe") - args = BASE_ARGS | dict( - like="comet-qe", - model=model_path, - vocabs=[vocab_path, vocab_path], - ) - - eval = Evaluator(**args) - data = SAMPLE_SRC_HYP - scores = eval.evaluate(data) - assert len(scores) == len(data) - # TODO: add expected scores and asserts - - -def test_evaluator_comet20ref(): - model_path, vocab_path = get_known_model("comet20-da") - args = BASE_ARGS | dict( - like="comet", - model=model_path, - vocabs=[vocab_path, vocab_path], - ) - - eval = Evaluator(**args) - data = SAMPLE_SRC_HYP_REF - scores = eval.evaluate(data) - len(scores) == len(data) - - -# TODO: add expected scores and asserts diff --git a/src/python/tests/test_translate.py b/src/python/tests/test_translate.py deleted file mode 100644 index 0ad5adc60..000000000 --- a/src/python/tests/test_translate.py +++ /dev/null @@ -1,16 +0,0 @@ -from pathlib import Path - -from pymarian import Translator - -from . import BASE_ARGS - - -def test_ende(): - # TODO: download model from blob storage - model_dir = Path.home() / 'tmp/marian-eng-deu' - model_file = str(model_dir / 'model.bin') - vocab_file = str(model_dir / 'vocab.spm') - args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file]) - translator = Translator(**args) - hyp = translator.translate("Hello. Good morning.") - assert hyp == "Hallo. Guten Morgen." From 39ade681112458957a1c4016d22622f0bdcdb489 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Thu, 22 Feb 2024 12:32:58 +0000 Subject: [PATCH 18/26] Merged PR 33078: Merge public master with internal master This mostly adds @'s changes from public master to internal. I did an automatic merge and need to go through those changes myself. I think there is an issue in translator.h which I will fix. @ can you check if things work for you here? --- .github/workflows/ios.yml | 43 + .github/workflows/release.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- .gitmodules | 6 + CHANGELOG.md | 2 + CMakeLists.txt | 74 +- VERSION | 2 +- azure-pipelines.yml | 2 +- cmake/ios.toolchain.cmake | 1099 +++++++++++++++++++ scripts/ci/install_mkl.sh | 2 +- src/3rd_party/CMakeLists.txt | 15 +- src/3rd_party/faiss/VectorTransform.cpp | 6 + src/3rd_party/faiss/VectorTransform.h | 2 + src/3rd_party/ruy | 1 + src/3rd_party/sentencepiece | 2 +- src/3rd_party/simd_utils | 1 + src/common/binary.cpp | 2 +- src/common/types.h | 6 +- src/functional/operators.h | 5 +- src/tensors/cpu/expression_graph_packable.h | 2 +- src/tensors/cpu/fbgemm/packed_gemm.cpp | 8 +- src/tensors/cpu/integer_common.h | 14 +- src/tensors/cpu/intgemm_interface.h | 4 +- src/tensors/cpu/prod.cpp | 8 - src/tensors/cpu/prod_blas.h | 130 ++- src/translator/translator.h | 1 - 26 files changed, 1393 insertions(+), 48 deletions(-) create mode 100644 .github/workflows/ios.yml create mode 100644 cmake/ios.toolchain.cmake create mode 160000 src/3rd_party/ruy create mode 160000 src/3rd_party/simd_utils diff --git a/.github/workflows/ios.yml b/.github/workflows/ios.yml new file mode 100644 index 000000000..4dfa8905d --- /dev/null +++ b/.github/workflows/ios.yml @@ -0,0 +1,43 @@ +name: iOS + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build-macos: + name: iOS CPU-only + runs-on: macos-12 + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install dependencies + run: brew install boost openblas openssl protobuf + + - name: Configure CMake + run: | + export LDFLAGS="-L/usr/local/opt/openblas/lib" + export CPPFLAGS="-I/usr/local/opt/openblas/include" + mkdir -p build + cd build + cmake .. \ + -DCOMPILE_CPU=on \ + -DCOMPILE_CUDA=off \ + -DCOMPILE_EXAMPLES=on \ + -DCOMPILE_SERVER=off \ + -DCOMPILE_TESTS=on \ + -DUSE_SENTENCEPIECE=on \ + -DCMAKE_TOOLCHAIN_FILE=../cmake/ios.toolchain.cmake \ + -DUSE_SENTENCEPIECE=on \ + -DPLATFORM=OS64 \ + -DDEPLOYMENT_TARGET=13.0 + + - name: Compile + working-directory: build + run: cmake --build . --config Release \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5beab28f0..59bb2dc76 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -77,7 +77,7 @@ jobs: # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL run: | - wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - + wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" | sudo apt-key add - sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list" sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index f2baae82d..3a4c65b31 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -91,7 +91,7 @@ jobs: # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html - name: Install MKL run: | - wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - + wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" | sudo apt-key add - sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list" sudo apt-get install -y --no-install-recommends intel-mkl-64bit-2020.0-088 diff --git a/.gitmodules b/.gitmodules index 7a94dab1d..083aabe85 100644 --- a/.gitmodules +++ b/.gitmodules @@ -20,6 +20,12 @@ [submodule "src/3rd_party/simple-websocket-server"] path = src/3rd_party/simple-websocket-server url = https://github.com/marian-nmt/Simple-WebSocket-Server +[submodule "src/3rd_party/ruy"] + path = src/3rd_party/ruy + url = https://github.com/marian-nmt/ruy.git +[submodule "src/3rd_party/simd_utils"] + path = src/3rd_party/simd_utils + url = https://github.com/marian-nmt/simd_utils.git [submodule "src/3rd_party/pybind11"] path = src/3rd_party/pybind11 url = https://github.com/pybind/pybind11.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 9412de3a2..3049e622e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] - Fixed compilation with clang 16.0.6 +- Added Threads::Threads to EXT_LIBS + ### Added - Added `pymarian-eval`, CLI for scoring metrics diff --git a/CMakeLists.txt b/CMakeLists.txt index e16876f78..b6aa74297 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ if (POLICY CMP0074) endif () project(marian CXX C) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.") @@ -82,6 +83,48 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() +# iOS support +if(CMAKE_SYSTEM_NAME STREQUAL "iOS" ) + set(ARM ON) + # need to ignore this warning for Xcode to be happy + list(APPEND ALL_WARNINGS -Wno-shorten-64-to-32;) +endif() + +# ARM support: currently ONLY armv8. armv8 includes NEON by default +# we do not currently have good support for automatic architecture detection, including for cross-compilation +# this is planned for future PRs +if(ARM) + + # Apple by default has Apple Accelerate. Otherwise fallback to RUY for GEMM + if(APPLE) + message(STATUS "Using Apple Accelerate SGEMM") + option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF) + else(APPLE) + message(STATUS "Using Ruy SGEMM") + set(EXT_LIBS ${EXT_LIBS} ruy) + option(USE_RUY_SGEMM "Compile with Ruy SGEMM" ON) + endif(APPLE) + + # Define that we are using ARM as required by simd_utils. See their README for info + add_compile_definitions(ARM FMA SSE) + # Some warnings as errors. I don't feel comfortable about the strict aliasing. + set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment") + + if(MSVC) + add_compile_options(/flax-vector-conversions) + else(MSVC) + add_compile_options(-flax-vector-conversions) + endif(MSVC) +endif(ARM) + +######## +# pThreads: consider it as EXT_LIBS for a more portable binary +set(CMAKE_THREAD_PREFER_PTHREAD TRUE) +set(THREADS_PREFER_PTHREAD_FLAG TRUE) +find_package(Threads REQUIRED) +set(EXT_LIBS ${EXT_LIBS} Threads::Threads) +######## + ############################################################################### # Set compilation flags if(MSVC) @@ -141,13 +184,16 @@ else(MSVC) set(INTRINSICS "") list(APPEND INTRINSICS_NVCC) - option(COMPILE_SSE2 "Compile CPU code with SSE2 support" ON) - option(COMPILE_SSE3 "Compile CPU code with SSE3 support" ON) - option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON) - option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON) - option(COMPILE_AVX "Compile CPU code with AVX support" ON) - option(COMPILE_AVX2 "Compile CPU code with AVX2 support" ON) - option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON) + if(NOT ARM) + # none of these options are available on ARM + option(COMPILE_SSE2 "Compile CPU code with SSE2 support" ON) + option(COMPILE_SSE3 "Compile CPU code with SSE3 support" ON) + option(COMPILE_SSE4_1 "Compile CPU code with SSE4.1 support" ON) + option(COMPILE_SSE4_2 "Compile CPU code with SSE4.2 support" ON) + option(COMPILE_AVX "Compile CPU code with AVX support" ON) + option(COMPILE_AVX2 "Compile CPU code with AVX2 support" ON) + option(COMPILE_AVX512 "Compile CPU code with AVX512 support" ON) + endif(NOT ARM) if(BUILD_ARCH STREQUAL "native") message(STATUS "Building with -march=native and intrinsics will be chosen automatically by the compiler to match the current machine.") @@ -223,7 +269,7 @@ else(MSVC) # Clang-10.0.0 complains when CUDA is newer than 10.1 set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version") endif() - set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA}") + set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${ARM_WARNINGS}") # These are used in src/CMakeLists.txt on a per-target basis list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated; @@ -242,7 +288,7 @@ else(MSVC) set(CMAKE_RDYNAMIC_FLAG "-rdynamic") endif(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_CXX_FLAGS "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}") + set(CMAKE_CXX_FLAGS "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_CXX_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG") @@ -252,7 +298,7 @@ else(MSVC) set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction") # these need to be set separately - set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}") + set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}") set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}") set(CMAKE_C_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG") @@ -260,6 +306,12 @@ else(MSVC) set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -pg") set(CMAKE_C_FLAGS_PROFGEN "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction") set(CMAKE_C_FLAGS_PROFUSE "${CMAKE_C_FLAGS_RELEASE} -fprofile-use -fprofile-correction") + + # set -march for all builds except iOS cross compilation + if(NOT CMAKE_SYSTEM_NAME STREQUAL "iOS" ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${BUILD_ARCH}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=${BUILD_ARCH}") + endif() endif(MSVC) # with gcc 7.0 and above we need to mark fallthrough in switch case statements @@ -521,7 +573,7 @@ endif() ############################################################################### # Find BLAS library if(COMPILE_CPU) - if(NOT GENERATE_MARIAN_INSTALL_TARGETS) + if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM) set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU add_definitions(-DCOMPILE_CPU=1) endif() diff --git a/VERSION b/VERSION index 53dbb431e..329143f69 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.25 +v1.12.26 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 4c7cd0bfd..d9c816928 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -68,7 +68,7 @@ variables: - name: MKL_DIR value: "$(Build.SourcesDirectory)/mkl" - name: MKL_URL - value: "https://data.statmt.org/romang/marian-regression-tests/ci/mkl-2020.1-windows-static.zip" + value: "https://mariandev.blob.core.windows.net/public/ci/mkl-2020.1-windows-static.zip" - name: VCPKG_COMMIT value: 2023.11.20 - name: VCPKG_DIR diff --git a/cmake/ios.toolchain.cmake b/cmake/ios.toolchain.cmake new file mode 100644 index 000000000..2131172fd --- /dev/null +++ b/cmake/ios.toolchain.cmake @@ -0,0 +1,1099 @@ +# This file is part of the ios-cmake project. It was retrieved from +# https://github.com/leetal/ios-cmake.git, which is a fork of +# https://github.com/gerstrong/ios-cmake.git, which is a fork of +# https://github.com/cristeab/ios-cmake.git, which is a fork of +# https://code.google.com/p/ios-cmake/. Which in turn is based off of +# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which +# are included with CMake 2.8.4 +# +# The ios-cmake project is licensed under the new BSD license. +# +# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software, +# Kitware, Inc., Insight Software Consortium. All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# This file is based on the Platform/Darwin.cmake and +# Platform/UnixPaths.cmake files which are included with CMake 2.8.4 +# It has been altered for iOS development. +# +# Updated by Alex Stewart (alexs.mac@gmail.com) +# +# ***************************************************************************** +# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com) +# under the BSD-3-Clause license +# https://github.com/leetal/ios-cmake +# ***************************************************************************** +# +# INFORMATION / HELP +# +############################################################################### +# OPTIONS # +############################################################################### +# +# PLATFORM: (default "OS64") +# OS = Build for iPhoneOS. +# OS64 = Build for arm64 iphoneOS. +# OS64COMBINED = Build for arm64 x86_64 iphoneOS + iphoneOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) +# SIMULATOR = Build for x86 i386 iphoneOS Simulator. +# SIMULATOR64 = Build for x86_64 iphoneOS Simulator. +# SIMULATORARM64 = Build for arm64 iphoneOS Simulator. +# SIMULATOR64COMBINED = Build for arm64 x86_64 iphoneOS Simulator. Combined into FAT STATIC lib (supported on 3.14+ of CMakewith "-G Xcode" argument ONLY) +# TVOS = Build for arm64 tvOS. +# TVOSCOMBINED = Build for arm64 x86_64 tvOS + tvOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) +# SIMULATOR_TVOS = Build for x86_64 tvOS Simulator. +# SIMULATORARM64_TVOS = Build for arm64 tvOS Simulator. +# WATCHOS = Build for armv7k arm64_32 for watchOS. +# WATCHOSCOMBINED = Build for armv7k arm64_32 x86_64 watchOS + watchOS Simulator. Combined into FAT STATIC lib (only supported on 3.14+ of CMake with "-G Xcode" argument in combination with the "cmake --install" CMake build step) +# SIMULATOR_WATCHOS = Build for x86_64 for watchOS Simulator. +# MAC = Build for x86_64 macOS. +# MAC_ARM64 = Build for Apple Silicon macOS. +# MAC_UNIVERSAL = Combined build for x86_64 and Apple Silicon on macOS. +# MAC_CATALYST = Build for x86_64 macOS with Catalyst support (iOS toolchain on macOS). +# Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS +# MAC_CATALYST_ARM64 = Build for Apple Silicon macOS with Catalyst support (iOS toolchain on macOS). +# Note: The build argument "MACOSX_DEPLOYMENT_TARGET" can be used to control min-version of macOS +# +# CMAKE_OSX_SYSROOT: Path to the SDK to use. By default this is +# automatically determined from PLATFORM and xcodebuild, but +# can also be manually specified (although this should not be required). +# +# CMAKE_DEVELOPER_ROOT: Path to the Developer directory for the platform +# being compiled for. By default, this is automatically determined from +# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should +# not be required). +# +# DEPLOYMENT_TARGET: Minimum SDK version to target. Default 2.0 on watchOS and 9.0 on tvOS+iOS +# +# NAMED_LANGUAGE_SUPPORT: +# ON (default) = Will require "enable_language(OBJC) and/or enable_language(OBJCXX)" for full OBJC|OBJCXX support +# OFF = Will embed the OBJC and OBJCXX flags into the CMAKE_C_FLAGS and CMAKE_CXX_FLAGS (legacy behavior, CMake version < 3.16) +# +# ENABLE_BITCODE: (ON|OFF) Enables or disables bitcode support. Default OFF +# +# ENABLE_ARC: (ON|OFF) Enables or disables ARC support. Default ON (ARC enabled by default) +# +# ENABLE_VISIBILITY: (ON|OFF) Enables or disables symbol visibility support. Default OFF (visibility hidden by default) +# +# ENABLE_STRICT_TRY_COMPILE: (ON|OFF) Enables or disables strict try_compile() on all Check* directives (will run linker +# to actually check if linking is possible). Default OFF (will set CMAKE_TRY_COMPILE_TARGET_TYPE to STATIC_LIBRARY) +# +# ARCHS: (armv7 armv7s armv7k arm64 arm64_32 i386 x86_64) If specified, will override the default architectures for the given PLATFORM +# OS = armv7 armv7s arm64 (if applicable) +# OS64 = arm64 (if applicable) +# SIMULATOR = i386 +# SIMULATOR64 = x86_64 +# SIMULATORARM64 = arm64 +# TVOS = arm64 +# SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated) +# SIMULATORARM64_TVOS = arm64 +# WATCHOS = armv7k arm64_32 (if applicable) +# SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated) +# MAC = x86_64 +# MAC_ARM64 = arm64 +# MAC_UNIVERSAL = x86_64 arm64 +# MAC_CATALYST = x86_64 +# MAC_CATALYST_ARM64 = arm64 +# +# NOTE: When manually specifying ARCHS, put a semi-colon between the entries. E.g., -DARCHS="armv7;arm64" +# +############################################################################### +# END OPTIONS # +############################################################################### +# +# This toolchain defines the following properties (available via get_property()) for use externally: +# +# PLATFORM: The currently targeted platform. +# XCODE_VERSION: Version number (not including Build version) of Xcode detected. +# SDK_VERSION: Version of SDK being used. +# OSX_ARCHITECTURES: Architectures being compiled for (generated from PLATFORM). +# APPLE_TARGET_TRIPLE: Used by autoconf build systems. NOTE: If "ARCHS" is overridden, this will *NOT* be set! +# +# This toolchain defines the following macros for use externally: +# +# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) +# A convenience macro for setting xcode specific properties on targets. +# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel +# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all"). +# +# find_host_package (PROGRAM ARGS) +# A macro used to find executable programs on the host system, not within the +# environment. Thanks to the android-cmake project for providing the +# command. +# + +cmake_minimum_required(VERSION 3.8.0) + +# CMake invokes the toolchain file twice during the first build, but only once during subsequent rebuilds. +if(DEFINED ENV{_IOS_TOOLCHAIN_HAS_RUN}) + return() +endif() +set(ENV{_IOS_TOOLCHAIN_HAS_RUN} true) + +# List of supported platform values +list(APPEND _supported_platforms + "OS" "OS64" "OS64COMBINED" "SIMULATOR" "SIMULATOR64" "SIMULATORARM64" "SIMULATOR64COMBINED" + "TVOS" "TVOSCOMBINED" "SIMULATOR_TVOS" "SIMULATORARM64_TVOS" + "WATCHOS" "WATCHOSCOMBINED" "SIMULATOR_WATCHOS" + "MAC" "MAC_ARM64" "MAC_UNIVERSAL" + "VISIONOS" "SIMULATOR_VISIONOS" "SIMULATOR64_VISIONOS" + "MAC_CATALYST" "MAC_CATALYST_ARM64") + +# Cache what generator is used +set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}") + +# Check if using a CMake version capable of building combined FAT builds (simulator and target slices combined in one static lib) +if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14") + set(MODERN_CMAKE YES) +endif() + +# Get the Xcode version being used. +# Problem: CMake runs toolchain files multiple times, but can't read cache variables on some runs. +# Workaround: On the first run (in which cache variables are always accessible), set an intermediary environment variable. +# +# NOTE: This pattern is used in many places in this toolchain to speed up checks of all sorts +if(DEFINED XCODE_VERSION_INT) + # Environment variables are always preserved. + set(ENV{_XCODE_VERSION_INT} "${XCODE_VERSION_INT}") +elseif(DEFINED ENV{_XCODE_VERSION_INT}) + set(XCODE_VERSION_INT "$ENV{_XCODE_VERSION_INT}") +elseif(NOT DEFINED XCODE_VERSION_INT) + find_program(XCODEBUILD_EXECUTABLE xcodebuild) + if(NOT XCODEBUILD_EXECUTABLE) + message(FATAL_ERROR "xcodebuild not found. Please install either the standalone commandline tools or Xcode.") + endif() + execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version + OUTPUT_VARIABLE XCODE_VERSION_INT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION_INT "${XCODE_VERSION_INT}") + string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION_INT "${XCODE_VERSION_INT}") + set(XCODE_VERSION_INT "${XCODE_VERSION_INT}" CACHE INTERNAL "") +endif() + +# Assuming that xcode 12.0 is installed you most probably have ios sdk 14.0 or later installed (tested on Big Sur) +# if you don't set a deployment target it will be set the way you only get 64-bit builds +#if(NOT DEFINED DEPLOYMENT_TARGET AND XCODE_VERSION_INT VERSION_GREATER 12.0) +# Temporarily fix the arm64 issues in CMake install-combined by excluding arm64 for simulator builds (needed for Apple Silicon...) +# set(CMAKE_XCODE_ATTRIBUTE_EXCLUDED_ARCHS[sdk=iphonesimulator*] "arm64") +#endif() + +# Check if the platform variable is set +if(DEFINED PLATFORM) + # Environment variables are always preserved. + set(ENV{_PLATFORM} "${PLATFORM}") +elseif(DEFINED ENV{_PLATFORM}) + set(PLATFORM "$ENV{_PLATFORM}") +elseif(NOT DEFINED PLATFORM) + message(FATAL_ERROR "PLATFORM argument not set. Bailing configure since I don't know what target you want to build for!") +endif () + +if(PLATFORM MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode") + message(FATAL_ERROR "The combined builds support requires Xcode to be used as a generator via '-G Xcode' command-line argument in CMake") +endif() + +# Safeguard that the platform value is set and is one of the supported values +list(FIND _supported_platforms ${PLATFORM} contains_PLATFORM) +if("${contains_PLATFORM}" EQUAL "-1") + string(REPLACE ";" "\n * " _supported_platforms_formatted "${_supported_platforms}") + message(FATAL_ERROR " Invalid PLATFORM specified! Current value: ${PLATFORM}.\n" + " Supported PLATFORM values: \n * ${_supported_platforms_formatted}") +endif() + +# Check if Apple Silicon is supported +if(PLATFORM MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$|^(MAC_UNIVERSAL)$" AND ${CMAKE_VERSION} VERSION_LESS "3.19.5") + message(FATAL_ERROR "Apple Silicon builds requires a minimum of CMake 3.19.5") +endif() + +# Touch the toolchain variable to suppress the "unused variable" warning. +# This happens if CMake is invoked with the same command line the second time. +if(CMAKE_TOOLCHAIN_FILE) +endif() + +# Fix for PThread library not in path +set(CMAKE_THREAD_LIBS_INIT "-lpthread") +set(CMAKE_HAVE_THREADS_LIBRARY 1) +set(CMAKE_USE_WIN32_THREADS_INIT 0) +set(CMAKE_USE_PTHREADS_INIT 1) + +# Specify named language support defaults. +if(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.16") + set(NAMED_LANGUAGE_SUPPORT ON) + message(STATUS "[DEFAULTS] Using explicit named language support! E.g., enable_language(CXX) is needed in the project files.") +elseif(NOT DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16") + set(NAMED_LANGUAGE_SUPPORT OFF) + message(STATUS "[DEFAULTS] Disabling explicit named language support. Falling back to legacy behavior.") +elseif(DEFINED NAMED_LANGUAGE_SUPPORT AND ${CMAKE_VERSION} VERSION_LESS "3.16") + message(FATAL_ERROR "CMake named language support for OBJC and OBJCXX was added in CMake 3.16.") +endif() +set(NAMED_LANGUAGE_SUPPORT_INT ${NAMED_LANGUAGE_SUPPORT} CACHE BOOL + "Whether or not to enable explicit named language support" FORCE) + +# Specify the minimum version of the deployment target. +if(NOT DEFINED DEPLOYMENT_TARGET) + if (PLATFORM MATCHES "WATCHOS") + # Unless specified, SDK version 4.0 is used by default as minimum target version (watchOS). + set(DEPLOYMENT_TARGET "4.0") + elseif(PLATFORM STREQUAL "MAC") + # Unless specified, SDK version 10.13 (High Sierra) is used by default as the minimum target version (macos). + set(DEPLOYMENT_TARGET "10.13") + elseif(PLATFORM STREQUAL "VISIONOS" OR PLATFORM STREQUAL "SIMULATOR_VISIONOS" OR PLATFORM STREQUAL "SIMULATOR64_VISIONOS") + # Unless specified, SDK version 1.0 is used by default as minimum target version (visionOS). + set(DEPLOYMENT_TARGET "1.0") + elseif(PLATFORM STREQUAL "MAC_ARM64") + # Unless specified, SDK version 11.0 (Big Sur) is used by default as the minimum target version (macOS on arm). + set(DEPLOYMENT_TARGET "11.0") + elseif(PLATFORM STREQUAL "MAC_UNIVERSAL") + # Unless specified, SDK version 11.0 (Big Sur) is used by default as minimum target version for universal builds. + set(DEPLOYMENT_TARGET "11.0") + elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64") + # Unless specified, SDK version 13.0 is used by default as the minimum target version (mac catalyst minimum requirement). + set(DEPLOYMENT_TARGET "13.1") + else() + # Unless specified, SDK version 11.0 is used by default as the minimum target version (iOS, tvOS). + set(DEPLOYMENT_TARGET "11.0") + endif() + message(STATUS "[DEFAULTS] Using the default min-version since DEPLOYMENT_TARGET not provided!") +elseif(DEFINED DEPLOYMENT_TARGET AND PLATFORM MATCHES "^MAC_CATALYST" AND ${DEPLOYMENT_TARGET} VERSION_LESS "13.1") + message(FATAL_ERROR "Mac Catalyst builds requires a minimum deployment target of 13.1!") +endif() + +# Store the DEPLOYMENT_TARGET in the cache +set(DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}" CACHE INTERNAL "") + +# Handle the case where we are targeting iOS and a version above 10.3.4 (32-bit support dropped officially) +if(PLATFORM STREQUAL "OS" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4) + set(PLATFORM "OS64") + message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") +elseif(PLATFORM STREQUAL "SIMULATOR" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.3.4) + set(PLATFORM "SIMULATOR64") + message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.") +endif() + +set(PLATFORM_INT "${PLATFORM}") + +if(DEFINED ARCHS) + string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") +endif() + +# Determine the platform name and architectures for use in xcodebuild commands +# from the specified PLATFORM_INT name. +if(PLATFORM_INT STREQUAL "OS") + set(SDK_NAME iphoneos) + if(NOT ARCHS) + set(ARCHS armv7 armv7s arm64) + set(APPLE_TARGET_TRIPLE_INT arm-apple-ios${DEPLOYMENT_TARGET}) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) + endif() +elseif(PLATFORM_INT STREQUAL "OS64") + set(SDK_NAME iphoneos) + if(NOT ARCHS) + if (XCODE_VERSION_INT VERSION_GREATER 10.0) + set(ARCHS arm64) # FIXME: Add arm64e when Apple has fixed the integration issues with it, libarclite_iphoneos.a is currently missing bitcode markers for example + else() + set(ARCHS arm64) + endif() + set(APPLE_TARGET_TRIPLE_INT arm64-apple-ios${DEPLOYMENT_TARGET}) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) + endif() +elseif(PLATFORM_INT STREQUAL "OS64COMBINED") + set(SDK_NAME iphoneos) + if(MODERN_CMAKE) + if(NOT ARCHS) + if (XCODE_VERSION_INT VERSION_GREATER 12.0) + set(ARCHS arm64 x86_64) + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64 arm64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64 arm64") + else() + set(ARCHS arm64 x86_64) + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "arm64") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "arm64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64") + endif() + set(APPLE_TARGET_TRIPLE_INT arm64-x86_64-apple-ios${DEPLOYMENT_TARGET}) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}) + endif() + else() + message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the OS64COMBINED setting work") + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATOR64COMBINED") + set(SDK_NAME iphonesimulator) + if(MODERN_CMAKE) + if(NOT ARCHS) + if (XCODE_VERSION_INT VERSION_GREATER 12.0) + set(ARCHS arm64 x86_64) # FIXME: Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missing bitcode markers for example + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64 arm64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64 arm64") + else() + set(ARCHS arm64 x86_64) + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphoneos*] "") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=iphonesimulator*] "x86_64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphoneos*] "") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=iphonesimulator*] "x86_64") + endif() + set(APPLE_TARGET_TRIPLE_INT aarch64-x86_64-apple-ios${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) + endif() + else() + message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the SIMULATOR64COMBINED setting work") + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATOR") + set(SDK_NAME iphonesimulator) + if(NOT ARCHS) + set(ARCHS i386) + set(APPLE_TARGET_TRIPLE_INT i386-apple-ios${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) + endif() + message(DEPRECATION "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.") +elseif(PLATFORM_INT STREQUAL "SIMULATOR64") + set(SDK_NAME iphonesimulator) + if(NOT ARCHS) + set(ARCHS x86_64) + set(APPLE_TARGET_TRIPLE_INT x86_64-apple-ios${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATORARM64") + set(SDK_NAME iphonesimulator) + if(NOT ARCHS) + set(ARCHS arm64) + set(APPLE_TARGET_TRIPLE_INT arm64-apple-ios${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "TVOS") + set(SDK_NAME appletvos) + if(NOT ARCHS) + set(ARCHS arm64) + set(APPLE_TARGET_TRIPLE_INT arm64-apple-tvos${DEPLOYMENT_TARGET}) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}) + endif() +elseif (PLATFORM_INT STREQUAL "TVOSCOMBINED") + set(SDK_NAME appletvos) + if(MODERN_CMAKE) + if(NOT ARCHS) + set(ARCHS arm64 x86_64) + set(APPLE_TARGET_TRIPLE_INT arm64-x86_64-apple-tvos${DEPLOYMENT_TARGET}) + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvos*] "arm64") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=appletvsimulator*] "x86_64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvos*] "arm64") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=appletvsimulator*] "x86_64") + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}) + endif() + else() + message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the TVOSCOMBINED setting work") + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") + set(SDK_NAME appletvsimulator) + if(NOT ARCHS) + set(ARCHS x86_64) + set(APPLE_TARGET_TRIPLE_INT x86_64-apple-tvos${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATORARM64_TVOS") + set(SDK_NAME appletvsimulator) + if(NOT ARCHS) + set(ARCHS arm64) + set(APPLE_TARGET_TRIPLE_INT arm64-apple-tvos${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-tvos${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "WATCHOS") + set(SDK_NAME watchos) + if(NOT ARCHS) + if (XCODE_VERSION_INT VERSION_GREATER 10.0) + set(ARCHS armv7k arm64_32) + set(APPLE_TARGET_TRIPLE_INT arm64_32-apple-watchos${DEPLOYMENT_TARGET}) + else() + set(ARCHS armv7k) + set(APPLE_TARGET_TRIPLE_INT arm-apple-watchos${DEPLOYMENT_TARGET}) + endif() + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}) + endif() +elseif(PLATFORM_INT STREQUAL "WATCHOSCOMBINED") + set(SDK_NAME watchos) + if(MODERN_CMAKE) + if(NOT ARCHS) + if (XCODE_VERSION_INT VERSION_GREATER 10.0) + set(ARCHS armv7k arm64_32 i386) + set(APPLE_TARGET_TRIPLE_INT arm64_32-i386-apple-watchos${DEPLOYMENT_TARGET}) + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k arm64_32") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k arm64_32") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386") + else() + set(ARCHS armv7k i386) + set(APPLE_TARGET_TRIPLE_INT arm-i386-apple-watchos${DEPLOYMENT_TARGET}) + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchos*] "armv7k") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=watchsimulator*] "i386") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchos*] "armv7k") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=watchsimulator*] "i386") + endif() + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}) + endif() + else() + message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the WATCHOSCOMBINED setting work") + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") + set(SDK_NAME watchsimulator) + if(NOT ARCHS) + set(ARCHS i386) + set(APPLE_TARGET_TRIPLE_INT i386-apple-watchos${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-watchos${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATOR64_VISIONOS") + set(SDK_NAME xrsimulator) + if(NOT ARCHS) + set(ARCHS x86_64) + set(APPLE_TARGET_TRIPLE_INT x86_64-apple-xros${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-xros${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "SIMULATOR_VISIONOS") + set(SDK_NAME xrsimulator) + if(NOT ARCHS) + set(ARCHS arm64) + set(APPLE_TARGET_TRIPLE_INT arm64-apple-xros${DEPLOYMENT_TARGET}-simulator) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-xros${DEPLOYMENT_TARGET}-simulator) + endif() +elseif(PLATFORM_INT STREQUAL "VISIONOS") + set(SDK_NAME xros) + if(NOT ARCHS) + set(ARCHS arm64) + set(APPLE_TARGET_TRIPLE_INT arm64-apple-xros${DEPLOYMENT_TARGET}) + else() + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-xros${DEPLOYMENT_TARGET}) + endif() +elseif(PLATFORM_INT STREQUAL "MAC" OR PLATFORM_INT STREQUAL "MAC_CATALYST") + set(SDK_NAME macosx) + if(NOT ARCHS) + set(ARCHS x86_64) + endif() + string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") + if(PLATFORM_INT STREQUAL "MAC") + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) + elseif(PLATFORM_INT STREQUAL "MAC_CATALYST") + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi) + endif() +elseif(PLATFORM_INT MATCHES "^(MAC_ARM64)$|^(MAC_CATALYST_ARM64)$") + set(SDK_NAME macosx) + if(NOT ARCHS) + set(ARCHS arm64) + endif() + string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") + if(PLATFORM_INT STREQUAL "MAC_ARM64") + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) + elseif(PLATFORM_INT STREQUAL "MAC_CATALYST_ARM64") + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-ios${DEPLOYMENT_TARGET}-macabi) + endif() +elseif(PLATFORM_INT STREQUAL "MAC_UNIVERSAL") + set(SDK_NAME macosx) + if(NOT ARCHS) + set(ARCHS "x86_64;arm64") + endif() + string(REPLACE ";" "-" ARCHS_SPLIT "${ARCHS}") + set(APPLE_TARGET_TRIPLE_INT ${ARCHS_SPLIT}-apple-macosx${DEPLOYMENT_TARGET}) +else() + message(FATAL_ERROR "Invalid PLATFORM: ${PLATFORM_INT}") +endif() + +string(REPLACE ";" " " ARCHS_SPACED "${ARCHS}") + +if(MODERN_CMAKE AND PLATFORM_INT MATCHES ".*COMBINED" AND NOT CMAKE_GENERATOR MATCHES "Xcode") + message(FATAL_ERROR "The COMBINED options only work with Xcode generator, -G Xcode") +endif() + +if(CMAKE_GENERATOR MATCHES "Xcode" AND PLATFORM_INT MATCHES "^MAC_CATALYST") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") + set(CMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS "macosx") + set(CMAKE_XCODE_ATTRIBUTE_SUPPORTS_MACCATALYST "YES") + if(NOT DEFINED MACOSX_DEPLOYMENT_TARGET) + set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "10.15") + else() + set(CMAKE_XCODE_ATTRIBUTE_MACOSX_DEPLOYMENT_TARGET "${MACOSX_DEPLOYMENT_TARGET}") + endif() +elseif(CMAKE_GENERATOR MATCHES "Xcode") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++") + set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "${DEPLOYMENT_TARGET}") + if(NOT PLATFORM_INT MATCHES ".*COMBINED") + set(CMAKE_XCODE_ATTRIBUTE_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}") + set(CMAKE_XCODE_ATTRIBUTE_VALID_ARCHS[sdk=${SDK_NAME}*] "${ARCHS_SPACED}") + endif() +endif() + +# If the user did not specify the SDK root to use, then query xcodebuild for it. +if(DEFINED CMAKE_OSX_SYSROOT_INT) + # Environment variables are always preserved. + set(ENV{_CMAKE_OSX_SYSROOT_INT} "${CMAKE_OSX_SYSROOT_INT}") +elseif(DEFINED ENV{_CMAKE_OSX_SYSROOT_INT}) + set(CMAKE_OSX_SYSROOT_INT "$ENV{_CMAKE_OSX_SYSROOT_INT}") +elseif(NOT DEFINED CMAKE_OSX_SYSROOT_INT) + execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -version -sdk ${SDK_NAME} Path + OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +if (NOT DEFINED CMAKE_OSX_SYSROOT_INT AND NOT DEFINED CMAKE_OSX_SYSROOT) + message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain" + "is pointing to the correct path. Please run:" + "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer" + "and see if that fixes the problem for you.") + message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} " + "does not exist.") +elseif(DEFINED CMAKE_OSX_SYSROOT_INT) + set(CMAKE_OSX_SYSROOT_INT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") + # Specify the location or name of the platform SDK to be used in CMAKE_OSX_SYSROOT. + set(CMAKE_OSX_SYSROOT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") +endif() + +# Use bitcode or not +if(NOT DEFINED ENABLE_BITCODE) + message(STATUS "[DEFAULTS] Disabling bitcode support by default. ENABLE_BITCODE not provided for override!") + set(ENABLE_BITCODE OFF) +endif() +set(ENABLE_BITCODE_INT ${ENABLE_BITCODE} CACHE BOOL + "Whether or not to enable bitcode" FORCE) +# Use ARC or not +if(NOT DEFINED ENABLE_ARC) + # Unless specified, enable ARC support by default + set(ENABLE_ARC ON) + message(STATUS "[DEFAULTS] Enabling ARC support by default. ENABLE_ARC not provided!") +endif() +set(ENABLE_ARC_INT ${ENABLE_ARC} CACHE BOOL "Whether or not to enable ARC" FORCE) +# Use hidden visibility or not +if(NOT DEFINED ENABLE_VISIBILITY) + # Unless specified, disable symbols visibility by default + set(ENABLE_VISIBILITY OFF) + message(STATUS "[DEFAULTS] Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!") +endif() +set(ENABLE_VISIBILITY_INT ${ENABLE_VISIBILITY} CACHE BOOL "Whether or not to hide symbols from the dynamic linker (-fvisibility=hidden)" FORCE) +# Set strict compiler checks or not +if(NOT DEFINED ENABLE_STRICT_TRY_COMPILE) + # Unless specified, disable strict try_compile() + set(ENABLE_STRICT_TRY_COMPILE OFF) + message(STATUS "[DEFAULTS] Using NON-strict compiler checks by default. ENABLE_STRICT_TRY_COMPILE not provided!") +endif() +set(ENABLE_STRICT_TRY_COMPILE_INT ${ENABLE_STRICT_TRY_COMPILE} CACHE BOOL + "Whether or not to use strict compiler checks" FORCE) + +# Get the SDK version information. +if(DEFINED SDK_VERSION) + # Environment variables are always preserved. + set(ENV{_SDK_VERSION} "${SDK_VERSION}") +elseif(DEFINED ENV{_SDK_VERSION}) + set(SDK_VERSION "$ENV{_SDK_VERSION}") +elseif(NOT DEFINED SDK_VERSION) + execute_process(COMMAND ${XCODEBUILD_EXECUTABLE} -sdk ${CMAKE_OSX_SYSROOT_INT} -version SDKVersion + OUTPUT_VARIABLE SDK_VERSION + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +# Find the Developer root for the specific iOS platform being compiled for +# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in +# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain +# this information from xcrun or xcodebuild. +if (NOT DEFINED CMAKE_DEVELOPER_ROOT AND NOT CMAKE_GENERATOR MATCHES "Xcode") + get_filename_component(PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT_INT} PATH) + get_filename_component(CMAKE_DEVELOPER_ROOT ${PLATFORM_SDK_DIR} PATH) + if (NOT EXISTS "${CMAKE_DEVELOPER_ROOT}") + message(FATAL_ERROR "Invalid CMAKE_DEVELOPER_ROOT: ${CMAKE_DEVELOPER_ROOT} does not exist.") + endif() +endif() + +# Find the C & C++ compilers for the specified SDK. +if(DEFINED CMAKE_C_COMPILER) + # Environment variables are always preserved. + set(ENV{_CMAKE_C_COMPILER} "${CMAKE_C_COMPILER}") +elseif(DEFINED ENV{_CMAKE_C_COMPILER}) + set(CMAKE_C_COMPILER "$ENV{_CMAKE_C_COMPILER}") + set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) +elseif(NOT DEFINED CMAKE_C_COMPILER) + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang + OUTPUT_VARIABLE CMAKE_C_COMPILER + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) +endif() +if(DEFINED CMAKE_CXX_COMPILER) + # Environment variables are always preserved. + set(ENV{_CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}") +elseif(DEFINED ENV{_CMAKE_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER "$ENV{_CMAKE_CXX_COMPILER}") +elseif(NOT DEFINED CMAKE_CXX_COMPILER) + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find clang++ + OUTPUT_VARIABLE CMAKE_CXX_COMPILER + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() +# Find (Apple's) libtool. +if(DEFINED BUILD_LIBTOOL) + # Environment variables are always preserved. + set(ENV{_BUILD_LIBTOOL} "${BUILD_LIBTOOL}") +elseif(DEFINED ENV{_BUILD_LIBTOOL}) + set(BUILD_LIBTOOL "$ENV{_BUILD_LIBTOOL}") +elseif(NOT DEFINED BUILD_LIBTOOL) + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find libtool + OUTPUT_VARIABLE BUILD_LIBTOOL + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() +# Find the toolchain's provided install_name_tool if none is found on the host +if(DEFINED CMAKE_INSTALL_NAME_TOOL) + # Environment variables are always preserved. + set(ENV{_CMAKE_INSTALL_NAME_TOOL} "${CMAKE_INSTALL_NAME_TOOL}") +elseif(DEFINED ENV{_CMAKE_INSTALL_NAME_TOOL}) + set(CMAKE_INSTALL_NAME_TOOL "$ENV{_CMAKE_INSTALL_NAME_TOOL}") +elseif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL) + execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT_INT} -find install_name_tool + OUTPUT_VARIABLE CMAKE_INSTALL_NAME_TOOL_INT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(CMAKE_INSTALL_NAME_TOOL ${CMAKE_INSTALL_NAME_TOOL_INT} CACHE INTERNAL "") +endif() + +# Configure libtool to be used instead of ar + ranlib to build static libraries. +# This is required on Xcode 7+, but should also work on previous versions of +# Xcode. +get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES) +foreach(lang ${languages}) + set(CMAKE_${lang}_CREATE_STATIC_LIBRARY "${BUILD_LIBTOOL} -static -o " CACHE INTERNAL "") +endforeach() + +# CMake 3.14+ support building for iOS, watchOS, and tvOS out of the box. +if(MODERN_CMAKE) + if(SDK_NAME MATCHES "iphone") + set(CMAKE_SYSTEM_NAME iOS) + elseif(SDK_NAME MATCHES "xros") + set(CMAKE_SYSTEM_NAME visionOS) + elseif(SDK_NAME MATCHES "xrsimulator") + set(CMAKE_SYSTEM_NAME visionOS) + elseif(SDK_NAME MATCHES "macosx") + set(CMAKE_SYSTEM_NAME Darwin) + elseif(SDK_NAME MATCHES "appletv") + set(CMAKE_SYSTEM_NAME tvOS) + elseif(SDK_NAME MATCHES "watch") + set(CMAKE_SYSTEM_NAME watchOS) + endif() + # Provide flags for a combined FAT library build on newer CMake versions + if(PLATFORM_INT MATCHES ".*COMBINED") + set(CMAKE_IOS_INSTALL_COMBINED YES) + if(CMAKE_GENERATOR MATCHES "Xcode") + # Set the SDKROOT Xcode properties to a Xcode-friendly value (the SDK_NAME, E.g, iphoneos) + # This way, Xcode will automatically switch between the simulator and device SDK when building. + set(CMAKE_XCODE_ATTRIBUTE_SDKROOT "${SDK_NAME}") + # Force to not build just one ARCH, but all! + set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH "NO") + endif() + endif() +elseif(NOT DEFINED CMAKE_SYSTEM_NAME AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.10") + # Legacy code path prior to CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified + set(CMAKE_SYSTEM_NAME iOS) +elseif(NOT DEFINED CMAKE_SYSTEM_NAME) + # Legacy code path before CMake 3.14 or fallback if no CMAKE_SYSTEM_NAME specified + set(CMAKE_SYSTEM_NAME Darwin) +endif() +# Standard settings. +set(CMAKE_SYSTEM_VERSION ${SDK_VERSION} CACHE INTERNAL "") +set(UNIX ON CACHE BOOL "") +set(APPLE ON CACHE BOOL "") +if(PLATFORM STREQUAL "MAC" OR PLATFORM STREQUAL "MAC_ARM64" OR PLATFORM STREQUAL "MAC_UNIVERSAL") + set(IOS OFF CACHE BOOL "") + set(MACOS ON CACHE BOOL "") +elseif(PLATFORM STREQUAL "MAC_CATALYST" OR PLATFORM STREQUAL "MAC_CATALYST_ARM64") + set(IOS ON CACHE BOOL "") + set(MACOS ON CACHE BOOL "") +else() + set(IOS ON CACHE BOOL "") +endif() +# Set the architectures for which to build. +set(CMAKE_OSX_ARCHITECTURES ${ARCHS} CACHE INTERNAL "") +# Change the type of target generated for try_compile() so it'll work when cross-compiling, weak compiler checks +if(NOT ENABLE_STRICT_TRY_COMPILE_INT) + set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +endif() +# All iOS/Darwin specific settings - some may be redundant. +if (NOT DEFINED CMAKE_MACOSX_BUNDLE) + set(CMAKE_MACOSX_BUNDLE YES) +endif() +set(CMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED "NO") +set(CMAKE_SHARED_LIBRARY_PREFIX "lib") +set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") +set(CMAKE_SHARED_MODULE_PREFIX "lib") +set(CMAKE_SHARED_MODULE_SUFFIX ".so") +set(CMAKE_C_COMPILER_ABI ELF) +set(CMAKE_CXX_COMPILER_ABI ELF) +set(CMAKE_C_HAS_ISYSROOT 1) +set(CMAKE_CXX_HAS_ISYSROOT 1) +set(CMAKE_MODULE_EXISTS 1) +set(CMAKE_DL_LIBS "") +set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") +set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") +set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}") +set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") + +if(ARCHS MATCHES "((^|;|, )(arm64|arm64e|x86_64))+") + set(CMAKE_C_SIZEOF_DATA_PTR 8) + set(CMAKE_CXX_SIZEOF_DATA_PTR 8) + if(ARCHS MATCHES "((^|;|, )(arm64|arm64e))+") + set(CMAKE_SYSTEM_PROCESSOR "aarch64") + else() + set(CMAKE_SYSTEM_PROCESSOR "x86_64") + endif() +else() + set(CMAKE_C_SIZEOF_DATA_PTR 4) + set(CMAKE_CXX_SIZEOF_DATA_PTR 4) + set(CMAKE_SYSTEM_PROCESSOR "arm") +endif() + +# Note that only Xcode 7+ supports the newer more specific: +# -m${SDK_NAME}-version-min flags, older versions of Xcode use: +# -m(ios/ios-simulator)-version-min instead. +if(${CMAKE_VERSION} VERSION_LESS "3.11") + if(PLATFORM_INT STREQUAL "OS" OR PLATFORM_INT STREQUAL "OS64") + if(XCODE_VERSION_INT VERSION_LESS 7.0) + set(SDK_NAME_VERSION_FLAGS + "-mios-version-min=${DEPLOYMENT_TARGET}") + else() + # Xcode 7.0+ uses flags we can build directly from SDK_NAME. + set(SDK_NAME_VERSION_FLAGS + "-m${SDK_NAME}-version-min=${DEPLOYMENT_TARGET}") + endif() + elseif(PLATFORM_INT STREQUAL "TVOS") + set(SDK_NAME_VERSION_FLAGS + "-mtvos-version-min=${DEPLOYMENT_TARGET}") + elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS") + set(SDK_NAME_VERSION_FLAGS + "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}") +elseif(PLATFORM_INT STREQUAL "SIMULATORARM64_TVOS") + set(SDK_NAME_VERSION_FLAGS + "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}") + elseif(PLATFORM_INT STREQUAL "WATCHOS") + set(SDK_NAME_VERSION_FLAGS + "-mwatchos-version-min=${DEPLOYMENT_TARGET}") + elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS") + set(SDK_NAME_VERSION_FLAGS + "-mwatchos-simulator-version-min=${DEPLOYMENT_TARGET}") + elseif(PLATFORM_INT STREQUAL "MAC") + set(SDK_NAME_VERSION_FLAGS + "-mmacosx-version-min=${DEPLOYMENT_TARGET}") + else() + # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min. + set(SDK_NAME_VERSION_FLAGS + "-mios-simulator-version-min=${DEPLOYMENT_TARGET}") + endif() +elseif(NOT PLATFORM_INT MATCHES "^MAC_CATALYST") + # Newer versions of CMake sets the version min flags correctly, skip this for Mac Catalyst targets + set(CMAKE_OSX_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET}) +endif() + +if(DEFINED APPLE_TARGET_TRIPLE_INT) + set(APPLE_TARGET_TRIPLE ${APPLE_TARGET_TRIPLE_INT} CACHE INTERNAL "") + set(CMAKE_C_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) + set(CMAKE_CXX_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) + set(CMAKE_ASM_COMPILER_TARGET ${APPLE_TARGET_TRIPLE}) +endif() + +if(PLATFORM_INT MATCHES "^MAC_CATALYST") + set(C_TARGET_FLAGS "-isystem ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/usr/include -iframework ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks") +endif() + +if(ENABLE_BITCODE_INT) + set(BITCODE "-fembed-bitcode") + set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode") + set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "YES") +else() + set(BITCODE "") + set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE "NO") +endif() + +if(ENABLE_ARC_INT) + set(FOBJC_ARC "-fobjc-arc") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "YES") +else() + set(FOBJC_ARC "-fno-objc-arc") + set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC "NO") +endif() + +if(NAMED_LANGUAGE_SUPPORT_INT) + set(OBJC_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0") + set(OBJC_LEGACY_VARS "") +else() + set(OBJC_VARS "") + set(OBJC_LEGACY_VARS "-fobjc-abi-version=2 -DOBJC_OLD_DISPATCH_PROTOTYPES=0") +endif() + +if(NOT ENABLE_VISIBILITY_INT) + foreach(lang ${languages}) + set(CMAKE_${lang}_VISIBILITY_PRESET "hidden" CACHE INTERNAL "") + endforeach() + set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "YES") + set(VISIBILITY "-fvisibility=hidden -fvisibility-inlines-hidden") +else() + foreach(lang ${languages}) + set(CMAKE_${lang}_VISIBILITY_PRESET "default" CACHE INTERNAL "") + endforeach() + set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN "NO") + set(VISIBILITY "-fvisibility=default") +endif() + +if(DEFINED APPLE_TARGET_TRIPLE) + set(APPLE_TARGET_TRIPLE_FLAG "-target ${APPLE_TARGET_TRIPLE}") +endif() + +#Check if Xcode generator is used since that will handle these flags automagically +if(CMAKE_GENERATOR MATCHES "Xcode") + message(STATUS "Not setting any manual command-line buildflags, since Xcode is selected as the generator. Modifying the Xcode build-settings directly instead.") +else() + set(CMAKE_C_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_C_FLAGS}") + set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_C_FLAGS_DEBUG}") + set(CMAKE_C_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_C_FLAGS_MINSIZEREL}") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_C_FLAGS_RELWITHDEBINFO}") + set(CMAKE_C_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_C_FLAGS_RELEASE}") + set(CMAKE_CXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${OBJC_LEGACY_VARS} ${BITCODE} ${VISIBILITY} ${CMAKE_CXX_FLAGS}") + set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_CXX_FLAGS_DEBUG}") + set(CMAKE_CXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_CXX_FLAGS_MINSIZEREL}") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") + set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_CXX_FLAGS_RELEASE}") + if(NAMED_LANGUAGE_SUPPORT_INT) + set(CMAKE_OBJC_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJC_FLAGS}") + set(CMAKE_OBJC_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJC_FLAGS_DEBUG}") + set(CMAKE_OBJC_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJC_FLAGS_MINSIZEREL}") + set(CMAKE_OBJC_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJC_FLAGS_RELWITHDEBINFO}") + set(CMAKE_OBJC_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJC_FLAGS_RELEASE}") + set(CMAKE_OBJCXX_FLAGS "${C_TARGET_FLAGS} ${APPLE_TARGET_TRIPLE_FLAG} ${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} ${FOBJC_ARC} ${OBJC_VARS} ${CMAKE_OBJCXX_FLAGS}") + set(CMAKE_OBJCXX_FLAGS_DEBUG "-O0 -g ${CMAKE_OBJCXX_FLAGS_DEBUG}") + set(CMAKE_OBJCXX_FLAGS_MINSIZEREL "-DNDEBUG -Os ${CMAKE_OBJCXX_FLAGS_MINSIZEREL}") + set(CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO "-DNDEBUG -O2 -g ${CMAKE_OBJCXX_FLAGS_RELWITHDEBINFO}") + set(CMAKE_OBJCXX_FLAGS_RELEASE "-DNDEBUG -O3 ${CMAKE_OBJCXX_FLAGS_RELEASE}") + endif() + set(CMAKE_C_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}") + set(CMAKE_CXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}") + if(NAMED_LANGUAGE_SUPPORT_INT) + set(CMAKE_OBJC_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJC_LINK_FLAGS}") + set(CMAKE_OBJCXX_LINK_FLAGS "${C_TARGET_FLAGS} ${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_OBJCXX_LINK_FLAGS}") + endif() + set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -x assembler-with-cpp -arch ${CMAKE_OSX_ARCHITECTURES} ${APPLE_TARGET_TRIPLE_FLAG}") +endif() + +## Print status messages to inform of the current state +message(STATUS "Configuring ${SDK_NAME} build for platform: ${PLATFORM_INT}, architecture(s): ${ARCHS}") +message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT_INT}") +message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}") +message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}") +message(STATUS "Using libtool: ${BUILD_LIBTOOL}") +message(STATUS "Using install name tool: ${CMAKE_INSTALL_NAME_TOOL}") +if(DEFINED APPLE_TARGET_TRIPLE) + message(STATUS "Autoconf target triple: ${APPLE_TARGET_TRIPLE}") +endif() +message(STATUS "Using minimum deployment version: ${DEPLOYMENT_TARGET}" + " (SDK version: ${SDK_VERSION})") +if(MODERN_CMAKE) + message(STATUS "Merging integrated CMake 3.14+ iOS,tvOS,watchOS,macOS toolchain(s) with this toolchain!") + if(PLATFORM_INT MATCHES ".*COMBINED") + message(STATUS "Will combine built (static) artifacts into FAT lib...") + endif() +endif() +if(CMAKE_GENERATOR MATCHES "Xcode") + message(STATUS "Using Xcode version: ${XCODE_VERSION_INT}") +endif() +message(STATUS "CMake version: ${CMAKE_VERSION}") +if(DEFINED SDK_NAME_VERSION_FLAGS) + message(STATUS "Using version flags: ${SDK_NAME_VERSION_FLAGS}") +endif() +message(STATUS "Using a data_ptr size of: ${CMAKE_CXX_SIZEOF_DATA_PTR}") +if(ENABLE_BITCODE_INT) + message(STATUS "Bitcode: Enabled") +else() + message(STATUS "Bitcode: Disabled") +endif() + +if(ENABLE_ARC_INT) + message(STATUS "ARC: Enabled") +else() + message(STATUS "ARC: Disabled") +endif() + +if(ENABLE_VISIBILITY_INT) + message(STATUS "Hiding symbols: Disabled") +else() + message(STATUS "Hiding symbols: Enabled") +endif() + +# Set global properties +set_property(GLOBAL PROPERTY PLATFORM "${PLATFORM}") +set_property(GLOBAL PROPERTY APPLE_TARGET_TRIPLE "${APPLE_TARGET_TRIPLE_INT}") +set_property(GLOBAL PROPERTY SDK_VERSION "${SDK_VERSION}") +set_property(GLOBAL PROPERTY XCODE_VERSION "${XCODE_VERSION_INT}") +set_property(GLOBAL PROPERTY OSX_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}") + +# Export configurable variables for the try_compile() command. +set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES + PLATFORM + XCODE_VERSION_INT + SDK_VERSION + NAMED_LANGUAGE_SUPPORT + DEPLOYMENT_TARGET + CMAKE_DEVELOPER_ROOT + CMAKE_OSX_SYSROOT_INT + ENABLE_BITCODE + ENABLE_ARC + CMAKE_ASM_COMPILER + CMAKE_C_COMPILER + CMAKE_C_COMPILER_TARGET + CMAKE_CXX_COMPILER + CMAKE_CXX_COMPILER_TARGET + BUILD_LIBTOOL + CMAKE_INSTALL_NAME_TOOL + CMAKE_C_FLAGS + CMAKE_C_DEBUG + CMAKE_C_MINSIZEREL + CMAKE_C_RELWITHDEBINFO + CMAKE_C_RELEASE + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_LINK_FLAGS + CMAKE_CXX_LINK_FLAGS + CMAKE_ASM_FLAGS +) + +if(NAMED_LANGUAGE_SUPPORT_INT) + list(APPEND CMAKE_TRY_COMPILE_PLATFORM_VARIABLES + CMAKE_OBJC_FLAGS + CMAKE_OBJC_DEBUG + CMAKE_OBJC_MINSIZEREL + CMAKE_OBJC_RELWITHDEBINFO + CMAKE_OBJC_RELEASE + CMAKE_OBJCXX_FLAGS + CMAKE_OBJCXX_DEBUG + CMAKE_OBJCXX_MINSIZEREL + CMAKE_OBJCXX_RELWITHDEBINFO + CMAKE_OBJCXX_RELEASE + CMAKE_OBJC_LINK_FLAGS + CMAKE_OBJCXX_LINK_FLAGS + ) +endif() + +set(CMAKE_PLATFORM_HAS_INSTALLNAME 1) +set(CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks") +set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -Wl,-headerpad_max_install_names") +set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -Wl,-headerpad_max_install_names") +set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,") +set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,") +set(CMAKE_FIND_LIBRARY_SUFFIXES ".tbd" ".dylib" ".so" ".a") +set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-install_name") + +# Set the find root to the SDK developer roots. +# Note: CMAKE_FIND_ROOT_PATH is only useful when cross-compiling. Thus, do not set on macOS builds. +if(NOT PLATFORM_INT MATCHES "^MAC.*$") + list(APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "") + set(CMAKE_IGNORE_PATH "/System/Library/Frameworks;/usr/local/lib;/opt/homebrew" CACHE INTERNAL "") +endif() + +# Default to searching for frameworks first. +IF(NOT DEFINED CMAKE_FIND_FRAMEWORK) + set(CMAKE_FIND_FRAMEWORK FIRST) +ENDIF(NOT DEFINED CMAKE_FIND_FRAMEWORK) + +# Set up the default search directories for frameworks. +if(PLATFORM_INT MATCHES "^MAC_CATALYST") + set(CMAKE_FRAMEWORK_PATH + ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks + ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks + ${CMAKE_OSX_SYSROOT_INT}/System/iOSSupport/System/Library/Frameworks + ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "") +else() + set(CMAKE_FRAMEWORK_PATH + ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks + ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks + ${CMAKE_FRAMEWORK_PATH} CACHE INTERNAL "") +endif() + +# By default, search both the specified iOS SDK and the remainder of the host filesystem. +if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH CACHE INTERNAL "") +endif() +if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH CACHE INTERNAL "") +endif() +if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH CACHE INTERNAL "") +endif() +if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) + set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH CACHE INTERNAL "") +endif() + +# +# Some helper-macros below to simplify and beautify the CMakeFile +# + +# This little macro lets you set any Xcode specific property. +macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) + set(XCODE_RELVERSION_I "${XCODE_RELVERSION}") + if(XCODE_RELVERSION_I STREQUAL "All") + set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") + else() + set_property(TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}") + endif() +endmacro(set_xcode_property) + +# This macro lets you find executable programs on the host system. +macro(find_host_package) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE NEVER) + set(_TOOLCHAIN_IOS ${IOS}) + set(IOS OFF) + find_package(${ARGN}) + set(IOS ${_TOOLCHAIN_IOS}) + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH) + set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH) +endmacro(find_host_package) \ No newline at end of file diff --git a/scripts/ci/install_mkl.sh b/scripts/ci/install_mkl.sh index 4037396e7..2333a75a7 100755 --- a/scripts/ci/install_mkl.sh +++ b/scripts/ci/install_mkl.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html -wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add - +wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB" | sudo apt-key add - sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list" sudo apt-get update -o Dir::Etc::sourcelist="/etc/apt/sources.list.d/intel-mkl.list" sudo apt-get install --no-install-recommends intel-mkl-64bit-2020.0-088 diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 6cf46533f..f57e746e1 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -9,10 +9,23 @@ add_subdirectory(./faiss) include_directories(./faiss) if(COMPILE_CPU) - if(NOT GENERATE_MARIAN_INSTALL_TARGETS) + # intgemm is not ARM-compatible. do not build it if we are on ARM + if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM) set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") add_subdirectory(./intgemm) endif() + + # the default codepath does not use ruy so there is no need to add these directories + # to the build unless it is explicitly enabled. RUY is intended mostly for ARM support + if(USE_RUY_SGEMM) + set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL " " FORCE) + add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL) + add_subdirectory(ruy EXCLUDE_FROM_ALL) + endif(USE_RUY_SGEMM) endif(COMPILE_CPU) if(USE_FBGEMM) diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp index 103b0910e..22fecbf78 100644 --- a/src/3rd_party/faiss/VectorTransform.cpp +++ b/src/3rd_party/faiss/VectorTransform.cpp @@ -19,6 +19,12 @@ using namespace faiss; +#ifdef ARM +// we use various AVX/SSE instructions in this file +// simd_utils translates these into ARM/NEON compatible instructions +#include "3rd_party/simd_utils/simd_utils.h" +#endif + extern "C" { diff --git a/src/3rd_party/faiss/VectorTransform.h b/src/3rd_party/faiss/VectorTransform.h index 5fc96bc46..e8689bc15 100644 --- a/src/3rd_party/faiss/VectorTransform.h +++ b/src/3rd_party/faiss/VectorTransform.h @@ -19,8 +19,10 @@ #include #ifdef __APPLE__ +#ifndef ARM #include #endif +#endif namespace faiss { diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy new file mode 160000 index 000000000..c04e5e52a --- /dev/null +++ b/src/3rd_party/ruy @@ -0,0 +1 @@ +Subproject commit c04e5e52ae6b144f74ac032652e3c538bda15c9b diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index fb6f8e408..b09054fdd 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit fb6f8e408d2078ebfedc8ccc33985fef03c50b0e +Subproject commit b09054fdd0ac18f1377b5f7c68807a86faada6c8 diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils new file mode 160000 index 000000000..fe9fa82c9 --- /dev/null +++ b/src/3rd_party/simd_utils @@ -0,0 +1 @@ +Subproject commit fe9fa82c9d7e6297913bc6c98fe079acc6e157e9 diff --git a/src/common/binary.cpp b/src/common/binary.cpp index 0041275c5..fa98ef8bc 100644 --- a/src/common/binary.cpp +++ b/src/common/binary.cpp @@ -109,7 +109,7 @@ void loadItems(const std::string& fileName, std::vector& items) { io::Item getItem(const void* current, const std::string& varName) { std::vector items; - loadItems(current, items); + loadItems(current, items, /*mapped=*/true); for(auto& item : items) if(item.name == varName) diff --git a/src/common/types.h b/src/common/types.h index 7b50bb691..bd67fae71 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -17,7 +17,11 @@ #include #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code -#include + #ifndef ARM + #include + #else + #include "3rd_party/simd_utils/simd_utils.h" + #endif #endif #ifdef __CUDACC__ // nvcc is compiling this code diff --git a/src/functional/operators.h b/src/functional/operators.h index 3628fdcb9..e7dcea3c6 100644 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -217,8 +217,11 @@ struct Ops { // __CUDACC__ is defined when compiling with NVCC regardless of device type // __CUDA_ARCH__ is defined when compiling device (GPU) code #ifndef __CUDACC__ - +#ifndef ARM #include "3rd_party/sse_mathfun.h" +#else +#include "3rd_party/simd_utils/simd_utils.h" +#endif namespace marian { namespace functional { diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index f1a68210e..db526b626 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph { #endif } else if (isIntgemm(gemmElementType) && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) using cpu::integer::cols; using cpu::integer::rows; auto allocator = New(getBackend()); diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp index dd81d0f7f..23ed559f1 100644 --- a/src/tensors/cpu/fbgemm/packed_gemm.cpp +++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp @@ -2,16 +2,16 @@ #include "tensors/tensor_allocator.h" #include "tensors/tensor_operators.h" -#include -#include -#include -#include #include #include #include //#include #if USE_FBGEMM +#include +#include +#include +#include #ifdef _MSC_VER #pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline') #pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn' diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index f4e632b5c..8a00a7870 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -5,7 +5,7 @@ #include "tensors/cpu/aligned.h" #include "common/io_item.h" -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) #include "3rd_party/intgemm/intgemm/intgemm.h" #else namespace intgemm { @@ -31,10 +31,12 @@ namespace intgemm { } #endif +#ifndef ARM #include #include #include #include +#endif #include #include @@ -98,7 +100,7 @@ template <> struct intgemm_ { template static inline float& getQuantMult(marian::Tensor val) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type()); typedef typename intgemm_::type Integer; return *(reinterpret_cast(val->data() + val->shape().elements())); @@ -109,7 +111,7 @@ static inline float& getQuantMult(marian::Tensor val) { } static inline Type getIntgemmType(Type vtype) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) if (vtype == Type::intgemm8) { if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) { return Type::intgemm8avx512vnni; @@ -142,7 +144,7 @@ static inline Type getIntgemmType(Type vtype) { } static inline bool passOrAbort(Type vtype) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) if (vtype == Type::intgemm8 || vtype == Type::intgemm16) { return true; } else if (vtype == Type::intgemm16sse2) { @@ -166,7 +168,7 @@ static inline bool passOrAbort(Type vtype) { template static inline float computeQuantMult(marian::Tensor val) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) if(sizeOf(vtype) == 1) return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements()); else if(sizeOf(vtype) == 2) @@ -186,7 +188,7 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias); // in our binary format. Then we copy the quantizationMultiplier information at the end template void prepareAndTransposeB(io::Item& item, const char * input) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) typedef typename intgemm_::type Integer; Integer * output_tensor = reinterpret_cast(&(*item.bytes.begin())); // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 88408aa18..80784e0f6 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -9,7 +9,7 @@ namespace marian { namespace cpu { namespace integer { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) /* * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized. * Expr input: The input tensor @@ -45,7 +45,7 @@ static inline Expr prepareA(Expr a) { */ template static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) { -#if COMPILE_CPU +#if COMPILE_CPU && !defined(ARM) ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type()); ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type()); diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 8fcca924b..639027d05 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -7,14 +7,6 @@ #include "tensors/tensor.h" #include "tensors/tensor_allocator.h" -#if MKL_FOUND -#include -#else -#if BLAS_FOUND -#include -#endif -#endif - #include "integer_common.h" #include "prod_blas.h" diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index a591fdd26..a281aa7bf 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -1,11 +1,117 @@ +#pragma once #if MKL_FOUND -#include -#else -#if BLAS_FOUND -#include -#endif + #include +#elif BLAS_FOUND + #include +#elif USE_RUY_SGEMM +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" + #include "ruy/ruy.h" + #include "ruy/system_aligned_alloc.h" +#pragma GCC pop #endif +#if USE_RUY_SGEMM +// AlignedVector allocates aligned memory and cleans up after itself. RAII +// wrapper similar to intgemm's AlignedVector. +template +class AlignedVector { +public: + AlignedVector(size_t num_elem) + : size_(num_elem), + storage_(reinterpret_cast(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {} + + T *begin() { return storage_; } + T *data() { return storage_; } + size_t size() const { return size_; } + size_t memSize() const { return sizeof(T) * size_; } + + // Forbid copy + AlignedVector(const AlignedVector &) = delete; + AlignedVector &operator=(const AlignedVector &) = delete; + + ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast(storage_)); } + +private: + size_t size_; + T *storage_; +}; + + +inline void GemmRuy(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const float alpha, + const float *A, + const int lda, + const float *B, + const int ldb, + const float beta, + float *C, + const int ldc) { + ruy::Context context; + + // If we need to transpose, we can swap dimensions in layout claim the matrix + // is just column-major. Set ordering so transpose. + const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + + ruy::Matrix lhs; + ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout()); + lhs.set_data(A); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout()); + rhs.set_data(B); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout()); + + if(beta == 0) { + // For beta = 0, we want to avoid the additional allocation. This is a + // large amount of our inference use-cases. sgemm is called with `beta` for + // accumulating gradients in backpropogation, which is 0.0 during + // inference. + + dst.set_data(C); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + if(alpha != 1.0) { + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = C; // Alias. + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i]; + } + } + + } else { + // No multiply-add in Ruy + // See also: https://github.com/google/ruy/issues/307 + + AlignedVector intermediate(M * N); + dst.set_data(intermediate.data()); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = intermediate.data(); + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i] + beta * C[i]; + } + } +} + +#endif // RUY_SGEMM + inline void sgemm(bool transA, bool transB, int rows_a, @@ -34,6 +140,20 @@ inline void sgemm(bool transA, beta, c, ldc); +#elif USE_RUY_SGEMM + GemmRuy(transA, + transB, + rows_a, + rows_b, + width, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc); #else transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy ABORT("Marian must be compiled with a BLAS library"); diff --git a/src/translator/translator.h b/src/translator/translator.h index b15683867..28ff7b0c2 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -232,7 +232,6 @@ class TranslateService : public ModelServiceTask { std::vector> modelWeights_; size_t numDevices_; - std::vector> model_items_; // non-mmap public: virtual ~TranslateService() {} From 01bc6b04435fbe3c523243c12f3000403c30496c Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Thu, 22 Feb 2024 19:31:30 +0000 Subject: [PATCH 19/26] Merged PR 33010: support force-decoding for pymarian Translator API support force-decoding for pymarian Translator API --- src/python/tests/regression/test_translate.py | 14 ++++++++++++++ src/translator/translator.h | 9 ++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/python/tests/regression/test_translate.py b/src/python/tests/regression/test_translate.py index b65ec49dc..49ddc14a2 100644 --- a/src/python/tests/regression/test_translate.py +++ b/src/python/tests/regression/test_translate.py @@ -33,3 +33,17 @@ def test_ende(): translator = Translator(**args) hyp = translator.translate("Hello. Good morning.") assert hyp == "Hallo , Guten Morgen ." + + +def test_ende_force_decode(): + + model_file = str(DATA_DIR / 'model.base.npz') + vocab_file = str(DATA_DIR / 'en-de.spm') + args = BASE_ARGS | dict(models=model_file, vocabs=[vocab_file, vocab_file], quiet=True) + translator = Translator(**args) + hyp = translator.translate("Hello. Good morning.") + assert hyp == "Hallo , Guten Morgen ." + + force_decode_config = dict(force_decode=True, tsv=True, tsv_fields=2) + hyp = translator.translate("Hello. Good morning.\tIsch", **force_decode_config) + assert hyp == "Isch am Guten Morgen ." diff --git a/src/translator/translator.h b/src/translator/translator.h index 28ff7b0c2..f3c1ac549 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -227,6 +227,7 @@ class TranslateService : public ModelServiceTask { std::vector> srcVocabs_; Ptr trgVocab_; + std::vector> allVocabs_; Ptr shortlistGenerator_; std::vector> modelWeights_; @@ -257,6 +258,8 @@ class TranslateService : public ModelServiceTask { trgVocab_ = New(options_, vocabPaths.size() - 1); trgVocab_->load(vocabPaths.back()); auto srcVocab = srcVocabs_.front(); + allVocabs_.insert(allVocabs_.end(), srcVocabs_.begin(), srcVocabs_.end()); + allVocabs_.emplace_back(trgVocab_); std::vector lshOpts = options_->get>("output-approx-knn", {}); ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); @@ -333,7 +336,11 @@ class TranslateService : public ModelServiceTask { auto inputs = currentOptions->get("tsv", false) ? convertTsvToLists(input, currentOptions->get("tsv-fields", 1)) : std::vector({input}); - auto corpus_ = New(inputs, srcVocabs_, currentOptions); + // when force-decode is set, include trgVocab_ , otherwise use srcVocabs_ only + // for CLI, force-decode is implemented in data/corpus_base.cpp + auto forceDecoding = currentOptions->get("force-decode", false); + + auto corpus_ = New(inputs, forceDecoding ? allVocabs_ : srcVocabs_, currentOptions); data::BatchGenerator batchGenerator(corpus_, currentOptions, nullptr, /*runAsync=*/false); auto collector = New(currentOptions->get("quiet-translation", false)); From 4d184bbd5fad5356fdeac00aca339d5723de9e43 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 5 Mar 2024 18:05:01 +0000 Subject: [PATCH 20/26] Merged PR 33382: handle cusparse deprecation warnings with cuda 12.3 Cuda seems to have deprecated a whole bunch of its interface and it seems to interact weirdly with some gcc versions. Disabling warnings for this header via dummy include. --- src/tensors/gpu/backend.h | 3 ++- src/tensors/gpu/cusparse_include.h | 12 +++++++++ src/tensors/gpu/prod_sparse.cpp | 6 ++--- src/tensors/gpu/prod_sparse_cu10.h | 3 ++- src/tensors/gpu/prod_sparse_cu11.h | 41 +++++++++++++++--------------- 5 files changed, 40 insertions(+), 25 deletions(-) create mode 100644 src/tensors/gpu/cusparse_include.h diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index 022e4f3fe..1199055b6 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -3,12 +3,13 @@ #include "common/config.h" #include "tensors/backend.h" // note: this is one folder up #include "tensors/gpu/cuda_helpers.h" +#include "tensors/gpu/cusparse_include.h" #include "common/logging.h" #include #include #include -#include + namespace marian { namespace gpu { diff --git a/src/tensors/gpu/cusparse_include.h b/src/tensors/gpu/cusparse_include.h new file mode 100644 index 000000000..b3e68387d --- /dev/null +++ b/src/tensors/gpu/cusparse_include.h @@ -0,0 +1,12 @@ +// header file to include cusparse.h while ignoring deprecated warnings locally + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +#include + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/src/tensors/gpu/prod_sparse.cpp b/src/tensors/gpu/prod_sparse.cpp index dd66866c7..448a383b9 100644 --- a/src/tensors/gpu/prod_sparse.cpp +++ b/src/tensors/gpu/prod_sparse.cpp @@ -2,15 +2,15 @@ #pragma warning(disable: 4505) // warning C4505: '__float2half_rz': unreferenced local function has been removed (missing 'static inline') #endif -#include -#include - // clang-format off #include "tensors/gpu/prod.h" #include "tensors/gpu/backend.h" #include "tensors/gpu/cuda_helpers.h" +#include "tensors/gpu/cusparse_include.h" // clang-format on +#include + // what a nightmare #if CUDA_VERSION >= 11000 #include "tensors/gpu/prod_sparse_cu11.h" diff --git a/src/tensors/gpu/prod_sparse_cu10.h b/src/tensors/gpu/prod_sparse_cu10.h index d03097e34..0cfdcfed4 100644 --- a/src/tensors/gpu/prod_sparse_cu10.h +++ b/src/tensors/gpu/prod_sparse_cu10.h @@ -1,10 +1,11 @@ #include -#include // clang-format off #include "tensors/gpu/prod.h" #include "tensors/gpu/backend.h" #include "tensors/gpu/cuda_helpers.h" +#include "tensors/gpu/cusparse_include.h" + // clang-format on namespace marian { diff --git a/src/tensors/gpu/prod_sparse_cu11.h b/src/tensors/gpu/prod_sparse_cu11.h index d8659e862..ed3e6e4f5 100644 --- a/src/tensors/gpu/prod_sparse_cu11.h +++ b/src/tensors/gpu/prod_sparse_cu11.h @@ -3,12 +3,13 @@ #endif #include -#include // clang-format off #include "tensors/gpu/prod.h" #include "tensors/gpu/backend.h" #include "tensors/gpu/cuda_helpers.h" +#include "tensors/gpu/cusparse_include.h" + // clang-format on namespace marian { @@ -16,7 +17,7 @@ namespace gpu { // primary template for specialization with different element and compute types template -struct TypedSparseGemm { +struct TypedSparseGemm { static cudaDataType getCudaDataType(const float*) { return CUDA_R_32F; }; static cudaDataType getCudaDataType(const half*) { return CUDA_R_16F; }; @@ -36,7 +37,7 @@ static void CSRProdSwapped(marian::Tensor C, // interpret tensor dimensions as matrix dimensions const auto& shapeC = C->shape(); const auto& shapeD = D->shape(); - + auto colsC = shapeC[-1]; auto rowsC = shapeC.elements() / colsC; @@ -47,7 +48,7 @@ static void CSRProdSwapped(marian::Tensor C, auto colsS = rowsD; auto denseOrder = CUSPARSE_ORDER_COL; - auto algorithm = CUSPARSE_SPMM_ALG_DEFAULT; + auto algorithm = CUSPARSE_SPMM_ALG_DEFAULT; std::cerr << shapeC << std::endl; std::cerr << shapeD << std::endl; @@ -60,7 +61,7 @@ static void CSRProdSwapped(marian::Tensor C, auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument"); ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size"); - + ElementType alpha = 1.0; cusparseSpMatDescr_t descS; @@ -71,19 +72,19 @@ static void CSRProdSwapped(marian::Tensor C, S_offsets->data(), S_indices->data(), S_values ->data(), - CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, getCudaDataType(S_values->data()))); CUSPARSE_CHECK(cusparseCreateDnMat(&descD, - rowsD, colsD, /*ld=*/colsD, - D->data(), - getCudaDataType(D->data()), + rowsD, colsD, /*ld=*/colsD, + D->data(), + getCudaDataType(D->data()), denseOrder)); CUSPARSE_CHECK(cusparseCreateDnMat(&descC, rowsC, colsC, /*ld=*/colsC, - C->data(), - getCudaDataType(C->data()), + C->data(), + getCudaDataType(C->data()), denseOrder)); size_t bufferSize = 0; @@ -136,7 +137,7 @@ static void CSRProd(marian::Tensor C, // interpret tensor dimensions as matrix dimensions const auto& shapeC = C->shape(); const auto& shapeD = D->shape(); - + auto colsC = shapeC[-1]; auto rowsC = shapeC.elements() / colsC; @@ -147,7 +148,7 @@ static void CSRProd(marian::Tensor C, auto colsS = rowsD; auto denseOrder = CUSPARSE_ORDER_ROW; - auto algorithm = CUSPARSE_SPMM_CSR_ALG2; + auto algorithm = CUSPARSE_SPMM_CSR_ALG2; if(transS) std::swap(rowsS, colsS); @@ -157,7 +158,7 @@ static void CSRProd(marian::Tensor C, auto numOffsets = S_offsets->shape().elements() - 1; // -1 since last value is length ABORT_IF(numOffsets != rowsS, "Unexpected number of rows in CSR argument"); ABORT_IF(S_values->shape() != S_indices->shape(), "CSR values and indices must have the same size"); - + ElementType alpha = 1.0; cusparseSpMatDescr_t descS; @@ -168,19 +169,19 @@ static void CSRProd(marian::Tensor C, S_offsets->data(), S_indices->data(), S_values ->data(), - CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, getCudaDataType(S_values->data()))); CUSPARSE_CHECK(cusparseCreateDnMat(&descD, - rowsD, colsD, /*ld=*/colsD, - D->data(), - getCudaDataType(D->data()), + rowsD, colsD, /*ld=*/colsD, + D->data(), + getCudaDataType(D->data()), denseOrder)); CUSPARSE_CHECK(cusparseCreateDnMat(&descC, rowsC, colsC, /*ld=*/colsC, - C->data(), - getCudaDataType(C->data()), + C->data(), + getCudaDataType(C->data()), denseOrder)); size_t bufferSize = 0; From 00ff08680ea7961b0ebbb2a2f15a80bb9c72d1dc Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 2 Apr 2024 13:20:25 +0000 Subject: [PATCH 21/26] Merged PR 33692: Add --no-optimizer-reload option This PR adds a simple `--no-optimizer-reload` that allows to skip restoring optimizer state during continued training or divergence fallback. --- CHANGELOG.md | 2 + VERSION | 2 +- src/common/config_parser.cpp | 10 +- src/graph/node_operators_binary.h | 60 ++++++------ src/tensors/gpu/tensor_operators.cu | 146 +++++++++++++++------------- src/training/graph_group.cpp | 48 +++++++-- src/training/graph_group.h | 15 ++- 7 files changed, 171 insertions(+), 112 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3049e622e..40ba6e0b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added +- Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size. +- Added `--no-optimizer-reload` to skip optimizer state loading during continued training or fallback. - Added `pymarian-eval`, CLI for scoring metrics - Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property. - Added `pymarian`: python bindings based on pybind11 diff --git a/VERSION b/VERSION index 329143f69..8d44afc76 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.26 +v1.12.27 diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 9c8b0776f..e3172b47d 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -254,9 +254,9 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) { "If non-empty, you need to provide one type per input file (if --train-sets) or per TSV field (if --tsv). " "Usually, there should be no need to provide these on the command line, the model should have them saved.", {}); - cli.add("--input-join-fields", + cli.add("--input-join-fields", "Join input fields (from files or TSV) into a single sequence " - "(mostly used single-encoder models like BLEURT and COMET-KIWI)", + "(mostly used single-encoder models like BLEURT and COMET-KIWI)", false); cli.add("--best-deep", "Use Edinburgh deep RNN configuration (s2s)"); @@ -436,6 +436,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { true); cli.add("--no-reload", "Do not load existing model specified in --model arg"); + cli.add("--no-optimizer-reload", + "Do not load existing optimizer state from checkpoint specified in --model arg"); cli.add>("--train-sets,-t", "Paths to training corpora: source target"); cli.add>("--vocabs,-v", @@ -650,7 +652,9 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add("--check-gradient-nan", "Skip parameter update in case of NaNs in gradient"); cli.add("--normalize-gradient", - "Normalize gradient by multiplying with no. devices / total labels (not recommended and to be removed in the future)"); + "Normalize gradient by dividing with efficient batch size"); + cli.add("--normalize-gradient-by-ratio", + "Normalize gradient by scaling with efficient batch size divided by running average batch size"); cli.add>("--train-embedder-rank", "Override model configuration and train a embedding similarity ranker with the model encoder, " diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 8cf0af1a4..eb3381bb4 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -17,27 +17,27 @@ class LambdaNodeOp : public NaryNodeOp { private: typedef const std::vector& Inputs; typedef std::function LambdaNodeFunctor; - + std::unique_ptr forward_; std::unique_ptr backward_; - + size_t externalHash_; public: - LambdaNodeOp(Inputs inputs, Shape shape, Type type, - LambdaNodeFunctor forward, + LambdaNodeOp(Inputs inputs, Shape shape, Type type, + LambdaNodeFunctor forward, size_t externalHash = 0) - : NaryNodeOp(inputs, shape, type), + : NaryNodeOp(inputs, shape, type), forward_(new LambdaNodeFunctor(forward)), externalHash_(externalHash) { Node::trainable_ = !!backward_; } - LambdaNodeOp(Inputs inputs, Shape shape, Type type, + LambdaNodeOp(Inputs inputs, Shape shape, Type type, LambdaNodeFunctor forward, - LambdaNodeFunctor backward, - size_t externalHash = 0) - : NaryNodeOp(inputs, shape, type), + LambdaNodeFunctor backward, + size_t externalHash = 0) + : NaryNodeOp(inputs, shape, type), forward_(new LambdaNodeFunctor(forward)), backward_(new LambdaNodeFunctor(backward)), externalHash_(externalHash) { @@ -130,7 +130,7 @@ class DotNodeOp : public NaryNodeOp { // df/dB += alpha * dot(op(A).T, D) // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C // to sum gradients from different graph parts - + auto isParameter = [](Expr p) { return std::dynamic_pointer_cast(p) != nullptr; }; @@ -276,7 +276,7 @@ class AffineNodeOp : public NaryNodeOp { NodeOps forwardOps() override { using namespace functional; - + return { NodeOp(Affine(val_, graph()->allocator(), @@ -431,8 +431,8 @@ class AffineWithReluNodeOp : public NaryNodeOp { float scalar_; public: - AffineWithReluNodeOp(Expr a, - Expr b, + AffineWithReluNodeOp(Expr a, + Expr b, Expr bias) : NaryNodeOp({a, b, bias}, newShape(a, b, false, false)), transA_(false), @@ -465,7 +465,7 @@ class AffineWithReluNodeOp : public NaryNodeOp { NodeOps forwardOps() override { ABORT_IF(!graph()->isInference(), "AffineWithReluNodeOp currently only supported for inference"); - + return { NodeOp(Affine(val_, graph()->allocator(), @@ -541,12 +541,12 @@ class DotBatchedNodeOp : public NaryNodeOp { ABORT_IF(shapeA[-1] != shapeB[-2], "Batched matrix product requires inner dimensions to match in {}{} * {}{}", std::string(shapeA), transA, std::string(shapeB), transB); - + // create shapes for batch dimensions only auto shapeBatchA = shapeA; shapeBatchA.set(-1, 1); shapeBatchA.set(-2, 1); - + auto shapeBatchB = shapeB; shapeBatchB.set(-1, 1); shapeBatchB.set(-2, 1); @@ -557,7 +557,7 @@ class DotBatchedNodeOp : public NaryNodeOp { // set non-batch dimensions in output shapeOut.set(-2, shapeA[-2]); shapeOut.set(-1, shapeB[-1]); - + return shapeOut; } @@ -579,7 +579,7 @@ class DotBatchedNodeOp : public NaryNodeOp { // df/dB += alpha * dot(op(A).T, D) // beta set to 1.0 in gemm, C = alpha * dot(op(A), op(B)) + beta * C // to sum gradients from different graph parts - + if(!transA_ && transB_) { return {NodeOp(ProdBatched(child(0)->grad(), graph()->allocator(), @@ -705,7 +705,7 @@ class DotBatchedLegacyNodeOp : public NaryNodeOp { shapeB.set(-2, b->shape()[-1]); shapeB.set(-1, b->shape()[-2]); } - + Shape outShape = shapeA; outShape.set(-1, shapeB[-1]); ABORT_IF(shapeA[-1] != shapeB[-2], @@ -1101,7 +1101,7 @@ struct ScatterNodeOp : public NaryNodeOp { auto backwardForVal = [this]() { auto allocator = graph()->allocator(); - // create temporary tensor of child(0)->grad().shape() == adj_.shape() + // create temporary tensor of child(0)->grad().shape() == adj_.shape() // copy adj_ to temporary auto grad = child(0)->grad(); auto tempGradMem = allocator->alloc(grad->memory()->size()); @@ -1116,7 +1116,7 @@ struct ScatterNodeOp : public NaryNodeOp { // insert tensor of zeros into temporary Insert(tempGrad, /*source=*/tempZero, /*indices*/child(1)->val(), axis_); - + // add temporary do child(0)->grad() Add(functional::_1, grad, tempGrad); @@ -1127,8 +1127,8 @@ struct ScatterNodeOp : public NaryNodeOp { return { // val - add gradients every where else to gradient of "a" - NodeOp(backwardForVal()), - + NodeOp(backwardForVal()), + NodeOp(/*no gradient*/[](){}), // indices // add gradients on indices to gradient of "source" @@ -1647,7 +1647,7 @@ struct RMSNormalizationOp : public NaryNodeOp { return {NodeOp( RMSNormalization(val_, child(0)->val(), - child(1)->val(), + (children_.size() >= 2) ? child(1)->val() : nullptr, (children_.size() == 3) ? child(2)->val() : nullptr, eps_))}; } @@ -1658,12 +1658,12 @@ struct RMSNormalizationOp : public NaryNodeOp { RMSNormalizationGrad( graph()->allocator(), child(0)->grad(), - child(1)->grad(), + (children_.size() >= 2) ? child(1)->grad() : nullptr, (children_.size() == 3) ? child(2)->grad() : nullptr, adj_, val_, child(0)->val(), - child(1)->val(), + (children_.size() >= 2) ? child(1)->val() : nullptr, (children_.size() == 3) ? child(2)->val() : nullptr, eps_))}; } @@ -1692,9 +1692,9 @@ struct RMSNormalizationOp : public NaryNodeOp { float eps_; }; -// @TODO: rewriting this fixes a bug for this one node. There should be exactly one -// NodeOp per gradient tensor many other nodes have that bug and need to be fixed. -// This will only manifest if the first op is not trainable, then gradients for the +// @TODO: rewriting this fixes a bug for this one node. There should be exactly one +// NodeOp per gradient tensor many other nodes have that bug and need to be fixed. +// This will only manifest if the first op is not trainable, then gradients for the // other nodes might get skipped despite being trainable. struct HighwayNodeOp : public NaryNodeOp { HighwayNodeOp(const std::vector& nodes) : NaryNodeOp(nodes, Shape::broadcast(nodes)) {} @@ -1703,7 +1703,7 @@ struct HighwayNodeOp : public NaryNodeOp { using namespace functional; auto alpha = sigmoid(_4); auto fwd = _1 = alpha * _2 + (1.f - alpha) * _3; - + return { NodeOp(Element(fwd, val_, child(0)->val(), child(1)->val(), child(2)->val())) }; diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index b7c80394b..c21f4d35f 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -127,14 +127,14 @@ __global__ void gSanitizeGradient(T* in, int length, } } -// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. -// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. -// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. +// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required. +// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient. +// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor. // In that case infinities do not result in a bad gradient, since they get clipped. -// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result +// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result // in a bad gradient. -// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), -// we return `false` indicating a bad gradient. +// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`), +// we return `false` indicating a bad gradient. bool SanitizeGradient(marian::Tensor in, Ptr allocator, bool pruneNaN, bool clipInf) { cudaSetDevice(in->getDeviceId().no); @@ -180,7 +180,7 @@ __global__ void gCopyCastTo(To* out, const From* in, int length) { if(index < length) { if(add) out[index] += (To)in[index]; - else + else out[index] = (To)in[index]; } } @@ -702,7 +702,7 @@ __global__ void gSoftmax(T* out, // determine max (used below to improve numeric stability) T* _max = _share; - + // @TODO: what's going on here with fp16? _max[threadIdx.x] = -CUDA_FLT_MAX; // mask // find max over column indices that have the same relative column index (=threadIdx.x) across all blocks of columns @@ -857,7 +857,7 @@ __global__ void gLogSoftmax(T* out, // CUDA complains if type or size of shared memory changes, keep size constant. extern __shared__ uint8_t _sharedBytes[]; - T* _share = (T*)_sharedBytes; + T* _share = (T*)_sharedBytes; AccType* _shareAccType = (AccType*)_sharedBytes; T* _max = _share; // 16-bit is ok for max if applicable @@ -892,7 +892,7 @@ __global__ void gLogSoftmax(T* out, _sum[threadIdx.x] = 0.0; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; - if(id < cols) { + if(id < cols) { // @TODO: would it be faster to recompute it below? Also better numeric stability with float? AccType sm = (AccType)sp[id] - (AccType)max; // subtract max for numeric stability so[id] = (T)sm; // assign numerator to output @@ -1327,7 +1327,7 @@ __global__ void gSelect(T* out, if(index < length) { outShape.dims(index, dims); int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor - dims[axis] = (int)d_indices[idxIndex]; + dims[axis] = (int)d_indices[idxIndex]; int inIndex = inShape.index(dims); if(add) out[index] += in[inIndex]; @@ -1353,12 +1353,12 @@ __global__ void gInsert(T* out, if(index < length) { inShape.dims(index, dims); int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor - dims[axis] = (int)d_indices[idxIndex]; + dims[axis] = (int)d_indices[idxIndex]; int outIndex = outShape.index(dims); if(add) out[outIndex] += in[index]; // this is probably wrong, atomicAdd? else - out[outIndex] = in[index]; + out[outIndex] = in[index]; } } } @@ -1385,7 +1385,7 @@ void Select(Tensor out, in->data(), in->shape(), axisGPU, - indices->data(), + indices->data(), indices->shape()); #if COMPILE_FP16 } else if (out->type() == Type::float16) { @@ -1403,7 +1403,7 @@ void Select(Tensor out, in->data(), in->shape(), axisGPU, - indices->data(), + indices->data(), indices->shape()); } else { ABORT("Select not implemented for type {}", out->type()); @@ -1640,7 +1640,7 @@ void GRUFastBackward(Ptr allocator, int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols); - Tensor tempGradBias, tempOnes; + Tensor tempGradBias, tempOnes; MemoryPiece::PtrType tempGradBiasMemory, tempOnesMemory; if(outputs[3]) { Shape memShape = {rows, outputs[3]->shape()[-1]}; @@ -1692,7 +1692,7 @@ void GRUFastBackward(Ptr allocator, // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. // This is much faster for fp16 which seems to have a broken atomicAdd implementation. - // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. + // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases if(outputs[3]) { @@ -1917,7 +1917,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor indices, } } -// computes the L2Norm of tensor and returns value as flaot on the CPU, +// computes the L2Norm of tensor and returns value as flaot on the CPU, // this is mostly used for diagnostic purposes and gradient clipping float L2Norm(Tensor in, Ptr allocator) { // @TODO: reverse order of arguments cudaSetDevice(in->getDeviceId().no); @@ -1996,7 +1996,7 @@ void Att(Tensor out, Tensor va, Tensor context, Tensor state) { size_t batchDim = context->shape()[-2]; size_t contextWordsDim = context->shape()[-3]; - int blocks = std::min(MAX_BLOCKS, (int)totalRows); + int blocks = std::min(MAX_BLOCKS, (int)totalRows); int threads = std::min(MAX_THREADS, (int)modelDim); int shared = sizeof(float) * threads; @@ -2316,11 +2316,11 @@ __global__ void gLayerNormalizationGrad(T* gradX, AccType lv = (xv - mean) / sigma; AccType gradLv = N * adjv - lv * sum_adj_l[0] - sum_adj[0]; - gradLv /= N * sigma; + gradLv /= N * sigma; AccType gradXv = gammav * gradLv; - // Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. + // Keep LN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. // @TODO: to be fixed and removed. AccType sign = functional::Ops::sgn(gradXv); AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete. @@ -2405,7 +2405,7 @@ void LayerNormalizationGrad(Ptr allocator, // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. // This is much faster for fp16 which seems to have a broken atomicAdd implementation. - // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. + // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases if(gradGamma) @@ -2462,7 +2462,7 @@ __global__ void gRMSNormalization(T* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { - AccType gammav = (AccType)gamma[id]; + AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f; AccType xv = (AccType)xRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; AccType rmsNorm = xv / rms; @@ -2492,7 +2492,7 @@ void RMSNormalization(Tensor out, if(out->type() == Type::float32) { gRMSNormalization<<>>(out->data(), in->data(), - gamma->data(), + gamma ? gamma->data() : nullptr, beta ? beta->data() : nullptr, rows, cols, @@ -2501,7 +2501,7 @@ void RMSNormalization(Tensor out, } else if (out->type() == Type::float16) { gRMSNormalization<<>>(out->data(), in->data(), - gamma->data(), + gamma ? gamma->data() : nullptr, beta ? beta->data() : nullptr, rows, cols, @@ -2547,7 +2547,7 @@ __global__ void gRMSNormalizationGrad(T* gradX, AccType xv = xRow[id]; AccType yv = yRow[id]; AccType betav = beta ? (AccType)beta[id] : (AccType)0.f; - AccType gammav = (AccType)gamma[id]; + AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f; AccType adjv = adjRow[id]; AccType rv = (yv - betav) / gammav; // go back to RMSNorm(x) from scaled and shifted version for accumulation @@ -2580,16 +2580,16 @@ __global__ void gRMSNormalizationGrad(T* gradX, if(id < cols) { AccType xv = xRow[id]; - AccType gammav = (AccType)gamma[id]; + AccType gammav = gamma ? (AccType)gamma[id] : (AccType)1.f; AccType adjv = adjRow[id]; AccType rmsNorm = xv / rms; AccType gradNorm = N * adjv - rmsNorm * sum_adj_r[0]; - gradNorm /= N * rms; + gradNorm /= N * rms; AccType gradXv = gammav * gradNorm; - // Keep RMSN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. + // Keep RMSN gradient between [-1000, 1000] for TensorOps, this currently used for making values fit into fp16. This wil also clip inf. // @TODO: to be fixed and removed. AccType sign = functional::Ops::sgn(gradXv); AccType cutoff = (AccType)1000.f; // @TODO: expose this somehow as an option? or better: make obsolete. @@ -2601,10 +2601,12 @@ __global__ void gRMSNormalizationGrad(T* gradX, T* gradXRow = gradX + j * cols; gradXRow[id] += (T)(gradXv); - T* gradGammaRow = gradGamma + j * cols; - // assignment is correct here as this gets summed up - // in the next kernel via matrix product - gradGammaRow[id] = (T)(adjv * rmsNorm); + if(gamma) { + T* gradGammaRow = gradGamma + j * cols; + // assignment is correct here as this gets summed up + // in the next kernel via matrix product + gradGammaRow[id] = (T)(adjv * rmsNorm); + } } } } @@ -2629,24 +2631,32 @@ void RMSNormalizationGrad(Ptr allocator, int threads = std::min(MAX_THREADS, cols); int blocks = std::min(MAX_BLOCKS, rows); - auto tempGradGammaMemory = allocator->alloc(adj->memory()->size()); - Tensor tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend()); - tempGradGamma->set(0.f); + MemoryPiece::PtrType tempGradGammaMemory; + Tensor tempGradGamma; + if(gamma) { + tempGradGammaMemory = allocator->alloc(adj->memory()->size()); + tempGradGamma = TensorBase::New(tempGradGammaMemory, adj->shape(), adj->type(), adj->getBackend()); + tempGradGamma->set(0.f); + } - auto tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type())); - Tensor tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend()); - tempOnes->set(1.f); + MemoryPiece::PtrType tempOnesMemory; + Tensor tempOnes; + if(gamma || beta) { + tempOnesMemory = allocator->alloc(rows * sizeOf(adj->type())); + tempOnes = TensorBase::New(tempOnesMemory, Shape({1, rows}), adj->type(), adj->getBackend()); + tempOnes->set(1.f); + } if(gradX->type() == Type::float32) { int shared = sizeof(float) * threads * 2; gRMSNormalizationGrad<<>>( gradX->data(), - tempGradGamma->data(), + gamma ? tempGradGamma->data() : nullptr, adj->data(), y->data(), x->data(), - gamma->data(), - (beta) ? beta->data() : nullptr, + gamma ? gamma->data() : nullptr, + beta ? beta->data() : nullptr, rows, cols, eps); @@ -2656,12 +2666,12 @@ void RMSNormalizationGrad(Ptr allocator, int shared = sizeof(float) * threads * 2; gRMSNormalizationGrad<<>>( gradX->data(), - tempGradGamma->data(), + gamma ? tempGradGamma->data() : nullptr, adj->data(), y->data(), x->data(), - gamma->data(), - (beta) ? beta->data() : nullptr, + gamma ? gamma->data() : nullptr, + beta ? beta->data() : nullptr, rows, cols, eps); @@ -2672,16 +2682,20 @@ void RMSNormalizationGrad(Ptr allocator, // We use this go get rid of the atomicAdd and perform a reduce of the gradients afterwards. // This is much faster for fp16 which seems to have a broken atomicAdd implementation. - // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. + // We reduce bias gradients with a matrix multiply, but use a 32-bit compute type. // This preserves precision with larger batches where all batch entries reduce into a single vector. // See also AffineNodeOp where we do the same for biases - gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add + if(gamma) { + gpu::Prod(gradGamma, tempOnes, tempGradGamma, false, false, 1, 1, Type::float32); // beta set to one to add + allocator->free(tempGradGammaMemory); + } - if(gradBeta) // dC/dbeta = adj - inverse broadcasting (reduction) + if(beta) { // dC/dbeta = adj - inverse broadcasting (reduction) gpu::Prod(gradBeta, tempOnes, adj, false, false, 1, 1, Type::float32); // beta set to one to add + } - allocator->free(tempGradGammaMemory); - allocator->free(tempOnesMemory); + if(tempOnes) + allocator->free(tempOnesMemory); } @@ -3421,16 +3435,16 @@ __global__ void Float2Bit(const float *in, uint32_t *out, int batch, int dim, in int batchIdx = blockIdx.x; const float *inBatchOffset = in + batchIdx * dim; uint32_t *outBatchOffset = out + batchIdx * outDim; - + int outDimIdx = threadIdx.x; while (outDimIdx < outDim) { const float *inDimOffset = inBatchOffset + outDimIdx * 32; uint32_t &outDimOffset = outBatchOffset[outDimIdx]; uint32_t outVal = 0; uint32_t mask = 1; - + for (int bitIdx = 0; bitIdx < 32; ++bitIdx) { - if (inDimOffset[bitIdx] >= 0) + if (inDimOffset[bitIdx] >= 0) outVal |= mask; mask <<= 1; @@ -3458,12 +3472,12 @@ void Float2Bit(marian::Tensor output, const marian::Tensor input) ////////////////////////////////////////////////////////////////////////////////////////// // Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo // https://www.geeksforgeeks.org/counting-sort/ -__global__ void HammmingAndSort(const uint32_t *weightHash, +__global__ void HammmingAndSort(const uint32_t *weightHash, const uint32_t *inputHash, uint16_t *hamming, - uint32_t *outCounts, - uint32_t *outIdx, - uint32_t kBest, uint16_t minVal, uint16_t maxVal, uint16_t range, + uint32_t *outCounts, + uint32_t *outIdx, + uint32_t kBest, uint16_t minVal, uint16_t maxVal, uint16_t range, int hashDim, int dim, int batch) { extern __shared__ uint32_t sharedCounts[]; @@ -3471,8 +3485,8 @@ __global__ void HammmingAndSort(const uint32_t *weightHash, int batchIdx = blockIdx.x; uint32_t *stopVal = sharedCounts + range; - uint16_t *hammingBatchOffset = hamming - ? hamming + batchIdx * dim + uint16_t *hammingBatchOffset = hamming + ? hamming + batchIdx * dim : (uint16_t*) (sharedCounts + range); uint32_t *outCountsBatchOffset = outCounts ? outCounts + batchIdx * kBest : nullptr; @@ -3550,7 +3564,7 @@ __global__ void HammmingAndSort(const uint32_t *weightHash, uint32_t countIdx = val - minVal; assert(countIdx < range); uint32_t &outIdx = sharedCounts[countIdx]; - + if (outIdx != NPP_MAX_32U) { uint32_t prevOutIdx; // Not supported in Maxwells or older @@ -3576,10 +3590,10 @@ __global__ void HammmingAndSort(const uint32_t *weightHash, // Calc hamming distance between input and weight hash. Return sorted indices and counts accoding to counting sort algo // https://www.geeksforgeeks.org/counting-sort/ void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts, - const marian::Tensor weightHash, + const marian::Tensor weightHash, const marian::Tensor inputHash, - uint32_t kBest, uint16_t minVal, uint16_t maxVal, - marian::Ptr &alloc, + uint32_t kBest, uint16_t minVal, uint16_t maxVal, + marian::Ptr &alloc, marian::Ptr &backend) { size_t SHARED_MEM_SIZE = 48000; @@ -3599,7 +3613,7 @@ void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts, size_t mem = range * sizeof(uint32_t) // counts + sizeof(uint32_t) // stopval + dim * sizeof(uint16_t); // hamming; - + marian::Tensor hamming; if (mem > SHARED_MEM_SIZE) { // shared memory too small. Write haming distance to global mem instead @@ -3613,12 +3627,12 @@ void HammmingAndSort(marian::Tensor outIdx, marian::Tensor outCounts, } HammmingAndSort<<>> - (weightHash->data(), + (weightHash->data(), inputHash->data(), hamming ? hamming->data() : nullptr, outCounts ? outCounts->data() : nullptr, outIdx ? outIdx->data() : nullptr, - kBest, minVal, maxVal, range, + kBest, minVal, maxVal, range, hashDim, dim, inputBatch); CUDA_CHECK(cudaGetLastError()); diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp index efada03ae..dbe6a7782 100644 --- a/src/training/graph_group.cpp +++ b/src/training/graph_group.cpp @@ -8,6 +8,10 @@ GraphGroup::GraphGroup(Ptr options, Ptr mpi) devices_(Config::getDevices(options, mpi->myMPIRank(), mpi->numMPIProcesses())), shardingMode_(getShardingMode(options_, mpi)), mbRoundUp_(options_->get("mini-batch-round-up", true)) { + + normalizeGradient_ = options_->get("normalize-gradient", false); + normalizeGradientByAverageRatio_ = options_->get("normalize-gradient-by-ratio", false); + if(options_->hasAndNotEmpty("cost-scaling")) { auto vcs = options_->get>("cost-scaling"); @@ -234,14 +238,23 @@ float GraphGroup::executeAndCollectNorm(const std::functionget("normalize-gradient")) - normalizationFactor *= updateTrgWords; + if(normalizeGradient_) + normalizationFactor *= effectiveBatchSize; + + if(normalizeGradientByAverageRatio_) { + // keep track of average effective batch size + updateAverageEffectiveBatchSize(effectiveBatchSize); + // this slightly adapts the gradient magnitude if the batch size changes drastically, + // in practice this will only matter if we grow the batch in larger steps. In that case + // the gradient magnitude is reduced until after a couple of updates that goes back to ~1. + normalizationFactor *= effectiveBatchSize / getAverageEffectiveBatchSize(); + } if(!isFinite(gNorm)) // we are checking the sanity of the gradient elsewhere return normalizationFactor; @@ -253,7 +266,7 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) // Normalize gradient norm w.r.t. number of labels in batch for statistics, // there should be no gradient normalization before this point, @TODO: check this - gNorm = gNorm / updateTrgWords; + gNorm = gNorm / effectiveBatchSize; size_t window; float gNormAvgTransform, gNormVarTransform, gNormTransform, gNormAvg; if(dynamicGradientScalingUseLogs_) { @@ -368,8 +381,13 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) { models_[i++]->load(graph, modelWeights_, markReloaded); } - // try to restore everything from checkpoint now - loadOptimizerState(modelFileName, scatterFn); + bool noOptimizerReload = options_->get("no-optimizer-reload", false); + if(noOptimizerReload) { + LOG(info, "--no-optimizer-reload is specified, we are skipping optimizer state restoration"); + } else { + // try to restore everything from checkpoint now + loadOptimizerState(modelFileName, scatterFn); + } // @TODO: run another graph->forward() to allocate the weights from the checkpoint? // then we might not need to keep modelWeights_ around. @@ -673,15 +691,27 @@ Ptr GraphGroup::collectStats(Ptr graph, } void GraphGroup::setTypicalTrgBatchWords(size_t typicalTrgBatchWords) { // needed for dynamic MB scaling - typicalTrgBatchWords_ = (double)typicalTrgBatchWords; + typicalTrgBatchWords_ = (float)typicalTrgBatchWords; } -double GraphGroup::getTypicalTrgBatchWords() { +float GraphGroup::getTypicalTrgBatchWords() { return typicalTrgBatchWords_; } void GraphGroup::updateAverageTrgBatchWords(size_t trgBatchWords) { - typicalTrgBatchWords_ = 0.99 * typicalTrgBatchWords_ + 0.01 * (double)trgBatchWords; // record a running average of the batch size, factors are chosen empirically. + typicalTrgBatchWords_ = 0.99f * typicalTrgBatchWords_ + 0.01f * (float)trgBatchWords; // record a running average of the batch size, factors are chosen empirically. +} + +float GraphGroup::getAverageEffectiveBatchSize() { + return averageEffectiveBatchSize_; +} + +void GraphGroup::updateAverageEffectiveBatchSize(size_t effectiveBatchSize) { + if(averageEffectiveBatchSize_ == 0) + averageEffectiveBatchSize_ = (float)effectiveBatchSize; + + // record a running average of the effective batch size + averageEffectiveBatchSize_ = 0.9f * averageEffectiveBatchSize_ + 0.1f * (float)effectiveBatchSize; } size_t GraphGroup::numberOfInputFiles() { diff --git a/src/training/graph_group.h b/src/training/graph_group.h index 9f70ed81b..1a2794571 100644 --- a/src/training/graph_group.h +++ b/src/training/graph_group.h @@ -50,7 +50,8 @@ class GraphGroup { Ptr scheduler_; // scheduler that keeps track of how much has been processed bool finalized_{false}; // 'true' if training has completed (further updates are no longer allowed) - double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words + float typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words + float averageEffectiveBatchSize_{0}; // record average effective batch size bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false bool costScaling_{false}; @@ -64,6 +65,9 @@ class GraphGroup { bool checkGradientNan_{false}; + bool normalizeGradient_{false}; + bool normalizeGradientByAverageRatio_{true}; + bool dynamicGradientScaling_{false}; float dynamicGradientScalingFactor_{2.f}; bool dynamicGradientScalingUseLogs_{false}; @@ -133,7 +137,7 @@ class GraphGroup { float checkNanOrNorm(size_t i, size_t begin, size_t end); float executeAndCollectNorm(const std::function& task); - float computeNormalizationFactor(float gNorm, size_t updateTrgWords); + float computeNormalizationFactor(float gNorm, size_t effectiveBatchSize); /** * Determine maximal batch size that can fit into the given workspace @@ -152,9 +156,14 @@ class GraphGroup { virtual Ptr collectStats(const std::vector>& vocabs) = 0; + // used to estimate the number of words in a batch and figure out statistics for batch growing etc. void setTypicalTrgBatchWords(size_t typicalTrgBatchWords); - double getTypicalTrgBatchWords(); + float getTypicalTrgBatchWords(); void updateAverageTrgBatchWords(size_t trgBatchWords); + + // similar to above but counts the number of labels including delayed updates. This is used for gradient normalization. + float getAverageEffectiveBatchSize(); + void updateAverageEffectiveBatchSize(size_t effectiveBatchSize); }; } // namespace marian From 58a9150281a6cc2fef4324cc76124361f7d012ea Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 16 Apr 2024 14:59:45 +0000 Subject: [PATCH 22/26] Merged PR 33803: Fixes to force-decoding to enable LSH This PR includes various fixes to the force decoding code to make the LSH and beam search work. --- CHANGELOG.md | 1 + src/data/shortlist.cpp | 124 +++++++++++++++++++++++++-------- src/data/shortlist.h | 24 ++++--- src/layers_new/alibi.cpp | 4 +- src/microsoft/quicksand.cpp | 40 ++++++----- src/microsoft/quicksand.h | 4 +- src/tensors/cpu/topk.cpp | 53 ++++++++------ src/translator/beam_search.cpp | 37 +++++----- src/translator/sampling.h | 104 ++++++++++++++++++--------- 9 files changed, 259 insertions(+), 132 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40ba6e0b6..caa3b8aa8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed force-decoding with LSH - Fixed force-decoding for beam-size > 1 - Fixed lost node in mt-detect metrics - Fixed BLEURT logmask computation diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index 909734ea6..ac588a279 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -8,7 +8,7 @@ namespace marian { namespace data { -// cast current void pointer to T pointer and move forward by num elements +// cast current void pointer to T pointer and move forward by num elements template const T* get(const void*& current, size_t num = 1) { const T* ptr = (const T*)current; @@ -18,19 +18,22 @@ const T* get(const void*& current, size_t num = 1) { ////////////////////////////////////////////////////////////////////////////////////// Shortlist::Shortlist(const std::vector& indices) - : indices_(indices), + : indices_(indices), initialized_(false) {} Shortlist::~Shortlist() {} -WordIndex Shortlist::reverseMap(int /*beamIdx*/, int /*batchIdx*/, int idx) const { return indices_[idx]; } +WordIndex Shortlist::reverseMap(int /*beamIdx*/, int /*batchIdx*/, int idx) const { + return indices_[idx]; +} -WordIndex Shortlist::tryForwardMap(WordIndex wIdx) const { +WordIndex Shortlist::tryForwardMap(WordIndex wIdx, int /*batchIdx*/) const { auto first = std::lower_bound(indices_.begin(), indices_.end(), wIdx); - if(first != indices_.end() && *first == wIdx) // check if element not less than wIdx has been found and if equal to wIdx + if(first != indices_.end() && *first == wIdx) { // check if element not less than wIdx has been found and if equal to wIdx return (int)std::distance(indices_.begin(), first); // return coordinate if found - else + } else { return npos; // return npos if not found, @TODO: replace with std::optional once we switch to C++17? + } } void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { @@ -46,7 +49,7 @@ void Shortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Exp Shape kShape({k}); indicesExpr_ = lambda({input, weights}, kShape, Type::uint32, forward); - createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k); + createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt); initialized_ = true; } @@ -59,8 +62,7 @@ Expr Shortlist::getIndicesExpr() const { void Shortlist::createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, - Expr lemmaEt, - int k) { + Expr lemmaEt) { ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExpr_); cachedShortWt_ = reshape(cachedShortWt_, {1, 1, cachedShortWt_->shape()[0], cachedShortWt_->shape()[1]}); @@ -70,6 +72,7 @@ void Shortlist::createCachedTensors(Expr weights, } if (lemmaEt) { + int k = indicesExpr_->shape()[-1]; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExpr_); cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {1, 1, cachedShortLemmaEt_->shape()[0], k}); } @@ -78,60 +81,121 @@ void Shortlist::createCachedTensors(Expr weights, /////////////////////////////////////////////////////////////////////////////////// LSHShortlist::LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic) -: Shortlist(std::vector()), +: Shortlist(std::vector()), k_(k), nbits_(nbits), lemmaSize_(lemmaSize), abortIfDynamic_(abortIfDynamic) { } WordIndex LSHShortlist::reverseMap(int beamIdx, int batchIdx, int idx) const { - //int currBeamSize = indicesExpr_->shape()[0]; int currBatchSize = indicesExpr_->shape()[1]; idx = (k_ * currBatchSize * beamIdx) + (k_ * batchIdx) + idx; assert(idx < indices_.size()); - return indices_[idx]; + return indices_[idx]; } Expr LSHShortlist::getIndicesExpr() const { return indicesExpr_; } +void LSHShortlist::setForcedIndices(Expr forcedIndices) { + if(forcedIndices) { + int dimBatch = forcedIndices->shape()[-2]; + forcedIndicesExpr_ = reshape(forcedIndices, {1, dimBatch, 1}); + } else { + forcedIndicesExpr_ = nullptr; + } +} + void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { - indicesExpr_ = callback(lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_), - [this](Expr node) { + auto topk = lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_); // [beam, batch, k] + + bool addForced = forcedIndicesExpr_ != nullptr; + if(addForced) { + topk = callback(topk, + [this](Expr node) { + int dimBeam = node->shape()[-3]; + int dimBatch = node->shape()[-2]; + for(int batchIdx = 0; batchIdx < dimBatch; batchIdx++) { + for(int beamIdx = 0; beamIdx < dimBeam; beamIdx++) { + IndexType* begin = node->val()->data() + beamIdx * dimBatch * k_ + batchIdx * k_; + IndexType* end = begin + k_; + IndexType val = forcedIndicesExpr_->val()->data()[batchIdx]; + auto pos = std::lower_bound(begin, end, val); + if(pos != end) + *pos = val; + else + *(end-1) = val; + } + } + // we will correctly overwrite the indices used for reverse mapping in the next call back + + setForcedIndices(nullptr); // mark as done for this step + }); + } + + indicesExpr_ = callback(topk, + [this](Expr node) { node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node }); - createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt, k_); + createCachedTensors(weights, isLegacyUntransposedW, b, lemmaEt); +} + +WordIndex LSHShortlist::tryForwardMap(WordIndex wIdx, int batchIdx) const { + if(!indicesExpr_ || indices_.empty()) + return npos; + + int dimBatch = indicesExpr_->shape()[-2]; + int beamIdx = 0; + + IndexType* begin = indicesExpr_->val()->data() + beamIdx * dimBatch * k_ + batchIdx * k_; + IndexType* end = begin + k_; + + auto pos = std::lower_bound(begin, end, wIdx); + if(pos != end) + return (int)std::distance(begin, pos); + else + return npos; +} + +Expr LSHShortlist::tryForwardMap(Expr indices) const { + auto forward = [this](Expr out, const std::vector& inputs) { + ABORT_IF(out->val()->getDeviceId().type != DeviceType::cpu, "LSHShortlist::tryForwardMap(Expr) is only implemented for CPU"); + for(int batchIdx = 0; batchIdx < out->shape().elements(); batchIdx++) + out->val()->data()[batchIdx] = LSHShortlist::tryForwardMap(inputs[0]->val()->data()[batchIdx], batchIdx); + }; + + return lambda({indices}, indices->shape(), Type::uint32, forward); } void LSHShortlist::createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, - Expr lemmaEt, - int k) { + Expr lemmaEt) { int currBeamSize = indicesExpr_->shape()[0]; int batchSize = indicesExpr_->shape()[1]; ABORT_IF(isLegacyUntransposedW, "Legacy untranspose W not yet tested"); + int kPrime = indicesExpr_->shape()[-1]; Expr indicesExprFlatten = reshape(indicesExpr_, {indicesExpr_->shape().elements()}); cachedShortWt_ = index_select(weights, isLegacyUntransposedW ? -1 : 0, indicesExprFlatten); - cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, k, cachedShortWt_->shape()[1]}); + cachedShortWt_ = reshape(cachedShortWt_, {currBeamSize, batchSize, kPrime, cachedShortWt_->shape()[1]}); if (b) { ABORT("Bias not supported with LSH"); cachedShortb_ = index_select(b, -1, indicesExprFlatten); - cachedShortb_ = reshape(cachedShortb_, {currBeamSize, batchSize, k, cachedShortb_->shape()[0]}); // not tested + cachedShortb_ = reshape(cachedShortb_, {currBeamSize, batchSize, kPrime, cachedShortb_->shape()[0]}); // not tested } if (lemmaEt) { int dim = lemmaEt->shape()[0]; cachedShortLemmaEt_ = index_select(lemmaEt, -1, indicesExprFlatten); - cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, k}); + cachedShortLemmaEt_ = reshape(cachedShortLemmaEt_, {dim, currBeamSize, batchSize, kPrime}); cachedShortLemmaEt_ = transpose(cachedShortLemmaEt_, {1, 2, 0, 3}); } } -LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize, bool abortIfDynamic) +LSHShortlistGenerator::LSHShortlistGenerator(int k, int nbits, size_t lemmaSize, bool abortIfDynamic) : k_(k), nbits_(nbits), lemmaSize_(lemmaSize), abortIfDynamic_(abortIfDynamic) { } @@ -165,7 +229,7 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, mmap_ = mio::mmap_source(fname); // memory-map the binary file once const void* current = mmap_.data(); // pointer iterator over binary file - + // compare magic number in binary file to make sure we are reading the right thing const int32_t MAGIC_NUMBER = 1234567890; int32_t header_magic_number = *get(current); @@ -173,7 +237,7 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, auto config = marian::quicksand::ParameterTree::FromBinaryReader(current); use16bit_ = config->GetBoolReq("use_16_bit"); - + LOG(info, "[data] Mapping Quicksand shortlist from {}", fname); idSize_ = sizeof(int32_t); @@ -189,12 +253,12 @@ QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr options, sourceOffsets_ = get(current, numSourceIds_); numShortlistIds_ = *get(current); sourceToShortlistIds_ = get(current, idSize_ * numShortlistIds_); - + // display parameters - LOG(info, + LOG(info, "[data] Quicksand shortlist has {} source ids, {} default ids and {} shortlist ids", - numSourceIds_, - numDefaultIds_, + numSourceIds_, + numDefaultIds_, numShortlistIds_); } @@ -225,12 +289,12 @@ Ptr QuicksandShortlistGenerator::generate(Ptr batc curShortlistIt->first = curShortlistIds; curShortlistIt->second = length; curShortlistIt++; - + if (length > maxLength) maxLength = length; } } - + // collect the actual shortlist mappings for (int32_t i = 0; i < maxLength && indexSet.size() < maxShortlistSize; i++) { for (int32_t j = 0; j < curShortlists.size() && indexSet.size() < maxShortlistSize; j++) { @@ -273,7 +337,7 @@ Ptr createShortlistGenerator(Ptr options, size_t lemmaSize = trgVocab->lemmaSize(); return New(lshOpts[0], lshOpts[1], lemmaSize, /*abortIfDynamic=*/false); } - else { + else { std::vector vals = options->get>("shortlist"); ABORT_IF(vals.empty(), "No path to shortlist given"); std::string fname = vals[0]; diff --git a/src/data/shortlist.h b/src/data/shortlist.h index bf185d570..484a3403e 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -26,26 +26,26 @@ class Shortlist { protected: std::vector indices_; // // [packed shortlist index] -> word index, used to select columns from output embeddings Expr indicesExpr_; // cache an expression that contains the short list indices + Expr forcedIndicesExpr_; Expr cachedShortWt_; // short-listed version, cached (cleared by clear()) Expr cachedShortb_; // these match the current value of shortlist_ Expr cachedShortLemmaEt_; bool initialized_; // used by batch-level shortlist. Only initialize with 1st call then skip all subsequent calls for same batch - + void createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, - Expr lemmaEt, - int k); + Expr lemmaEt); public: static constexpr WordIndex npos{std::numeric_limits::max()}; // used to identify invalid shortlist entries similar to std::string::npos Shortlist(const std::vector& indices); virtual ~Shortlist(); - + virtual bool isDynamic() const { return false; } virtual WordIndex reverseMap(int beamIdx, int batchIdx, int idx) const; - virtual WordIndex tryForwardMap(WordIndex wIdx) const; + virtual WordIndex tryForwardMap(WordIndex wIdx, int batchIdx=0) const; virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt); virtual Expr getIndicesExpr() const; @@ -72,7 +72,7 @@ class ShortlistGenerator { // https://arxiv.org/pdf/1903.03129.pdf https://arxiv.org/pdf/1806.00588.pdf class LSHShortlist: public Shortlist { private: - int k_; // number of candidates returned from each input + int k_; // number of candidates returned from each input int nbits_; // length of hash size_t lemmaSize_; // vocab size bool abortIfDynamic_; // if true disallow dynamic allocation for encoded weights and rotation matrix (only allow use of pre-allocated parameters) @@ -83,8 +83,8 @@ class LSHShortlist: public Shortlist { void createCachedTensors(Expr weights, bool isLegacyUntransposedW, Expr b, - Expr lemmaEt, - int k); + Expr lemmaEt); + virtual WordIndex tryForwardMap(WordIndex wIdx, int batchIdx=0) const override; public: LSHShortlist(int k, int nbits, size_t lemmaSize, bool abortIfDynamic = false); @@ -94,7 +94,9 @@ class LSHShortlist: public Shortlist { virtual void filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) override; virtual Expr getIndicesExpr() const override; + virtual void setForcedIndices(Expr forcedIndices); + virtual Expr tryForwardMap(Expr indices) const; }; class LSHShortlistGenerator : public ShortlistGenerator { @@ -349,7 +351,7 @@ class FakeShortlistGenerator : public ShortlistGenerator { }; /* -Legacy binary shortlist for Microsoft-internal use. +Legacy binary shortlist for Microsoft-internal use. */ class QuicksandShortlistGenerator : public ShortlistGenerator { private: @@ -371,7 +373,7 @@ class QuicksandShortlistGenerator : public ShortlistGenerator { const int32_t* sourceOffsets_{nullptr}; int32_t numShortlistIds_{0}; const uint8_t* sourceToShortlistIds_{nullptr}; - + public: QuicksandShortlistGenerator(Ptr options, Ptr srcVocab, @@ -384,7 +386,7 @@ class QuicksandShortlistGenerator : public ShortlistGenerator { }; /* -Shortlist factory to create correct type of shortlist. Currently assumes everything is a text shortlist +Shortlist factory to create correct type of shortlist. Currently assumes everything is a text shortlist unless the extension is *.bin for which the Microsoft legacy binary shortlist is used. */ Ptr createShortlistGenerator(Ptr options, diff --git a/src/layers_new/alibi.cpp b/src/layers_new/alibi.cpp index 44f0eb60b..1eb3c861c 100644 --- a/src/layers_new/alibi.cpp +++ b/src/layers_new/alibi.cpp @@ -54,8 +54,10 @@ Ptr AlibiDecoderState::select( Expr AlibiDecoderState::getAlibiShift(Ptr graph, bool decoding) const { if(decoding) { std::vector shift; - for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_) + for(const auto& [trgPos, srcPos, batchIdx] : syncPoints_) { + (void)batchIdx; // unused shift.push_back((float)(srcPos - trgPos)); + } if(!shift.empty()) { int dimBeam = lastBeam_; diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 513639dd6..75f224d9a 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -48,6 +48,9 @@ class VocabWrapper : public IVocabWrapper { VocabWrapper(Ptr vocab) : pImpl_(vocab) {} virtual ~VocabWrapper() {} WordIndex encode(const std::string& word) const override { return (*pImpl_)[word].toWordIndex(); } + WordIndex getEosId() const override { return pImpl_->getEosId().toWordIndex(); }; + WordIndex getUnkId() const override { return pImpl_->getUnkId().toWordIndex(); }; + std::string decode(WordIndex id) const override { return (*pImpl_)[Word::fromWordIndex(id)]; } size_t size() const override { return pImpl_->size(); } void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const override { pImpl_->transcodeToShortlistInPlace(ptr, num); } @@ -145,10 +148,9 @@ class BeamSearchDecoder : public IBeamSearchDecoder { void setWorkspace(uint8_t* data, size_t size) override { device_->set(data, size); } - QSNBestBatch decode(const QSBatch& qsBatch, + QSNBestBatch decode(const std::vector& qsBatches, size_t maxLength, const std::unordered_set& shortlist) override { - std::vector lshOpts = options_->get>("output-approx-knn", {}); ABORT_IF(lshOpts.size() != 0 && lshOpts.size() != 2, "--output-approx-knn takes 2 parameters"); ABORT_IF(lshOpts.size() == 2 && shortlist.size() > 0, "LSH and shortlist cannot be used at the same time"); @@ -167,24 +169,30 @@ class BeamSearchDecoder : public IBeamSearchDecoder { scorer->setShortlistGenerator(shortListGen); } - // form source batch, by interleaving the words over sentences in the batch, and setting the mask - size_t batchSize = qsBatch.size(); - auto subBatch = New(batchSize, maxLength, vocabs_[0]); - for(size_t i = 0; i < maxLength; ++i) { - for(size_t j = 0; j < batchSize; ++j) { - const auto& sent = qsBatch[j]; - if(i < sent.size()) { - size_t idx = i * batchSize + j; - subBatch->data()[idx] = marian::Word::fromWordIndex(sent[i]); - subBatch->mask()[idx] = 1; + ABORT_IF(qsBatches.empty(), "No input batch provided"); + + auto createSubBatch = [maxLength](const QSBatch& qsBatch, Ptr vocab) { + size_t batchSize = qsBatch.size(); + auto subBatch = New(batchSize, qsBatch.front().size(), vocab); + for(size_t i = 0; i < maxLength; ++i) { + for(size_t j = 0; j < batchSize; ++j) { + const auto& sent = qsBatch[j]; + if(i < sent.size()) { + size_t idx = i * batchSize + j; + subBatch->data()[idx] = marian::Word::fromWordIndex(sent[i]); + subBatch->mask()[idx] = 1; + } } } - } - auto tgtSubBatch = New(batchSize, 0, vocabs_[1]); // only holds a vocab, but data is dummy - std::vector> subBatches{ subBatch, tgtSubBatch }; - std::vector sentIds(batchSize, 0); + return subBatch; + }; + + auto srcSubBatch = createSubBatch(qsBatches[0], vocabs_[0]); + auto tgtSubBatch = createSubBatch(qsBatches[1], vocabs_[1]); + std::vector> subBatches{ srcSubBatch, tgtSubBatch }; auto batch = New(subBatches); + std::vector sentIds(batch->size(), 0); batch->setSentenceIds(sentIds); // decode diff --git a/src/microsoft/quicksand.h b/src/microsoft/quicksand.h index 3ed866e83..cf452fb81 100644 --- a/src/microsoft/quicksand.h +++ b/src/microsoft/quicksand.h @@ -44,6 +44,8 @@ class IVocabWrapper { public: virtual WordIndex encode(const std::string& word) const = 0; virtual std::string decode(WordIndex id) const = 0; + virtual WordIndex getEosId() const = 0; + virtual WordIndex getUnkId() const = 0; virtual size_t size() const = 0; virtual void transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const = 0; }; @@ -59,7 +61,7 @@ class IBeamSearchDecoder { virtual ~IBeamSearchDecoder() {} - virtual QSNBestBatch decode(const QSBatch& qsBatch, + virtual QSNBestBatch decode(const std::vector& qsBatches, size_t maxLength, const std::unordered_set& shortlist) = 0; diff --git a/src/tensors/cpu/topk.cpp b/src/tensors/cpu/topk.cpp index 73f0ce273..5cb1119bb 100644 --- a/src/tensors/cpu/topk.cpp +++ b/src/tensors/cpu/topk.cpp @@ -4,15 +4,15 @@ // CPU implementation of proper Marian top-k operator for TopkNodeOp // This file contains a lot of code-duplicaton with src/translator/nth_element.cpp -// the goal is to replace the beam-search specific topk search with this code. -// Currently this is only used in the unit tests, but we will move forward and +// the goal is to replace the beam-search specific topk search with this code. +// Currently this is only used in the unit tests, but we will move forward and // make the beam-search more graph and operator-based. namespace marian { -namespace cpu { +namespace cpu { void TopK(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tensor in, int k, int axis, bool descending) { - + ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis"); ABORT_IF(in->type() != Type::float32, "Input should have type {}", Type::float32); ABORT_IF(outInd->type() != Type::uint32, "Output should be have type {}", Type::uint32); @@ -29,61 +29,70 @@ void TopK(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tens IndexType* outIndPtr = outInd->data(); float* outValPtr = outVal->data(); for(int i = 0; i < rows; ++i) { - std::partial_sort( + std::partial_sort( // sorts the top N (beam size) idxs by score to the front idxs.begin(), idxs.begin() + k, idxs.end(), - [&](int a, int b) { - return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; + [&](int a, int b) { + return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; } ); - + for(int j = 0; j < k; j++) { outIndPtr[j] = idxs[j]; outValPtr[j] = inDataPtr[idxs[j]]; } - + outIndPtr += k; outValPtr += k; inDataPtr += cols; } } -// CPU implementation of Marian sort operator for SortNodeOp -void Sort(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tensor in, int axis, bool descending) { - ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis"); - ABORT_IF(in->type() != Type::float32, "Input should have type {}", Type::float32); - ABORT_IF(outInd->type() != Type::uint32, "Output should be have type {}", Type::uint32); - +template +void SortTyped(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tensor in, int axis, bool descending) { int cols = in->shape()[axis]; int rows = in->shape().elements() / cols; std::vector idxs(cols); std::iota(idxs.begin(), idxs.end(), 0); - const float* inDataPtr = in->data(); + const T* inDataPtr = in->data(); IndexType* outIndPtr = outInd->data(); - float* outValPtr = outVal->data(); + T* outValPtr = outVal->data(); for(int i = 0; i < rows; ++i) { - std::sort( + std::sort( idxs.begin(), idxs.end(), - [&](int a, int b) { - return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; + [&](int a, int b) { + return descending ? inDataPtr[a] > inDataPtr[b] : inDataPtr[a] < inDataPtr[b]; } ); - + for(int j = 0; j < cols; j++) { outIndPtr[j] = idxs[j]; outValPtr[j] = inDataPtr[idxs[j]]; } - + outIndPtr += cols; outValPtr += cols; inDataPtr += cols; } } +// CPU implementation of Marian sort operator for SortNodeOp +void Sort(Tensor outVal, Tensor outInd, Ptr /*allocator*/, const Tensor in, int axis, bool descending) { + ABORT_IF(axis != in->shape().size() - 1, "Currently only works for last axis"); + ABORT_IF(outInd->type() != Type::uint32, "Output indices should be have type {}", Type::uint32); + + if(in->type() == Type::float32) + SortTyped(outVal, outInd, nullptr, in, axis, descending); + else if(in->type() == Type::uint32) + SortTyped(outVal, outInd, nullptr, in, axis, descending); + else + ABORT("Unsupported type {}", in->type()); +} + } } diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp index 63aa0ec8f..8b1312c2f 100644 --- a/src/translator/beam_search.cpp +++ b/src/translator/beam_search.cpp @@ -45,20 +45,20 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current // They can be between 0 and (vocabSize * nBestBeamSize * batchSize)-1. // (beamHypIdx refers to the GPU tensors, *not* the beams[] array; they are not the same in case of purging) const auto key = nBestKeys[i]; - + // decompose key into individual indices (batchIdx, beamHypIdx, wordIdx) const auto beamHypIdx = (key / vocabSize) % nBestBeamSize; const auto currentBatchIdx = (key / vocabSize) / nBestBeamSize; const auto origBatchIdx = reverseBatchIdxMap.empty() ? currentBatchIdx : reverseBatchIdxMap[currentBatchIdx]; // map currentBatchIdx back into original position within starting maximal batch size, required to find correct beam bool dropHyp = !dropBatchEntries.empty() && dropBatchEntries[origBatchIdx] && factorGroup == 0; - + WordIndex wordIdx; if(dropHyp) { // if we force=drop the hypothesis, assign EOS, otherwise the expected word id. if(factoredVocab) { // when using factoredVocab, extract the EOS lemma index from the word id, we predicting factors one by one here, hence lemma only std::vector eosFactors; factoredVocab->word2factors(factoredVocab->getEosId(), eosFactors); wordIdx = (WordIndex)eosFactors[0]; - } else { // without factoredVocab lemma index and word index are the same. Safe cruising. + } else { // without factoredVocab lemma index and word index are the same. Safe cruising. wordIdx = trgVocab_->getEosId().toWordIndex(); } } else { // we are not dropping anything, just assign the normal index @@ -66,9 +66,9 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current } // @TODO: We currently assign a log probability of 0 to all beam entries of the dropped batch entry, instead it might be a good idea to use - // the per Hyp pathScore without the current expansion (a bit hard to obtain). - // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better. - // For the empty hyp this would naturally result in 0, too. + // the per Hyp pathScore without the current expansion (a bit hard to obtain). + // For the case where we drop empty inputs, 0 is fine. For other use cases like a forced stop, the penultimate pathScore might be better. + // For the empty hyp this would naturally result in 0, too. const float pathScore = dropHyp ? 0.f : nBestPathScores[i]; // 0 (Prob = 1, maximum score) if dropped or expanded path score for (batchIdx, beamHypIdx, word) const auto& beam = beams[origBatchIdx]; @@ -78,7 +78,7 @@ Beams BeamSearch::toHyps(const std::vector& nBestKeys, // [current continue; if(pathScore == INVALID_PATH_SCORE) // (dummy slot or word that cannot be expanded by current factor) continue; - + ABORT_IF(beamHypIdx >= beam.size(), "Out of bounds beamHypIdx??"); // effectively this is equivalent to ABORT_IF(beams[origBatchIdx].empty(), ...) // map wordIdx to word @@ -298,23 +298,23 @@ Histories BeamSearch::search(Ptr graph, Ptr Expr suppressedWordIndices; bool suppressUnk = !options_->get("allow-unk", false); bool suppressSpecial = !options_->get("allow-special", false); + + auto shortlist = scorers_[0]->getShortlist(); // first shortlist is generally ok, @TODO: make sure they are the same across scorers? if (suppressUnk || suppressSpecial) { // do we need to suppress unk or special? std::vector suppressed = trgVocab_->suppressedIndices(suppressUnk, suppressSpecial); - - auto shortlist = scorers_[0]->getShortlist(); // first shortlist is generally ok, @TODO: make sure they are the same across scorers? if(shortlist) // check if suppressed words are allowed by the shortlist, if not, remove - suppressed.erase(std::remove_if(suppressed.begin(), - suppressed.end(), - [&](WordIndex i) { + suppressed.erase(std::remove_if(suppressed.begin(), + suppressed.end(), + [&](WordIndex i) { return shortlist->tryForwardMap(i) == data::Shortlist::npos; }), suppressed.end()); - + if(!suppressed.empty()) suppressedWordIndices = graph->indices(suppressed); } - auto distMod = New(options_, batch, INVALID_PATH_SCORE); + auto distMod = New(graph, options_, batch, INVALID_PATH_SCORE, shortlist); // the decoding process updates the following state information in each output time step: // - beams: array [origDimBatch] of array [maxBeamSize] of Hypothesis @@ -432,8 +432,7 @@ Histories BeamSearch::search(Ptr graph, Ptr if (numFactorGroups == 1) { // @TODO: this branch can go away logProbs = states[i]->getLogProbs().getLogits(); // [maxBeamSize, 1, currentDimBatch, dimVocab] } else { - auto shortlist = scorers_[i]->getShortlist(); - logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, shortlist); // [maxBeamSize, 1, currentDimBatch, dimVocab] + logProbs = states[i]->getLogProbs().getFactoredLogits(factorGroup, scorers_[i]->getShortlist()); // [maxBeamSize, 1, currentDimBatch, dimVocab] } } else { @@ -456,10 +455,10 @@ Histories BeamSearch::search(Ptr graph, Ptr } } - // we cast (ensembled) scores to float32, as accumulated them into path scores; + // we cast (ensembled) scores to float32, as accumulated them into path scores; // also beneficial for sampling etc. // @TODO:: consider doing this before ensembling - stepScores = cast(stepScores, Type::float32); + stepScores = cast(stepScores, Type::float32); if(factorGroup == 0) { stepScores = distMod->force(stepScores, (int)t, (int)maxBeamSize, batchIndices); @@ -482,7 +481,7 @@ Histories BeamSearch::search(Ptr graph, Ptr // suppress specific symbols if not at right positions // @TODO: move this to DistributionModifier if(suppressedWordIndices && factorGroup == 0) - suppressWords(expandedPathScores, suppressedWordIndices); + suppressWords(expandedPathScores, suppressedWordIndices); // @TODO: this is probably not working correctly for LSH short list //********************************************************************** // perform beam search diff --git a/src/translator/sampling.h b/src/translator/sampling.h index 184202229..043f79bd6 100644 --- a/src/translator/sampling.h +++ b/src/translator/sampling.h @@ -10,7 +10,7 @@ namespace sampling { // Prune logits via top-k pruning Expr topkPruning(Expr scores, int k, bool normalize = false) { - Expr val, idx; + Expr val, idx; // note, for around k>200 topk is slower on the GPU than sorting and then selecting the top-k values std::tie(val, idx) = topk(scores, k, /*axis=*/-1, /*descending=*/true); @@ -24,14 +24,14 @@ Expr topkPruning(Expr scores, int k, bool normalize = false) { // Prune logits via nucleus pruning Expr nucleusPruning(Expr scores, float threshold, bool normalize = false) { - // normalization would make sense here since we compare against a meaningful threshold and + // normalization would make sense here since we compare against a meaningful threshold and // we don't know what other manipulations have been done to the logits before, but // leaving it to the user for now. We do set it to true in beam_search.cpp if(normalize) scores = logsoftmax(scores); // renormalize via logsoftmax // sort scores in descending order, this way we can use the cumulative sum to find the nucleus - Expr val, idx; + Expr val, idx; std::tie(val, idx) = sort(scores, /*axis=*/-1, /*descending=*/true); // logcumsumexp because we have logprobs, exclusive because we keep at least the first element @@ -51,11 +51,11 @@ Expr nucleusPruning(Expr scores, float threshold, bool normalize = false) { // Prune logits via epsilon pruning Expr epsilonPruning(Expr scores, float epsilon, bool normalize = false) { - // normalization would make sense here since we compare against a meaningful threshold and + // normalization would make sense here since we compare against a meaningful threshold and // we don't know what other manipulations have been done to the logits before if(normalize) scores = logsoftmax(scores); // renormalize via logsoftmax - + // make sure the epsilon is not larger than the largest value in the scores // otherwise we will mask out all values // equivalent to union of top-1 and log(epsilon) @@ -81,7 +81,10 @@ Expr gumbelMaxTrick(Expr scores, float temperature) { class DistModifier { private: + Ptr graph_; Ptr options_; + Ptr shortlist_; + bool forceDecode_{false}; bool sampling_{false}; @@ -91,12 +94,38 @@ class DistModifier { float invalidPathScore_; Expr forceBatch_; - + + void lazyCreateForceBatch() { + if(!forceBatch_) { + // turn the batch into a cached tensor that lives in the computation graph + std::vector forceWords; + for(auto& word : batch_->back()->data()) + forceWords.push_back(word.toWordIndex()); + int dimTime = (int)batch_->back()->batchWidth(); + int dimBatch = (int)batch_->back()->batchSize(); + forceBatch_ = graph_->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1] + } + } + public: - DistModifier(Ptr options, Ptr batch, float invalidPathScore) : - options_(options), forceDecode_(options_->get("force-decode", false)), - batch_(batch), invalidPathScore_(invalidPathScore) { - + DistModifier(Ptr graph, Ptr options, Ptr batch, float invalidPathScore, Ptr shortlist = nullptr) : + graph_(graph), + options_(options), + shortlist_(shortlist), + forceDecode_(options_->get("force-decode", false)), + batch_(batch), + invalidPathScore_(invalidPathScore) { + + // if we are force-decoding with a short list we need to set the forced token ids early + if(shortlist_ && forceDecode_) { + lazyCreateForceBatch(); + auto lsh = std::dynamic_pointer_cast(shortlist_); + ABORT_IF(!lsh, "Force-decoding not supported with shortlists other than LSH"); + ABORT_IF(!forceBatch_, "forceBatch_ is undefined??"); + Expr forceIndices = slice(forceBatch_, /*axis=*/-3, 0); // [1, 1, dimBatch, 1] + lsh->setForcedIndices(forceIndices); + } + if(options_->hasAndNotEmpty("output-sampling")) { sampling_ = true; auto samplingOpts = options_->get>("output-sampling", {}); @@ -108,8 +137,8 @@ class DistModifier { } else if(samplingMethod == "1") { // for backcompat with boolean values sampling_ = true; samplingMethod = "full"; - } - + } + if(samplingMethod == "full") { float temperature = 1.f; if(samplingOpts.size() > 1) @@ -171,28 +200,23 @@ class DistModifier { Expr force(Expr scores, int pos, int beamSize, std::vector& batchIndices) { // we check the last field of the batch for force-decoding content + int dimTime = (int)batch_->back()->batchWidth(); - if(!forceDecode_ || pos >= dimTime) // nothing to force-decode, just return original scores + if(!forceDecode_ || pos >= dimTime) { // nothing to force-decode, just return original scores return scores; + } LOG_ONCE(info, "Force-decoding with given prefixes"); - // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the + // if we get here, then we have to do force-decoding. We do this by "softly" modifying the scores and passing the // result to the normal top-k/beam search. "Softly" here means we add masking terms rather than making hard selections // which preserves the original tensor layout. - // This allows for beam-search and batched force-decoding with different length prefixes in a batch + // This allows for beam-search and batched force-decoding with different length prefixes in a batch // (way harder to do with actual index manipulation). We then return modified (masked) probabilities to the beam-search // which then continues as normal on the modified distribution. - if(!forceBatch_) { - // turn the batch into a cached tensor that lives in the computation graph - std::vector forceWords; - for(auto& word : batch_->back()->data()) - forceWords.push_back(word.toWordIndex()); - - int dimBatch = (int)batch_->back()->batchSize(); - forceBatch_ = scores->graph()->constant({1, dimTime, dimBatch, 1}, inits::fromVector(forceWords), Type::uint32); // [1, dimTime, dimBatch, 1] - } + lazyCreateForceBatch(); + ABORT_IF(!forceBatch_, "forceBatch_ is undefined??"); // if we remove batch entries during decoding (finished decoding) then adjust here if(forceBatch_->shape()[-2] != batchIndices.size()) forceBatch_ = index_select(forceBatch_, -2, batchIndices); @@ -200,6 +224,14 @@ class DistModifier { // get vocab index and probability for force-decoded tokens for the current time step Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] + if(shortlist_) { + auto lsh = std::dynamic_pointer_cast(shortlist_); + ABORT_IF(!lsh, "Force-decoding not supported with shortlists other than LSH"); + // only get location for first beam slot, the other slots don't matter since we overwrite them later. + lsh->setForcedIndices(forceIndices); + forceIndices = lsh->tryForwardMap(forceIndices); // [1, 1, dimBatch, 1] + } + // select scores from first beam entry for force-decoding Expr b1stScores = slice(scores, /*axis=*/-4, 0); // [1, 1, dimBatch, dimVocab] Expr forceVals = gather(b1stScores, /*axis=*/-1, forceIndices); // [1, 1, dimBatch, 1] @@ -207,18 +239,26 @@ class DistModifier { // create dummy indices and values for beam entries other than the force-decoded value. This is required to ensure that the beam // does not collapse for hyps outside the forced hyps and can still do full beam-search once we finish force-decoding for a batch // entry. We initialize randomly (they are not going to be used anyway due to very low prob) and shift by 1 to have 0 at first postion. - int dimVocab = scores->shape()[-1]; + int dimVocab = scores->shape()[-1]; auto graph = scores->graph(); - // we start at 256 to skip over suppressed special words in SentencePiece @TODO: this should be somehow inferred. - Expr dummyIndices = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(256.f, (float)dimVocab)), {0, 0, 0, 1}, 0.f); - // we use a range of invalidPathScore_ to invalidPathScore_ / 2 to make sure that the probabilities stay low, but larger than invalidPathScore_ itself. - Expr dummyVals = shift(graph->constant({1, 1, 1, beamSize}, inits::uniform(invalidPathScore_, invalidPathScore_ / 2.f)), {0, 0, 0, 1}, 0.f); + + std::vector dummyIndicesVec(beamSize, 0); + std::vector dummyValsVec(beamSize, 0.f); + for(int i = 1; i < beamSize; ++i) { + // we use dimVocab - i - 1 to make sure that the dummy indices are different from the force-decoded index (last vocab indices) + dummyIndicesVec[i] = dimVocab - i - 1; + // we use invalidPathScore_ / (2.f + i) to make sure that the dummy values are very low and decrease with beam position + dummyValsVec[i] = invalidPathScore_ / (2.f + i); + } + + Expr dummyIndices = graph->constant({1, 1, 1, beamSize}, inits::fromVector(dummyIndicesVec), Type::uint32); + Expr dummyVals = graph->constant({1, 1, 1, beamSize}, inits::fromVector(dummyValsVec)); // here we add the force-decoded entries back into the zeroed positions - dummyIndices = cast(cast(dummyIndices, Type::float32) + cast(forceIndices, Type::float32), Type::uint32); // [1, 1, dimBatch, dimBeam] - dummyVals = dummyVals + forceVals; // [1, 1, dimBatch, dimBeam] + dummyIndices = cast(maximum(cast(dummyIndices, Type::float32), cast(forceIndices, Type::float32)), Type::uint32); // [1, 1, dimBatch, dimBeam] + dummyVals = dummyVals + forceVals; // [1, 1, dimBatch, dimBeam] - // create a tensor of the same size as the original logits from the first beam entry, initialize with invalidPathScore and then scatter + // create a tensor of the same size as the original logits from the first beam entry, initialize with invalidPathScore and then scatter // the force-decoded and dummy values into the correct positions. Expr forcedScores = constant_like(b1stScores, inits::fromValue(invalidPathScore_)); // [1, 1, dimBatch, dimVocab] forcedScores = scatter(forcedScores, -1, dummyIndices, dummyVals); // [1, 1, dimBatch, dimVocab] From b4ed6304e86bf4f10f9d121ae2df008506ff73b2 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Wed, 17 Apr 2024 13:23:37 +0000 Subject: [PATCH 23/26] Merged PR 34062: Add exception if force-decoding is used for FSM vocab Abort or throw an exception if we try force-decoding with a factored Vocab. --- CHANGELOG.md | 1 + VERSION | 2 +- src/data/shortlist.cpp | 44 ++++++++++++++++--------------------- src/microsoft/quicksand.cpp | 10 ++++++--- src/translator/sampling.h | 11 ++++++++-- 5 files changed, 37 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index caa3b8aa8..0b52200ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Throw exception when forcing with FS vocabs - Fixed force-decoding with LSH - Fixed force-decoding for beam-size > 1 - Fixed lost node in mt-detect metrics diff --git a/VERSION b/VERSION index 8d44afc76..7542664ec 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.27 +v1.12.28 diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp index ac588a279..83ae435f4 100644 --- a/src/data/shortlist.cpp +++ b/src/data/shortlist.cpp @@ -107,33 +107,27 @@ void LSHShortlist::setForcedIndices(Expr forcedIndices) { void LSHShortlist::filter(Expr input, Expr weights, bool isLegacyUntransposedW, Expr b, Expr lemmaEt) { auto topk = lsh::search(input, weights, k_, nbits_, (int)lemmaSize_, abortIfDynamic_); // [beam, batch, k] - - bool addForced = forcedIndicesExpr_ != nullptr; - if(addForced) { - topk = callback(topk, - [this](Expr node) { - int dimBeam = node->shape()[-3]; - int dimBatch = node->shape()[-2]; - for(int batchIdx = 0; batchIdx < dimBatch; batchIdx++) { - for(int beamIdx = 0; beamIdx < dimBeam; beamIdx++) { - IndexType* begin = node->val()->data() + beamIdx * dimBatch * k_ + batchIdx * k_; - IndexType* end = begin + k_; - IndexType val = forcedIndicesExpr_->val()->data()[batchIdx]; - auto pos = std::lower_bound(begin, end, val); - if(pos != end) - *pos = val; - else - *(end-1) = val; - } - } - // we will correctly overwrite the indices used for reverse mapping in the next call back - - setForcedIndices(nullptr); // mark as done for this step - }); - } - indicesExpr_ = callback(topk, [this](Expr node) { + if(forcedIndicesExpr_) { + // if a forced index is set, we need to overwrite the relevant topk index with the forced index + int dimBeam = node->shape()[-3]; + int dimBatch = node->shape()[-2]; + for(int batchIdx = 0; batchIdx < dimBatch; batchIdx++) { + for(int beamIdx = 0; beamIdx < dimBeam; beamIdx++) { + IndexType* begin = node->val()->data() + beamIdx * dimBatch * k_ + batchIdx * k_; + IndexType* end = begin + k_; + IndexType val = forcedIndicesExpr_->val()->data()[batchIdx]; + auto pos = std::lower_bound(begin, end, val); + if(pos != end) + *pos = val; + else + *(end-1) = val; + } + } + // we will correctly overwrite the indices used for reverse mapping in the next call back + setForcedIndices(nullptr); + } node->val()->get(indices_); // set the value of the field indices_ whenever the graph traverses this node }); diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 75f224d9a..3dce71d57 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -169,7 +169,8 @@ class BeamSearchDecoder : public IBeamSearchDecoder { scorer->setShortlistGenerator(shortListGen); } - ABORT_IF(qsBatches.empty(), "No input batch provided"); + ABORT_IF(qsBatches.empty(), "No input batch provided"); + ABORT_IF(qsBatches.size() > 2, "More than two sub-batches provided"); auto createSubBatch = [maxLength](const QSBatch& qsBatch, Ptr vocab) { size_t batchSize = qsBatch.size(); @@ -188,9 +189,12 @@ class BeamSearchDecoder : public IBeamSearchDecoder { }; auto srcSubBatch = createSubBatch(qsBatches[0], vocabs_[0]); - auto tgtSubBatch = createSubBatch(qsBatches[1], vocabs_[1]); + std::vector> subBatches{ srcSubBatch }; + if(qsBatches.size() == 2) { + auto tgtSubBatch = createSubBatch(qsBatches[1], vocabs_[1]); + subBatches.push_back(tgtSubBatch); + } - std::vector> subBatches{ srcSubBatch, tgtSubBatch }; auto batch = New(subBatches); std::vector sentIds(batch->size(), 0); batch->setSentenceIds(sentIds); diff --git a/src/translator/sampling.h b/src/translator/sampling.h index 043f79bd6..764ef1799 100644 --- a/src/translator/sampling.h +++ b/src/translator/sampling.h @@ -116,6 +116,8 @@ class DistModifier { batch_(batch), invalidPathScore_(invalidPathScore) { + forceDecode_ = forceDecode_ && batch->sets() > 1; // force-decoding if we have multiple sets in the batch + // if we are force-decoding with a short list we need to set the forced token ids early if(shortlist_ && forceDecode_) { lazyCreateForceBatch(); @@ -217,13 +219,18 @@ class DistModifier { lazyCreateForceBatch(); ABORT_IF(!forceBatch_, "forceBatch_ is undefined??"); + + auto factoredVocab = batch_->front()->vocab()->tryAs(); + ABORT_IF(factoredVocab, "Factored vocabularies are not supported for force-decoding"); + // if we remove batch entries during decoding (finished decoding) then adjust here if(forceBatch_->shape()[-2] != batchIndices.size()) forceBatch_ = index_select(forceBatch_, -2, batchIndices); // get vocab index and probability for force-decoded tokens for the current time step - Expr forceIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] + Expr posIndices = slice(forceBatch_, /*axis=*/-3, pos); // [1, 1, dimBatch, 1] + Expr forceIndices = posIndices; if(shortlist_) { auto lsh = std::dynamic_pointer_cast(shortlist_); ABORT_IF(!lsh, "Force-decoding not supported with shortlists other than LSH"); @@ -267,7 +274,7 @@ class DistModifier { // via interpolating by a selector. In marian eosId is used for padding, so this works everywhere and eos for unfinished hyps means // free decoding or sampling. WordIndex eosId = batch_->back()->vocab()->getEosId().toWordIndex(); - auto interpol = eq(cast(forceIndices, scores->value_type()), (float)eosId); + auto interpol = eq(cast(posIndices, scores->value_type()), (float)eosId); return interpol * scores + (1.f - interpol) * forcedScores; } From 2745b773bc8b8402ad536737b33d4948bbac8542 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Sat, 20 Apr 2024 15:24:52 +0000 Subject: [PATCH 24/26] Merged PR 34029: Fix regressions in new layer framework for ALIBI-based decoding * Fixes regressions in new layer framework for ALIBI-based decoding --- CHANGELOG.md | 1 + VERSION | 2 +- src/layers_new/alibi.h | 28 +++++++++------ src/layers_new/attention.h | 12 +++++-- src/layers_new/rnn.h | 1 - src/models/transformer_factory.h | 62 ++++++++++++++++---------------- 6 files changed, 60 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b52200ed..dbf50d5bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Fixed ALiBI states and caching in new layer framework - Throw exception when forcing with FS vocabs - Fixed force-decoding with LSH - Fixed force-decoding for beam-size > 1 diff --git a/VERSION b/VERSION index 7542664ec..1e696b303 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.28 +v1.12.29 diff --git a/src/layers_new/alibi.h b/src/layers_new/alibi.h index 66c102235..e4d58e8b5 100644 --- a/src/layers_new/alibi.h +++ b/src/layers_new/alibi.h @@ -155,19 +155,25 @@ class AlibiAttentionMaskProcessor : public AttentionMaskProcessor { // Apply the alibi mask to the given query and mask virtual Expr apply(Expr query, Expr mask) const override { - if(!trainable) { - const_cast(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes()); - const_cast(biases) = graph()->constant({numHeads, 1, 1}, initBiases()); - } else { - registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes()); - registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases()); - } - Expr shift = nullptr; - int start = 0; + auto processMask = [this, query](Expr mask) { + if(!trainable) { + const_cast(slopes) = graph()->constant({numHeads, 1, 1}, initSlopes()); + const_cast(biases) = graph()->constant({numHeads, 1, 1}, initBiases()); + } else { + registerParameterLazy(slopes, Shape({numHeads, 1, 1}), initSlopes()); + registerParameterLazy(biases, Shape({numHeads, 1, 1}), initBiases()); + } + + Expr shift = nullptr; + int start = 0; + auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start); + return alibiMask; + }; - auto alibiMask = alibiLogMask(mask, query, slopes, biases, shift, numHeads, start); - return alibiMask; + // recompute the mask if input mask changes (different memory address), otherwise return cached version + auto equal = [](Expr a, Expr b) { return a == b; }; + return cachedMask_->apply(mask, processMask, equal); } }; diff --git a/src/layers_new/attention.h b/src/layers_new/attention.h index 9bd31baa0..cc65c9fa7 100644 --- a/src/layers_new/attention.h +++ b/src/layers_new/attention.h @@ -310,8 +310,16 @@ struct AttentionMaskProcessor : public MaskProcessor { if(!mask) return nullptr; - // shape of mask should be [1, dimBatch, dimKeys, 1] - return marian::logMask(mask, numHeads, /*addCausalMask=*/false); // [1, dimBatch * numHeads, 1, dimKeys] + // shape of input `mask` should be [1, dimBatch, dimKeys, 1] + // output shape will be // [1, dimBatch * numHeads, 1, dimKeys] if addCausalMask is false + // or [1, dimBatch * numHeads, dimKeys, dimKeys] if addCausalMask is true + auto processMask = [this](Expr mask) { return marian::logMask(mask, numHeads, /*addCausalMask=*/false); }; + + // recompute the mask if input mask changes (different memory address), otherwise return cached version + auto equal = [](Expr a, Expr b) { return a == b; }; + + // recompute the mask if the shape changes, otherwise return cached version + return cachedMask_->apply(mask, processMask, equal); } }; diff --git a/src/layers_new/rnn.h b/src/layers_new/rnn.h index 9a9cd067f..3e3307476 100644 --- a/src/layers_new/rnn.h +++ b/src/layers_new/rnn.h @@ -132,7 +132,6 @@ class RNN final : public Layer, public IBinaryLayer, public IBinaryDecoderLayer } state->as()->set(cellState->recurrent); - state->setPosition(cellState->position); // during decoding again, this is a no-op Expr output = swapTimeBatch(concatenate(outputs, dimTimeAxis)); diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h index fcd90ad63..06cf5a995 100644 --- a/src/models/transformer_factory.h +++ b/src/models/transformer_factory.h @@ -108,48 +108,48 @@ class TransformerLegacy : public EncoderDecoder { prefix = "TransformerBatchDecoder"; for(int layerNo = 0; layerNo < opt("dec-depth"); ++layerNo) { // name maps for decoder self-attention blocks - nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->qProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->qProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->qProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->qProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->kProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->kProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->kProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->kProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->vProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->vProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->vProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->vProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->oProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->oProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->selfAttention->oProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_self_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); // name maps for decoder SSRU - nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_W", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->iProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_Wf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_bf", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->cell->fProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->rnn->oProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_rnn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->autoRegressiveBlock->postprocessor->norm->bias", prefix, layerNo); // name maps for decoder cross-attention blocks - nameMap[fmt::format("decoder_l{}_context_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->qProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->qProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->qProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bq", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->qProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->kProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->kProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->kProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bk", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->kProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->vProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->vProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->vProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bv", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->vProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->oProj->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->oProj->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->oProj->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_bo", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->crossAttention->oProj->bias", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->postprocessor->norm->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_context_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->postprocessor->norm->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wo_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_context_Wo_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->crossAttentionBlock->postprocessor->norm->bias", prefix, layerNo); // name maps for decoder FFN blocks int mult = 3; @@ -160,11 +160,11 @@ class TransformerLegacy : public EncoderDecoder { mult = 1; layerType = "LinearReluDropout"; } - nameMap[fmt::format("decoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->layers->at({})->as()->weight", prefix, layerNo, mult * ffnLayerNo, layerType); - nameMap[fmt::format("decoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->layers->at({})->as()->bias", prefix, layerNo, mult * ffnLayerNo, layerType); + nameMap[fmt::format("decoder_l{}_ffn_W{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->layers->at({})->as()->weight", prefix, layerNo, mult * ffnLayerNo, layerType); + nameMap[fmt::format("decoder_l{}_ffn_b{}", layerNo + 1, ffnLayerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->layers->at({})->as()->bias", prefix, layerNo, mult * ffnLayerNo, layerType); } - nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->postprocessor->norm->weight", prefix, layerNo); - nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->postprocessor->norm->bias", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_scale", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->postprocessor->norm->weight", prefix, layerNo); + nameMap[fmt::format("decoder_l{}_ffn_ffn_ln_bias", layerNo + 1)] = fmt::format("{}->decoder->layers->at({})->as()->filterBlock->postprocessor->norm->bias", prefix, layerNo); } return nameMap; From 07042cf2cea3d2b745a69bd14b76acaa1df0a913 Mon Sep 17 00:00:00 2001 From: Marcin Junczys-Dowmunt Date: Tue, 23 Apr 2024 12:31:12 +0000 Subject: [PATCH 25/26] Merged PR 34167: Do not mmap files for conversion in Quicksand API * Do not mmap files for conversion --- CHANGELOG.md | 1 + VERSION | 2 +- src/microsoft/quicksand.cpp | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbf50d5bd..1b908d9cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - New experimental layer framework for Transformer-like models. ### Fixed +- Do not mmap files for conversion via Quicksand API - Fixed ALiBI states and caching in new layer framework - Throw exception when forcing with FS vocabs - Fixed force-decoding with LSH diff --git a/VERSION b/VERSION index 1e696b303..1501aad44 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.29 +v1.12.30 diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 3dce71d57..67fef8592 100644 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -303,7 +303,7 @@ DecoderCpuAvxVersion parseCpuAvxVersion(std::string name) { bool convertModel(std::string inputFile, std::string outputFile, int32_t targetPrec, int32_t lshNBits) { std::cerr << "Converting from: " << inputFile << ", to: " << outputFile << ", precision: " << targetPrec << std::endl; - auto modelFile = New(inputFile); + auto modelFile = New(inputFile, marian::io::MmapMode::DontMmap); YAML::Node config = modelFile->getYamlFromModel(); std::stringstream configStr; From a6ab8af8fc8f02c130819bfe7e07318ec958e323 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Thu, 27 Jun 2024 15:21:08 +0000 Subject: [PATCH 26/26] Merged PR 34540: pymarian: build for multiple python versions; disable tcmalloc; huggingface backed for gated COMETs pymarian upgrades * Support for build for multiple python versions at once; borrowed a cmake script from AMD * use "build" instead of "pip wheel"; build is more stable and leaves less junk on file system * Disable tcmalloc for pymarian * Added support for [huggingface backend](https://huggingface.co/collections/Unbabel/marian-comet-metrics-and-qe-664e28c82743db6709d022fc). Currently enabled for gated comet models only. * Added `--cache` argument to pymarian-eval CLI; Useful for accessing cache from blobstorage mount path for gated models --- CHANGELOG.md | 4 +- CMakeLists.txt | 6 ++ VERSION | 2 +- cmake/PythonModules.cmake | 119 ++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 45 +++++++----- src/python/README.md | 68 +++++++++++++++++- src/python/pymarian/defaults.py | 37 ++++++---- src/python/pymarian/eval.py | 6 +- src/python/pymarian/utils.py | 49 +++++++++---- src/python/pyproject.toml | 3 +- src/python/setup.py | 17 +++-- 11 files changed, 296 insertions(+), 60 deletions(-) create mode 100644 cmake/PythonModules.cmake diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b908d9cc..afa4465ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] - Fixed compilation with clang 16.0.6 -- Added Threads::Threads to EXT_LIBS - +- Added Threads::Threads to `EXT_LIBS` +- Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace ### Added - Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size. diff --git a/CMakeLists.txt b/CMakeLists.txt index b6aa74297..ee05cf99b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,6 +123,12 @@ set(CMAKE_THREAD_PREFER_PTHREAD TRUE) set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) set(EXT_LIBS ${EXT_LIBS} Threads::Threads) + +# disable tcmalloc if pymarian=on +if(USE_TCMALLOC AND PYMARIAN) + message(WARNING "TCMalloc can cause segfaults with some python libraries. Hence disabling TCMalloc for a robust pymarian build.") + set(USE_TCMALLOC off) +endif() ######## ############################################################################### diff --git a/VERSION b/VERSION index 1501aad44..7b4d55e09 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v1.12.30 +v1.12.31 diff --git a/cmake/PythonModules.cmake b/cmake/PythonModules.cmake new file mode 100644 index 000000000..062155647 --- /dev/null +++ b/cmake/PythonModules.cmake @@ -0,0 +1,119 @@ +# Retrieved from ROCm/AMDMIGraphX repo @ https://github.com/ROCm/AMDMIGraphX/blob/develop/cmake/PythonModules.cmake +##################################################################################### +# The MIT License (MIT) +# +# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +##################################################################################### +if(COMMAND find_python) + return() +endif() + + +macro(py_exec) + execute_process(${ARGN} RESULT_VARIABLE RESULT) + if(NOT RESULT EQUAL 0) + message(FATAL_ERROR "Process failed: ${ARGN}") + endif() +endmacro() + +set(PYBIND11_NOPYTHON On) +# this wont work if pybind11 is git submodule +#find_package(pybind11 REQUIRED) + +## ===================== +set(PYTHON_SEARCH_VERSIONS 3.7 3.8 3.9 3.10 3.11 3.12 3.13) +set(PYTHON_DISABLE_VERSIONS "" CACHE STRING "") +foreach(PYTHON_DISABLE_VERSION ${PYTHON_DISABLE_VERSIONS}) + list(REMOVE_ITEM PYTHON_SEARCH_VERSIONS ${PYTHON_DISABLE_VERSION}) +endforeach() + +## ===================== + +macro(find_python version) + find_program(PYTHON_CONFIG_${version} python${version}-config) + if(EXISTS ${PYTHON_CONFIG_${version}}) + py_exec(COMMAND ${PYTHON_CONFIG_${version}} --includes OUTPUT_VARIABLE _python_include_args) + execute_process(COMMAND ${PYTHON_CONFIG_${version}} --ldflags --embed OUTPUT_VARIABLE _python_ldflags_args RESULT_VARIABLE _python_ldflags_result) + if(NOT _python_ldflags_result EQUAL 0) + py_exec(COMMAND ${PYTHON_CONFIG_${version}} --ldflags OUTPUT_VARIABLE _python_ldflags_args) + endif() + separate_arguments(_python_includes UNIX_COMMAND "${_python_include_args}") + separate_arguments(_python_ldflags UNIX_COMMAND "${_python_ldflags_args}") + string(REPLACE "-I" "" _python_includes "${_python_includes}") + add_library(python${version}::headers INTERFACE IMPORTED GLOBAL) + set_target_properties(python${version}::headers PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_python_includes}" + ) + add_library(python${version}::runtime INTERFACE IMPORTED GLOBAL) + set_target_properties(python${version}::runtime PROPERTIES + INTERFACE_LINK_OPTIONS "${_python_ldflags}" + INTERFACE_LINK_LIBRARIES python${version}::headers + ) + py_exec(COMMAND ${PYTHON_CONFIG_${version}} --prefix OUTPUT_VARIABLE _python_prefix) + string(STRIP "${_python_prefix}" _python_prefix) + set(PYTHON_${version}_EXECUTABLE "${_python_prefix}/bin/python${version}" CACHE PATH "") + endif() +endmacro() + +####### +function(py_extension name version) + set(_python_module_extension ".so") + if(version VERSION_GREATER_EQUAL 3.0) + py_exec(COMMAND ${PYTHON_CONFIG_${version}} --extension-suffix OUTPUT_VARIABLE _python_module_extension) + string(STRIP "${_python_module_extension}" _python_module_extension) + endif() + set_target_properties(${name} PROPERTIES PREFIX "" SUFFIX "${_python_module_extension}") +endfunction() + +function(py_add_module NAME) + set(options) + set(oneValueArgs PYTHON_VERSION PYTHON_MODULE) + set(multiValueArgs) + + cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(PYTHON_VERSION ${PARSE_PYTHON_VERSION}) + + add_library(${NAME} MODULE ${PARSE_UNPARSED_ARGUMENTS}) + pybind11_strip(${NAME}) + py_extension(${NAME} ${PYTHON_VERSION}) + target_link_libraries(${NAME} PRIVATE pybind11::module pybind11::lto python${PYTHON_VERSION}::headers) + set_target_properties(${NAME} PROPERTIES + OUTPUT_NAME ${PARSE_PYTHON_MODULE} + C_VISIBILITY_PRESET hidden + CXX_VISIBILITY_PRESET hidden + ) + +endfunction() + +### +set(_PYTHON_VERSIONS) +foreach(PYTHON_VERSION ${PYTHON_SEARCH_VERSIONS}) + find_python(${PYTHON_VERSION}) + if(TARGET python${PYTHON_VERSION}::headers) + message(STATUS "Python ${PYTHON_VERSION} found.") + list(APPEND _PYTHON_VERSIONS ${PYTHON_VERSION}) + else() + message(STATUS "Python ${PYTHON_VERSION} not found.") + endif() +endforeach() +# Make the variable global +set(PYTHON_VERSIONS "${_PYTHON_VERSIONS}" CACHE INTERNAL "" FORCE) + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c0c4f74b9..fb5bdca98 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -292,24 +292,35 @@ endif(GENERATE_MARIAN_INSTALL_TARGETS) if(PYMARIAN) - if(NOT PYTHON_EXECUTABLE) - set(PYTHON_EXECUTABLE python) # default to python in the environment - endif() - + # python libs which use different version of tcmalloc (e.g. pandas) can cause segfaults, so we disable it include_directories(3rd_party/pybind11/include) add_subdirectory(3rd_party/pybind11) + include(PythonModules) + # print all python versions + message(STATUS "Going to look for these Python versions: ${PYTHON_VERSIONS}") + add_custom_target(_pymarian) + foreach(PYTHON_VERSION ${PYTHON_VERSIONS}) # defined in PythonModules.cmake + py_add_module(_pymarian_${PYTHON_VERSION} python/binding/bind.cpp PYTHON_VERSION ${PYTHON_VERSION} PYTHON_MODULE _pymarian) + target_link_libraries(_pymarian_${PYTHON_VERSION} PUBLIC marian ${EXT_LIBS}) + if(CUDA_FOUND) + target_link_libraries(_pymarian_${PYTHON_VERSION} PUBLIC marian_cuda) + endif(CUDA_FOUND) + add_dependencies(_pymarian _pymarian_${PYTHON_VERSION}) + endforeach() + + # ==== make .whl files ==== + # IMPORTANT: do not parallelize the wheel builds; they conflict on a few directories (e.g. *.egg-info) + set(LAST_PYMARIAN_TGT "") + foreach(PYTHON_VERSION ${PYTHON_VERSIONS}) + add_custom_target( + pymarian_${PYTHON_VERSION} ALL + COMMAND ${PYTHON_${PYTHON_VERSION}_EXECUTABLE} -m pip install --upgrade pip build + COMMAND ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}" + ${PYTHON_${PYTHON_VERSION}_EXECUTABLE} -m build --wheel ${PROJECT_SOURCE_DIR}/src/python -o "${PROJECT_BINARY_DIR}" + DEPENDS _pymarian_${PYTHON_VERSION} ${LAST_PYMARIAN_TGT} + VERBATIM COMMENT "===Building pymarian wheel for python${PYTHON_VERSION}===" + ) + set(LAST_PYMARIAN_TGT pymarian_${PYTHON_VERSION}) + endforeach(PYTHON_VERSION) - pybind11_add_module(_pymarian MODULE python/binding/bind.cpp) - target_link_libraries(_pymarian PUBLIC marian) - if(CUDA_FOUND) - target_link_libraries(_pymarian PUBLIC marian_cuda) - endif(CUDA_FOUND) - install(TARGETS _pymarian DESTINATION .) - - # build pymarian wheel - add_custom_target(pymarian ALL - ${CMAKE_COMMAND} -E env "CMAKE_BINARY_DIR=${PROJECT_BINARY_DIR}" "CMAKE_SOURCE_DIR=${PROJECT_SOURCE_DIR}" - "${PYTHON_EXECUTABLE}" -m pip wheel -v --no-input ${PROJECT_SOURCE_DIR}/src/python -w "${PROJECT_BINARY_DIR}" - DEPENDS _pymarian - VERBATIM COMMENT "Building pymarian wheel") endif(PYMARIAN) diff --git a/src/python/README.md b/src/python/README.md index d3fc34e25..82d300675 100644 --- a/src/python/README.md +++ b/src/python/README.md @@ -13,7 +13,7 @@ cmake --build build -j # -j option parallelizes build on all cpu cores python -m pip install build/pymarian-*.whl ``` -Since the above commands uses `python` executable in the PATH to determine Python version to compile marian native extension, make sure to have the desired `python` executable in your environment _before_ invoking these commands. +The above commands use `python` executable in the PATH to determine Python version for compiling marian native extension. Make sure to have the desired `python` executable in your environment _before_ invoking these cmake commands. ## Python API @@ -96,6 +96,7 @@ options: -ws WORKSPACE, --workspace WORKSPACE Workspace memory (default: 8000) -pc, --print-cmd Print marian evaluate command and exit (default: False) + --cache CACHE Cache directory for storing models (default: $HOME/.cache/marian/metric) More info at https://github.com/marian-nmt/marian-dev. This CLI is loaded from .../python3.10/site-packages/pymarian/eval.py (version: 1.12.25) @@ -157,6 +158,71 @@ python -m pytest -s src/python/tests/regression ``` +## Release Instructions + +### Building Pymarian for Multiple Python Versions + +Our CMake scripts detects `python3.*` available in PATH and builds pymarian for each. +To support a specific version of python, make the `python3.x` executable available in PATH prior to running cmake. +This can be achieved by (without conflicts) using `conda` or `mamba`. + + +```bash +# setup mamba if not already; Note: you may use conda as well +which mamba || { + name=Miniforge3-$(uname)-$(uname -m).sh + wget "https://github.com/conda-forge/miniforge/releases/latest/download/$name" \ + && bash $name -b -p ~/mambaforge && ~/mambaforge/bin/mamba init bash && rm $name +} + +# create environment for each version +versions="$(echo 3.{12,11,10,9,8,7})" +for version in $versions; do + echo "python $version" + mamba env list | grep -q "^py${version}" || mamba create -q -y -n py${version} python=${version} +done + +# stack all environments +for version in $versions; do mamba activate py${version} --stack; done +# check if all python versions are available +for version in $versions; do which python$version; done + + +# Build as usual +cmake . -B build -DCOMPILE_CUDA=off -DPYMARIAN=on +cmake --build build -j +ls build/pymarian*.whl +``` + +### Upload to PyPI +```bash +twine upload -r testpypi build/*.whl + +twine upload -r pypi build/*.whl +``` + +__Initial Setup:__ create `~/.pypirc` with following: + +```ini +[distutils] +index-servers = + pypi + testpypi + +[pypi] +repository: https://upload.pypi.org/legacy/ +username:__token__ +password: + +[testpypi] +repository: https://test.pypi.org/legacy/ +username:__token__ +password: +``` +Obtain token from https://pypi.org/manage/account/ + + + ## Known issues 1. In conda or mamba environment, if you see `.../miniconda3/envs//bin/../lib/libstdc++.so.6: version 'GLIBCXX_3.4.30' not found` error, diff --git a/src/python/pymarian/defaults.py b/src/python/pymarian/defaults.py index 2fdeff278..51c29c69d 100644 --- a/src/python/pymarian/defaults.py +++ b/src/python/pymarian/defaults.py @@ -1,9 +1,12 @@ from pathlib import Path - +import os class Defaults: BASE_URL = "https://textmt.blob.core.windows.net/www/marian/metric" - CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric' + + DEF_CACHE_PATH = Path.home() / '.cache' / 'marian' / 'metric' + # user might also change this from CLI at runtime + CACHE_PATH = Path(os.environ['MARIAN_CACHE']) if os.environ.get('MARIAN_CACHE', '').strip() else DEF_CACHE_PATH MINI_BATCH = 16 MAXI_BATCH = 256 WORKSPACE = 8000 @@ -12,20 +15,24 @@ class Defaults: FLOAT_PRECISION = 4 FILE_LOCK_TIMEOUT = 1 * 60 * 60 # seconds => 1 hour PROGRESS_BAR = True - - # metric name to model type; lowercase all IDs + HUGGINGFACE = "huggingface" + AZURE = "azure" + COMET_VOCAB_REPO = "microsoft/infoxlm-large" + # metric id -> (model_type, huggingface_org/model_id) + # unbabel agreed to host models within their org and added the same gating/licensing mechanism + # we hosted bleurt ourself (Apache2.0) on https://huggingface.co/marian-nmt KNOWN_METRICS = { - "bleurt-20": "bleurt", - "wmt20-comet-da": "comet", - "wmt20-comet-qe-da": "comet-qe", - "wmt20-comet-qe-da-v2": "comet-qe", - "wmt21-comet-da": "comet", - "wmt21-comet-qe-da": "comet-qe", - "wmt21-comet-qe-mqm": "comet-qe", - "wmt22-comet-da": "comet", - "wmt22-cometkiwi-da": "comet-qe", - "xcomet-xl": "comet", - "xcomet-xxL": "comet", + "bleurt-20": ["bleurt", "marian-nmt/bleurt-20"], + "wmt20-comet-da": ["comet", "unbabel/wmt20-comet-da-marian"], + "wmt20-comet-qe-da": ["comet-qe", "unbabel/wmt20-comet-qe-da-marian"], + "wmt20-comet-qe-da-v2": ["comet-qe", "unbabel/wmt20-comet-qe-da-v2-marian"], + "wmt21-comet-da": ["comet", "unbabel/wmt21-comet-da-marian"], + "wmt21-comet-qe-da": ["comet-qe", "unbabel/wmt21-comet-qe-da-marian"], + "wmt21-comet-qe-mqm": ["comet-qe", "unbabel/wmt21-comet-qe-mqm-marian"], + "wmt22-comet-da": ["comet", "unbabel/wmt22-comet-da-marian"], + "wmt22-cometkiwi-da": ["comet-qe", "unbabel/wmt22-cometkiwi-da-marian"], + "wmt23-cometkiwi-da-xl": ["comet-qe", "unbabel/wmt23-cometkiwi-da-xl-marian"], + "wmt23-cometkiwi-da-xxl": ["comet-qe", "unbabel/wmt23-cometkiwi-da-xxl-marian"], } # model type to field order diff --git a/src/python/pymarian/eval.py b/src/python/pymarian/eval.py index 4b5e5f02c..d355e100f 100755 --- a/src/python/pymarian/eval.py +++ b/src/python/pymarian/eval.py @@ -25,7 +25,7 @@ def parse_args(): f'This CLI is loaded from {__file__} (version: {__version__})', ) - known_metrics = ', '.join(Defaults.KNOWN_METRICS) + known_metrics = ', '.join(Defaults.KNOWN_METRICS.keys()) parser.add_argument( '-m', '--model', @@ -89,6 +89,7 @@ def parse_args(): parser.add_argument( '-pc', '--print-cmd', action="store_true", help="Print marian evaluate command and exit" ) + parser.add_argument('--cache', help='Cache directory for storing models', type=Path, default=Defaults.CACHE_PATH) args = parser.parse_args() return vars(args) @@ -197,6 +198,7 @@ def main(**args): log.debug(args) else: args['quiet'] = '' + Defaults.CACHE_PATH = args.pop('cache') model_id = args.pop('model') model_path = Path(model_id) @@ -221,7 +223,7 @@ def main(**args): model_path = get_model_path(model_id) if not vocab_path: # if vocab is not given, resolve it from cache vocab_path = get_vocab_path(model_id) - args['like'] = Defaults.KNOWN_METRICS.get(model_id, Defaults.DEF_MODEL_TYPE) + args['like'] = Defaults.KNOWN_METRICS.get(model_id, [Defaults.DEF_MODEL_TYPE])[0] except ValueError as e: raise ValueError(f'Invalid model ID: {model_id}') from e diff --git a/src/python/pymarian/utils.py b/src/python/pymarian/utils.py index c3a4efab0..0d33ae455 100644 --- a/src/python/pymarian/utils.py +++ b/src/python/pymarian/utils.py @@ -6,13 +6,14 @@ import logging as log import shutil from pathlib import Path -from typing import List, Tuple +from typing import Tuple import portalocker import requests from .defaults import Defaults from .pypdl import Downloader +from huggingface_hub import hf_hub_download log.basicConfig(level=log.INFO) @@ -40,15 +41,22 @@ def get_model_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Path: If necessary, this function downloads checkpoint to a local cache directory. :param model_name: model name + :param progress_bar: show progress bar while downloading :return: checkpoint path """ validate_id(model_name) - chkpt_url = f'{Defaults.BASE_URL}/{model_name}/model.{model_name}.bin' - - local_dir = Defaults.CACHE_PATH / model_name - chkpt_local = local_dir / f'model.{model_name}.bin' - - maybe_download_file(chkpt_url, chkpt_local) + hf_repo_id = Defaults.KNOWN_METRICS.get(model_name, [None, None])[1] + if hf_repo_id: + # TODO: support progress bar switch + chkpt_local = hf_hub_download(repo_id=hf_repo_id, filename="checkpoints/marian.model.bin", + cache_dir=Defaults.CACHE_PATH) + chkpt_local = Path(chkpt_local) + else: + chkpt_url = f'{Defaults.BASE_URL}/{model_name}/model.{model_name}.bin' + local_dir = Defaults.CACHE_PATH / model_name + chkpt_local = local_dir / f'model.{model_name}.bin' + + maybe_download_file(chkpt_url, chkpt_local, progress_bar=progress_bar) assert chkpt_local.exists(), f'Checkpoint file {chkpt_local} does not exist' return chkpt_local @@ -61,12 +69,27 @@ def get_vocab_path(model_name, progress_bar: bool = PROGRESS_BAR) -> Tuple[Path, :param progress_bar: show progress bar while downloading :return: checkpoint path, vocabulary path """ - validate_id(model_name) - local_dir = Defaults.CACHE_PATH / model_name - vocab_local = local_dir / 'vocab.spm' - - vocab_url = f'{Defaults.BASE_URL}/{model_name}/vocab.spm' - maybe_download_file(vocab_url, vocab_local, progress_bar=progress_bar) + hf_repo_id = Defaults.KNOWN_METRICS.get(model_name, [None, None])[1] + if hf_repo_id: + filename = "vocab.spm" + if 'comet' in hf_repo_id.lower(): + hf_repo_id = Defaults.COMET_VOCAB_REPO + filename = "sentencepiece.bpe.model" + # TODO: support progress bar switch + vocab_local = hf_hub_download(repo_id=hf_repo_id, filename=filename, cache_dir=Defaults.CACHE_PATH) + vocab_local = Path(vocab_local) + if vocab_local.suffix != ".spm": # marian requires .spm extension + vocab_spm = vocab_local.with_suffix(".spm") + if not vocab_spm.exists(): + vocab_spm.symlink_to(Path(vocab_local.name), target_is_directory=False) + vocab_local = vocab_spm + else: + validate_id(model_name) + local_dir = Defaults.CACHE_PATH / model_name + vocab_local = local_dir / 'vocab.spm' + + vocab_url = f'{Defaults.BASE_URL}/{model_name}/vocab.spm' + maybe_download_file(vocab_url, vocab_local, progress_bar=progress_bar) assert vocab_local.exists(), f'Vocabulary file {vocab_local} does not exist' return vocab_local diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml index f2008a924..30eb16f36 100644 --- a/src/python/pyproject.toml +++ b/src/python/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "pyyaml", "tqdm", "requests", + "huggingface-hub==0.23.1", ] [project.scripts] @@ -49,8 +50,6 @@ demos = [ "sentence-splitter@git+https://github.com/mediacloud/sentence-splitter", ] -[tool.setuptools] -include-package-data = true [tool.black] line-length = 110 diff --git a/src/python/setup.py b/src/python/setup.py index bcbca2c63..0e34efd30 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -71,12 +71,15 @@ def get_native_ext() -> Path: print(f"\t>>>Making it available under the package scope at: {native_ext_local}") shutil.copy(native_ext, native_ext_local) - # remove incomaptible .so files from prior builds (if any) - for old_file in Path(__file__).parent.glob("_pymarian.*"): - if old_file.resolve() == native_ext_local.resolve(): - continue - print(f"\t>>>Removing old file: {old_file}") - old_file.unlink() + # wheel builder adds all *.so files into *.whl making the wheel bloated; so we remove them + remove_old_files = True + if remove_old_files: + # remove incomaptible .so files from prior builds (if any) + for old_file in Path(__file__).parent.glob("_pymarian.*"): + if old_file.resolve() == native_ext_local.resolve(): + continue + print(f"\t>>>INFO:: Removing incompatible extension: {old_file}") + old_file.unlink() return native_ext_local @@ -97,6 +100,6 @@ def has_ext_modules(foo): package_dir={"pymarian": "pymarian"}, packages=find_namespace_packages(where=".", exclude=["tests", "binding"]), include_package_data=True, - package_data={"": [str(native_ext)]}, + package_data={"pymarian": [str(native_ext)]}, distclass=BinaryDistribution, )