Skip to content

Commit

Permalink
merge with internal master
Browse files Browse the repository at this point in the history
  • Loading branch information
emjotde committed Aug 5, 2024
2 parents 2d067af + a6ab8af commit 2f9b6df
Show file tree
Hide file tree
Showing 161 changed files with 8,143 additions and 2,226 deletions.
7 changes: 7 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/regression-tests
/build*
/.pytest_cache
/.vscode
/dist
/doc
.history*
8 changes: 7 additions & 1 deletion .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,11 @@ jobs:
./marian --version
./marian-decoder --version
./marian-scorer --version
./spm_encode --version
ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \))
- name: Install PyMarian
run: |
python3 -m pip install --upgrade pip setuptools wheel pytest
CMAKE_ARGS="" python3 -m pip install -v .
python3 -m pymarian -v
MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
11 changes: 11 additions & 0 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ jobs:
-DCOMPILE_CPU=${{ matrix.cpu }} \
-DCOMPILE_CUDA=${{ matrix.gpu }} \
-DCOMPILE_EXAMPLES=${{ matrix.examples }} \
-DUSE_TCMALLOC=OFF \
-DCOMPILE_SERVER=on \
-DCOMPILE_TESTS=${{ matrix.unit_tests }} \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-${{ matrix.cuda }} \
Expand Down Expand Up @@ -143,3 +144,13 @@ jobs:
./marian-server --version
./spm_encode --version
ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \))
- name: Install PyMarian
working-directory: build
env:
CUDA_VERSION: ${{ matrix.cuda }}
run: |
python3 -m pip install --upgrade pip setuptools wheel pytest
CMAKE_ARGS="" python3 -m pip install -v .
python3 -m pymarian -v
MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests
12 changes: 12 additions & 0 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,16 @@ jobs:
.\marian-decoder.exe --version
.\marian-scorer.exe --version
dir *.exe
cd ..
shell: cmd

- name: Install PyMarian
working-directory: src/python
run: |
python3 -m pip install --upgrade pip setuptools wheel pytest
python3 -m pip install -v .
python3 -m pymarian -v
python3 -m pytest -vs src/python/tests
env:
CUDA_VERSION: ${{ matrix.cuda }}
shell: cmd
15 changes: 13 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Config files from CMake
.history*
src/common/project_version.h
src/common/git_revision.h
src/common/build_info.cpp
Expand Down Expand Up @@ -48,6 +48,8 @@ pingme.txt
# CMake files
build
build-*
# pymarian wheels
dist/

# Examples
examples/*/*.gz
Expand All @@ -61,4 +63,13 @@ examples/mnist/*ubyte
/vs/MarianDll.VC.VC.opendb

.vs
.vscode
.vscode

# Python : pymarian
*.whl
*.egg-info
src/python/pymarian/_version.py
src/python/tests/data
__pycache__
.pytest_cache

7 changes: 5 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/ruy"]
path = src/3rd_party/ruy
url = https://github.com/google/ruy.git
url = https://github.com/marian-nmt/ruy.git
[submodule "src/3rd_party/simd_utils"]
path = src/3rd_party/simd_utils
url = https://github.com/JishinMaster/simd_utils.git
url = https://github.com/marian-nmt/simd_utils.git
[submodule "src/3rd_party/pybind11"]
path = src/3rd_party/pybind11
url = https://github.com/pybind/pybind11.git
23 changes: 21 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
- Fixed compilation with clang 16.0.6
- Added Threads::Threads to EXT_LIBS

- Added Threads::Threads to `EXT_LIBS`
- Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace

### Added
- Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size.
- Added `--no-optimizer-reload` to skip optimizer state loading during continued training or fallback.
- Added `pymarian-eval`, CLI for scoring metrics
- Added `--input-reorder pos1 pos2` option to re-ordering inputs internally when reading in batches. This is mostly a model property.
- Added `pymarian`: python bindings based on pybind11
- Added implementation of COMET-KIWI
- Added implementation of xCOMET-XL/XXL regressor parts (MQM interpolation missing for now)
- Added implementation of COMET-22 (reference-based) model and conversion
- Added sparsemax operator (slow version)
- Added sampling variants nucleus and epsilon, e.g. `--output-sampling nucleus 0.9` and `--output-sampling epsilon 0.02`, respectively.
- Added ALIBI related options to new layer framework.
- Added `--no-spm-encode` option, allowing the model to use vocabulary IDs directly to train/decode.
- Added MSE and MAE costs to COMET-QE training.
- Added augmentation of shuffled examples to COMET-QE training via `--comet-augment-bad`.
Expand All @@ -29,6 +40,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- New experimental layer framework for Transformer-like models.

### Fixed
- Do not mmap files for conversion via Quicksand API
- Fixed ALiBI states and caching in new layer framework
- Throw exception when forcing with FS vocabs
- Fixed force-decoding with LSH
- Fixed force-decoding for beam-size > 1
- Fixed lost node in mt-detect metrics
- Fixed BLEURT logmask computation
- Fixed wrong paramter name for norm in new layer framework
- Fixed unit test for LayerNorm
- Only collect batch statistics during mini-batch-fit up to actual max-length.
Expand All @@ -37,6 +55,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Correct defaults for factored embeddings such that shared library use works (move out of config.h/cpp).

### Changed
- Refactoring of model loading, mmapping happens now opportunistically, --mmap-models for decoding forces mmap and croaks if not possible.
- Removed --num-devices N option that wasn't really used by anyone (I assume).


Expand Down
60 changes: 38 additions & 22 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ option(USE_MKL "Compile with MKL support" ON)
option(USE_MPI "Use MPI library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
option(USE_TCMALLOC "Use TCMALLOC if available" ON)
option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
option(DETERMINISTIC "Try to make training results as deterministic as possible (e.g. for testing)" OFF)
option(PYMARIAN "Build Pymarian package which is based on pybind11" OFF)

# fbgemm and sentencepiece are both defined with "non-local" installation targets (the source projects don't define them,
# so we define them in src\3rd_party\CMakeLists.txt), but that isn't supported until CMake 3.12. Prior to CMake 3.12,
Expand Down Expand Up @@ -121,6 +123,12 @@ set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
find_package(Threads REQUIRED)
set(EXT_LIBS ${EXT_LIBS} Threads::Threads)

# disable tcmalloc if pymarian=on
if(USE_TCMALLOC AND PYMARIAN)
message(WARNING "TCMalloc can cause segfaults with some python libraries. Hence disabling TCMalloc for a robust pymarian build.")
set(USE_TCMALLOC off)
endif()
########

###############################################################################
Expand Down Expand Up @@ -148,7 +156,7 @@ if(MSVC)
set(INTRINSICS "/arch:AVX2")
# set(INTRINSICS "/arch:AVX512")
# /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS "/permissive- /EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")

Expand Down Expand Up @@ -286,8 +294,8 @@ else(MSVC)
set(CMAKE_RDYNAMIC_FLAG "-rdynamic")
endif(CMAKE_COMPILER_IS_GNUCC)

set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS "-std=c++17 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE}")
Expand All @@ -297,7 +305,7 @@ else(MSVC)

# these need to be set separately
set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} ${INTRINSICS}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE}")
Expand Down Expand Up @@ -399,6 +407,7 @@ if(CUDA_FOUND)
LIST(APPEND COMPUTE -Wno-deprecated-gpu-targets)
endif()

message(STATUS "CUDA_VERSION=${CUDA_VERSION}; CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
if(COMPILE_KEPLER)
message(STATUS "Compiling code for Kepler GPUs")
LIST(APPEND COMPUTE -gencode=arch=compute_35,code=sm_35;) # Tesla K40 and above
Expand Down Expand Up @@ -464,7 +473,7 @@ if(CUDA_FOUND)
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
find_library(CUDA_cublasLt_LIBRARY NAMES cublasLt PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 NO_DEFAULT_PATH)
if(NOT CUDA_cublasLt_LIBRARY)
message(FATAL_ERROR "cuBLASLt library not found")
message(FATAL_ERROR "cuBLASLt library not found. -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
endif()
set(EXT_LIBS ${EXT_LIBS} ${CUDA_cublasLt_LIBRARY})
set(CUDA_LIBS ${CUDA_LIBS} ${CUDA_cublasLt_LIBRARY})
Expand Down Expand Up @@ -518,16 +527,21 @@ endif(COMPILE_CUDA)

# TODO: make compatible with older CUDA versions
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O0; -g; --use_fast_math; ${COMPUTE})
else(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
list(APPEND CUDA_NVCC_FLAGS --extended-lambda; --default-stream per-thread; -O3; -g; --use_fast_math; ${COMPUTE})
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
if(NOT MSVC)
# @TODO: add warnings here too
list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++11; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
list(APPEND CUDA_NVCC_FLAGS -ccbin ${CMAKE_C_COMPILER}; -std=c++17; -Xcompiler\ -fPIC; -Xcompiler\ -Wno-unused-result; -Xcompiler\ -Wno-deprecated; -Xcompiler\ -Wno-pragmas; -Xcompiler\ -Wno-unused-value; -Xcompiler\ -Werror;)
list(APPEND CUDA_NVCC_FLAGS ${INTRINSICS_NVCC})
else()
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
# c++17 doesn't work with CUDA 10
if ((CUDA_VERSION VERSION_EQUAL "11.0" OR CUDA_VERSION VERSION_GREATER "11.0"))
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++17; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
else()
list(APPEND CUDA_NVCC_FLAGS -Xcompiler\ /std:c++14; -Xcompiler\ /FS; -Xcompiler\ /MT$<$<CONFIG:Debug>:d>; )
endif()
endif()

list(REMOVE_DUPLICATES CUDA_NVCC_FLAGS)
Expand All @@ -543,20 +557,22 @@ if(USE_STATIC_LIBS)
endif()

###############################################################################
# Find Tcmalloc_minimal
# Find Tcmalloc_minimal
# re-used from sentencepiece
if(NOT WIN32)
if(USE_STATIC_LIBS)
find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
else()
find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
endif()
if (TCMALLOC_LIB)
message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
else()
message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
if(USE_TCMALLOC)
if(NOT WIN32)
if(USE_STATIC_LIBS)
find_library(TCMALLOC_LIB NAMES libtcmalloc_minimal.a)
else()
find_library(TCMALLOC_LIB NAMES tcmalloc_minimal)
endif()
if (TCMALLOC_LIB)
message(STATUS "Found TCMalloc: ${TCMALLOC_LIB}")
set(EXT_LIBS ${EXT_LIBS} ${Tcmalloc_LIBRARIES})
add_definitions(-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free)
else()
message(STATUS "Not Found TCMalloc: ${TCMALLOC_LIB}")
endif()
endif()
endif()

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
v1.12.14
v1.12.31
Loading

0 comments on commit 2f9b6df

Please sign in to comment.