Skip to content

Commit

Permalink
CANN Backend support
Browse files Browse the repository at this point in the history
Co-authored-by: kandrio <[email protected]>
  • Loading branch information
3manifold and kandrio committed Jan 29, 2024
1 parent c95fd4e commit 39d3229
Show file tree
Hide file tree
Showing 72 changed files with 4,282 additions and 153 deletions.
107 changes: 107 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ option(WITH_DNNL "Compile with DNNL backend" OFF)
option(WITH_ACCELERATE "Compile with Accelerate backend" OFF)
option(WITH_OPENBLAS "Compile with OpenBLAS backend" OFF)
option(WITH_RUY "Compile with Ruy backend" OFF)
option(WITH_CANN "Compile with CANN backend" OFF)
option(WITH_CUDA "Compile with CUDA backend" OFF)
option(WITH_CUDNN "Compile with cuDNN backend" OFF)
option(CUDA_DYNAMIC_LOADING "Dynamically load CUDA libraries at runtime" OFF)
Expand All @@ -21,6 +22,12 @@ option(BUILD_CLI "Compile the clients" ON)
option(BUILD_TESTS "Compile the tests" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries" ON)

if(WITH_CUDA OR WITH_CUDNN)
if(WITH_CANN)
message( FATAL_ERROR "CANN backend cannot be combined with CUDA or CUDNN!" )
endif ()
endif ()

if(ENABLE_PROFILING)
message(STATUS "Enable profiling support")
add_definitions(-DCT2_ENABLE_PROFILING)
Expand Down Expand Up @@ -525,6 +532,105 @@ if (WITH_CUDA)
)
elseif(WITH_CUDNN)
message(FATAL_ERROR "WITH_CUDNN=ON requires WITH_CUDA=ON")
elseif(WITH_CANN)
add_definitions(-DCT2_WITH_CANN)

message(STATUS "ASCEND_TOOLKIT_HOME: $ENV{ASCEND_TOOLKIT_HOME}")
message(STATUS "LD_LIBRARY_PATH: $ENV{LD_LIBRARY_PATH}")
message(STATUS "PYTHONPATH: $ENV{PYTHONPATH}")
message(STATUS "ASCEND_AICPU_PATH: $ENV{ASCEND_AICPU_PATH}")
message(STATUS "ASCEND_OPP_PATH: $ENV{ASCEND_OPP_PATH}")
message(STATUS "TOOLCHAIN_HOME: $ENV{TOOLCHAIN_HOME}")
message(STATUS "ASCEND_HOME_PATH: $ENV{ASCEND_HOME_PATH}")
message(STATUS "PATH: $ENV{PATH}")

if(DEFINED ENV{ASCEND_CUSTOM_PATH})
set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
else()
set(ASCEND_DIR /usr/local/Ascend)
endif()

message(STATUS "ASCEND_DIR: ${ASCEND_DIR}")

set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})

set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})

set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})

ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})

ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})

ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})

set(extern_ascend ascend_ge ascend_graph atlas_acl CACHE INTERNAL "acllib runtime libs")

set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)

set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)

message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})

ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})

ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})

ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})

set(extern_ascend_cl ascendcl acl_op_compiler CACHE INTERNAL "acltoolkit libs")

list(APPEND SOURCES
src/cann/allocator.cc
src/cann/primitives.cc
src/cann/utils.cc
src/ops/topk_npu.cc
src/ops/dequantize_npu.cc
src/ops/gumbel_max_npu.cc
src/ops/topp_mask_npu.cc
src/ops/multinomial_npu.cc
src/ops/gather_npu.cc
src/ops/conv1d_npu.cc
src/ops/concat_split_slide_npu.cc
src/ops/alibi_add_npu.cc
src/ops/softmax_npu.cc
src/ops/tile_npu.cc
src/ops/rms_norm_npu.cc
src/ops/layer_norm_npu.cc
src/ops/rotary_npu.cc
src/ops/bias_add_npu.cc
src/ops/mean_npu.cc
src/ops/quantize_npu.cc)
add_library(${PROJECT_NAME} ${SOURCES})
list(APPEND LIBRARIES ${extern_ascend} ${extern_ascend_cl})
else()
add_library(${PROJECT_NAME} ${SOURCES})
endif()
Expand All @@ -540,6 +646,7 @@ set_property(TARGET ${PROJECT_NAME} APPEND PROPERTY
)

list(APPEND LIBRARIES ${CMAKE_DL_LIBS})

target_link_libraries(${PROJECT_NAME} PRIVATE ${LIBRARIES})
target_include_directories(${PROJECT_NAME} BEFORE
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ The project is production-oriented and comes with [backward compatibility guaran

## Key features

* **Fast and efficient execution on CPU and GPU**<br/>The execution [is significantly faster and requires less resources](#benchmarks) than general-purpose deep learning frameworks on supported models and tasks thanks to many advanced optimizations: layer fusion, padding removal, batch reordering, in-place operations, caching mechanism, etc.
* **Fast and efficient execution on CPU, GPU and NPU**<br/>The execution [is significantly faster and requires less resources](#benchmarks) than general-purpose deep learning frameworks on supported models and tasks thanks to many advanced optimizations: layer fusion, padding removal, batch reordering, in-place operations, caching mechanism, etc.
* **Quantization and reduced precision**<br/>The model serialization and computation support weights with [reduced precision](https://opennmt.net/CTranslate2/quantization.html): 16-bit floating points (FP16), 16-bit brain floating points (BF16), 16-bit integers (INT16), and 8-bit integers (INT8).
* **Multiple CPU architectures support**<br/>The project supports x86-64 and AArch64/ARM64 processors and integrates multiple backends that are optimized for these platforms: [Intel MKL](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html), [oneDNN](https://github.com/oneapi-src/oneDNN), [OpenBLAS](https://www.openblas.net/), [Ruy](https://github.com/google/ruy), and [Apple Accelerate](https://developer.apple.com/documentation/accelerate).
* **Automatic CPU detection and code dispatch**<br/>One binary can include multiple backends (e.g. Intel MKL and oneDNN) and instruction set architectures (e.g. AVX, AVX2) that are automatically selected at runtime based on the CPU information.
* **Parallel and asynchronous execution**<br/>Multiple batches can be processed in parallel and asynchronously using multiple GPUs or CPU cores.
* **Dynamic memory usage**<br/>The memory usage changes dynamically depending on the request size while still meeting performance requirements thanks to caching allocators on both CPU and GPU.
* **Parallel and asynchronous execution**<br/>Multiple batches can be processed in parallel and asynchronously using multiple GPUs, NPUs or CPU cores.
* **Dynamic memory usage**<br/>The memory usage changes dynamically depending on the request size while still meeting performance requirements thanks to caching allocators on all CPU, GPU and NPU.
* **Lightweight on disk**<br/>Quantization can make the models 4 times smaller on disk with minimal accuracy loss.
* **Simple integration**<br/>The project has few dependencies and exposes simple APIs in [Python](https://opennmt.net/CTranslate2/python/overview.html) and C++ to cover most integration needs.
* **Configurable and interactive decoding**<br/>[Advanced decoding features](https://opennmt.net/CTranslate2/decoding.html) allow autocompleting a partial sequence and returning alternatives at a specific location in the sequence.
Expand Down
8 changes: 7 additions & 1 deletion cli/translator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ int main(int argc, char* argv[]) {
cxxopts::value<size_t>()->default_value("1"))
("intra_threads", "Number of computation threads (set to 0 to use the default value).",
cxxopts::value<size_t>()->default_value("0"))
("device", "Device to use (can be cpu, cuda, auto).",
("device", "Device to use (can be cpu, cuda, cann, auto).",
cxxopts::value<std::string>()->default_value("cpu"))
("device_index", "Comma-separated list of device IDs to use.",
cxxopts::value<std::vector<int>>()->default_value("0"))
Expand All @@ -44,6 +44,8 @@ int main(int argc, char* argv[]) {
cxxopts::value<std::string>()->default_value("default"))
("cuda_compute_type", "Computation type on CUDA devices (overrides compute_type)",
cxxopts::value<std::string>())
("cann_compute_type", "Computation type on CANN devices (overrides compute_type)",
cxxopts::value<std::string>())
("cpu_compute_type", "Computation type on CPU devices (overrides compute_type)",
cxxopts::value<std::string>())
;
Expand Down Expand Up @@ -139,6 +141,10 @@ int main(int argc, char* argv[]) {
if (args.count("cuda_compute_type"))
compute_type = ctranslate2::str_to_compute_type(args["cuda_compute_type"].as<std::string>());
break;
case ctranslate2::Device::CANN:
if (args.count("cann_compute_type"))
compute_type = ctranslate2::str_to_compute_type(args["cann_compute_type"].as<std::string>());
break;
};

ctranslate2::ReplicaPoolConfig pool_config;
Expand Down
78 changes: 78 additions & 0 deletions docker/cann/Dockerfile_cann
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Extened/build an image for CANN support
# Ascend-cann-toolkit_<VERSION>.run is expected to exist in <project_root>/ascend_install_files

# preferably arm64
FROM ubuntu:20.04

RUN DEBIAN_FRONTEND="noninteractive" apt update && \
apt install --no-install-recommends net-tools -y && \
apt install --no-install-recommends libsqlite3-dev -y && \
apt install --no-install-recommends zlib1g -y && \
apt install --no-install-recommends openssl -y

RUN DEBIAN_FRONTEND="noninteractive" apt update && \
apt install --no-install-recommends ca-certificates -y && \
apt install --no-install-recommends bc wget -y && \
apt install --no-install-recommends curl gdb cmake gcc make g++ pkg-config unzip -y && \
apt install --no-install-recommends libblas3 liblapack3 gfortran vim -y && \
apt install --no-install-recommends liblapack-dev libblas-dev libhdf5-dev libffi-dev -y && \
apt install --no-install-recommends libssl-dev zlib1g-dev xz-utils cython3 python3-h5py -y && \
apt install --no-install-recommends libopenblas-dev libgmpxx4ldbl liblzma-dev -y && \
apt install --no-install-recommends pciutils -y


RUN DEBIAN_FRONTEND="noninteractive" apt update && \
apt-get install -y --no-install-recommends \
python3-dev \
python3-pip \
wget

RUN python3 -m pip --no-cache-dir install numpy && \
python3 -m pip --no-cache-dir install decorator && \
python3 -m pip --no-cache-dir install sympy && \
python3 -m pip --no-cache-dir install cffi && \
python3 -m pip --no-cache-dir install pyyaml && \
python3 -m pip --no-cache-dir install pathlib2 && \
python3 -m pip --no-cache-dir install protobuf && \
python3 -m pip --no-cache-dir install scipy

RUN python3 -m pip --no-cache-dir install psutil && \
python3 -m pip --no-cache-dir install requests absl-py

RUN python3 -m pip --no-cache-dir install attrs

# cleanup actions
RUN rm -rf /root/.cache/pip
RUN DEBIAN_FRONTEND="noninteractive" apt clean && rm -rf /var/lib/apt/lists/*
RUN DEBIAN_FRONTEND="noninteractive" apt autoremove && apt autoclean

# Install Ascend toolkit
COPY ascend_install_files ascend_install_files
RUN chmod +x ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run && \
ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run --install && \
rm -f ascend_install_files/Ascend-cann-toolkit_7.0.RC1.alpha001_linux-aarch64.run

# Add usergroup & user
RUN groupadd HwHiAiUser && useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser

# This is copied from /usr/local/Ascend/ascend-toolkit/set_env.sh of the respective ascend-toolkit version
ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:$LD_LIBRARY_PATH
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:$PYTHONPATH
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:$PATH
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}

# ENV LD_LIBRARY_PATH=/usr/lib/aarch64-linux-gnu/hdf5/serial:$LD_LIBRARY_PATH
# ENV HCCL_CONNECT_TIMEOUT=7200
# ENV HCCL_WHITELIST_DISABLE=1
# ENV HCCL_SECURITY_MODE=1

ENV ASCEND_GLOBAL_LOG_LEVEL=3

# Set env vars again in case of interactive ssh connection (ascend-toolkit assumed to be already installed)
RUN cp /usr/local/Ascend/ascend-toolkit/set_env.sh /etc/profile.d/
RUN chmod 644 /etc/profile.d/set_env.sh
15 changes: 15 additions & 0 deletions docker/cann/run_container_cann.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# build image that will host CANN environment
cd ../../
docker build -t ctranslate2-aarch64 -f docker/cann/Dockerfile_cann --platform linux/arm64 .

# run the respective container
docker run \
-d --cap-add sys_ptrace \
--pids-limit 409600 \
--privileged --shm-size=128G \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/dcmi:/usr/local/dcmi \
--name ctranslate2-aarch64 <container>
7 changes: 7 additions & 0 deletions docs/hardware_support.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,10 @@ See the [environment variables](environment_variables.md) `CT2_USE_MKL` and `CT2
* NVIDIA GPUs with a Compute Capability greater or equal to 3.5

The driver requirement depends on the CUDA version. See the [CUDA Compatibility guide](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) for more information.

## NPU

* AArch64/ARM64 processors
* Ascend NPU AI Processor greater or equal to 910A

`CANN` version greater or equal to `7.0.RC1.alpha001` (depends on NPU model). See [CANN documentation](https://support.huawei.com/enterprise/en/ascend-computing/cann-pid-251168373) for more information.
10 changes: 10 additions & 0 deletions examples/cann/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 3.7)
project(cann)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)
find_package(Threads)
add_executable(cann_run main.cc)
target_link_libraries(cann_run PRIVATE
${CMAKE_THREAD_LIBS_INIT}
ctranslate2
)
45 changes: 45 additions & 0 deletions examples/cann/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# CANN example query
This example demonstrates a translation query employing `CANN` using the English-German Transformer model trained with OpenNMT-py as in [CTranslate2 documentation](https://opennmt.net/CTranslate2/quickstart.html).

## Environment setup
- Create environment:`docker/cann/Dockerfile_cann`
- Run the container: `docker/cann/run_container_cann.sh`

## Download model
```bash
wget https://s3.amazonaws.com/opennmt-models/transformer-ende-wmt-pyOnmt.tar.gz
tar xf transformer-ende-wmt-pyOnmt.tar.gz
```

## Build executable
Run `examples/cann/build_run.sh`

### Expected output

```
current path: "<current path>"
input data path: "<input data path>"
[<timestamp>] [ctranslate2] [thread 49835] [info] CPU: ARM (NEON=true)
[<timestamp>] [ctranslate2] [thread 49835] [info] - Selected ISA: NEON
[<timestamp>] [ctranslate2] [thread 49835] [info] - Use Intel MKL: false
[<timestamp>] [ctranslate2] [thread 49835] [info] - SGEMM backend: OpenBLAS (packed: false)
[<timestamp>] [ctranslate2] [thread 49835] [info] - GEMM_S16 backend: none (packed: false)
[<timestamp>] [ctranslate2] [thread 49835] [info] - GEMM_S8 backend: Ruy (packed: false, u8s8 preferred: false)
[<timestamp>] [ctranslate2] [thread 49835] [info] NPU:
[<timestamp>] [ctranslate2] [thread 49835] [info] - Number of NPU cores: 8
[<timestamp>] [ctranslate2] [thread 49835] [info] - aclrtRunMode: ACL_HOST
[<timestamp>] [ctranslate2] [thread 49835] [info] Loaded model <path> on device cann:0
[<timestamp>] [ctranslate2] [thread 49835] [info] - Binary version: 6
[<timestamp>] [ctranslate2] [thread 49835] [info] - Model specification revision: 7
[<timestamp>] [ctranslate2] [thread 49835] [info] - Selected compute type: float32
input data:
▁H ello ▁world !
Start: Warmup examples
output:
▁Hallo ▁Welt !
input data:
▁H ello ▁world !
Start: Query examples
output:
▁Hallo ▁Welt !
```
19 changes: 19 additions & 0 deletions examples/cann/build_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

# execute from project root

# first build ct2lib
rm -rf build-release/
mkdir build-release && cd build-release || exit

cmake -DWITH_CANN=ON -DCMAKE_BUILD_TYPE=Release -DBUILD_CLI=OFF -DWITH_MKL=OFF -DOPENMP_RUNTIME=COMP -DCMAKE_PREFIX_PATH="/opt/OpenBLAS" -DWITH_OPENBLAS=ON -DWITH_RUY=ON ..

make -j"$(nproc)"

rm CMakeCache.txt

# then build cann_run
cmake -DCMAKE_BUILD_TYPE=Release ../examples/cann/

make -j"$(nproc)"
# ./cann_run <ende_ctranslate2_path>
Loading

0 comments on commit 39d3229

Please sign in to comment.