forked from jax-ml/jax
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Building on jax-ml#21925, this tutorial demonstrates the use of the FFI using `ffi_call` with a simple example. I don't think this should cover all of the most advanced use cases, but it should be sufficient for the most common examples. I think it would be useful to eventually replace the existing CUDA tutorial, but I'm not sure that it'll get there in the first draft. As an added benefit, this also runs a simple test (akin to `docs/cuda_custom_call`) which actually executes using a tool chain that open source users would use in practice.
- Loading branch information
Showing
9 changed files
with
1,304 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
CMake* | ||
cmake* | ||
Makefile | ||
*.so | ||
*.dylib |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
cmake_minimum_required(VERSION 3.18...3.27) | ||
project(rms_norm LANGUAGES CXX) | ||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) | ||
|
||
execute_process( | ||
COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir | ||
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR) | ||
list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}") | ||
find_package(nanobind CONFIG REQUIRED) | ||
|
||
# TODO(dfm): Remove this "FetchContent" version and replace with the python | ||
# command which is commented out below once jaxlib 0.4.31 is released. | ||
include(FetchContent) | ||
FetchContent_Declare( | ||
xla | ||
GIT_REPOSITORY https://github.com/openxla/xla.git | ||
GIT_TAG 0b35a7fcb1c2b58d657994c588d049f9fe4ad048 | ||
) | ||
FetchContent_MakeAvailable(xla) | ||
set(XLA_DIR "${xla_SOURCE_DIR}") | ||
# execute_process( | ||
# COMMAND "${Python_EXECUTABLE}" | ||
# "-c" "from jax.extend import ffi; print(ffi.include_dir())" | ||
# OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE XLA_DIR) | ||
message(STATUS "XLA include directory: ${XLA_DIR}") | ||
|
||
nanobind_add_module(rms_norm NOMINSIZE "rms_norm.cc") | ||
target_include_directories(rms_norm PUBLIC ${XLA_DIR}) | ||
install(TARGETS rms_norm LIBRARY DESTINATION ${CMAKE_CURRENT_LIST_DIR}) | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
/* Copyright 2024 The JAX Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
==============================================================================*/ | ||
|
||
#include <cmath> | ||
#include <cstdint> | ||
#include <functional> | ||
#include <numeric> | ||
#include <utility> | ||
#include <type_traits> | ||
|
||
#include "nanobind/nanobind.h" | ||
#include "xla/ffi/api/c_api.h" | ||
#include "xla/ffi/api/ffi.h" | ||
|
||
namespace ffi = xla::ffi; | ||
namespace nb = nanobind; | ||
|
||
// This is the example "library function" that we want to expose to JAX. This | ||
// isn't meant to be a particularly good implementation, it's just here as a | ||
// placeholder for the purposes of this tutorial. | ||
float ComputeRmsNorm(float eps, int64_t size, const float *x, float *y) { | ||
float sm = 0.0f; | ||
for (int64_t n = 0; n < size; ++n) { | ||
sm += x[n] * x[n]; | ||
} | ||
float scale = 1.0f / std::sqrt(sm / float(size) + eps); | ||
for (int64_t n = 0; n < size; ++n) { | ||
y[n] = x[n] * scale; | ||
} | ||
return scale; | ||
} | ||
|
||
// A helper function for extracting the relevant dimensions from `ffi::Buffer`s. | ||
// In this example, we treat all leading dimensions as batch dimensions, so this | ||
// function returns the total number of elements in the buffer, and the size of | ||
// the last dimension. | ||
template <ffi::DataType T> | ||
std::pair<int64_t, int64_t> GetDims(const ffi::Buffer<T> &buffer) { | ||
auto dims = buffer.dimensions(); | ||
if (dims.size() == 0) { | ||
return std::make_pair(0, 0); | ||
} | ||
return std::make_pair(buffer.element_count(), dims.back()); | ||
} | ||
|
||
// A wrapper function providing the interface between the XLA FFI call and our | ||
// library function `ComputeRmsNorm` above. This function handles the batch | ||
// dimensions by calling `ComputeRmsNorm` within a loop. | ||
ffi::Error RmsNormImpl(float eps, ffi::Buffer<ffi::DataType::F32> x, | ||
ffi::Result<ffi::Buffer<ffi::DataType::F32>> y) { | ||
auto [totalSize, lastDim] = GetDims(x); | ||
if (lastDim == 0) { | ||
return ffi::Error(ffi::ErrorCode::kInvalidArgument, | ||
"RmsNorm input must be an array"); | ||
} | ||
for (int64_t n = 0; n < totalSize; n += lastDim) { | ||
ComputeRmsNorm(eps, lastDim, &(x.typed_data()[n]), &(y->typed_data()[n])); | ||
} | ||
return ffi::Error::Success(); | ||
} | ||
|
||
// Wrap `RmsNormImpl` and specify the interface to XLA. | ||
XLA_FFI_DEFINE_HANDLER(RmsNorm, RmsNormImpl, | ||
ffi::Ffi::Bind() | ||
.Attr<float>("eps") | ||
.Arg<ffi::Buffer<ffi::DataType::F32>>() // x | ||
.Ret<ffi::Buffer<ffi::DataType::F32>>() // y | ||
); | ||
|
||
ffi::Error RmsNormFwdImpl(float eps, ffi::Buffer<ffi::DataType::F32> x, | ||
ffi::Result<ffi::Buffer<ffi::DataType::F32>> y, | ||
ffi::Result<ffi::Buffer<ffi::DataType::F32>> res) { | ||
auto [totalSize, lastDim] = GetDims(x); | ||
if (lastDim == 0) { | ||
return ffi::Error(ffi::ErrorCode::kInvalidArgument, | ||
"RmsNormFwd input must be an array"); | ||
} | ||
for (int64_t n = 0, idx = 0; n < totalSize; n += lastDim, ++idx) { | ||
res->typed_data()[idx] = ComputeRmsNorm(eps, lastDim, &(x.typed_data()[n]), | ||
&(y->typed_data()[n])); | ||
} | ||
return ffi::Error::Success(); | ||
} | ||
|
||
XLA_FFI_DEFINE_HANDLER(RmsNormFwd, RmsNormFwdImpl, | ||
ffi::Ffi::Bind() | ||
.Attr<float>("eps") | ||
.Arg<ffi::Buffer<ffi::DataType::F32>>() // x | ||
.Ret<ffi::Buffer<ffi::DataType::F32>>() // y | ||
.Ret<ffi::Buffer<ffi::DataType::F32>>() // res | ||
); | ||
|
||
void ComputeRmsNormBwd(int64_t size, float res, const float *x, | ||
const float *ct_y, float *ct_x) { | ||
float ct_res = 0.0f; | ||
for (int64_t n = 0; n < size; ++n) { | ||
ct_res += x[n] * ct_y[n]; | ||
} | ||
float factor = ct_res * res * res * res / float(size); | ||
for (int64_t n = 0; n < size; ++n) { | ||
ct_x[n] = res * ct_y[n] - factor * x[n]; | ||
} | ||
} | ||
|
||
ffi::Error RmsNormBwdImpl(ffi::Buffer<ffi::DataType::F32> res, | ||
ffi::Buffer<ffi::DataType::F32> x, | ||
ffi::Buffer<ffi::DataType::F32> ct_y, | ||
ffi::Result<ffi::Buffer<ffi::DataType::F32>> ct_x) { | ||
auto [totalSize, lastDim] = GetDims(x); | ||
if (lastDim == 0) { | ||
return ffi::Error(ffi::ErrorCode::kInvalidArgument, | ||
"RmsNormBwd inputs must be arrays"); | ||
} | ||
for (int64_t n = 0, idx = 0; n < totalSize; n += lastDim, ++idx) { | ||
ComputeRmsNormBwd(lastDim, res.typed_data()[idx], &(x.typed_data()[n]), | ||
&(ct_y.typed_data()[n]), &(ct_x->typed_data()[n])); | ||
} | ||
return ffi::Error::Success(); | ||
} | ||
|
||
XLA_FFI_DEFINE_HANDLER(RmsNormBwd, RmsNormBwdImpl, | ||
ffi::Ffi::Bind() | ||
.Arg<ffi::Buffer<ffi::DataType::F32>>() // res | ||
.Arg<ffi::Buffer<ffi::DataType::F32>>() // x | ||
.Arg<ffi::Buffer<ffi::DataType::F32>>() // ct_y | ||
.Ret<ffi::Buffer<ffi::DataType::F32>>() // ct_x | ||
); | ||
|
||
template <typename T> | ||
nb::capsule EncapsulateFfiCall(T *fn) { | ||
// This check is optional, but it can be useful to catch invalid function | ||
// pointers at compile time. | ||
static_assert(std::is_invocable_r_v<XLA_FFI_Error *, T, XLA_FFI_CallFrame *>, | ||
"Encapsulated function must be and XLA FFI handler"); | ||
return nb::capsule(reinterpret_cast<void *>(fn)); | ||
} | ||
|
||
NB_MODULE(rms_norm, m) { | ||
m.def("rms_norm", []() { return EncapsulateFfiCall(RmsNorm); }); | ||
m.def("rms_norm_fwd", []() { return EncapsulateFfiCall(RmsNormFwd); }); | ||
m.def("rms_norm_bwd", []() { return EncapsulateFfiCall(RmsNormBwd); }); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -120,6 +120,7 @@ Operators | |
neg | ||
nextafter | ||
pad | ||
platform_dependent | ||
polygamma | ||
population_count | ||
pow | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters