Skip to content

Commit

Permalink
[Hexagon] Add HVX quant conv2d implementation
Browse files Browse the repository at this point in the history
This patch adds a new HVX intrinsic implementation to perform quantized
convolution.

It assumes that the qnn.conv2d relay op is not
canonicalized and all the quantization parameters (scales and zero
points) are passed into the intrinsic implementation.

It also uses the fixed point computation function defined in hexagon
topi utils to compute a fixed point (combined) scale which is used to
perform the final requantization before returning the quantized output.
  • Loading branch information
quic-sanirudh committed Nov 4, 2022
1 parent b1a099b commit 7048369
Show file tree
Hide file tree
Showing 10 changed files with 1,171 additions and 206 deletions.
9 changes: 9 additions & 0 deletions cmake/modules/Hexagon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,15 @@ if(BUILD_FOR_HEXAGON)
"${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc"
)

include_directories(
"${TVMRT_SOURCE_DIR}/hexagon/ops"
)

set_source_files_properties(
"${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_quant_hvx.cc"
PROPERTIES COMPILE_FLAGS "-mhvx"
)

set_source_files_properties(
"${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc"
PROPERTIES COMPILE_FLAGS "-mhvx"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/device_api.h>

#include <algorithm>
#include <cassert>

#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_
Expand All @@ -28,6 +29,7 @@
namespace tvm {
namespace runtime {
namespace hexagon {
namespace conv_utils {
static constexpr auto hexagon_device = DLDevice{static_cast<DLDeviceType>(kDLHexagon), 0};

// Standalone DLTensor: the standalone-ness means that this object owns the shape
Expand Down Expand Up @@ -75,15 +77,31 @@ inline void* to_ptr(uintptr_t v) { return reinterpret_cast<void*>(v); }

inline uintptr_t to_uint(void* ptr) { return reinterpret_cast<uintptr_t>(ptr); }

constexpr int xyc_to_sm_16b(int y, int x, int c) {
inline constexpr int yxc_to_sm_16b(int y, int x, int c) {
// Map y,x,c coordinates within a block to the offset (in 16-bit elements)
// from the beginning of the block in spatial-major layout.
// 10-bit spatial mask: yyyxcccccx
assert(y >= 0 && x >= 0 && c >= 0);
return y << 7 | (x & 2) << 5 | c << 1 | (x & 1);
}

constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
inline constexpr int yxc_to_sm_8b(int y, int x, int c) {
// Map y,x,c coordinates within a block to the offset (in 8-bit elements)
// from the beginning of the block in spatial-major layout.
// 10-bit spatial mask: yyyxxxccccc
return y << 8 | x << 5 | c;
}

inline constexpr int hwio_to_sm_8b(int width, int y, int x, int i, int o) {
// Map y,x,i,o coordinates within a chunk (assuming the origin at the
// top-left spatial corner) to the offset (in 8-bit elements) from the
// beginning of the chunk in spatial-major layout.
// Spatial mask: p..piiioooooii, where p..p are position bits.
int p = y * width + (width - 1 - x);
return p << 10 | (i & 0x1c) << 5 | o << 2 | (i & 3);
}

inline constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) {
// Map y,x,i,o coordinates within a chunk (assuming the origin at the
// top-left spatial corner) to the offset (in 16-bit elements) from the
// beginning of the chunk in spatial-major layout.
Expand Down Expand Up @@ -123,6 +141,10 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
* The input is mapped into the below mentioned layout (notation similar to index map used for
* transform layout):
*
* For uint8_t type
* lambda n, h, w, c: n, h//8, w//8, c//32, AXIS_SEPARATOR, h%8, w%8, c%32
*
* For uint16_t type
* lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2
*
* where AXIS_SEPARATOR represents split up in the physical layout
Expand All @@ -133,7 +155,48 @@ inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) {
* @param width
* @param depth
*/
void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth);
template <typename T, int block_height, int block_width, int block_depth>
void blockize_hwc(void* out, void* inp_flat, int height, int width, int depth) {
int (*index_func)(int, int, int);
if constexpr (std::is_same_v<T, uint8_t>)
index_func = yxc_to_sm_8b;
else if constexpr (std::is_same_v<T, uint16_t>)
index_func = yxc_to_sm_16b;
else
LOG_ERROR << "blockize_hwc is only supported for uint8_t and uint16_t types";

auto inp_data = static_cast<T*>(inp_flat);
auto out_data = static_cast<uintptr_t*>(out);
const int stride_x = depth;
const int stride_y = stride_x * width;

for (int cy = 0; cy < height; cy += block_height) {
for (int cx = 0; cx < width; cx += block_width) {
for (int cc = 0; cc < depth; cc += block_depth) {
auto block = reinterpret_cast<T*>(*out_data++);
int max_y = std::min(block_height, height - cy);
int max_x = std::min(block_width, width - cx);
int max_c = std::min(block_depth, depth - cc);
for (int y = 0; y < max_y; ++y) {
for (int x = 0; x < max_x; ++x) {
for (int c = 0; c < max_c; ++c) {
block[index_func(y, x, c)] =
inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)];
}
for (int c = max_c; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
}
for (int x = max_x; x < block_width; ++x) {
for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
}
}

for (int y = max_y; y < block_height; ++y)
for (int x = 0; x < block_width; ++x)
for (int c = 0; c < block_depth; ++c) block[index_func(y, x, c)] = 0;
} // cc
} // cx
} // cy
}

/**
* @brief Convert back from non-contguous layout to a flat layout
Expand All @@ -144,7 +207,42 @@ void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int dept
* @param width
* @param depth
*/
void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth);
template <typename T, int block_height, int block_width, int block_depth>
void deblockize_hwc(void* out_flat, void* inp, int height, int width, int depth) {
int (*index_func)(int, int, int);
if constexpr (std::is_same_v<T, uint8_t>)
index_func = yxc_to_sm_8b;
else if constexpr (std::is_same_v<T, uint16_t>)
index_func = yxc_to_sm_16b;
else
LOG_ERROR << "deblockize_hwc is only supported for uint8_t and uint16_t types";

uintptr_t* inp_data = static_cast<uintptr_t*>(inp);
T* out_data = static_cast<T*>(out_flat);
const int stride_x = depth;
const int stride_y = stride_x * width;

for (int cy = 0; cy < height; cy += block_height) {
for (int cx = 0; cx < width; cx += block_width) {
for (int cc = 0; cc < depth; cc += block_depth) {
auto block = reinterpret_cast<T*>(*inp_data);
int max_y = std::min(block_height, height - cy);
int max_x = std::min(block_width, width - cx);
int max_c = std::min(block_depth, depth - cc);
for (int y = 0; y < max_y; ++y) {
for (int x = 0; x < max_x; ++x) {
for (int c = 0; c < max_c; ++c) {
out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] =
block[index_func(y, x, c)];
}
}
}

inp_data++;
}
}
}
}

/**
* @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
Expand Down Expand Up @@ -175,22 +273,50 @@ void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int de
*/
void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
int width, int idepth, int odepth);
void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
int idepth, int odepth);

template <typename T, int block_height, int block_width, int block_depth>
SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat,
bool copy_data);
bool copy_data) {
tvm::runtime::String vtcm_scope = "global.vtcm";

int calculate_num_weight_chunks(int64_t* shape_hwio);
// Allocate blocks for activations. We will use the block pointers
// directly from the allocated area.
int n = nhwc_flat->shape[0];
int h = round_up(nhwc_flat->shape[1], block_height);
int w = round_up(nhwc_flat->shape[2], block_width);
int c = round_up(nhwc_flat->shape[3], block_depth);
int64_t shape_2d[2] = {(n * h * w * c) / (block_height * block_width * block_depth),
block_height * block_width * block_depth};
void* nhwc_vtcm =
device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope);
if (copy_data) {
blockize_hwc<T, block_height, block_width, block_depth>(
nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], nhwc_flat->shape[3]);
}

return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm,
{n, h / block_height, w / block_width, c / block_depth});
}

int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
int chunk_in_channel, int chunk_out_channel);

SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
int num_chunks, void** ptr_table);

SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
int num_chunks, void** ptr_table, int wgt_zp = 0);

template <size_t N>
void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor<N>& tensor) {
if (auto* data_space = tensor.GetDataSpace()) {
device_api->FreeDataSpace(hexagon_device, data_space);
}
}

} // namespace conv_utils
} // namespace hexagon
} // namespace runtime
} // namespace tvm
Expand Down
57 changes: 31 additions & 26 deletions src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include <cassert>
#include <cinttypes>

#include "tvm/runtime/hexagon/ops/conv2d.h"
#include "conv2d.h"

// Current limitations:
// - N in NHWC must be 1
Expand Down Expand Up @@ -68,7 +68,7 @@ namespace hexagon {
*/
static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi,
int xio, int ci, int xii, const DLTensor& tensor) {
auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
auto block_ptr = conv_utils::nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c);
auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii;
auto first_element_ptr = reinterpret_cast<uint16_t*>(block_ptr);
return first_element_ptr + block_offset;
Expand Down Expand Up @@ -279,10 +279,10 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*)
}
int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4);
int fy = fh % 8;
for (int c = 0; c < round_up(filt_idepth, 2); c += 2) {
for (int c = 0; c < conv_utils::round_up(filt_idepth, 2); c += 2) {
int out_act_cc = c / 32;
int ci = c % 32;
auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);
auto wgt_chunk = conv_utils::hwio_at(cr_filt, fch, fcw, out_act_cc, out_c);

// Find weight chunk offset ptr
int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4;
Expand All @@ -306,7 +306,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*)
true_wo, ci, true_wi, cr_act);
HVX_Vector act_vec = getInputVector(act_element_ptr);

auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0);
auto wgt_chunk_offset = conv_utils::hwio_to_sm_16b(max_x, fy, fx, ci, 0);
auto base_chunk_ptr = reinterpret_cast<uint16_t*>(wgt_chunk);
auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset;

Expand Down Expand Up @@ -404,7 +404,7 @@ void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*)

int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val,
int out_code, void* res_handle) {
namespace hexagonrt = tvm::runtime::hexagon;
namespace conv_utils = tvm::runtime::hexagon::conv_utils;
ICHECK_EQ(num_args, 7) << "Unexpected number of arguments";
ICHECK_EQ(type_codes[0], kTVMDLTensorHandle)
<< "First argument is expected to be the input tensor"; // Input activations
Expand Down Expand Up @@ -440,50 +440,55 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue*
<< wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top
<< ", pad_left=" << pad_left;

auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false);
auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false);
ICHECK(device_api != nullptr);
tvm::runtime::String vtcm_scope = "global.vtcm";

auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true);
auto act_vtcm =
conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, /*copy_data=*/true);

ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero";
ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero";
ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero";
ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero";
int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape);
int num_wgt_chunks = conv_utils::calculate_num_weight_chunks(
wgt_flat->shape, /* chunk_height */ 8, /* chunk_width */ 4, /* chunk_in_channel */ 32,
/* chunk_out_channel */ 32);

LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks;
auto wgt_ptr_table =
reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t)));
auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);
auto wgt_vtcm = conv_utils::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table);

auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false);
auto out_vtcm =
conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, out_flat, /*copy_data=*/false);

// Prepare zero_block
int64_t block_nbytes = 2048;
void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes,
void* zero_block = device_api->AllocDataSpace(conv_utils::hexagon_device, 1, &block_nbytes,
tvm::runtime::DataType::UInt(8), vtcm_scope);
memset(zero_block, 0, 2048);

// FIXME: Setting bias to zero_block: this works for up to 256 output channels.
auto bias_flat =
hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
conv_utils::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]);
auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape);
auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape);
auto pad_shape = conv_utils::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left});
auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape);
bool relu = false;

hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat,
filt_shape, pad_shape, relu, stride_h, stride_w,
hexagonrt::to_uint(zero_block));
tvm::runtime::hexagon::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape,
bias_flat, filt_shape, pad_shape, relu, stride_h,
stride_w, conv_utils::to_uint(zero_block));

hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1],
out_flat->shape[2], out_flat->shape[3]);
conv_utils::deblockize_hwc<uint16_t, 8, 4, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1],
out_flat->shape[2], out_flat->shape[3]);

device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block);
hexagonrt::release(device_api, out_vtcm);
hexagonrt::release(device_api, wgt_vtcm);
hexagonrt::release(device_api, act_vtcm);
device_api->FreeDataSpace(conv_utils::hexagon_device, zero_block);
conv_utils::release(device_api, out_vtcm);
conv_utils::release(device_api, wgt_vtcm);
conv_utils::release(device_api, act_vtcm);

return 0;
}
Loading

0 comments on commit 7048369

Please sign in to comment.