Skip to content

Commit

Permalink
Xtensa LSTM: (#2150)
Browse files Browse the repository at this point in the history
Enabled LSTM kernel support for XTENSA target.
Updated xtensa_downloads script to use the latest HiFi NN Libraries.

The 8x16 unit test cases has non-zero zero_point for 16 bit output.
[https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/kernels/testdata/lstm_test_data.cc#L255C1-L258C61](url)

Default run for all the 8x16 unit test cases result: FAIL. This is due to non-zero output offset value.

BUG=#1867
  • Loading branch information
cad-audio authored Jan 2, 2024
1 parent 6576ef7 commit 17d0e7f
Show file tree
Hide file tree
Showing 13 changed files with 2,124 additions and 2,905 deletions.
2 changes: 1 addition & 1 deletion tensorflow/lite/micro/kernels/lstm_eval_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,6 @@ TF_LITE_MICRO_TEST(TestLSTMEvalInt16) {
cell_state_tolerance,
int16_node_contents);
}

#endif // !defined(XTENSA)

TF_LITE_MICRO_TESTS_END
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ void TestUnidirectionalLSTMFloat(
TF_LITE_MICRO_TESTS_BEGIN
// TODO(b/230666079) enable below tests for xtensa when the xtensa
// kernel is reconciled with reference kernel
#if !defined(XTENSA)
TF_LITE_MICRO_TEST(TestUnidirectionalLSTMFloat) {
const tflite::testing::LstmEvalCheckData<12, 4, 12> kernel_eval_data =
tflite::testing::Get2X2LstmEvalCheckData();
Expand Down Expand Up @@ -193,5 +192,4 @@ TF_LITE_MICRO_TEST(TestUnidirectionalLSTMInt16) {
kernel_eval_data, hidden_state_tolerance, cell_state_tolerance,
int16_node_contents);
}
#endif // !defined(XTENSA)
TF_LITE_MICRO_TESTS_END
1,564 changes: 419 additions & 1,145 deletions tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc

Large diffs are not rendered by default.

992 changes: 800 additions & 192 deletions tensorflow/lite/micro/kernels/xtensa/lstm_eval.h

Large diffs are not rendered by default.

615 changes: 587 additions & 28 deletions tensorflow/lite/micro/kernels/xtensa/lstm_eval_hifi.cc

Large diffs are not rendered by default.

78 changes: 0 additions & 78 deletions tensorflow/lite/micro/kernels/xtensa/lstm_shared.h

This file was deleted.

2 changes: 1 addition & 1 deletion tensorflow/lite/micro/kernels/xtensa/svdf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ TfLiteStatus EvalIntegerSvdfHifi(TfLiteContext* context, TfLiteNode* node,
#if defined(HIFI5)
memcpy(state_ptr, state_ptr + 1, num_bytes);
#else
xa_nn_memmove_16(state_ptr, state_ptr + 1, num_bytes);
xa_nn_memmove_16(state_ptr, state_ptr + 1, (num_bytes >> 1));
#endif // defined(HIFI5)

// Note: no need to clear the latest activation, matmul is not accumulative.
Expand Down
104 changes: 100 additions & 4 deletions tensorflow/lite/micro/kernels/xtensa/transpose_conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,19 +183,57 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Quantized kernels use an int32 scratch buffer.
if (input->type == kTfLiteInt8) {
TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
const int stride_width = params->stride_width;
const int stride_height = params->stride_height;

const int input_height = SizeOfDimension(input, 1);
const int input_width = SizeOfDimension(input, 2);
const int input_depth = SizeOfDimension(input, 3);
const int output_height = height;
const int output_width = width;
int32_t scratch_buffer_size = 0;
scratch_buffer_size = xa_nn_transpose_conv_getsize(
input_height, input_width, input_depth, filter_height, filter_width,
stride_width, stride_height, output_height, output_width, num_channels,
PREC_SYM8S, PREC_ASYM8S);
TFLITE_DCHECK(context->RequestScratchBufferInArena(
context, scratch_buffer_size,
&(data->scratch_buffer_index)) == kTfLiteOk);
#else // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
TFLITE_DCHECK(context->RequestScratchBufferInArena(
context,
GetTensorShape(output).FlatSize() * sizeof(int32_t),
&(data->scratch_buffer_index)) == kTfLiteOk);
#endif
}

// Quantized 16x8 kernels use an int64 scratch buffer.
if (input->type == kTfLiteInt16) {
TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
const int stride_width = params->stride_width;
const int stride_height = params->stride_height;

const int input_height = SizeOfDimension(input, 1);
const int input_width = SizeOfDimension(input, 2);
const int input_depth = SizeOfDimension(input, 3);
const int output_height = height;
const int output_width = width;
int32_t scratch_buffer_size = 0;
scratch_buffer_size = xa_nn_transpose_conv_getsize(
input_height, input_width, input_depth, filter_height, filter_width,
stride_width, stride_height, output_height, output_width, num_channels,
PREC_SYM8S, PREC_SYM16S);
TFLITE_DCHECK(context->RequestScratchBufferInArena(
context, scratch_buffer_size,
&(data->scratch_buffer_index)) == kTfLiteOk);
#else // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
TFLITE_DCHECK(context->RequestScratchBufferInArena(
context,
GetTensorShape(output).FlatSize() * sizeof(std::int64_t),
&(data->scratch_buffer_index)) == kTfLiteOk);
#endif // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
}

// All per-channel quantized tensors need valid zero point and scale arrays.
Expand Down Expand Up @@ -282,6 +320,63 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
case kTfLiteInt8: {
int32_t* scratch_buffer = static_cast<int32_t*>(
context->GetScratchBuffer(context, data.scratch_buffer_index));
#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
if (bias->type == kTfLiteInt32) {
const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
const RuntimeShape& filter_shape =
tflite::micro::GetTensorShape(filter);
const RuntimeShape& output_shape =
tflite::micro::GetTensorShape(output);
const int stride_width = data.params.stride_width;
const int stride_height = data.params.stride_height;
const int pad_width = data.params.padding_values.width;
const int pad_height = data.params.padding_values.height;

const int batches = MatchingDim(input_shape, 0, output_shape, 0);
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);

const int input_height = input_shape.Dims(1);
const int input_width = input_shape.Dims(2);
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int output_height = output_shape.Dims(1);
const int output_width = output_shape.Dims(2);
const int8_t* input_data = tflite::micro::GetTensorData<int8_t>(input);
const int8_t* filter_data =
tflite::micro::GetTensorData<int8_t>(filter);
const int32_t* bias_data = tflite::micro::GetTensorData<int32_t>(bias);
int8_t* output_data = tflite::micro::GetTensorData<int8_t>(output);

const int num_elements = output_shape.FlatSize();

for (int b = 0; b < batches; b++) {
xa_nn_transpose_conv_sym8sxasym8s(
&output_data[b * output_height * output_width * output_depth],
const_cast<WORD8*>(
&input_data[b * input_height * input_width * input_depth]),
const_cast<WORD8*>(filter_data), const_cast<WORD32*>(bias_data),
stride_width, stride_height, pad_width, pad_height, input_depth,
output_depth, input_height, input_width, filter_height,
filter_width, output_height, output_width, num_elements / batches,
data.params.input_offset, data.params.output_offset,
data.per_channel_output_shift, data.per_channel_output_multiplier,
scratch_buffer);
}
} else {
reference_integer_ops::TransposeConv(
data.params, data.per_channel_output_multiplier,
data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
tflite::micro::GetTensorData<int8_t>(input),
tflite::micro::GetTensorShape(filter),
tflite::micro::GetTensorData<int8_t>(filter),
tflite::micro::GetTensorShape(bias),
tflite::micro::GetTensorData<int32_t>(bias),
tflite::micro::GetTensorShape(output),
tflite::micro::GetTensorData<int8_t>(output),
tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
}
#else
reference_integer_ops::TransposeConv(
data.params, data.per_channel_output_multiplier,
data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
Expand All @@ -293,6 +388,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
tflite::micro::GetTensorShape(output),
tflite::micro::GetTensorData<int8_t>(output),
tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
#endif
break;
}
case kTfLiteInt16: {
Expand All @@ -319,7 +415,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
tflite::micro::GetTensorData<int16_t>(output),
tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
} else {
#if defined(HIFI3) || defined(HIFI4)
#if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
const RuntimeShape& input_shape = tflite::micro::GetTensorShape(input);
const RuntimeShape& filter_shape =
tflite::micro::GetTensorShape(filter);
Expand Down Expand Up @@ -359,9 +455,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
output_depth, input_height, input_width, filter_height,
filter_width, output_height, output_width, num_elements / batches,
data.per_channel_output_shift, data.per_channel_output_multiplier,
&scratch_buffer[b * output_height * output_width * output_depth]);
scratch_buffer);
}
#else
#else // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
reference_integer_ops::TransposeConv(
data.params, data.per_channel_output_multiplier,
data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
Expand All @@ -373,7 +469,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
tflite::micro::GetTensorShape(output),
tflite::micro::GetTensorData<int16_t>(output),
tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer);
#endif // defined(HIFI3) || defined(HIFI4)
#endif // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
}
break;
}
Expand Down
Loading

0 comments on commit 17d0e7f

Please sign in to comment.