From 0046b832c5e95035a52cc412b68c57025c0caead Mon Sep 17 00:00:00 2001 From: Alex Kogan Date: Mon, 14 Nov 2022 10:55:59 -0500 Subject: [PATCH] Enable parallel output reordering in MlasReorderOutputNchw() --- onnxruntime/contrib_ops/cpu/nchwc_ops.cc | 2 +- onnxruntime/core/mlas/inc/mlas.h | 3 +- onnxruntime/core/mlas/lib/reorder.cpp | 213 +++++++++++++----- .../test/mlas/unittest/test_conv2d_nchwc.h | 2 +- .../test/mlas/unittest/test_pool2d_nchwc.h | 2 +- .../mlas/unittest/test_reorder_output.cpp | 2 +- 6 files changed, 164 insertions(+), 60 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc index 7c4ee548ccaca..c16aaca5e71ea 100644 --- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc +++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc @@ -144,7 +144,7 @@ Status ReorderOutput::Compute(OpKernelContext* context) const { if (channels_last_) { MlasReorderOutputNhwc(Y_shape.data(), x_data, y_data); } else { - MlasReorderOutputNchw(Y_shape.data(), x_data, y_data); + MlasReorderOutputNchw(Y_shape.data(), x_data, y_data, context->GetOperatorThreadPool()); } return Status::OK(); diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 5b6756e4fb90b..c1a4d16fd44fb 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1085,7 +1085,8 @@ MLASCALL MlasReorderOutputNchw( const int64_t* OutputShape, const float* S, - float* D + float* D, + MLAS_THREADPOOL* ThreadPool ); void diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp index 0d7fbd97a4a6f..99c1dbac3b692 100644 --- a/onnxruntime/core/mlas/lib/reorder.cpp +++ b/onnxruntime/core/mlas/lib/reorder.cpp @@ -1,6 +1,7 @@ /*++ Copyright (c) Microsoft Corporation. All rights reserved. +Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved. Licensed under the MIT License. @@ -17,6 +18,20 @@ Module Name: #include "mlasi.h" +// +// Define the parameters to execute segments of a NCHW output reordering +// operation on worker threads. +// + +struct MLAS_REORDER_OUTPUT_NCHW_BLOCK { + ptrdiff_t TargetThreadCount; + const float* S; + float* D; + size_t OutputChannels; + size_t OutputSize; + size_t TasksCount; +}; + MLAS_FORCEINLINE void MlasReorderGatherFloat32x4( @@ -396,25 +411,22 @@ Return Value: } void -MLASCALL -MlasReorderOutputNchw( - const int64_t* OutputShape, - const float* S, - float* D +MlasReorderOutputNchwThreaded( + void* Context, + ptrdiff_t Index ) /*++ Routine Description: - This routine reorders an output buffer from NCHWc to NCHW format. + This routine is invoked from a worker thread to execute a segment of a + NCHW output reordering operation. Arguments: - OutputShape - Supplies the shape of the output tensor. - - S - Supplies the address of the source tensor. + Context - Supplies the pointer to the context for the threaded operation. - D - Supplies the address of the destination tensor. + Index - Supplies the current index of the threaded operation. Return Value: @@ -422,77 +434,168 @@ Return Value: --*/ { + const auto* WorkBlock = (MLAS_REORDER_OUTPUT_NCHW_BLOCK*)Context; + + const size_t OutputChannels = WorkBlock->OutputChannels; + const size_t OutputSize = WorkBlock->OutputSize; + const float* S = WorkBlock->S; + float* D = WorkBlock->D; + const size_t BlockSize = MlasNchwcGetBlockSize(); + const size_t TasksPerBatch = size_t(ceil(((float)OutputChannels) / BlockSize)); + const size_t LastTaskInBatchIndex = TasksPerBatch - 1; - const size_t BatchCount = size_t(OutputShape[0]); - const size_t OutputChannels = size_t(OutputShape[1]); - const size_t OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]); + // + // Compute the range of task indices to use for this thread. + // + + size_t TaskStart; + size_t TasksRemaining; + MlasPartitionWork(Index, WorkBlock->TargetThreadCount, WorkBlock->TasksCount, + &TaskStart, &TasksRemaining); + + size_t TaskEnd = TaskStart + TasksRemaining; + // - // Transpose NCHWc blocks from the source buffer to the destination buffer. + // Rebase the pointers to the source and destination buffers for this thread. // - for (size_t batch = 0; batch < BatchCount; batch++) { + size_t FirstBatchIndex = TaskStart / TasksPerBatch; + size_t FirstTaskInBatchIndex = TaskStart % TasksPerBatch; + S += BlockSize * OutputSize * (FirstBatchIndex * TasksPerBatch + FirstTaskInBatchIndex); + D += OutputSize * (FirstBatchIndex * OutputChannels + BlockSize * FirstTaskInBatchIndex); - for (size_t o = OutputChannels; o > 0;) { + // + // Transpose NCHWc blocks associated with tasks in the range [TaskStart, TaskEnd) + // from the source buffer to the destination buffer. + // - const size_t OutputChannelsThisIteration = std::min(o, BlockSize); - const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); - o -= OutputChannelsThisIteration; + for (size_t t = TaskStart; t < TaskEnd; t++) { + size_t TaskInBatchIndex = t % TasksPerBatch; - const float* s = S; - float* d = D; - size_t OutputSizeRemaining = OutputSize; + const size_t OutputChannelsThisIteration = (TaskInBatchIndex < LastTaskInBatchIndex) ? + BlockSize : OutputChannels - BlockSize * LastTaskInBatchIndex; + const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); - for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) { + const float* s = S; + float* d = D; + size_t OutputSizeRemaining = OutputSize; - const float* ss = s; - float* dd = d; - size_t bc = 0; + for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) { - for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { - MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize); - ss += 4; - dd += 4 * OutputSize; - } + const float* ss = s; + float* dd = d; + size_t bc = 0; - for (; bc < OutputChannelsThisIteration; bc += 1) { - MlasReorderGatherFloat32x4(ss, dd, BlockSize); - ss += 1; - dd += OutputSize; - } + for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { + MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize); + ss += 4; + dd += 4 * OutputSize; + } - s += 4 * BlockSize; - d += 4; + for (; bc < OutputChannelsThisIteration; bc += 1) { + MlasReorderGatherFloat32x4(ss, dd, BlockSize); + ss += 1; + dd += OutputSize; } - for (; OutputSizeRemaining > 0; OutputSizeRemaining--) { + s += 4 * BlockSize; + d += 4; + } - const float* ss = s; - float* dd = d; - size_t bc = 0; + for (; OutputSizeRemaining > 0; OutputSizeRemaining--) { - for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { - MlasReorderScatterFloat32x4(ss, dd, OutputSize); - ss += 4; - dd += 4 * OutputSize; - } + const float* ss = s; + float* dd = d; + size_t bc = 0; - for (; bc < OutputChannelsThisIteration; bc += 1) { - *dd = *ss++; - dd += OutputSize; - } + for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { + MlasReorderScatterFloat32x4(ss, dd, OutputSize); + ss += 4; + dd += 4 * OutputSize; + } - s += BlockSize; - d += 1; + for (; bc < OutputChannelsThisIteration; bc += 1) { + *dd = *ss++; + dd += OutputSize; } - S += BlockSize * OutputSize; - D += OutputChannelsThisIteration * OutputSize; + s += BlockSize; + d += 1; } + + S += BlockSize * OutputSize; + D += OutputChannelsThisIteration * OutputSize; } } + +void +MLASCALL +MlasReorderOutputNchw( + const int64_t* OutputShape, + const float* S, + float* D, + MLAS_THREADPOOL* ThreadPool + ) +/*++ + +Routine Description: + + This routine reorders an output buffer from NCHWc to NCHW format. + +Arguments: + + OutputShape - Supplies the shape of the output tensor. + + S - Supplies the address of the source tensor. + + D - Supplies the address of the destination tensor. + +Return Value: + + None. + +--*/ +{ + MLAS_REORDER_OUTPUT_NCHW_BLOCK WorkBlock; + + // + // Capture the NCHW reorder output operation parameters to the work block. + // + + WorkBlock.S = S; + WorkBlock.D = D; + + WorkBlock.OutputChannels = size_t(OutputShape[1]); + WorkBlock.OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]); + + const size_t BlockSize = MlasNchwcGetBlockSize(); + const size_t TasksPerBatch = size_t(ceil(((float)WorkBlock.OutputChannels) / BlockSize)); + const size_t BatchCount = size_t(OutputShape[0]); + const size_t TasksCount = BatchCount * TasksPerBatch; + WorkBlock.TasksCount = TasksCount; + + // + // Schedule the operation across a set of worker threads if the output + // tensor is sufficienly large. Limit the number of threads to at least + // the number of available tasks. + // + + ptrdiff_t TargetThreadCount = 1; + const size_t BufferSize = BatchCount * WorkBlock.OutputChannels * WorkBlock.OutputSize; + if (BufferSize > 1024 && TasksCount > 1) { + TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool); + if (size_t(TargetThreadCount) > TasksCount) { + TargetThreadCount = ptrdiff_t(TasksCount); + } + } + WorkBlock.TargetThreadCount = TargetThreadCount; + + MlasExecuteThreaded(MlasReorderOutputNchwThreaded, &WorkBlock, TargetThreadCount, ThreadPool); +} + void MLASCALL MlasReorderOutputNhwc( diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h index e516fa8a0b698..c125720668381 100644 --- a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h +++ b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h @@ -137,7 +137,7 @@ class MlasNchwcConv2DTest : public MlasConv2DTest { // Reorder the output buffer. // - MlasReorderOutputNchw(OutputShape, NchwcOutput, Output); + MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, MlasConv2DTest::threadpool_); } const size_t BlockSize = MlasNchwcGetBlockSize(); diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h index 10e3f7f927aba..38ac63a68c843 100644 --- a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h +++ b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h @@ -49,7 +49,7 @@ class MlasNchwcPool2DTest : public MlasPool2DTest { NchwcOutput, nullptr); - MlasReorderOutputNchw(OutputShape, NchwcOutput, Output); + MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, nullptr); } MatrixGuardBuffer BufferNchwcInput; diff --git a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp index a87d2a0fa5721..704333fd27fa0 100644 --- a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp +++ b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp @@ -27,7 +27,7 @@ class MlasReorderOutputTest : public MlasTestBase { std::fill_n(Output, OutputBufferElements, -0.5f); std::fill_n(OutputReference, OutputBufferElements, -0.5f); - MlasReorderOutputNchw(NchwOutputShape, Input, Output); + MlasReorderOutputNchw(NchwOutputShape, Input, Output, GetMlasThreadPool()); ReferenceReorderOutput(BatchCount, Channels, Height, Width, Input, OutputReference, false); ASSERT_EQ(memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)), 0) << " [Nchw] batch=" << BatchCount << ", channels=" << Channels