Enable parallel output reordering in MlasReorderOutputNchw() (microso…

…ft#13643) ### Description This PR speeds-up the output reordering operation (as implemented in [MlasReorderOutputNchw](https://github.com/microsoft/onnxruntime/blob/9954454c65086c49b7c00f83b23ada76975f3546/onnxruntime/core/mlas/lib/reorder.cpp#L400)) by replacing the sequential implementation with a parallelized one. The parallelization is achieved through the use of the existing [TryBatchParallelFor](https://github.com/microsoft/onnxruntime/blob/9954454c65086c49b7c00f83b23ada76975f3546/include/onnxruntime/core/platform/threadpool.h#L284) construct. ### Motivation and Context  The output reordering operation is frequently executed in image processing models. Its implementation can be easily parallelized and therefore sped up when executed on a multi-core machine. The amount of speedup achieved by this PR varies and depends on the actual input. The table below summarizes the results of some of the experiments I have conducted on a 16-core VM running on an AMD EPYC 7742 64-core processor. The experiment is based on the existing [unit test](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/mlas/unittest/test_reorder_output.cpp) for the output reordering operation. The first column represents the shape of the output as BatchCount:Channels:Height:Width, and the numbers in other columns represent the latency (in us, on average out of 100 runs) for the tested variants. Specifically, I compare the (sequential) baseline (in second column) with the (parallelized) variants, each using a number of worker threads equal to 1, 2, 4, 8 or 16 (as specified in [the constructor to the threadpool object](https://github.com/microsoft/onnxruntime/blob/9954454c65086c49b7c00f83b23ada76975f3546/onnxruntime/test/mlas/unittest/test_main.cpp#L12)). The numbers in () represent the speedup over the baseline. | Input | baseline | 1 Thread | 2 Threads | 4 Threads | 8 Threads | 16 Threads| | ------------- | ------------- |---------------|---------------|---------------|---------------|---------------| 1:1:112:112 | 20.8 | 21.5 (x0.97) | 21.9 (x0.95) | 22.2 (x0.94) | 22.5 (x0.92) | 23.0 (x0.90) | 1:128:160:84 | 540.4 | 712.5 (x0.76) | 404.0 (x1.34) | 327.8 (x1.65) | 377.9 (x1.43) | 371.8 (x1.45) | 13:240:4:314 | 1484.0 | 1851.1 (x0.80) | 1080.9 (x1.37) | 570.2 (x2.60) | 531.8 (x2.79) | 511.2 (x2.90) | 13:96:4:314 | 471.0 | 679.9 (x0.69) | 427.2 (x1.10) | 372.1 (x1.27) | 445.5 (x1.06) | 428.5 (x1.10) | 1:64:320:168 | 1215.1 | 1497.8 (x0.81) | 863.8 (x1.41) | 456.7 (x2.66) | 435.7 (x2.79) | 462.5 (x2.63) | 30:240:4:140 | 1711.5 | 2181.4 (x0.78) | 1182.6 (x1.45) | 657.4 (x2.60) | 592.5 (x2.89) | 578.0 (x2.96) | 30:336:4:140 | 2432.5 | 3039.2 (x0.80) | 1695.6 (x1.43) | 920.7 (x2.64) | 817.1 (x2.98) | 819.2 (x2.97) | The initial drop between the baseline and the variant using just one worker thread can be attributed to the overhead of invoking the reordering loop as a functor in TryBatchParallelFor. This overhead is compensated by the speedup of parallel processing when the number of worker threads is increased.
intel · Feb 15, 2023 · 061b9fd · 061b9fd
1 parent 8a7e605
commit 061b9fd
Show file tree

Hide file tree

Showing 6 changed files with 164 additions and 60 deletions.
diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@@ -144,7 +144,7 @@ Status ReorderOutput::Compute(OpKernelContext* context) const {
   if (channels_last_) {
     MlasReorderOutputNhwc(Y_shape.data(), x_data, y_data);
   } else {
-    MlasReorderOutputNchw(Y_shape.data(), x_data, y_data);
+    MlasReorderOutputNchw(Y_shape.data(), x_data, y_data, context->GetOperatorThreadPool());
   }
 
   return Status::OK();

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1085,7 +1085,8 @@ MLASCALL
 MlasReorderOutputNchw(
     const int64_t* OutputShape,
     const float* S,
-    float* D
+    float* D,
+    MLAS_THREADPOOL* ThreadPool
     );
 
 void

diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp
@@ -1,6 +1,7 @@
 /*++
 
 Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
 
 Licensed under the MIT License.
 
@@ -17,6 +18,20 @@ Module Name:
 
 #include "mlasi.h"
 
+//
+// Define the parameters to execute segments of a NCHW output reordering
+// operation on worker threads.
+//
+
+struct MLAS_REORDER_OUTPUT_NCHW_BLOCK {
+    ptrdiff_t TargetThreadCount;
+    const float* S;
+    float* D;
+    size_t OutputChannels;
+    size_t OutputSize;
+    size_t TasksCount;
+};
+
 MLAS_FORCEINLINE
 void
 MlasReorderGatherFloat32x4(
@@ -396,103 +411,191 @@ Return Value:
 }
 
 void
-MLASCALL
-MlasReorderOutputNchw(
-    const int64_t* OutputShape,
-    const float* S,
-    float* D
+MlasReorderOutputNchwThreaded(
+    void* Context,
+    ptrdiff_t Index
     )
 /*++
 
 Routine Description:
 
-    This routine reorders an output buffer from NCHWc to NCHW format.
+    This routine is invoked from a worker thread to execute a segment of a
+    NCHW output reordering operation.
 
 Arguments:
 
-    OutputShape - Supplies the shape of the output tensor.
-
-    S - Supplies the address of the source tensor.
+    Context - Supplies the pointer to the context for the threaded operation.
 
-    D - Supplies the address of the destination tensor.
+    Index - Supplies the current index of the threaded operation.
 
 Return Value:
 
     None.
 
 --*/
 {
+    const auto* WorkBlock = (MLAS_REORDER_OUTPUT_NCHW_BLOCK*)Context;
+
+    const size_t OutputChannels = WorkBlock->OutputChannels;
+    const size_t OutputSize = WorkBlock->OutputSize;
+    const float* S = WorkBlock->S;
+    float* D =  WorkBlock->D;
+
     const size_t BlockSize = MlasNchwcGetBlockSize();
+    const size_t TasksPerBatch = size_t(ceil(((float)OutputChannels) / BlockSize));
+    const size_t LastTaskInBatchIndex = TasksPerBatch - 1;
 
-    const size_t BatchCount = size_t(OutputShape[0]);
-    const size_t OutputChannels = size_t(OutputShape[1]);
-    const size_t OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]);
+    //
+    // Compute the range of task indices to use for this thread.
+    //
+
+    size_t TaskStart;
+    size_t TasksRemaining;
 
+    MlasPartitionWork(Index, WorkBlock->TargetThreadCount, WorkBlock->TasksCount,
+        &TaskStart, &TasksRemaining);
+
+    size_t TaskEnd = TaskStart + TasksRemaining;
+
     //
-    // Transpose NCHWc blocks from the source buffer to the destination buffer.
+    // Rebase the pointers to the source and destination buffers for this thread.
     //
 
-    for (size_t batch = 0; batch < BatchCount; batch++) {
+    size_t FirstBatchIndex = TaskStart / TasksPerBatch;
+    size_t FirstTaskInBatchIndex = TaskStart % TasksPerBatch;
+    S += BlockSize * OutputSize * (FirstBatchIndex * TasksPerBatch + FirstTaskInBatchIndex);
+    D += OutputSize * (FirstBatchIndex * OutputChannels + BlockSize * FirstTaskInBatchIndex);
 
-        for (size_t o = OutputChannels; o > 0;) {
+    //
+    // Transpose NCHWc blocks associated with tasks in the range [TaskStart, TaskEnd)
+    // from the source buffer to the destination buffer.
+    //
 
-            const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
-            const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
-            o -= OutputChannelsThisIteration;
+    for (size_t t = TaskStart; t < TaskEnd; t++) {
+        size_t TaskInBatchIndex = t % TasksPerBatch;
 
-            const float* s = S;
-            float* d = D;
-            size_t OutputSizeRemaining = OutputSize;
+        const size_t OutputChannelsThisIteration = (TaskInBatchIndex < LastTaskInBatchIndex) ?
+            BlockSize : OutputChannels - BlockSize * LastTaskInBatchIndex;
+        const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
 
-            for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) {
+        const float* s = S;
+        float* d = D;
+        size_t OutputSizeRemaining = OutputSize;
 
-                const float* ss = s;
-                float* dd = d;
-                size_t bc = 0;
+        for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) {
 
-                for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
-                    MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize);
-                    ss += 4;
-                    dd += 4 * OutputSize;
-                }
+            const float* ss = s;
+            float* dd = d;
+            size_t bc = 0;
 
-                for (; bc < OutputChannelsThisIteration; bc += 1) {
-                    MlasReorderGatherFloat32x4(ss, dd, BlockSize);
-                    ss += 1;
-                    dd += OutputSize;
-                }
+            for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
+                MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize);
+                ss += 4;
+                dd += 4 * OutputSize;
+            }
 
-                s += 4 * BlockSize;
-                d += 4;
+            for (; bc < OutputChannelsThisIteration; bc += 1) {
+                MlasReorderGatherFloat32x4(ss, dd, BlockSize);
+                ss += 1;
+                dd += OutputSize;
             }
 
-            for (; OutputSizeRemaining > 0; OutputSizeRemaining--) {
+            s += 4 * BlockSize;
+            d += 4;
+        }
 
-                const float* ss = s;
-                float* dd = d;
-                size_t bc = 0;
+        for (; OutputSizeRemaining > 0; OutputSizeRemaining--) {
 
-                for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
-                    MlasReorderScatterFloat32x4(ss, dd, OutputSize);
-                    ss += 4;
-                    dd += 4 * OutputSize;
-                }
+            const float* ss = s;
+            float* dd = d;
+            size_t bc = 0;
 
-                for (; bc < OutputChannelsThisIteration; bc += 1) {
-                    *dd = *ss++;
-                    dd += OutputSize;
-                }
+            for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
+                MlasReorderScatterFloat32x4(ss, dd, OutputSize);
+                ss += 4;
+                dd += 4 * OutputSize;
+            }
 
-                s += BlockSize;
-                d += 1;
+            for (; bc < OutputChannelsThisIteration; bc += 1) {
+                *dd = *ss++;
+                dd += OutputSize;
             }
 
-            S += BlockSize * OutputSize;
-            D += OutputChannelsThisIteration * OutputSize;
+            s += BlockSize;
+            d += 1;
         }
+
+        S += BlockSize * OutputSize;
+        D += OutputChannelsThisIteration * OutputSize;
     }
 }
 
+
+void
+MLASCALL
+MlasReorderOutputNchw(
+    const int64_t* OutputShape,
+    const float* S,
+    float* D,
+    MLAS_THREADPOOL* ThreadPool
+    )
+/*++
+
+Routine Description:
+
+    This routine reorders an output buffer from NCHWc to NCHW format.
+
+Arguments:
+
+    OutputShape - Supplies the shape of the output tensor.
+
+    S - Supplies the address of the source tensor.
+
+    D - Supplies the address of the destination tensor.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    MLAS_REORDER_OUTPUT_NCHW_BLOCK WorkBlock;
+
+    //
+    // Capture the NCHW reorder output operation parameters to the work block.
+    //
+
+    WorkBlock.S = S;
+    WorkBlock.D = D;
+
+    WorkBlock.OutputChannels = size_t(OutputShape[1]);
+    WorkBlock.OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]);
+
+    const size_t BlockSize = MlasNchwcGetBlockSize();
+    const size_t TasksPerBatch = size_t(ceil(((float)WorkBlock.OutputChannels) / BlockSize));
+    const size_t BatchCount = size_t(OutputShape[0]);
+    const size_t TasksCount = BatchCount * TasksPerBatch;    
+    WorkBlock.TasksCount = TasksCount;
+
+    //
+    // Schedule the operation across a set of worker threads if the output 
+    // tensor is sufficienly large. Limit the number of threads to at least
+    // the number of available tasks.
+    //
+
+    ptrdiff_t TargetThreadCount = 1;
+    const size_t BufferSize = BatchCount * WorkBlock.OutputChannels * WorkBlock.OutputSize;
+    if (BufferSize > 1024 && TasksCount > 1) {
+        TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+        if (size_t(TargetThreadCount) > TasksCount) {
+            TargetThreadCount = ptrdiff_t(TasksCount);
+        }
+    }     
+    WorkBlock.TargetThreadCount = TargetThreadCount;
+
+    MlasExecuteThreaded(MlasReorderOutputNchwThreaded, &WorkBlock, TargetThreadCount, ThreadPool);
+}
+
 void
 MLASCALL
 MlasReorderOutputNhwc(

diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
@@ -137,7 +137,7 @@ class MlasNchwcConv2DTest : public MlasConv2DTest<Threaded> {
     // Reorder the output buffer.
     //
 
-    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output);
+    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, MlasConv2DTest<Threaded>::threadpool_);
   }
 
   const size_t BlockSize = MlasNchwcGetBlockSize();

diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h
@@ -49,7 +49,7 @@ class MlasNchwcPool2DTest : public MlasPool2DTest<PoolingKind, Threaded> {
                   NchwcOutput,
                   nullptr);
 
-    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output);
+    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, nullptr);
   }
 
   MatrixGuardBuffer<float> BufferNchwcInput;

diff --git a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
@@ -27,7 +27,7 @@ class MlasReorderOutputTest : public MlasTestBase {
     std::fill_n(Output, OutputBufferElements, -0.5f);
     std::fill_n(OutputReference, OutputBufferElements, -0.5f);
 
-    MlasReorderOutputNchw(NchwOutputShape, Input, Output);
+    MlasReorderOutputNchw(NchwOutputShape, Input, Output, GetMlasThreadPool());
     ReferenceReorderOutput(BatchCount, Channels, Height, Width, Input, OutputReference, false);
     ASSERT_EQ(memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)), 0)
         << " [Nchw] batch=" << BatchCount << ", channels=" << Channels