From 3abab34ef7b531f5b76367dc7d5ec0c22b36cf2c Mon Sep 17 00:00:00 2001
From: Alex Kogan <alex.kogan@oracle.com>
Date: Mon, 14 Nov 2022 10:55:59 -0500
Subject: [PATCH] Enable parallel output reordering in MlasReorderOutputNchw()

---
 onnxruntime/contrib_ops/cpu/nchwc_ops.cc      |   2 +-
 onnxruntime/core/mlas/inc/mlas.h              |   3 +-
 onnxruntime/core/mlas/lib/reorder.cpp         | 213 +++++++++++++-----
 .../test/mlas/unittest/test_conv2d_nchwc.h    |   2 +-
 .../test/mlas/unittest/test_pool2d_nchwc.h    |   2 +-
 .../mlas/unittest/test_reorder_output.cpp     |   2 +-
 6 files changed, 164 insertions(+), 60 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
index 7c4ee548ccaca..c16aaca5e71ea 100644
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@@ -144,7 +144,7 @@ Status ReorderOutput::Compute(OpKernelContext* context) const {
   if (channels_last_) {
     MlasReorderOutputNhwc(Y_shape.data(), x_data, y_data);
   } else {
-    MlasReorderOutputNchw(Y_shape.data(), x_data, y_data);
+    MlasReorderOutputNchw(Y_shape.data(), x_data, y_data, context->GetOperatorThreadPool());
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 5b6756e4fb90b..c1a4d16fd44fb 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1085,7 +1085,8 @@ MLASCALL
 MlasReorderOutputNchw(
     const int64_t* OutputShape,
     const float* S,
-    float* D
+    float* D,
+    MLAS_THREADPOOL* ThreadPool
     );
 
 void
diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp
index 0d7fbd97a4a6f..99c1dbac3b692 100644
--- a/onnxruntime/core/mlas/lib/reorder.cpp
+++ b/onnxruntime/core/mlas/lib/reorder.cpp
@@ -1,6 +1,7 @@
 /*++
 
 Copyright (c) Microsoft Corporation. All rights reserved.
+Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
 
 Licensed under the MIT License.
 
@@ -17,6 +18,20 @@ Module Name:
 
 #include "mlasi.h"
 
+//
+// Define the parameters to execute segments of a NCHW output reordering
+// operation on worker threads.
+//
+
+struct MLAS_REORDER_OUTPUT_NCHW_BLOCK {
+    ptrdiff_t TargetThreadCount;
+    const float* S;
+    float* D;
+    size_t OutputChannels;
+    size_t OutputSize;
+    size_t TasksCount;
+};
+
 MLAS_FORCEINLINE
 void
 MlasReorderGatherFloat32x4(
@@ -396,25 +411,22 @@ Return Value:
 }
 
 void
-MLASCALL
-MlasReorderOutputNchw(
-    const int64_t* OutputShape,
-    const float* S,
-    float* D
+MlasReorderOutputNchwThreaded(
+    void* Context,
+    ptrdiff_t Index
     )
 /*++
 
 Routine Description:
 
-    This routine reorders an output buffer from NCHWc to NCHW format.
+    This routine is invoked from a worker thread to execute a segment of a
+    NCHW output reordering operation.
 
 Arguments:
 
-    OutputShape - Supplies the shape of the output tensor.
-
-    S - Supplies the address of the source tensor.
+    Context - Supplies the pointer to the context for the threaded operation.
 
-    D - Supplies the address of the destination tensor.
+    Index - Supplies the current index of the threaded operation.
 
 Return Value:
 
@@ -422,77 +434,168 @@ Return Value:
 
 --*/
 {
+    const auto* WorkBlock = (MLAS_REORDER_OUTPUT_NCHW_BLOCK*)Context;
+
+    const size_t OutputChannels = WorkBlock->OutputChannels;
+    const size_t OutputSize = WorkBlock->OutputSize;
+    const float* S = WorkBlock->S;
+    float* D =  WorkBlock->D;
+
     const size_t BlockSize = MlasNchwcGetBlockSize();
+    const size_t TasksPerBatch = size_t(ceil(((float)OutputChannels) / BlockSize));
+    const size_t LastTaskInBatchIndex = TasksPerBatch - 1;
 
-    const size_t BatchCount = size_t(OutputShape[0]);
-    const size_t OutputChannels = size_t(OutputShape[1]);
-    const size_t OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]);
+    //
+    // Compute the range of task indices to use for this thread.
+    //
+
+    size_t TaskStart;
+    size_t TasksRemaining;
 
+    MlasPartitionWork(Index, WorkBlock->TargetThreadCount, WorkBlock->TasksCount,
+        &TaskStart, &TasksRemaining);
+
+    size_t TaskEnd = TaskStart + TasksRemaining;
+   
     //
-    // Transpose NCHWc blocks from the source buffer to the destination buffer.
+    // Rebase the pointers to the source and destination buffers for this thread.
     //
 
-    for (size_t batch = 0; batch < BatchCount; batch++) {
+    size_t FirstBatchIndex = TaskStart / TasksPerBatch;
+    size_t FirstTaskInBatchIndex = TaskStart % TasksPerBatch;
+    S += BlockSize * OutputSize * (FirstBatchIndex * TasksPerBatch + FirstTaskInBatchIndex);
+    D += OutputSize * (FirstBatchIndex * OutputChannels + BlockSize * FirstTaskInBatchIndex);
 
-        for (size_t o = OutputChannels; o > 0;) {
+    //
+    // Transpose NCHWc blocks associated with tasks in the range [TaskStart, TaskEnd)
+    // from the source buffer to the destination buffer.
+    //
 
-            const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
-            const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
-            o -= OutputChannelsThisIteration;
+    for (size_t t = TaskStart; t < TaskEnd; t++) {
+        size_t TaskInBatchIndex = t % TasksPerBatch;
 
-            const float* s = S;
-            float* d = D;
-            size_t OutputSizeRemaining = OutputSize;
+        const size_t OutputChannelsThisIteration = (TaskInBatchIndex < LastTaskInBatchIndex) ?
+            BlockSize : OutputChannels - BlockSize * LastTaskInBatchIndex;
+        const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
 
-            for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) {
+        const float* s = S;
+        float* d = D;
+        size_t OutputSizeRemaining = OutputSize;
 
-                const float* ss = s;
-                float* dd = d;
-                size_t bc = 0;
+        for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) {
 
-                for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
-                    MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize);
-                    ss += 4;
-                    dd += 4 * OutputSize;
-                }
+            const float* ss = s;
+            float* dd = d;
+            size_t bc = 0;
 
-                for (; bc < OutputChannelsThisIteration; bc += 1) {
-                    MlasReorderGatherFloat32x4(ss, dd, BlockSize);
-                    ss += 1;
-                    dd += OutputSize;
-                }
+            for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
+                MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize);
+                ss += 4;
+                dd += 4 * OutputSize;
+            }
 
-                s += 4 * BlockSize;
-                d += 4;
+            for (; bc < OutputChannelsThisIteration; bc += 1) {
+                MlasReorderGatherFloat32x4(ss, dd, BlockSize);
+                ss += 1;
+                dd += OutputSize;
             }
 
-            for (; OutputSizeRemaining > 0; OutputSizeRemaining--) {
+            s += 4 * BlockSize;
+            d += 4;
+        }
 
-                const float* ss = s;
-                float* dd = d;
-                size_t bc = 0;
+        for (; OutputSizeRemaining > 0; OutputSizeRemaining--) {
 
-                for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
-                    MlasReorderScatterFloat32x4(ss, dd, OutputSize);
-                    ss += 4;
-                    dd += 4 * OutputSize;
-                }
+            const float* ss = s;
+            float* dd = d;
+            size_t bc = 0;
 
-                for (; bc < OutputChannelsThisIteration; bc += 1) {
-                    *dd = *ss++;
-                    dd += OutputSize;
-                }
+            for (; bc < AlignedOutputChannelsThisIteration; bc += 4) {
+                MlasReorderScatterFloat32x4(ss, dd, OutputSize);
+                ss += 4;
+                dd += 4 * OutputSize;
+            }
 
-                s += BlockSize;
-                d += 1;
+            for (; bc < OutputChannelsThisIteration; bc += 1) {
+                *dd = *ss++;
+                dd += OutputSize;
             }
 
-            S += BlockSize * OutputSize;
-            D += OutputChannelsThisIteration * OutputSize;
+            s += BlockSize;
+            d += 1;
         }
+
+        S += BlockSize * OutputSize;
+        D += OutputChannelsThisIteration * OutputSize;
     }
 }
 
+
+void
+MLASCALL
+MlasReorderOutputNchw(
+    const int64_t* OutputShape,
+    const float* S,
+    float* D,
+    MLAS_THREADPOOL* ThreadPool
+    )
+/*++
+
+Routine Description:
+
+    This routine reorders an output buffer from NCHWc to NCHW format.
+
+Arguments:
+
+    OutputShape - Supplies the shape of the output tensor.
+
+    S - Supplies the address of the source tensor.
+
+    D - Supplies the address of the destination tensor.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    MLAS_REORDER_OUTPUT_NCHW_BLOCK WorkBlock;
+
+    //
+    // Capture the NCHW reorder output operation parameters to the work block.
+    //
+
+    WorkBlock.S = S;
+    WorkBlock.D = D;
+    
+    WorkBlock.OutputChannels = size_t(OutputShape[1]);
+    WorkBlock.OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]);
+
+    const size_t BlockSize = MlasNchwcGetBlockSize();
+    const size_t TasksPerBatch = size_t(ceil(((float)WorkBlock.OutputChannels) / BlockSize));
+    const size_t BatchCount = size_t(OutputShape[0]);
+    const size_t TasksCount = BatchCount * TasksPerBatch;    
+    WorkBlock.TasksCount = TasksCount;
+
+    //
+    // Schedule the operation across a set of worker threads if the output 
+    // tensor is sufficienly large. Limit the number of threads to at least
+    // the number of available tasks.
+    //
+
+    ptrdiff_t TargetThreadCount = 1;
+    const size_t BufferSize = BatchCount * WorkBlock.OutputChannels * WorkBlock.OutputSize;
+    if (BufferSize > 1024 && TasksCount > 1) {
+        TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+        if (size_t(TargetThreadCount) > TasksCount) {
+            TargetThreadCount = ptrdiff_t(TasksCount);
+        }
+    }     
+    WorkBlock.TargetThreadCount = TargetThreadCount;
+
+    MlasExecuteThreaded(MlasReorderOutputNchwThreaded, &WorkBlock, TargetThreadCount, ThreadPool);
+}
+
 void
 MLASCALL
 MlasReorderOutputNhwc(
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
index e516fa8a0b698..c125720668381 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
+++ b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h
@@ -137,7 +137,7 @@ class MlasNchwcConv2DTest : public MlasConv2DTest<Threaded> {
     // Reorder the output buffer.
     //
 
-    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output);
+    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, MlasConv2DTest<Threaded>::threadpool_);
   }
 
   const size_t BlockSize = MlasNchwcGetBlockSize();
diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h
index 10e3f7f927aba..38ac63a68c843 100644
--- a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h
+++ b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h
@@ -49,7 +49,7 @@ class MlasNchwcPool2DTest : public MlasPool2DTest<PoolingKind, Threaded> {
                   NchwcOutput,
                   nullptr);
 
-    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output);
+    MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, nullptr);
   }
 
   MatrixGuardBuffer<float> BufferNchwcInput;
diff --git a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
index a87d2a0fa5721..704333fd27fa0 100644
--- a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
+++ b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp
@@ -27,7 +27,7 @@ class MlasReorderOutputTest : public MlasTestBase {
     std::fill_n(Output, OutputBufferElements, -0.5f);
     std::fill_n(OutputReference, OutputBufferElements, -0.5f);
 
-    MlasReorderOutputNchw(NchwOutputShape, Input, Output);
+    MlasReorderOutputNchw(NchwOutputShape, Input, Output, GetMlasThreadPool());
     ReferenceReorderOutput(BatchCount, Channels, Height, Width, Input, OutputReference, false);
     ASSERT_EQ(memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)), 0)
         << " [Nchw] batch=" << BatchCount << ", channels=" << Channels