From e36a378891d56f58bcb10e40e33bc9112692f461 Mon Sep 17 00:00:00 2001
From: liujuncheng <liujuncheng1022@gmail.com>
Date: Wed, 14 Sep 2022 15:56:25 +0800
Subject: [PATCH] Add cublas_fused_mlp

---
 .../nodes/GPU/test_cublas_fused_mlp.py        | 54 +++++++++++++++++++
 oneflow_onnx/oneflow2onnx/handlers/nn.py      | 39 ++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py

diff --git a/examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py b/examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py
new file mode 100644
index 0000000..b40081a
--- /dev/null
+++ b/examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import tempfile
+import oneflow as flow
+from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check
+
+
+class MLP(flow.nn.Module):
+    def __init__(self) -> None:
+        super(MLP, self).__init__()
+        self.mlp = flow.nn.FusedMLP(in_features=8, hidden_features=[16, 32], out_features=16, skip_final_activation=True)
+
+    def forward(self, x: flow.Tensor) -> flow.Tensor:
+        return self.mlp(x)
+
+
+mlp = MLP()
+mlp = mlp.to("cuda")
+
+
+class TestGraph(flow.nn.Graph):
+    def __init__(self):
+        super().__init__()
+        self.m = mlp
+
+    def build(self, x):
+        out = self.m(x)
+        return out
+
+
+def test_cublas_fused_mlp():
+
+    graph = TestGraph()
+    graph._compile(flow.randn(32, 8).to("cuda"))
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        flow.save(mlp.state_dict(), tmpdirname)
+        convert_to_onnx_and_check(graph, onnx_model_path="/tmp", device="gpu")
+
+
+test_cublas_fused_mlp()
diff --git a/oneflow_onnx/oneflow2onnx/handlers/nn.py b/oneflow_onnx/oneflow2onnx/handlers/nn.py
index 40e7ce4..58c38e4 100644
--- a/oneflow_onnx/oneflow2onnx/handlers/nn.py
+++ b/oneflow_onnx/oneflow2onnx/handlers/nn.py
@@ -419,6 +419,45 @@ def Version_9(cls, ctx, node, **kwargs):
         cls.Version_6(ctx, node, **kwargs)
 
 
+@flow_op(["cublas_fused_mlp"])
+class CublasFusedMLP:
+    @classmethod
+    def Version_1(cls, ctx, node, **kwargs):
+        n_inputs = len(node.input_tensor_names)
+        n_layers = n_inputs // 2
+        assert n_layers * 2 + 1 == n_inputs
+        assert n_layers >= 0
+        x = node.input_tensor_names[0]
+        weights = node.input_tensor_names[1 : n_layers + 1]
+        biases = node.input_tensor_names[n_layers + 1 :]
+        n_outputs = len(node.output_tensor_names)
+        assert n_outputs == n_inputs
+        y = node.output_tensor_names[0]
+        for output in node.output_tensor_names[1:]:
+            assert len(ctx.FindOutputConsumers(output)) == 0
+        skip_final_act = node.attrs["skip_final_activation"]
+        next_x = x
+        scope = node.name
+        output_shape = ctx.get_shape(y)
+        output_dtype = ctx.get_dtype(y)
+        ctx.RemoveNode(node.name)
+        for layer_idx in range(n_layers):
+            tranpose_node = ctx.MakeNode("Transpose", [weights[layer_idx]], op_name_scope=scope, name="transpose_{}".format(layer_idx))
+            matmul_node = ctx.MakeNode("MatMul", [next_x, tranpose_node.output_tensor_names[0]], op_name_scope=scope, name="matmul_{}".format(layer_idx))
+            bias_attrs = {}
+            if ctx.opset < 7:
+                bias_attrs = {"broadcast": 1}
+            bias_node = ctx.MakeNode("Add", [matmul_node.output_tensor_names[0], biases[layer_idx]], attr=bias_attrs, op_name_scope=scope, name="bias_{}".format(layer_idx))
+            if layer_idx != n_layers - 1 or (not skip_final_act):
+                relu_node = ctx.MakeNode("Relu", [bias_node.output_tensor_names[0]], op_name_scope=scope, name="relu_{}".format(layer_idx))
+                next_x = relu_node.output_tensor_names[0]
+            else:
+                next_x = bias_node.output_tensor_names[0]
+        ctx.MakeNode("Identity", [next_x], outputs=[y], op_name_scope=scope)
+        ctx.set_shape(y, output_shape)
+        ctx.set_dtype(y, output_dtype)
+
+
 @flow_op("upsample_nearest_2d", onnx_op="Resize")
 class UpSampleNearest2D:
     @classmethod