diff --git a/examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py b/examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py new file mode 100644 index 0000000..b40081a --- /dev/null +++ b/examples/oneflow2onnx/nodes/GPU/test_cublas_fused_mlp.py @@ -0,0 +1,54 @@ +""" +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import tempfile +import oneflow as flow +from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check + + +class MLP(flow.nn.Module): + def __init__(self) -> None: + super(MLP, self).__init__() + self.mlp = flow.nn.FusedMLP(in_features=8, hidden_features=[16, 32], out_features=16, skip_final_activation=True) + + def forward(self, x: flow.Tensor) -> flow.Tensor: + return self.mlp(x) + + +mlp = MLP() +mlp = mlp.to("cuda") + + +class TestGraph(flow.nn.Graph): + def __init__(self): + super().__init__() + self.m = mlp + + def build(self, x): + out = self.m(x) + return out + + +def test_cublas_fused_mlp(): + + graph = TestGraph() + graph._compile(flow.randn(32, 8).to("cuda")) + + with tempfile.TemporaryDirectory() as tmpdirname: + flow.save(mlp.state_dict(), tmpdirname) + convert_to_onnx_and_check(graph, onnx_model_path="/tmp", device="gpu") + + +test_cublas_fused_mlp() diff --git a/oneflow_onnx/oneflow2onnx/handlers/nn.py b/oneflow_onnx/oneflow2onnx/handlers/nn.py index 40e7ce4..58c38e4 100644 --- a/oneflow_onnx/oneflow2onnx/handlers/nn.py +++ b/oneflow_onnx/oneflow2onnx/handlers/nn.py @@ -419,6 +419,45 @@ def Version_9(cls, ctx, node, **kwargs): cls.Version_6(ctx, node, **kwargs) +@flow_op(["cublas_fused_mlp"]) +class CublasFusedMLP: + @classmethod + def Version_1(cls, ctx, node, **kwargs): + n_inputs = len(node.input_tensor_names) + n_layers = n_inputs // 2 + assert n_layers * 2 + 1 == n_inputs + assert n_layers >= 0 + x = node.input_tensor_names[0] + weights = node.input_tensor_names[1 : n_layers + 1] + biases = node.input_tensor_names[n_layers + 1 :] + n_outputs = len(node.output_tensor_names) + assert n_outputs == n_inputs + y = node.output_tensor_names[0] + for output in node.output_tensor_names[1:]: + assert len(ctx.FindOutputConsumers(output)) == 0 + skip_final_act = node.attrs["skip_final_activation"] + next_x = x + scope = node.name + output_shape = ctx.get_shape(y) + output_dtype = ctx.get_dtype(y) + ctx.RemoveNode(node.name) + for layer_idx in range(n_layers): + tranpose_node = ctx.MakeNode("Transpose", [weights[layer_idx]], op_name_scope=scope, name="transpose_{}".format(layer_idx)) + matmul_node = ctx.MakeNode("MatMul", [next_x, tranpose_node.output_tensor_names[0]], op_name_scope=scope, name="matmul_{}".format(layer_idx)) + bias_attrs = {} + if ctx.opset < 7: + bias_attrs = {"broadcast": 1} + bias_node = ctx.MakeNode("Add", [matmul_node.output_tensor_names[0], biases[layer_idx]], attr=bias_attrs, op_name_scope=scope, name="bias_{}".format(layer_idx)) + if layer_idx != n_layers - 1 or (not skip_final_act): + relu_node = ctx.MakeNode("Relu", [bias_node.output_tensor_names[0]], op_name_scope=scope, name="relu_{}".format(layer_idx)) + next_x = relu_node.output_tensor_names[0] + else: + next_x = bias_node.output_tensor_names[0] + ctx.MakeNode("Identity", [next_x], outputs=[y], op_name_scope=scope) + ctx.set_shape(y, output_shape) + ctx.set_dtype(y, output_dtype) + + @flow_op("upsample_nearest_2d", onnx_op="Resize") class UpSampleNearest2D: @classmethod