From b624a6dc709590d7c407a41210cc925808a181e0 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 10:01:30 -0500
Subject: [PATCH 01/75] Use dahlia.

---
 frontends/relay-futil/README.md               |   8 +-
 frontends/relay-futil/compiler.py             | 144 +++++---
 frontends/relay-futil/dahlia_functions.py     | 125 +++++++
 frontends/relay-futil/example.py              |  40 +--
 frontends/relay-futil/futil_ast.py            |  54 ++-
 frontends/relay-futil/pretty_print.py         |   3 +
 frontends/relay-futil/tests/add.expect        | 126 ++++---
 frontends/relay-futil/tests/data/add.expect   |   3 -
 .../relay-futil/tests/data/add.relay.data     |   6 +-
 .../tests/data/{let.expect => let1.expect}    |   3 -
 .../tests/data/{let.relay => let1.relay}      |   0
 .../data/{let.relay.data => let1.relay.data}  |   4 -
 frontends/relay-futil/tests/data/let2.expect  |   5 +-
 frontends/relay-futil/tests/data/let2.relay   |   2 +-
 .../relay-futil/tests/data/let2.relay.data    |   6 +-
 frontends/relay-futil/tests/data/sub.expect   |   3 -
 .../relay-futil/tests/data/sub.relay.data     |   4 -
 .../tests/data/tensor2d_add.expect            |  44 +++
 .../relay-futil/tests/data/tensor2d_add.relay |   5 +
 .../tests/data/tensor2d_add.relay.data        |  14 +
 .../tests/data/tensor3d_batch_flatten.expect  |  18 +
 .../tests/data/tensor3d_batch_flatten.relay   |   6 +
 .../data/tensor3d_batch_flatten.relay.data    |  10 +
 frontends/relay-futil/tests/let1.expect       | 134 ++++---
 frontends/relay-futil/tests/let2.expect       | 233 ++++++++-----
 frontends/relay-futil/tests/let3.expect       | 330 ++++++++++++------
 frontends/relay-futil/tests/sub.expect        | 126 ++++---
 .../relay-futil/tests/tensor2d_add.expect     | 135 +++++++
 .../relay-futil/tests/tensor2d_add.relay      |   6 +
 .../tests/tensor3d_batch_flatten.expect       | 166 +++++++++
 .../tests/tensor3d_batch_flatten.relay        |   6 +
 frontends/relay-futil/utilities.py            | 251 +++----------
 32 files changed, 1349 insertions(+), 671 deletions(-)
 create mode 100644 frontends/relay-futil/dahlia_functions.py
 rename frontends/relay-futil/tests/data/{let.expect => let1.expect} (78%)
 rename frontends/relay-futil/tests/data/{let.relay => let1.relay} (100%)
 rename frontends/relay-futil/tests/data/{let.relay.data => let1.relay.data} (81%)
 create mode 100644 frontends/relay-futil/tests/data/tensor2d_add.expect
 create mode 100644 frontends/relay-futil/tests/data/tensor2d_add.relay
 create mode 100644 frontends/relay-futil/tests/data/tensor2d_add.relay.data
 create mode 100644 frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
 create mode 100644 frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay
 create mode 100644 frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
 create mode 100644 frontends/relay-futil/tests/tensor2d_add.expect
 create mode 100644 frontends/relay-futil/tests/tensor2d_add.relay
 create mode 100644 frontends/relay-futil/tests/tensor3d_batch_flatten.expect
 create mode 100644 frontends/relay-futil/tests/tensor3d_batch_flatten.relay

diff --git a/frontends/relay-futil/README.md b/frontends/relay-futil/README.md
index 427909c036..0136ec5436 100644
--- a/frontends/relay-futil/README.md
+++ b/frontends/relay-futil/README.md
@@ -42,12 +42,10 @@ Run an Example
 Try this to run a simple example:
 ```bash
 cd futil/frontends/relay-futil
-python3 example.py
+python3 example.py add
 ```     
-
-Pass the `-r` flag to this script to see the Relay code. Otherwise, we just print the FuTIL code. There is also an `-o` flag to try optimizing the Relay code a little bit.
-
-You can specify the name of an example as a command-line argument. Currently, the only option is `identity`.
+Pass the `-h` flag to this script for help.
+Pass the `-r` flag to this script to see the Relay IR. Otherwise, we just print the FuTIL output. 
 
 
 Run the Tests
diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 89b8991798..b10932c239 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -1,90 +1,139 @@
 from tvm import relay, ir
 from tvm.relay.expr_functor import ExprFunctor
 from tvm.relay.function import Function
-import textwrap
-from collections import namedtuple, defaultdict
-import math
+from collections import defaultdict
 
 from pretty_print import *
 from utilities import *
 from futil_ast import *
+from dahlia_functions import *
 
-# Map standard Relay call to respective hardware name in FuTIL.
-BuiltInBinaryCalls = {'add': 'add', 'equal': 'eq', 'multiply': 'mult', 'subtract': 'sub'}
-
-EmitResult = namedtuple('EmitResult', ['cells', 'groups'])
+# Mapping from Relay binary calls to the respective Dahlia operator.
+BuiltInBinaryCalls = {'add': '+', 'multiply': '*', 'subtract': '-'}
 
 
 class Relay2Futil(ExprFunctor):
     """The main compilation visitor."""
 
+    def __init__(self):
+        super(Relay2Futil, self).__init__()
+        self.id_dictionary = defaultdict(int)
+        self.relay_id_dictionary = defaultdict(int)
+        self.dahlia_components = []
+        self.main = FComponent(name="main", cells=[], wires=[])
+
     def id(self, name):
         """
-        Provides unique identification for a given name.
+        Provides a unique identification for a given name.
         """
         id_number = self.id_dictionary[name]
         self.id_dictionary[name] += 1
         return name + str(id_number)
 
-    def __init__(self):
-        super(Relay2Futil, self).__init__()
-        self.id_dictionary = defaultdict(int)
-        self.main = FComponent(name="main", cells=[], wires=[])
+    def relay_id(self, name):
+        """
+        Relay does not explicitly differentiate a variable name if it is used twice. For example,
+        %x  = foo(%y);
+        %x1 = bar(%x); // Here, at this level, the name_hint associated with `x1` is still 'x'.
+        To avoid this, we provide Relay with its own identification dictionary. If 'x' is seen
+        three times, it will produce: 'x', 'x1', x2'.
+        """
+        id_number = self.relay_id_dictionary[name]
+        self.relay_id_dictionary[name] += 1
+        if id_number == 0: return name
+        return name + str(id_number)
+
+    def produce_dahlia_name(self, name, type):
+        """
+        Dahlia uses the following naming scheme for an arbitrary variable 'X':
+        Memory1D: 'X0', 'X1', 'X2', ...
+        Memory2D: 'X0_0', 'X1_0', 'X2_0', ...
+        Memory3D: 'X0_0_0', 'X1_0_0', 'X2_0_0', ...
+        """
+        dahlia_name = self.id(name)
+        if type == PrimitiveType.Memory1D: return dahlia_name
+        if type == PrimitiveType.Memory2D: return dahlia_name + "_0"
+        if type == PrimitiveType.Memory3D: return dahlia_name + "_0_0"
+        assert False, f'{name} with {type} is not supported yet.'
+
+    def get_dahlia_function_type(self, function_name, input_type):
+        """
+        Returns the corresponding name, Dahlia function type, and op (if it is a binary op, otherwise None).
+        If the function type isn't supported, fails with an assertion.
+        """
+        op = None
+        if function_name in BuiltInBinaryCalls:
+            op = BuiltInBinaryCalls[function_name]
+            if input_type == PrimitiveType.Memory1D:
+                return self.relay_id(f'tensor1d_{function_name}'), DahliaFunctionType.Tensor1DBinaryOp, op
+            if input_type == PrimitiveType.Memory2D:
+                return self.relay_id(f'tensor2d_{function_name}'), DahliaFunctionType.Tensor2DBinaryOp, op
+
+        if function_name == "nn.batch_flatten":
+            assert input_type == PrimitiveType.Memory3D, f'{input_type} not supported for batch flattening.'
+            return self.relay_id(f'tensor3d_batch_flatten'), DahliaFunctionType.Tensor3DBatchFlatten, op
+
+        assert False, f'{function_name} with {input_type} is not supported.'
 
     def visit_var(self, var):
-        name = var.name_hint
-        type = str(var.type_annotation)
-        data = [get_bitwidth(type), 1, 1]  # [width, size, index_size]
-        return [FCell(primitive=FPrimitive(name=name, data=data, type=PrimitiveType.Memory1D))]
+        name = self.relay_id(var.name_hint)
+        if self.main.contains_primitive(name): return [cell]
+
+        data, type = get_memory_parameters(var.type_annotation)
+        dahlia_name = self.produce_dahlia_name(name, type)
+        return [FCell(dahlia_name=dahlia_name, primitive=FPrimitive(name=name, data=data, type=type))]
 
     def visit_let(self, let):
-        variable = self.visit(let.var)[0]
+        variable = self.visit(let.var)
         body = self.visit(let.body)
         values = self.visit(let.value)
 
-        for value in values:
-            if not value.is_declaration(): continue
-            value.declaration.intermediary_output = FCell(
-                primitive=FPrimitive(name=variable.primitive.name, data=variable.primitive.data,
-                                     type=PrimitiveType.Memory1D))
+        output = variable[0]
+        for value in flatten(values):
+            if not value.is_dahlia_declaration(): continue
+            decl = value.dahlia_declaration
+            decl.output = output
+            # TODO(cgyurgyik): This shouldn't be necessary. To simplify, produce mapping
+            #                  between enum and corresponding function.
+            if decl.type == DahliaFunctionType.Tensor1DBinaryOp:
+                decl.program = tensor1d_op(decl)
+            elif decl.type == DahliaFunctionType.Tensor2DBinaryOp:
+                decl.program = tensor2d_op(decl)
+            elif decl.type == DahliaFunctionType.Tensor3DBatchFlatten:
+                decl.program = tensor3d_batch_flatten(decl)
         return [body, values]
 
     def visit_constant(self, const):
         type = const.data.dtype
         shape = const.data.shape
-        data = [get_bitwidth(type), int(const.data.asnumpy())]  # [width, value]
+        data = [get_bitwidth(type), int(const.data.asnumpy())]
         name = self.id("const")
         return [FCell(primitive=FPrimitive(name=name, data=data, type=PrimitiveType.Constant))]
 
     def visit_call(self, call):
-        assert call.op.name in BuiltInBinaryCalls, f'{call.op.name} not supported.'
-        op = BuiltInBinaryCalls[call.op.name]
-
+        cells = []
         args = []
-        for arg in call.args: args.append(self.visit(arg))
-        return [build_tensor_0D_binary_op(call, args, op)]
+        for arg in call.args:
+            result = self.visit(arg)
+            cells.append(result)
+            args.append(result)
+        cells = flatten(cells)
+        name, type, op = self.get_dahlia_function_type(call.op.name, cells[0].primitive.type)
+        dahlia_declaration = DahliaDeclaration(component_name=name, decl_name=self.id(name), op=op,
+                                               inputs=flatten(args), type=type)
+        cells.append(FCell(dahlia_declaration=dahlia_declaration))
+        return cells
 
     def visit_function(self, function):
-        fn: FComponent = FComponent(name=self.id("function"), cells=[], wires=[],
-                                    signature=FSignature(inputs=[], outputs=[]))
-        fn.signature.inputs, fn.signature.outputs = extract_function_arguments(function.params)
         body = self.visit(function.body)
 
-        components = [fn]
         for cell in flatten(body):
-            if cell.is_declaration():
-                fn.add_cell(cell)
-                components.append(cell.declaration.component)
-            elif cell.primitive.type == PrimitiveType.Constant:
-                # Include constants, but not function arguments.
-                fn.add_cell(cell)
-
-        build_function_body(fn)  # Groups, wires, connections.
-
-        # Add declaration to main.
-        self.main.add_cell(FCell(declaration=FDeclaration(name=self.id("fn"), component=fn)))
+            self.main.add_cell(cell)
+            if not cell.is_dahlia_declaration(): continue
+            self.dahlia_components.append(cell.dahlia_declaration.program)
 
-        return '\n'.join(pp_component(c) for c in reversed(components))
+        build_main(self.main)  # Groups, wires, connections.
+        return pp_component(self.main)
 
 
 def infer_type(expr: Function) -> Function:
@@ -93,7 +142,6 @@ def infer_type(expr: Function) -> Function:
     to_normal_pass = relay.transform.ToANormalForm()
     mod = ir.IRModule()
     mod['main'] = expr
-    # mod = fuse_op__pass(mod)
     mod = infer_types_pass(mod)
     ret = mod['main']
     return ret
@@ -103,12 +151,12 @@ def compile(program) -> str:
     """Translate a Relay function to a FuTIL program (as a string)."""
     program = infer_type(program)
     visitor = Relay2Futil()
-    src = visitor.visit(program)
 
-    build_main_body(visitor.main)
     PREAMBLE = """import "primitives/std.lib";"""
+    MAIN = visitor.visit(program)
+    DAHLIA_COMPONENTS = '\n'.join(visitor.dahlia_components)
     NEWL = "\n\n"
-    return f'{PREAMBLE}{NEWL}{src}{NEWL}{pp_component(visitor.main)}'
+    return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}{NEWL}'
 
 
 if __name__ == '__main__':
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
new file mode 100644
index 0000000000..36873c3106
--- /dev/null
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -0,0 +1,125 @@
+import subprocess
+
+from futil_ast import *
+
+
+def lower_dahlia_program(prog, component_name):
+    '''
+    Takes in a string that represents a Dahlia program, lowers it to FuTIL, and applies the `externalize` pass.
+    This is just for experimental purposes, and needs to be replaced.
+    More bluntly, this does the following:
+    1. Copies dahlia program `prog` to a temporary file `temp.fuse`.
+       $ echo `program_string` > temp.fuse
+
+    2. Lowers `temp.fuse` to FuTIL with the name changed to `component_name`, and saves it in `lowered.futil`.
+       $ ./fuse temp.fuse --lower -b=futil -n=component_name > lowered.futil
+
+    3. Runs the 'externalize' pass on the `lowered.futil` file.
+       $ cargo run -- lowered.futil -p externalize > temp.futil
+
+    4. Copies the output from `lowered.futil`, except for the first line (we don't want another copy of the import).
+
+    TODO(cgyurgyik): As you'll see below, this only works on my local machine.
+                     I've explicitly removed errors with `2>/dev/null` so they aren't inserted
+                     to the file as well. However, this makes debugging difficult as well.
+    '''
+    program_string = "\""
+    for line in prog.splitlines():
+        program_string += f'{line}\n'
+    program_string += "\""
+    no_err = "2>/dev/null"
+    command = \
+        f"""
+        echo {program_string} > temp.fuse &&
+        /Users/cgyurgyik/Projects/dahlia/fuse temp.fuse --lower -b=futil -n={component_name} > lowered.futil {no_err} -l error &&
+        cd ../../ &&
+        cargo run -- frontends/relay-futil/lowered.futil -p externalize > frontends/relay-futil/temp.futil {no_err} &&
+        cd frontends/relay-futil/ 
+        """
+    subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
+    dahlia_component = open('temp.futil', 'r').read()[29:]  # Skip over importing the primitives library.
+    subprocess.Popen("rm temp.fuse ; rm lowered.futil ; rm temp.futil", stdout=subprocess.PIPE,
+                     shell=True).communicate()
+    return dahlia_component
+
+
+def tensor1d_op(declaration):
+    op1 = declaration.inputs[0].primitive
+    op2 = declaration.inputs[1].primitive
+    res = declaration.output.primitive
+
+    assert op1.type == PrimitiveType.Memory1D and op1.type == op2.type and op2.type == res.type
+    assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0]
+    assert op1.data[1] == op2.data[1] and op2.data[1] == res.data[1]
+    assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2]
+    bitwidth = op1.data[0]
+    size = op1.data[1]
+    index_size = op1.data[2]
+    return lower_dahlia_program(f"""
+    decl {op1.name}: ubit<{bitwidth}>[{size}];
+    decl {op2.name}: ubit<{bitwidth}>[{size}];
+    decl {res.name}: ubit<{bitwidth}>[{size}];
+    for (let i: ubit<{index_size}> = 0..{size}) {{
+      {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
+    }}""", declaration.component_name)
+
+
+def tensor2d_op(declaration):
+    op1 = declaration.inputs[0].primitive
+    op2 = declaration.inputs[1].primitive
+    res = declaration.output.primitive
+
+    assert op1.type == PrimitiveType.Memory2D and op1.type == op2.type and op2.type == res.type
+    assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0]
+    assert op1.data[1] == op2.data[1] and op2.data[1] == res.data[1]
+    assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2]
+    assert op1.data[3] == op2.data[3] and op2.data[3] == res.data[3]
+    assert op1.data[4] == op2.data[4] and op2.data[4] == res.data[4]
+
+    bitwidth = op1.data[0]
+    size0 = op1.data[1]
+    size1 = op1.data[2]
+    index_size0 = op1.data[3]
+    index_size1 = op1.data[4]
+    return lower_dahlia_program(f"""
+    decl {op1.name}: ubit<{bitwidth}>[{size0}][{size1}];
+    decl {op2.name}: ubit<{bitwidth}>[{size0}][{size1}];
+    decl {res.name}: ubit<{bitwidth}>[{size0}][{size1}];
+    for (let i: ubit<{index_size0}> = 0..{size0}) {{
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{
+        {res.name}[i][j] := {op1.name}[i][j] {declaration.op} {op2.name}[i][j];
+      }}
+    }}""", declaration.component_name)
+
+
+def tensor3d_batch_flatten(declaration):
+    op1 = declaration.inputs[0].primitive
+    res = declaration.output.primitive
+
+    bitwidth = op1.data[0]
+    op1_size0 = op1.data[1]
+    op1_size1 = op1.data[2]
+    op1_size2 = op1.data[3]
+    op1_index_size0 = op1.data[4]
+    op1_index_size1 = op1.data[5]
+    op1_index_size2 = op1.data[6]
+    res_bitwidth = res.data[0]
+    res_size0 = res.data[1]
+    res_size1 = res.data[2]
+    res_index_size0 = res.data[3]
+    res_index_size1 = res.data[4]
+
+    assert op1.type == PrimitiveType.Memory3D and res_size1 == op1_size1 * op1_size2 and res_size0 == op1_size0
+    assert res.type == PrimitiveType.Memory2D and res_bitwidth == bitwidth
+    return lower_dahlia_program(f"""
+        decl {op1.name}: ubit<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
+        decl {res.name}: ubit<{bitwidth}>[{res_size0}][{res_size1}];
+        let l: ubit<{res_index_size1}> = 0;
+        for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
+          for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
+            for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
+              {res.name}[i][l] := {op1.name}[i][j][k];
+              l := l + 1;
+            }}
+          }}
+        }}""", declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 6d51b5dba2..9d0c06d8f8 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -1,11 +1,10 @@
 import tvm
 from tvm import relay
-from tvm.relay import parser
 from compiler import *
 import sys
 
 
-def tensor_0d_add():
+def add():
     """Add together two variables in Relay.
     """
     x = relay.var('x', shape=(), dtype="int32")
@@ -13,15 +12,7 @@ def tensor_0d_add():
     return relay.Function([x, y], relay.add(x, y))
 
 
-def tensor_1d_add():
-    """Add together two 1-dimensional tensors in Relay.
-    """
-    x = relay.var("x", relay.TensorType((1, 4), "int32"))
-    y = relay.var("y", relay.TensorType((1, 4), "int32"))
-    return relay.Function([x, y], relay.add(x, y))
-
-
-def tensor_2d_add():
+def tensor_add():
     """Add together two 2-dimensional tensors in Relay.
     """
     x = relay.var("x", relay.TensorType((2, 4), "int32"))
@@ -29,13 +20,12 @@ def tensor_2d_add():
     return relay.Function([x, y], relay.add(x, y))
 
 
-def assign():
-    """Assign a const to a varible
+def batch_flatten():
+    """Flattens all dimensions except for the batch dimension.
     """
-    x = relay.var('x', shape=())
-    v1 = relay.log(x)
-    v2 = relay.add(v1, x)
-    return relay.Function([x], v2)
+    x = relay.var("x", relay.TensorType((2, 5, 5), "int32"))
+    return relay.Function([x], relay.nn.batch_flatten(x))
+
 
 def mlp_net():
     """The MLP test from Relay.
@@ -44,16 +34,27 @@ def mlp_net():
     return mlp.get_net(1)
 
 
-ALL_FUNCS = [tensor_0d_add, tensor_1d_add, tensor_2d_add, mlp_net]
+ALL_FUNCS = [add, tensor_add, batch_flatten, mlp_net]
+FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
 def simple_example():
-    func = tensor_0d_add()  # Default if none provided.
+    if '-h' in sys.argv[1:]:
+        supported_functions = []
+        print("- To see FuTIL output:\n$ python3 example.py <function_name>")
+        print("- To see Relay IR:\n$ python3 example.py <function_name> -r")
+        print("\n- Supported function names:")
+        for f in FUNC_NAMES: print(f'    {f}')
+        return
+    func = None
     # See if the command line contains a function name.
     for option in ALL_FUNCS:
         if option.__name__ in sys.argv[1:]:
             func = option()
             break
+    if func == None:
+        print("For help:\n$ python3 example.py -h")
+        return
 
     # Try optimizing the Relay IR with a few built-in passes.
     seq = tvm.transform.Sequential([
@@ -65,7 +66,6 @@ def simple_example():
     mod = tvm.IRModule.from_expr(func)
     mod_opt = seq(mod)
     func = mod_opt['main']
-
     if '-r' in sys.argv[1:]:
         # Dump the Relay representation (for educational purposes).
         print(func)
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 9b45774c16..dbdd21e666 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -1,7 +1,12 @@
 from dataclasses import dataclass
 from typing import List, Dict
 from enum import Enum
-import textwrap
+
+
+class DahliaFunctionType(Enum):
+    Tensor1DBinaryOp = 1
+    Tensor2DBinaryOp = 2
+    Tensor3DBatchFlatten = 3
 
 
 class PrimitiveType(Enum):
@@ -10,7 +15,6 @@ class PrimitiveType(Enum):
     Memory1D = 3
     Memory2D = 4
     Memory3D = 5
-    BinOp = 6
 
 
 class ControlType(Enum):
@@ -48,21 +52,6 @@ class FSignature:
     outputs: List[FPortDef]
 
 
-# @dataclass
-# class Atom:
-#     '''
-#     Atomic operations used in guard conditions and RHS of the guarded assignments.
-#     '''
-#     port: FPort
-#     num: int  # TODO(cgyurgyik): This uses a Bitnum structure.
-
-
-# @dataclass
-# class FGuard:
-#     guard_expression: str
-#     atom: Atom
-
-
 @dataclass
 class FWire:
     src: str  # FGuard
@@ -128,14 +117,38 @@ class FComponent:
     controls: FControl = None  # Control statement for this component.
     signature: FSignature = None  # Input and output ports.
 
+    def contains_primitive(self, name: str):
+        '''
+        Determines whether this component contains a primitive with the given name.
+        '''
+        # TODO(cgyurgyik): Rethink data structure here.
+        for cell in self.cells:
+            if not cell.is_primitive(): continue
+            if cell.primitive.name == name: return True
+        return False
+
     def add_cell(self, subcomponent: Cell):
         '''
         Appends a subcomponent to this component's list of FuTIL cells.
         '''
-        # TODO(cgyurgyik): If its already contained here, don't re-add it.
+        if not subcomponent.is_primitive():
+            self.cells.append(subcomponent)
+            return
+        if self.contains_primitive(subcomponent.primitive.name): return
         self.cells.append(subcomponent)
 
 
+@dataclass
+class DahliaDeclaration:
+    decl_name: str
+    component_name: str
+    type: DahliaFunctionType
+    op: str = None
+    program: str = None
+    inputs: List[Cell] = None
+    output: Cell = None
+
+
 @dataclass
 class FDeclaration:
     '''
@@ -149,11 +162,16 @@ class FDeclaration:
 
 @dataclass
 class FCell(Cell):
+    dahlia_name: str = None
     primitive: FPrimitive = None
     declaration: FDeclaration = None
+    dahlia_declaration: DahliaDeclaration = None
 
     def is_primitive(self):
         return self.primitive != None
 
     def is_declaration(self):
         return self.declaration != None
+
+    def is_dahlia_declaration(self):
+        return self.dahlia_declaration != None
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 3b4c5481bb..6b8cbe4632 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -1,4 +1,5 @@
 from futil_ast import *
+import textwrap
 
 
 def mk_block(decl, contents, indent=2):
@@ -107,3 +108,5 @@ def pp_cell(cell: FCell):
             assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
     elif cell.is_declaration():
         return f'{cell.declaration.name} = {cell.declaration.component.name};'
+    elif cell.is_dahlia_declaration():
+        return f'{cell.dahlia_declaration.decl_name} = {cell.dahlia_declaration.component_name};'
diff --git a/frontends/relay-futil/tests/add.expect b/frontends/relay-futil/tests/add.expect
index c153759a33..c02365332b 100644
--- a/frontends/relay-futil/tests/add.expect
+++ b/frontends/relay-futil/tests/add.expect
@@ -1,84 +1,100 @@
 import "primitives/std.lib";
 
-component add (x_out: 32, y_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
-    add = prim std_add(32);
-    c0 = prim std_const(1, 0);
+    add0 = prim std_add(32);
+    add1 = prim std_add(1);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
   }
   wires {
-    group process_add {
-      in_addr0 = c0.out;
-      add.left = x_out;
-      add.right = y_out;
-      in_write_en = 1'd1;
-      in_write_data = add.out;
-      process_add[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
     }
-  }
-  control {
-    seq {
-      process_add;
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
     }
-  }
-}
-component function0 (x_out: 32, y_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
-  cells {
-    add_fn = add;
-    z = prim std_mem_d1(32, 1, 1);
-    c0 = prim std_const(1, 0);
-  }
-  wires {
-    group run_add_fn {
-      add_fn.x_out = x_out;
-      add_fn.y_out = y_out;
-      z.write_data = add_fn.in_write_data;
-      z.write_en = add_fn.in_write_en;
-      z.addr0 = add_fn.in_addr0;
-      add_fn.in_done = z.done;
-      add_fn.go = 1'd1;
-      run_add_fn[done] = add_fn.done ? 1'd1;
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_read_data;
+      upd1[done] = y_read0_0.done ? 1'd1;
     }
-    group save_return_value {
-      z.addr0 = c0.out;
-      in_addr0 = c0.out;
-      in_write_en = 1'd1;
-      in_write_data = z.read_data;
-      save_return_value[done] = in_done ? 1'd1;
+    group upd2<"static"=1> {
+      z0_addr0 = i0.out;
+      z0_write_en = 1'd1;
+      add0.left = x_read0_0.out;
+      add0.right = y_read0_0.out;
+      z0_write_data = 1'd1 ? add0.out;
+      upd2[done] = z0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const2.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      run_add_fn;
-      save_return_value;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
 
 component main () -> () {
   cells {
-    fn0 = function0;
-    c0 = prim std_const(1, 0);
-    main_ret = prim std_mem_d1(32, 1, 1);
+    z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
+    tensor1d_add0 = tensor1d_add;
   }
   wires {
-    group run_fn0 {
-      fn0.x_out = x.read_data;
-      x.addr0 = fn0.in_addr0;
-      fn0.y_out = y.read_data;
-      y.addr0 = fn0.in_addr0;
-      main_ret.addr0 = fn0.in_addr0;
-      main_ret.write_data = fn0.in_write_data;
-      main_ret.write_en = fn0.in_write_en;
-      fn0.in_done = main_ret.done;
-      fn0.go = 1'd1;
-      run_fn0[done] = fn0.done ? 1'd1;
+    group run_tensor1d_add {
+      x.addr0 = tensor1d_add0.x0_addr0;
+      tensor1d_add0.x0_read_data = x.read_data;
+      y.addr0 = tensor1d_add0.y0_addr0;
+      tensor1d_add0.y0_read_data = y.read_data;
+      z.addr0 = tensor1d_add0.z0_addr0;
+      z.write_data = tensor1d_add0.z0_write_data;
+      z.write_en = tensor1d_add0.z0_write_en;
+      tensor1d_add0.z0_done = z.done;
+      tensor1d_add0.go = 1'd1;
+      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_fn0;
+      run_tensor1d_add;
     }
   }
 }
+
+
diff --git a/frontends/relay-futil/tests/data/add.expect b/frontends/relay-futil/tests/data/add.expect
index 64fea78c3e..8e6eaee89f 100644
--- a/frontends/relay-futil/tests/data/add.expect
+++ b/frontends/relay-futil/tests/data/add.expect
@@ -1,7 +1,4 @@
 {
-  "main_ret": [
-    49
-  ],
   "x": [
     42
   ],
diff --git a/frontends/relay-futil/tests/data/add.relay.data b/frontends/relay-futil/tests/data/add.relay.data
index 2ad0db6bf5..2a8177b129 100644
--- a/frontends/relay-futil/tests/data/add.relay.data
+++ b/frontends/relay-futil/tests/data/add.relay.data
@@ -10,9 +10,5 @@
   "z": {
     "data": [0],
     "bitwidth": 32
-  },
-    "main_ret": {
-      "data": [0],
-      "bitwidth": 32
-    }
+  }
 }
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/let.expect b/frontends/relay-futil/tests/data/let1.expect
similarity index 78%
rename from frontends/relay-futil/tests/data/let.expect
rename to frontends/relay-futil/tests/data/let1.expect
index 4a8f9a7636..91c8cc0380 100644
--- a/frontends/relay-futil/tests/data/let.expect
+++ b/frontends/relay-futil/tests/data/let1.expect
@@ -13,8 +13,5 @@
   ],
   "e": [
     250
-  ],
-  "main_ret": [
-    250
   ]
 }
diff --git a/frontends/relay-futil/tests/data/let.relay b/frontends/relay-futil/tests/data/let1.relay
similarity index 100%
rename from frontends/relay-futil/tests/data/let.relay
rename to frontends/relay-futil/tests/data/let1.relay
diff --git a/frontends/relay-futil/tests/data/let.relay.data b/frontends/relay-futil/tests/data/let1.relay.data
similarity index 81%
rename from frontends/relay-futil/tests/data/let.relay.data
rename to frontends/relay-futil/tests/data/let1.relay.data
index 403a89d412..3fc10b83aa 100644
--- a/frontends/relay-futil/tests/data/let.relay.data
+++ b/frontends/relay-futil/tests/data/let1.relay.data
@@ -7,10 +7,6 @@
       "data": [5],
       "bitwidth": 32
     },
-    "main_ret": {
-      "data": [0],
-      "bitwidth": 32
-    },
     "c": {
       "data":[3],
       "bitwidth": 32
diff --git a/frontends/relay-futil/tests/data/let2.expect b/frontends/relay-futil/tests/data/let2.expect
index 8c1e6980e2..a4c655ebe8 100644
--- a/frontends/relay-futil/tests/data/let2.expect
+++ b/frontends/relay-futil/tests/data/let2.expect
@@ -9,9 +9,6 @@
     12
   ],
   "d": [
-    15
-  ],
-  "main_ret": [
-    15
+    36
   ]
 }
diff --git a/frontends/relay-futil/tests/data/let2.relay b/frontends/relay-futil/tests/data/let2.relay
index bcf94635a6..76db1c9722 100644
--- a/frontends/relay-futil/tests/data/let2.relay
+++ b/frontends/relay-futil/tests/data/let2.relay
@@ -1,6 +1,6 @@
 v0.0.4
 fn (%a: int32, %b: int32) {
   let %c = multiply(%a, %b);
-  let %d = add(%c, %a);
+  let %d = multiply(%c, %a);
   %d
 }
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/let2.relay.data b/frontends/relay-futil/tests/data/let2.relay.data
index f3450e4b2c..128ed0c61d 100644
--- a/frontends/relay-futil/tests/data/let2.relay.data
+++ b/frontends/relay-futil/tests/data/let2.relay.data
@@ -7,12 +7,8 @@
       "data": [4],
       "bitwidth": 32
     },
-    "main_ret": {
-      "data": [0],
-      "bitwidth": 32
-    },
     "c": {
-      "data":[3],
+      "data":[0],
       "bitwidth": 32
     },
     "d": {
diff --git a/frontends/relay-futil/tests/data/sub.expect b/frontends/relay-futil/tests/data/sub.expect
index 6dfd9d1980..e313c7824d 100644
--- a/frontends/relay-futil/tests/data/sub.expect
+++ b/frontends/relay-futil/tests/data/sub.expect
@@ -7,8 +7,5 @@
   ],
   "c": [
     42
-  ],
-  "main_ret": [
-    42
   ]
 }
diff --git a/frontends/relay-futil/tests/data/sub.relay.data b/frontends/relay-futil/tests/data/sub.relay.data
index e008769748..219d0fa867 100644
--- a/frontends/relay-futil/tests/data/sub.relay.data
+++ b/frontends/relay-futil/tests/data/sub.relay.data
@@ -7,10 +7,6 @@
       "data": [7],
       "bitwidth": 32
     },
-    "main_ret": {
-      "data": [0],
-      "bitwidth": 32
-    },
     "c": {
       "data": [0],
       "bitwidth": 32
diff --git a/frontends/relay-futil/tests/data/tensor2d_add.expect b/frontends/relay-futil/tests/data/tensor2d_add.expect
new file mode 100644
index 0000000000..84e8e0c77c
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor2d_add.expect
@@ -0,0 +1,44 @@
+{
+  "x": [
+    [
+      1,
+      2,
+      3,
+      4
+    ],
+    [
+      2,
+      4,
+      6,
+      8
+    ]
+  ],
+  "x1": [
+    [
+      42,
+      42,
+      42,
+      42
+    ],
+    [
+      42,
+      42,
+      42,
+      42
+    ]
+  ],
+  "y": [
+    [
+      41,
+      40,
+      39,
+      38
+    ],
+    [
+      40,
+      38,
+      36,
+      34
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/tensor2d_add.relay b/frontends/relay-futil/tests/data/tensor2d_add.relay
new file mode 100644
index 0000000000..4db9fcb099
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor2d_add.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(2, 4), int32], %y: Tensor[(2, 4), int32]) {
+  let %x1 = add(%x, %y);
+  %x1
+}
diff --git a/frontends/relay-futil/tests/data/tensor2d_add.relay.data b/frontends/relay-futil/tests/data/tensor2d_add.relay.data
new file mode 100644
index 0000000000..0bf859a4c7
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor2d_add.relay.data
@@ -0,0 +1,14 @@
+{
+  "x": {
+    "data": [[1, 2, 3, 4], [2, 4, 6, 8]],
+    "bitwidth": 32
+  },
+  "y": {
+      "data": [[41, 40, 39, 38], [40, 38, 36, 34]],
+      "bitwidth": 32
+    },
+  "x1": {
+    "data": [[0, 0, 0, 0], [0, 0, 0, 0]],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
new file mode 100644
index 0000000000..4d55d4d415
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
@@ -0,0 +1,18 @@
+{
+  "x": [
+    [
+      1,
+      2
+    ],
+    [
+      3,
+      4
+    ]
+  ],
+  "x1": [
+    1,
+    2,
+    3,
+    4
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay
new file mode 100644
index 0000000000..2a5e223fec
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(1, 2, 2), int32]) -> Tensor[(1, 4), int32] {
+  let %x1: Tensor[(1, 4), int32] = nn.batch_flatten(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
new file mode 100644
index 0000000000..b6c5eae239
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
@@ -0,0 +1,10 @@
+{
+  "x": {
+    "data": [[1, 2], [3, 4]],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [1, 2, 3, 4],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/let1.expect b/frontends/relay-futil/tests/let1.expect
index 1652474d30..4e82ded37e 100644
--- a/frontends/relay-futil/tests/let1.expect
+++ b/frontends/relay-futil/tests/let1.expect
@@ -1,84 +1,108 @@
 import "primitives/std.lib";
 
-component mult (a_out: 32, b_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
-    mult = prim std_mult(32);
-    c0 = prim std_const(1, 0);
+    a_read0_0 = prim std_reg(32);
+    add0 = prim std_add(1);
+    b_read0_0 = prim std_reg(32);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    mult_pipe0 = prim std_mult_pipe(32);
   }
   wires {
-    group process_mult {
-      in_addr0 = c0.out;
-      mult.left = a_out;
-      mult.right = b_out;
-      in_write_en = 1'd1;
-      in_write_data = mult.out;
-      process_mult[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
     }
-  }
-  control {
-    seq {
-      process_mult;
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
     }
-  }
-}
-component function0 (a_out: 32, b_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
-  cells {
-    mult_fn = mult;
-    z = prim std_mem_d1(32, 1, 1);
-    c0 = prim std_const(1, 0);
-  }
-  wires {
-    group run_mult_fn {
-      mult_fn.a_out = a_out;
-      mult_fn.b_out = b_out;
-      z.write_data = mult_fn.in_write_data;
-      z.write_en = mult_fn.in_write_en;
-      z.addr0 = mult_fn.in_addr0;
-      mult_fn.in_done = z.done;
-      mult_fn.go = 1'd1;
-      run_mult_fn[done] = mult_fn.done ? 1'd1;
+    group let1<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let1[done] = bin_read0_0.done;
+      mult_pipe0.left = a_read0_0.out;
+      mult_pipe0.right = b_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      a_read0_0.write_en = 1'd1;
+      a0_addr0 = i0.out;
+      a_read0_0.in = 1'd1 ? a0_read_data;
+      upd0[done] = a_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      b_read0_0.write_en = 1'd1;
+      b0_addr0 = i0.out;
+      b_read0_0.in = 1'd1 ? b0_read_data;
+      upd1[done] = b_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      z0_addr0 = i0.out;
+      z0_write_en = 1'd1;
+      z0_write_data = 1'd1 ? bin_read0_0.out;
+      upd2[done] = z0_done ? 1'd1;
     }
-    group save_return_value {
-      z.addr0 = c0.out;
-      in_addr0 = c0.out;
-      in_write_en = 1'd1;
-      in_write_data = z.read_data;
-      save_return_value[done] = in_done ? 1'd1;
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      run_mult_fn;
-      save_return_value;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          let1;
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
 
 component main () -> () {
   cells {
-    fn0 = function0;
-    c0 = prim std_const(1, 0);
-    main_ret = prim std_mem_d1(32, 1, 1);
+    z = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
     b = prim std_mem_d1(32, 1, 1);
+    tensor1d_multiply0 = tensor1d_multiply;
   }
   wires {
-    group run_fn0 {
-      fn0.a_out = a.read_data;
-      a.addr0 = fn0.in_addr0;
-      fn0.b_out = b.read_data;
-      b.addr0 = fn0.in_addr0;
-      main_ret.addr0 = fn0.in_addr0;
-      main_ret.write_data = fn0.in_write_data;
-      main_ret.write_en = fn0.in_write_en;
-      fn0.in_done = main_ret.done;
-      fn0.go = 1'd1;
-      run_fn0[done] = fn0.done ? 1'd1;
+    group run_tensor1d_multiply {
+      a.addr0 = tensor1d_multiply0.a0_addr0;
+      tensor1d_multiply0.a0_read_data = a.read_data;
+      b.addr0 = tensor1d_multiply0.b0_addr0;
+      tensor1d_multiply0.b0_read_data = b.read_data;
+      z.addr0 = tensor1d_multiply0.z0_addr0;
+      z.write_data = tensor1d_multiply0.z0_write_data;
+      z.write_en = tensor1d_multiply0.z0_write_en;
+      tensor1d_multiply0.z0_done = z.done;
+      tensor1d_multiply0.go = 1'd1;
+      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_fn0;
+      run_tensor1d_multiply;
     }
   }
 }
+
+
diff --git a/frontends/relay-futil/tests/let2.expect b/frontends/relay-futil/tests/let2.expect
index e5b099e95f..903681d9a8 100644
--- a/frontends/relay-futil/tests/let2.expect
+++ b/frontends/relay-futil/tests/let2.expect
@@ -1,118 +1,191 @@
 import "primitives/std.lib";
 
-component mult (a_out: 32, b_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
-    mult = prim std_mult(32);
-    c0 = prim std_const(1, 0);
+    a_read0_0 = prim std_reg(32);
+    add0 = prim std_add(32);
+    add1 = prim std_add(1);
+    c_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
   }
   wires {
-    group process_mult {
-      in_addr0 = c0.out;
-      mult.left = a_out;
-      mult.right = b_out;
-      in_write_en = 1'd1;
-      in_write_data = mult.out;
-      process_mult[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
     }
-  }
-  control {
-    seq {
-      process_mult;
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
     }
-  }
-}
-component add (c_out: 32, a_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
-  cells {
-    add = prim std_add(32);
-    c0 = prim std_const(1, 0);
-  }
-  wires {
-    group process_add {
-      in_addr0 = c0.out;
-      add.left = c_out;
-      add.right = a_out;
-      in_write_en = 1'd1;
-      in_write_data = add.out;
-      process_add[done] = in_done ? 1'd1;
+    group upd0<"static"=1> {
+      c_read0_0.write_en = 1'd1;
+      c0_addr0 = i0.out;
+      c_read0_0.in = 1'd1 ? c0_read_data;
+      upd0[done] = c_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      a_read0_0.write_en = 1'd1;
+      a0_addr0 = i0.out;
+      a_read0_0.in = 1'd1 ? a0_read_data;
+      upd1[done] = a_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      d0_addr0 = i0.out;
+      d0_write_en = 1'd1;
+      add0.left = c_read0_0.out;
+      add0.right = a_read0_0.out;
+      d0_write_data = 1'd1 ? add0.out;
+      upd2[done] = d0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const2.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      process_add;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
-component function0 (a_out: 32, b_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
-    add_fn = add;
-    mult_fn = mult;
-    c = prim std_mem_d1(32, 1, 1);
-    d = prim std_mem_d1(32, 1, 1);
-    c0 = prim std_const(1, 0);
+    a_read0_0 = prim std_reg(32);
+    add0 = prim std_add(1);
+    b_read0_0 = prim std_reg(32);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    mult_pipe0 = prim std_mult_pipe(32);
   }
   wires {
-    group run_mult_fn {
-      mult_fn.a_out = a_out;
-      mult_fn.b_out = b_out;
-      c.write_data = mult_fn.in_write_data;
-      c.write_en = mult_fn.in_write_en;
-      c.addr0 = mult_fn.in_addr0;
-      mult_fn.in_done = c.done;
-      mult_fn.go = 1'd1;
-      run_mult_fn[done] = mult_fn.done ? 1'd1;
-    }
-    group run_add_fn {
-      add_fn.c_out = c.read_data;
-      add_fn.a_out = a_out;
-      d.write_data = add_fn.in_write_data;
-      d.write_en = add_fn.in_write_en;
-      d.addr0 = add_fn.in_addr0;
-      add_fn.in_done = d.done;
-      add_fn.go = 1'd1;
-      run_add_fn[done] = add_fn.done ? 1'd1;
-    }
-    group save_return_value {
-      d.addr0 = c0.out;
-      in_addr0 = c0.out;
-      in_write_en = 1'd1;
-      in_write_data = d.read_data;
-      save_return_value[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let1[done] = bin_read0_0.done;
+      mult_pipe0.left = a_read0_0.out;
+      mult_pipe0.right = b_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      a_read0_0.write_en = 1'd1;
+      a0_addr0 = i0.out;
+      a_read0_0.in = 1'd1 ? a0_read_data;
+      upd0[done] = a_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      b_read0_0.write_en = 1'd1;
+      b0_addr0 = i0.out;
+      b_read0_0.in = 1'd1 ? b0_read_data;
+      upd1[done] = b_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      c0_addr0 = i0.out;
+      c0_write_en = 1'd1;
+      c0_write_data = 1'd1 ? bin_read0_0.out;
+      upd2[done] = c0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      run_mult_fn;
-      run_add_fn;
-      save_return_value;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          let1;
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
 
 component main () -> () {
   cells {
-    fn0 = function0;
-    c0 = prim std_const(1, 0);
-    main_ret = prim std_mem_d1(32, 1, 1);
+    d = prim std_mem_d1(32, 1, 1);
+    c = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
+    tensor1d_add0 = tensor1d_add;
     b = prim std_mem_d1(32, 1, 1);
+    tensor1d_multiply0 = tensor1d_multiply;
   }
   wires {
-    group run_fn0 {
-      fn0.a_out = a.read_data;
-      a.addr0 = fn0.in_addr0;
-      fn0.b_out = b.read_data;
-      b.addr0 = fn0.in_addr0;
-      main_ret.addr0 = fn0.in_addr0;
-      main_ret.write_data = fn0.in_write_data;
-      main_ret.write_en = fn0.in_write_en;
-      fn0.in_done = main_ret.done;
-      fn0.go = 1'd1;
-      run_fn0[done] = fn0.done ? 1'd1;
+    group run_tensor1d_multiply {
+      a.addr0 = tensor1d_multiply0.a0_addr0;
+      tensor1d_multiply0.a0_read_data = a.read_data;
+      b.addr0 = tensor1d_multiply0.b0_addr0;
+      tensor1d_multiply0.b0_read_data = b.read_data;
+      c.addr0 = tensor1d_multiply0.c0_addr0;
+      c.write_data = tensor1d_multiply0.c0_write_data;
+      c.write_en = tensor1d_multiply0.c0_write_en;
+      tensor1d_multiply0.c0_done = c.done;
+      tensor1d_multiply0.go = 1'd1;
+      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
+    }
+    group run_tensor1d_add {
+      c.addr0 = tensor1d_add0.c0_addr0;
+      tensor1d_add0.c0_read_data = c.read_data;
+      a.addr0 = tensor1d_add0.a0_addr0;
+      tensor1d_add0.a0_read_data = a.read_data;
+      d.addr0 = tensor1d_add0.d0_addr0;
+      d.write_data = tensor1d_add0.d0_write_data;
+      d.write_en = tensor1d_add0.d0_write_en;
+      tensor1d_add0.d0_done = d.done;
+      tensor1d_add0.go = 1'd1;
+      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_fn0;
+      run_tensor1d_multiply;
+      run_tensor1d_add;
     }
   }
 }
+
+
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index 819b8da5da..dbaad55558 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -1,152 +1,274 @@
 import "primitives/std.lib";
 
-component sub (a_out: 32, b_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
   cells {
-    sub = prim std_sub(32);
-    c0 = prim std_const(1, 0);
+    add0 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    c_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    d_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    mult_pipe0 = prim std_mult_pipe(32);
   }
   wires {
-    group process_sub {
-      in_addr0 = c0.out;
-      sub.left = a_out;
-      sub.right = b_out;
-      in_write_en = 1'd1;
-      in_write_data = sub.out;
-      process_sub[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
     }
-  }
-  control {
-    seq {
-      process_sub;
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
     }
-  }
-}
-component add (c_out: 32, a_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
-  cells {
-    add = prim std_add(32);
-    c0 = prim std_const(1, 0);
-  }
-  wires {
-    group process_add {
-      in_addr0 = c0.out;
-      add.left = c_out;
-      add.right = a_out;
-      in_write_en = 1'd1;
-      in_write_data = add.out;
-      process_add[done] = in_done ? 1'd1;
+    group let1<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let1[done] = bin_read0_0.done;
+      mult_pipe0.left = c_read0_0.out;
+      mult_pipe0.right = d_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      c_read0_0.write_en = 1'd1;
+      c0_addr0 = i0.out;
+      c_read0_0.in = 1'd1 ? c0_read_data;
+      upd0[done] = c_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      d_read0_0.write_en = 1'd1;
+      d0_addr0 = i0.out;
+      d_read0_0.in = 1'd1 ? d0_read_data;
+      upd1[done] = d_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      e0_addr0 = i0.out;
+      e0_write_en = 1'd1;
+      e0_write_data = 1'd1 ? bin_read0_0.out;
+      upd2[done] = e0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      process_add;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          let1;
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
-component mult (c_out: 32, d_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
-    mult = prim std_mult(32);
-    c0 = prim std_const(1, 0);
+    a_read0_0 = prim std_reg(32);
+    add0 = prim std_add(32);
+    add1 = prim std_add(1);
+    c_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
   }
   wires {
-    group process_mult {
-      in_addr0 = c0.out;
-      mult.left = c_out;
-      mult.right = d_out;
-      in_write_en = 1'd1;
-      in_write_data = mult.out;
-      process_mult[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group upd0<"static"=1> {
+      c_read0_0.write_en = 1'd1;
+      c0_addr0 = i0.out;
+      c_read0_0.in = 1'd1 ? c0_read_data;
+      upd0[done] = c_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      a_read0_0.write_en = 1'd1;
+      a0_addr0 = i0.out;
+      a_read0_0.in = 1'd1 ? a0_read_data;
+      upd1[done] = a_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      d0_addr0 = i0.out;
+      d0_write_en = 1'd1;
+      add0.left = c_read0_0.out;
+      add0.right = a_read0_0.out;
+      d0_write_data = 1'd1 ? add0.out;
+      upd2[done] = d0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const2.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      process_mult;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
-component function0 (a_out: 32, b_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_subtract(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
-    mult_fn = mult;
-    add_fn = add;
-    sub_fn = sub;
-    c = prim std_mem_d1(32, 1, 1);
-    d = prim std_mem_d1(32, 1, 1);
-    e = prim std_mem_d1(32, 1, 1);
-    c0 = prim std_const(1, 0);
+    a_read0_0 = prim std_reg(32);
+    add0 = prim std_add(1);
+    b_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    sub0 = prim std_sub(32);
   }
   wires {
-    group run_sub_fn {
-      sub_fn.a_out = a_out;
-      sub_fn.b_out = b_out;
-      c.write_data = sub_fn.in_write_data;
-      c.write_en = sub_fn.in_write_en;
-      c.addr0 = sub_fn.in_addr0;
-      sub_fn.in_done = c.done;
-      sub_fn.go = 1'd1;
-      run_sub_fn[done] = sub_fn.done ? 1'd1;
-    }
-    group run_add_fn {
-      add_fn.c_out = c.read_data;
-      add_fn.a_out = a_out;
-      d.write_data = add_fn.in_write_data;
-      d.write_en = add_fn.in_write_en;
-      d.addr0 = add_fn.in_addr0;
-      add_fn.in_done = d.done;
-      add_fn.go = 1'd1;
-      run_add_fn[done] = add_fn.done ? 1'd1;
-    }
-    group run_mult_fn {
-      mult_fn.c_out = c.read_data;
-      mult_fn.d_out = d.read_data;
-      e.write_data = mult_fn.in_write_data;
-      e.write_en = mult_fn.in_write_en;
-      e.addr0 = mult_fn.in_addr0;
-      mult_fn.in_done = e.done;
-      mult_fn.go = 1'd1;
-      run_mult_fn[done] = mult_fn.done ? 1'd1;
-    }
-    group save_return_value {
-      e.addr0 = c0.out;
-      in_addr0 = c0.out;
-      in_write_en = 1'd1;
-      in_write_data = e.read_data;
-      save_return_value[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group upd0<"static"=1> {
+      a_read0_0.write_en = 1'd1;
+      a0_addr0 = i0.out;
+      a_read0_0.in = 1'd1 ? a0_read_data;
+      upd0[done] = a_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      b_read0_0.write_en = 1'd1;
+      b0_addr0 = i0.out;
+      b_read0_0.in = 1'd1 ? b0_read_data;
+      upd1[done] = b_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      c0_addr0 = i0.out;
+      c0_write_en = 1'd1;
+      sub0.left = a_read0_0.out;
+      sub0.right = b_read0_0.out;
+      c0_write_data = 1'd1 ? sub0.out;
+      upd2[done] = c0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      run_sub_fn;
-      run_add_fn;
-      run_mult_fn;
-      save_return_value;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
 
 component main () -> () {
   cells {
-    fn0 = function0;
-    c0 = prim std_const(1, 0);
-    main_ret = prim std_mem_d1(32, 1, 1);
+    e = prim std_mem_d1(32, 1, 1);
+    c = prim std_mem_d1(32, 1, 1);
+    d = prim std_mem_d1(32, 1, 1);
+    tensor1d_multiply0 = tensor1d_multiply;
     a = prim std_mem_d1(32, 1, 1);
+    tensor1d_add0 = tensor1d_add;
     b = prim std_mem_d1(32, 1, 1);
+    tensor1d_subtract0 = tensor1d_subtract;
   }
   wires {
-    group run_fn0 {
-      fn0.a_out = a.read_data;
-      a.addr0 = fn0.in_addr0;
-      fn0.b_out = b.read_data;
-      b.addr0 = fn0.in_addr0;
-      main_ret.addr0 = fn0.in_addr0;
-      main_ret.write_data = fn0.in_write_data;
-      main_ret.write_en = fn0.in_write_en;
-      fn0.in_done = main_ret.done;
-      fn0.go = 1'd1;
-      run_fn0[done] = fn0.done ? 1'd1;
+    group run_tensor1d_subtract {
+      a.addr0 = tensor1d_subtract0.a0_addr0;
+      tensor1d_subtract0.a0_read_data = a.read_data;
+      b.addr0 = tensor1d_subtract0.b0_addr0;
+      tensor1d_subtract0.b0_read_data = b.read_data;
+      c.addr0 = tensor1d_subtract0.c0_addr0;
+      c.write_data = tensor1d_subtract0.c0_write_data;
+      c.write_en = tensor1d_subtract0.c0_write_en;
+      tensor1d_subtract0.c0_done = c.done;
+      tensor1d_subtract0.go = 1'd1;
+      run_tensor1d_subtract[done] = tensor1d_subtract0.done ? 1'd1;
+    }
+    group run_tensor1d_add {
+      c.addr0 = tensor1d_add0.c0_addr0;
+      tensor1d_add0.c0_read_data = c.read_data;
+      a.addr0 = tensor1d_add0.a0_addr0;
+      tensor1d_add0.a0_read_data = a.read_data;
+      d.addr0 = tensor1d_add0.d0_addr0;
+      d.write_data = tensor1d_add0.d0_write_data;
+      d.write_en = tensor1d_add0.d0_write_en;
+      tensor1d_add0.d0_done = d.done;
+      tensor1d_add0.go = 1'd1;
+      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
+    }
+    group run_tensor1d_multiply {
+      c.addr0 = tensor1d_multiply0.c0_addr0;
+      tensor1d_multiply0.c0_read_data = c.read_data;
+      d.addr0 = tensor1d_multiply0.d0_addr0;
+      tensor1d_multiply0.d0_read_data = d.read_data;
+      e.addr0 = tensor1d_multiply0.e0_addr0;
+      e.write_data = tensor1d_multiply0.e0_write_data;
+      e.write_en = tensor1d_multiply0.e0_write_en;
+      tensor1d_multiply0.e0_done = e.done;
+      tensor1d_multiply0.go = 1'd1;
+      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_fn0;
+      run_tensor1d_subtract;
+      run_tensor1d_add;
+      run_tensor1d_multiply;
     }
   }
 }
+
+
diff --git a/frontends/relay-futil/tests/sub.expect b/frontends/relay-futil/tests/sub.expect
index d4fb4c7691..ef4f0d69cc 100644
--- a/frontends/relay-futil/tests/sub.expect
+++ b/frontends/relay-futil/tests/sub.expect
@@ -1,84 +1,100 @@
 import "primitives/std.lib";
 
-component sub (x_out: 32, y_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
+component tensor1d_subtract(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
-    sub = prim std_sub(32);
-    c0 = prim std_const(1, 0);
+    add0 = prim std_add(1);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    sub0 = prim std_sub(32);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
   }
   wires {
-    group process_sub {
-      in_addr0 = c0.out;
-      sub.left = x_out;
-      sub.right = y_out;
-      in_write_en = 1'd1;
-      in_write_data = sub.out;
-      process_sub[done] = in_done ? 1'd1;
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
     }
-  }
-  control {
-    seq {
-      process_sub;
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
     }
-  }
-}
-component function0 (x_out: 32, y_out: 32, in_done: 1) -> (in_write_data: 32, in_write_en: 1, in_addr0: 1) {
-  cells {
-    sub_fn = sub;
-    z = prim std_mem_d1(32, 1, 1);
-    c0 = prim std_const(1, 0);
-  }
-  wires {
-    group run_sub_fn {
-      sub_fn.x_out = x_out;
-      sub_fn.y_out = y_out;
-      z.write_data = sub_fn.in_write_data;
-      z.write_en = sub_fn.in_write_en;
-      z.addr0 = sub_fn.in_addr0;
-      sub_fn.in_done = z.done;
-      sub_fn.go = 1'd1;
-      run_sub_fn[done] = sub_fn.done ? 1'd1;
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_read_data;
+      upd1[done] = y_read0_0.done ? 1'd1;
     }
-    group save_return_value {
-      z.addr0 = c0.out;
-      in_addr0 = c0.out;
-      in_write_en = 1'd1;
-      in_write_data = z.read_data;
-      save_return_value[done] = in_done ? 1'd1;
+    group upd2<"static"=1> {
+      z0_addr0 = i0.out;
+      z0_write_en = 1'd1;
+      sub0.left = x_read0_0.out;
+      sub0.right = y_read0_0.out;
+      z0_write_data = 1'd1 ? sub0.out;
+      upd2[done] = z0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
+      upd3[done] = i0.done ? 1'd1;
     }
   }
+
   control {
     seq {
-      run_sub_fn;
-      save_return_value;
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          upd2;
+          upd3;
+        }
+      }
     }
   }
 }
 
 component main () -> () {
   cells {
-    fn0 = function0;
-    c0 = prim std_const(1, 0);
-    main_ret = prim std_mem_d1(32, 1, 1);
+    z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
+    tensor1d_subtract0 = tensor1d_subtract;
   }
   wires {
-    group run_fn0 {
-      fn0.x_out = x.read_data;
-      x.addr0 = fn0.in_addr0;
-      fn0.y_out = y.read_data;
-      y.addr0 = fn0.in_addr0;
-      main_ret.addr0 = fn0.in_addr0;
-      main_ret.write_data = fn0.in_write_data;
-      main_ret.write_en = fn0.in_write_en;
-      fn0.in_done = main_ret.done;
-      fn0.go = 1'd1;
-      run_fn0[done] = fn0.done ? 1'd1;
+    group run_tensor1d_subtract {
+      x.addr0 = tensor1d_subtract0.x0_addr0;
+      tensor1d_subtract0.x0_read_data = x.read_data;
+      y.addr0 = tensor1d_subtract0.y0_addr0;
+      tensor1d_subtract0.y0_read_data = y.read_data;
+      z.addr0 = tensor1d_subtract0.z0_addr0;
+      z.write_data = tensor1d_subtract0.z0_write_data;
+      z.write_en = tensor1d_subtract0.z0_write_en;
+      tensor1d_subtract0.z0_done = z.done;
+      tensor1d_subtract0.go = 1'd1;
+      run_tensor1d_subtract[done] = tensor1d_subtract0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_fn0;
+      run_tensor1d_subtract;
     }
   }
 }
+
+
diff --git a/frontends/relay-futil/tests/tensor2d_add.expect b/frontends/relay-futil/tests/tensor2d_add.expect
new file mode 100644
index 0000000000..052d44971c
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor2d_add.expect
@@ -0,0 +1,135 @@
+import "primitives/std.lib";
+
+component tensor2d_add(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(3);
+    add2 = prim std_add(2);
+    const0 = prim std_const(2, 0);
+    const1 = prim std_const(2, 1);
+    const2 = prim std_const(3, 0);
+    const3 = prim std_const(3, 3);
+    const4 = prim std_const(3, 1);
+    const5 = prim std_const(2, 1);
+    i0 = prim std_reg(2);
+    j0 = prim std_reg(3);
+    le0 = prim std_le(2);
+    le1 = prim std_le(3);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = j0.out;
+      x0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_0_addr1 = j0.out;
+      y0_0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_0_read_data;
+      upd1[done] = y_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x10_0_addr1 = j0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      add0.left = x_read0_0.out;
+      add0.right = y_read0_0.out;
+      x10_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x10_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd3[done] = j0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      i0.write_en = 1'd1;
+      add2.left = i0.out;
+      add2.right = const5.out;
+      i0.in = 1'd1 ? add2.out;
+      upd4[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              par {
+                upd0;
+                upd1;
+              }
+              upd2;
+              upd3;
+            }
+          }
+          upd4;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 2, 4, 2, 3);
+    x = prim std_mem_d2(32, 2, 4, 2, 3);
+    y = prim std_mem_d2(32, 2, 4, 2, 3);
+    tensor2d_add0 = tensor2d_add;
+  }
+  wires {
+    group run_tensor2d_add {
+      x.addr0 = tensor2d_add0.x0_0_addr0;
+      tensor2d_add0.x0_0_read_data = x.read_data;
+      x.addr1 = tensor2d_add0.x0_0_addr1;
+      y.addr0 = tensor2d_add0.y0_0_addr0;
+      tensor2d_add0.y0_0_read_data = y.read_data;
+      y.addr1 = tensor2d_add0.y0_0_addr1;
+      x1.addr0 = tensor2d_add0.x10_0_addr0;
+      x1.addr1 = tensor2d_add0.x10_0_addr1;
+      x1.write_data = tensor2d_add0.x10_0_write_data;
+      x1.write_en = tensor2d_add0.x10_0_write_en;
+      tensor2d_add0.x10_0_done = x1.done;
+      tensor2d_add0.go = 1'd1;
+      run_tensor2d_add[done] = tensor2d_add0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor2d_add;
+    }
+  }
+}
+
+
diff --git a/frontends/relay-futil/tests/tensor2d_add.relay b/frontends/relay-futil/tests/tensor2d_add.relay
new file mode 100644
index 0000000000..9aba018f7c
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor2d_add.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(2, 4), int32], %y: Tensor[(2, 4), int32]) {
+  let %x1 = add(%x, %y);
+  %x1
+}
+
diff --git a/frontends/relay-futil/tests/tensor3d_batch_flatten.expect b/frontends/relay-futil/tests/tensor3d_batch_flatten.expect
new file mode 100644
index 0000000000..d647914125
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor3d_batch_flatten.expect
@@ -0,0 +1,166 @@
+import "primitives/std.lib";
+
+component tensor3d_batch_flatten(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+  cells {
+    add0 = prim std_add(3);
+    add1 = prim std_add(2);
+    add2 = prim std_add(2);
+    add3 = prim std_add(1);
+    const0 = prim std_const(3, 0);
+    const1 = prim std_const(1, 0);
+    const10 = prim std_const(1, 1);
+    const2 = prim std_const(1, 0);
+    const3 = prim std_const(2, 0);
+    const4 = prim std_const(2, 1);
+    const5 = prim std_const(2, 0);
+    const6 = prim std_const(2, 1);
+    const7 = prim std_const(3, 1);
+    const8 = prim std_const(2, 1);
+    const9 = prim std_const(2, 1);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(2);
+    k0 = prim std_reg(2);
+    l_0 = prim std_reg(3);
+    le0 = prim std_le(1);
+    le1 = prim std_le(2);
+    le2 = prim std_le(2);
+    x_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const2.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const4.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const6.out;
+    }
+    group let0<"static"=1> {
+      l_0.in = const0.out;
+      l_0.write_en = 1'd1;
+      let0[done] = l_0.done;
+    }
+    group let1<"static"=1> {
+      i0.in = const1.out;
+      i0.write_en = 1'd1;
+      let1[done] = i0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const3.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group let3<"static"=1> {
+      k0.in = const5.out;
+      k0.write_en = 1'd1;
+      let3[done] = k0.done;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_0_addr2 = k0.out;
+      x0_0_0_addr1 = j0.out;
+      x0_0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      x10_0_addr1 = l_0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? x_read0_0.out;
+      upd1[done] = x10_0_done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      l_0.write_en = 1'd1;
+      add0.left = l_0.out;
+      add0.right = const7.out;
+      l_0.in = 1'd1 ? add0.out;
+      upd2[done] = l_0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      k0.write_en = 1'd1;
+      add1.left = k0.out;
+      add1.right = const8.out;
+      k0.in = 1'd1 ? add1.out;
+      upd3[done] = k0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add2.left = j0.out;
+      add2.right = const9.out;
+      j0.in = 1'd1 ? add2.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add3.left = i0.out;
+      add3.right = const10.out;
+      i0.in = 1'd1 ? add3.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      let1;
+      while le0.out with cond0 {
+        seq {
+          let2;
+          while le1.out with cond1 {
+            seq {
+              let3;
+              while le2.out with cond2 {
+                seq {
+                  upd0;
+                  upd1;
+                  upd2;
+                  upd3;
+                }
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 1, 4, 1, 3);
+    x = prim std_mem_d3(32, 1, 2, 2, 1, 2, 2);
+    tensor3d_batch_flatten0 = tensor3d_batch_flatten;
+  }
+  wires {
+    group run_tensor3d_batch_flatten {
+      x.addr0 = tensor3d_batch_flatten0.x0_0_0_addr0;
+      tensor3d_batch_flatten0.x0_0_0_read_data = x.read_data;
+      x.addr1 = tensor3d_batch_flatten0.x0_0_0_addr1;
+      x.addr2 = tensor3d_batch_flatten0.x0_0_0_addr2;
+      x1.addr0 = tensor3d_batch_flatten0.x10_0_addr0;
+      x1.addr1 = tensor3d_batch_flatten0.x10_0_addr1;
+      x1.write_data = tensor3d_batch_flatten0.x10_0_write_data;
+      x1.write_en = tensor3d_batch_flatten0.x10_0_write_en;
+      tensor3d_batch_flatten0.x10_0_done = x1.done;
+      tensor3d_batch_flatten0.go = 1'd1;
+      run_tensor3d_batch_flatten[done] = tensor3d_batch_flatten0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor3d_batch_flatten;
+    }
+  }
+}
+
+
diff --git a/frontends/relay-futil/tests/tensor3d_batch_flatten.relay b/frontends/relay-futil/tests/tensor3d_batch_flatten.relay
new file mode 100644
index 0000000000..2a5e223fec
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor3d_batch_flatten.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(1, 2, 2), int32]) -> Tensor[(1, 4), int32] {
+  let %x1: Tensor[(1, 4), int32] = nn.batch_flatten(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 51f63c392b..8d34c080e5 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -1,5 +1,6 @@
 from futil_ast import *
 from itertools import chain
+import math
 
 
 def flatten(l):
@@ -22,218 +23,70 @@ def get_bitwidth(type):
     '''
     t = str(type)
     assert t[0:3] == 'int' or t[0:5] == 'float', f'{t} is not supported.'
-    begin = 3 if t[0:3] == 'int' else 5 # 'float'
+    begin = 3 if t[0:3] == 'int' else 5  # 'float'
     return int(t[begin:len(t)])
 
 
-def extract_function_arguments(args):
+def get_memory_parameters(type):
     '''
-    Extracts the arguments from a function as port definitions.
+    Acquires the memory parameters necessary to create a FuTIL memory primitive.
     '''
-    inputs = []
-    outputs = []
-    for arg in args:
-        name = arg.name_hint
-        bitwidth = get_bitwidth(arg.type_annotation)
-        out_port = f'{name}_out'
-        inputs.append(FPortDef(name=out_port, bitwidth=bitwidth))
-    inputs.append(FPortDef(name="in_done", bitwidth = 1))
-
-    write_data_port = f'in_write_data'
-    write_enable_port = f'in_write_en'
-    addr0_port = f'in_addr0'
-
-    outputs.append(FPortDef(name=write_data_port, bitwidth=bitwidth))
-    # TODO(cgyurgyik): Let's instead add a begin and end index. If begin == end, we can assume its 0D.
-    outputs.append(FPortDef(name=write_enable_port, bitwidth=1))
-    outputs.append(FPortDef(name=addr0_port, bitwidth=1))  # FIXME: Hardcoded for 0D tensors.
-    return inputs, outputs
-
-
-def build_main_body(c: FComponent):
-    '''
-    Builds the main function that will take the last function and run it.
-    '''
-    for cell in reversed(c.cells):
-        if cell.is_declaration():
-            bitwidth = cell.declaration.component.signature.outputs[0].bitwidth
-            inputs = cell.declaration.component.signature.inputs
-            outputs = cell.declaration.component.signature.outputs
-            function_name = cell.declaration.name
-            break
-
-    index = 0
-    cst = FCell(primitive=FPrimitive(name=f'c{index}', data=[1, index], type=PrimitiveType.Constant))
-    c.add_cell(cst)
-    ret = FCell(primitive=FPrimitive(name=f'{c.name}_ret', data=[32, 1, 1], type=PrimitiveType.Memory1D))
-    c.add_cell(ret)
+    t = str(type)
+    if t[0:3] == 'int' or t[0:5] == 'float':
+        return [get_bitwidth(type), 1, 1], PrimitiveType.Memory1D
+    assert t[0:6] == 'Tensor', f'{type} is not currently supported.'
 
-    input_arguments = []
-    for i in range(0, len(inputs) - 1):
-        input_name = (inputs[i].name).split('_')[0]
-        input_arguments.append(input_name)
-        c.add_cell(FCell(primitive=FPrimitive(name=input_name, data=[bitwidth, 1, 1], type=PrimitiveType.Memory1D)))
+    string_type = t[t.find(")") + 3:t.find("]")]
+    string_dimensions = t[t.find("(") + 1:t.find(")")]
 
-    group_name = f'run_{function_name}'
-    write_data_port = outputs[0].name
-    write_enable_port = outputs[1].name
-    addr0_port = outputs[2].name
+    tensor_dimensions = list(map(int, string_dimensions.split(',')))
+    data = [get_bitwidth(string_type)]
+    for dimension in tensor_dimensions: data.append(dimension)  # Size.
+    for dimension in tensor_dimensions: data.append(int(math.log2(dimension) + 1))  # Index size.
 
-    wires = []
-    for i in range(0, len(input_arguments)):
-        # Build connections for input arguments.
-        wires.append(FWire(f'{function_name}.{inputs[i].name}', f'{input_arguments[i]}.read_data'))
-        wires.append(FWire(f'{input_arguments[i]}.addr0', f'{function_name}.{addr0_port}'))
+    if len(tensor_dimensions) == 2:
+        type = PrimitiveType.Memory2D
+    elif len(tensor_dimensions) == 3:
+        type = PrimitiveType.Memory3D
+    return data, type
 
-    wires.append(FWire(f'{c.name}_ret.addr0', f'{function_name}.{addr0_port}'))
-    wires.append(FWire(f'{c.name}_ret.write_data', f'{function_name}.{write_data_port}'))
-    wires.append(FWire(f'{c.name}_ret.write_en', f'{function_name}.{write_enable_port}'))
-    wires.append(FWire(f'{function_name}.in_done', f'{ret.primitive.name}.done'))
-    wires.append(FWire(f'{function_name}.go', "1'd1"))
-    wires.append(FWire(f'{group_name}[done]', f'{function_name}.done ? ' + "1'd1"))
 
-    c.wires = [FConnection(group=FGroup(name=group_name, wires=wires, attributes=[]))]
-    connections = list(filter(lambda w: w.is_group(), c.wires))
-    c.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]
-
-
-def build_function_body(c: FComponent):
-    '''
-    Builds the body of the relay function. This is done by building function declarations,
-    and connecting them with wires.
-    '''
-    declarations = []
+def build_main(c: FComponent):
+    dahlia_declarations = []
     for cell in reversed(c.cells):
-        if cell.is_declaration():
-            declarations.append(cell.declaration)
-
-    for declaration in declarations:
-        intermediary_output = declaration.intermediary_output
-        c.add_cell(declaration.intermediary_output)
-        bitwidth = declaration.component.signature.outputs[0].bitwidth
-        inputs = declaration.component.signature.inputs
-        outputs = declaration.component.signature.outputs
-        function_name = declaration.name
-        group_name = f'run_{function_name}'
-        write_data_port = outputs[0].name
-        write_enable_port = outputs[1].name
-        addr0_port = outputs[2].name
-
-        wires = get_input_wires(c, declaration)
-        wires.append(FWire(f'{intermediary_output.primitive.name}.write_data', f'{function_name}.{write_data_port}'))
-        wires.append(FWire(f'{intermediary_output.primitive.name}.write_en', f'{function_name}.{write_enable_port}'))
-        wires.append(FWire(f'{intermediary_output.primitive.name}.addr0', f'{function_name}.{addr0_port}'))
-        wires.append(FWire(f'{function_name}.{inputs[-1].name}', f'{intermediary_output.primitive.name}.done'))
-        wires.append(FWire(f'{function_name}.go', "1'd1"))
-        wires.append(FWire(f'{group_name}[done]', f'{function_name}.done ? ' + "1'd1"))
+        if cell.is_dahlia_declaration():
+            dahlia_declarations.append(cell.dahlia_declaration)
+
+    for declaration in dahlia_declarations:
+        inputs = declaration.inputs
+        wires = []
+        group_name = f'run_{declaration.component_name}'
+        for input in flatten(inputs):
+            prim = input.primitive
+            wires.append(FWire(f'{prim.name}.addr0', f'{declaration.decl_name}.{input.dahlia_name}_addr0'))
+            wires.append(
+                FWire(f'{declaration.decl_name}.{input.dahlia_name}_read_data', f'{prim.name}.read_data'))
+            if not prim.type == PrimitiveType.Memory2D and not prim.type == PrimitiveType.Memory3D: continue
+            wires.append(FWire(f'{prim.name}.addr1', f'{declaration.decl_name}.{input.dahlia_name}_addr1'))
+            if not prim.type == PrimitiveType.Memory3D: continue
+            wires.append(FWire(f'{prim.name}.addr2', f'{declaration.decl_name}.{input.dahlia_name}_addr2'))
+
+        output = declaration.output
+        wires.append(FWire(f'{output.primitive.name}.addr0', f'{declaration.decl_name}.{output.dahlia_name}_addr0'))
+        if output.primitive.type == PrimitiveType.Memory2D or output.primitive.type == PrimitiveType.Memory3D:
+            wires.append(FWire(f'{output.primitive.name}.addr1', f'{declaration.decl_name}.{output.dahlia_name}_addr1'))
+        if output.primitive.type == PrimitiveType.Memory3D:
+            wires.append(FWire(f'{output.primitive.name}.addr2', f'{declaration.decl_name}.{output.dahlia_name}_addr2'))
+
+        wires.append(
+            FWire(f'{output.primitive.name}.write_data', f'{declaration.decl_name}.{output.dahlia_name}_write_data'))
+        wires.append(
+            FWire(f'{output.primitive.name}.write_en', f'{declaration.decl_name}.{output.dahlia_name}_write_en'))
+        wires.append(FWire(f'{declaration.decl_name}.{output.dahlia_name}_done', f'{output.primitive.name}.done'))
+        wires.append(FWire(f'{declaration.decl_name}.go', "1'd1"))
+        wires.append(FWire(f'{group_name}[done]', f"{declaration.decl_name}.done ? 1'd1"))
         c.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
 
-    last = declarations[len(declarations) - 1].intermediary_output
-    build_return_connections(c, last)
-
     # Ensures that only group names make it into the Controls of a component.
     connections = list(filter(lambda w: w.is_group(), c.wires))
     c.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]
-    return c
-
-
-def get_input_wires(comp: FComponent, decl: FDeclaration):
-    '''
-    Produces the appropriate input wires for a declaration 'decl' within component 'c'.
-    This is necessary to avoid re-creating cells for intermediary inputs that
-    already exist. For example,
-
-    fn(%a, %b) {
-      let %c = multiply(%a, %b); // %a, %b already exist.
-      let %d = add(%a, %c);      // %c is an intermediary.
-    }
-    '''
-    function_name = decl.name
-    decl_inputs = decl.component.signature.inputs
-    intermediary_inputs = flatten(decl.intermediary_inputs)
-
-    finalized_inputs = []
-    # Determines whether an input is either an actual input of a previous function or an intermediary input.
-    # TODO(cgyurgyik): Clean this up once finalized, and use appropriate data structures.
-    for input in intermediary_inputs:
-        found = False
-        for cell in comp.cells:
-            if not cell.is_primitive() or cell.primitive.name != input.primitive.name: continue
-            found = True
-            finalized_inputs.append(f'{input.primitive.name}.read_data')
-            break
-        if not found:
-            finalized_inputs.append(f'{input.primitive.name}_out')
-
-    wires = []
-    for i in range(0, len(decl_inputs) - 1):
-        # Build connections for input arguments.
-        wires.append(FWire(f'{function_name}.{decl_inputs[i].name}', f'{finalized_inputs[i]}'))
-    return wires
-
-
-def build_return_connections(comp: FComponent, intermediary_output: FCell):
-    '''
-    Given a component `comp` and the final intermediary output `intermediary_output`, Creates a group to save the value in main.
-    Example:
-        Relay Function:
-        fn (%a, %b) {
-          let %c = add(%a, %b);
-          %c
-        }
-        This will create the group (and corresponding wires) to connect `c` to the return value in `main`.
-    '''
-    inputs = comp.signature.inputs
-    outputs = comp.signature.outputs
-    intermediary_output_name = intermediary_output.primitive.name
-
-    index = primitive = FPrimitive(name="c0", data=[1, 0], type=PrimitiveType.Constant)
-    comp.add_cell(FCell(primitive=index))
-
-    group_name = "save_return_value"
-    wires = []
-    wires.append(FWire(f'{intermediary_output_name}.addr0', f'{index.name}.out'))
-    wires.append(FWire(f'in_addr0', f'{index.name}.out'))
-    wires.append(FWire(f'in_write_en', "1'd1"))
-    wires.append(FWire(f'in_write_data', f'{intermediary_output_name}.read_data'))
-    wires.append(FWire(f'{group_name}[done]', f'{inputs[-1].name} ? ' + "1'd1"))
-    comp.wires.append((FConnection(group=FGroup(name=group_name, wires=wires, attributes=[]))))
-
-
-def build_tensor_0D_binary_op(call, args, op_name: str):
-    '''
-    Builds the component for a 0D tensor (scalar) binary operation.
-    '''
-    comp: FComponent = FComponent(name=op_name, cells=[], wires=[],
-                               signature=FSignature(inputs=[], outputs=[]))
-    inputs, outputs = extract_function_arguments(call.args)
-    comp.signature.inputs = inputs
-    comp.signature.outputs = outputs
-
-    op = op_name
-    assert inputs[0].bitwidth == inputs[1].bitwidth, \
-        f'Port definitions have different bitwidths for BinOp: {inputs[0].bitwidth}, {inputs[1].bitwidth}'
-
-    cst = FCell(primitive=FPrimitive(name="c0", data=[inputs[-1].bitwidth, 0], type=PrimitiveType.Constant))
-    adder = FCell(primitive=FPrimitive(name=op, data=[inputs[0].bitwidth, op_name], type=PrimitiveType.BinOp))
-    comp.add_cell(adder)
-    comp.add_cell(cst)
-
-    write_data_port = outputs[0].name
-    write_en_port = outputs[1].name
-    addr0_port = outputs[2].name
-
-    group_name = f'process_{op_name}'
-    wires = []
-    wires.append(FWire(addr0_port, f'{cst.primitive.name}.out'))
-    wires.append(FWire(f'{op}.left', inputs[0].name))
-    wires.append(FWire(f'{op}.right', inputs[1].name))
-    wires.append(FWire(write_en_port, "1'd1"))
-    wires.append(FWire(write_data_port, f'{op}.out'))
-
-    wires.append(FWire(f'{group_name}[done]', f'{inputs[-1].name} ? ' + "1'd1"))
-
-    connections = [FConnection(group=FGroup(name=group_name, wires=wires, attributes=[]))]
-    comp.wires = connections
-    comp.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]
-    return FCell(declaration=FDeclaration(name=op_name + "_fn", component=comp, intermediary_inputs=args))

From 981920213e35dd5ad00306f5f52a173560b54738 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 12:01:55 -0500
Subject: [PATCH 02/75] cleanup.

---
 frontends/relay-futil/compiler.py  | 2 +-
 frontends/relay-futil/utilities.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index b10932c239..033506cc1d 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -156,7 +156,7 @@ def compile(program) -> str:
     MAIN = visitor.visit(program)
     DAHLIA_COMPONENTS = '\n'.join(visitor.dahlia_components)
     NEWL = "\n\n"
-    return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}{NEWL}'
+    return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}'
 
 
 if __name__ == '__main__':
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 8d34c080e5..89adfec513 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -54,8 +54,8 @@ def get_memory_parameters(type):
 def build_main(c: FComponent):
     dahlia_declarations = []
     for cell in reversed(c.cells):
-        if cell.is_dahlia_declaration():
-            dahlia_declarations.append(cell.dahlia_declaration)
+        if not cell.is_dahlia_declaration(): continue
+        dahlia_declarations.append(cell.dahlia_declaration)
 
     for declaration in dahlia_declarations:
         inputs = declaration.inputs

From 9e83fa17d09f12843c9b891e671492eadfe5047b Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 14:50:34 -0500
Subject: [PATCH 03/75] Add temporary file use.

---
 frontends/relay-futil/dahlia_functions.py     | 32 +++++++++----------
 frontends/relay-futil/tests/add.expect        |  2 --
 frontends/relay-futil/tests/let1.expect       |  2 --
 frontends/relay-futil/tests/let2.expect       |  2 --
 frontends/relay-futil/tests/let3.expect       |  2 --
 frontends/relay-futil/tests/sub.expect        |  2 --
 .../relay-futil/tests/tensor2d_add.expect     |  2 --
 .../tests/tensor3d_batch_flatten.expect       |  2 --
 8 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 36873c3106..5989d9af6c 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -1,5 +1,6 @@
 import subprocess
 
+from tempfile import NamedTemporaryFile, TemporaryFile
 from futil_ast import *
 
 
@@ -23,24 +24,23 @@ def lower_dahlia_program(prog, component_name):
                      I've explicitly removed errors with `2>/dev/null` so they aren't inserted
                      to the file as well. However, this makes debugging difficult as well.
     '''
-    program_string = "\""
+    program_string = ""
     for line in prog.splitlines():
         program_string += f'{line}\n'
-    program_string += "\""
-    no_err = "2>/dev/null"
-    command = \
-        f"""
-        echo {program_string} > temp.fuse &&
-        /Users/cgyurgyik/Projects/dahlia/fuse temp.fuse --lower -b=futil -n={component_name} > lowered.futil {no_err} -l error &&
-        cd ../../ &&
-        cargo run -- frontends/relay-futil/lowered.futil -p externalize > frontends/relay-futil/temp.futil {no_err} &&
-        cd frontends/relay-futil/ 
-        """
-    subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
-    dahlia_component = open('temp.futil', 'r').read()[29:]  # Skip over importing the primitives library.
-    subprocess.Popen("rm temp.fuse ; rm lowered.futil ; rm temp.futil", stdout=subprocess.PIPE,
-                     shell=True).communicate()
-    return dahlia_component
+
+    with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
+        tf0.seek(0)
+        tf1.seek(0)
+        tf2.seek(0)
+        tf0.write(bytes(program_string, 'UTF-8'))
+        no_err = "2>/dev/null"
+        command = f"""
+            /Users/cgyurgyik/Projects/dahlia/fuse {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {no_err}
+            && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {no_err} 
+            """
+        subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
+        dahlia_component = open(tf2.name, 'r').read()[29:]  # Skip over importing the primitives library.
+        return dahlia_component
 
 
 def tensor1d_op(declaration):
diff --git a/frontends/relay-futil/tests/add.expect b/frontends/relay-futil/tests/add.expect
index c02365332b..a67d257997 100644
--- a/frontends/relay-futil/tests/add.expect
+++ b/frontends/relay-futil/tests/add.expect
@@ -96,5 +96,3 @@ component main () -> () {
     }
   }
 }
-
-
diff --git a/frontends/relay-futil/tests/let1.expect b/frontends/relay-futil/tests/let1.expect
index 4e82ded37e..77312716ef 100644
--- a/frontends/relay-futil/tests/let1.expect
+++ b/frontends/relay-futil/tests/let1.expect
@@ -104,5 +104,3 @@ component main () -> () {
     }
   }
 }
-
-
diff --git a/frontends/relay-futil/tests/let2.expect b/frontends/relay-futil/tests/let2.expect
index 903681d9a8..88da5412ff 100644
--- a/frontends/relay-futil/tests/let2.expect
+++ b/frontends/relay-futil/tests/let2.expect
@@ -187,5 +187,3 @@ component main () -> () {
     }
   }
 }
-
-
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index dbaad55558..8885ca1785 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -270,5 +270,3 @@ component main () -> () {
     }
   }
 }
-
-
diff --git a/frontends/relay-futil/tests/sub.expect b/frontends/relay-futil/tests/sub.expect
index ef4f0d69cc..9cac092744 100644
--- a/frontends/relay-futil/tests/sub.expect
+++ b/frontends/relay-futil/tests/sub.expect
@@ -96,5 +96,3 @@ component main () -> () {
     }
   }
 }
-
-
diff --git a/frontends/relay-futil/tests/tensor2d_add.expect b/frontends/relay-futil/tests/tensor2d_add.expect
index 052d44971c..46db3a2cab 100644
--- a/frontends/relay-futil/tests/tensor2d_add.expect
+++ b/frontends/relay-futil/tests/tensor2d_add.expect
@@ -131,5 +131,3 @@ component main () -> () {
     }
   }
 }
-
-
diff --git a/frontends/relay-futil/tests/tensor3d_batch_flatten.expect b/frontends/relay-futil/tests/tensor3d_batch_flatten.expect
index d647914125..b04fd2d7b3 100644
--- a/frontends/relay-futil/tests/tensor3d_batch_flatten.expect
+++ b/frontends/relay-futil/tests/tensor3d_batch_flatten.expect
@@ -162,5 +162,3 @@ component main () -> () {
     }
   }
 }
-
-

From 1afa5a179faa6ae372c23d475f8024b721eb6b8c Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 16:34:50 -0500
Subject: [PATCH 04/75] Now only need changes in two places when adding a
 Dahlia function.

---
 frontends/relay-futil/compiler.py         | 42 ++++++++++-------------
 frontends/relay-futil/dahlia_functions.py | 14 ++++----
 frontends/relay-futil/futil_ast.py        | 14 ++++----
 3 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 033506cc1d..23d51b25d8 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -35,6 +35,7 @@ def relay_id(self, name):
         Relay does not explicitly differentiate a variable name if it is used twice. For example,
         %x  = foo(%y);
         %x1 = bar(%x); // Here, at this level, the name_hint associated with `x1` is still 'x'.
+
         To avoid this, we provide Relay with its own identification dictionary. If 'x' is seen
         three times, it will produce: 'x', 'x1', x2'.
         """
@@ -56,24 +57,28 @@ def produce_dahlia_name(self, name, type):
         if type == PrimitiveType.Memory3D: return dahlia_name + "_0_0"
         assert False, f'{name} with {type} is not supported yet.'
 
-    def get_dahlia_function_type(self, function_name, input_type):
+    def get_dahlia_declaration(self, function_name, cells, args):
         """
         Returns the corresponding name, Dahlia function type, and op (if it is a binary op, otherwise None).
         If the function type isn't supported, fails with an assertion.
         """
-        op = None
+        input_type = cells[0].primitive.type
+        function = name = op = None
         if function_name in BuiltInBinaryCalls:
             op = BuiltInBinaryCalls[function_name]
             if input_type == PrimitiveType.Memory1D:
-                return self.relay_id(f'tensor1d_{function_name}'), DahliaFunctionType.Tensor1DBinaryOp, op
+                name = self.relay_id(f'tensor1d_{function_name}')
+                function = tensor1d_op
             if input_type == PrimitiveType.Memory2D:
-                return self.relay_id(f'tensor2d_{function_name}'), DahliaFunctionType.Tensor2DBinaryOp, op
-
+                name = self.relay_id(f'tensor2d_{function_name}')
+                function = tensor2d_op
         if function_name == "nn.batch_flatten":
             assert input_type == PrimitiveType.Memory3D, f'{input_type} not supported for batch flattening.'
-            return self.relay_id(f'tensor3d_batch_flatten'), DahliaFunctionType.Tensor3DBatchFlatten, op
+            function = tensor3d_batch_flatten
+            name = self.relay_id(f'{function.__name__}')
 
-        assert False, f'{function_name} with {input_type} is not supported.'
+        assert function != None and name != None, f'{function_name} with type {input_type} is not supported.'
+        return DahliaDeclaration(component_name=name, decl_name=self.id(name), op=op, inputs=args, function=function)
 
     def visit_var(self, var):
         name = self.relay_id(var.name_hint)
@@ -91,16 +96,8 @@ def visit_let(self, let):
         output = variable[0]
         for value in flatten(values):
             if not value.is_dahlia_declaration(): continue
-            decl = value.dahlia_declaration
-            decl.output = output
-            # TODO(cgyurgyik): This shouldn't be necessary. To simplify, produce mapping
-            #                  between enum and corresponding function.
-            if decl.type == DahliaFunctionType.Tensor1DBinaryOp:
-                decl.program = tensor1d_op(decl)
-            elif decl.type == DahliaFunctionType.Tensor2DBinaryOp:
-                decl.program = tensor2d_op(decl)
-            elif decl.type == DahliaFunctionType.Tensor3DBatchFlatten:
-                decl.program = tensor3d_batch_flatten(decl)
+            value.dahlia_declaration.output = output
+            value.dahlia_declaration.invoke()
         return [body, values]
 
     def visit_constant(self, const):
@@ -114,14 +111,11 @@ def visit_call(self, call):
         cells = []
         args = []
         for arg in call.args:
-            result = self.visit(arg)
-            cells.append(result)
-            args.append(result)
+            argument = self.visit(arg)
+            cells.append(argument)
+            args.append(argument)
         cells = flatten(cells)
-        name, type, op = self.get_dahlia_function_type(call.op.name, cells[0].primitive.type)
-        dahlia_declaration = DahliaDeclaration(component_name=name, decl_name=self.id(name), op=op,
-                                               inputs=flatten(args), type=type)
-        cells.append(FCell(dahlia_declaration=dahlia_declaration))
+        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, flatten(args))))
         return cells
 
     def visit_function(self, function):
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 5989d9af6c..4d401bfe3e 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -3,6 +3,9 @@
 from tempfile import NamedTemporaryFile, TemporaryFile
 from futil_ast import *
 
+IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
+NO_ERR = "2>/dev/null"
+
 
 def lower_dahlia_program(prog, component_name):
     '''
@@ -29,18 +32,17 @@ def lower_dahlia_program(prog, component_name):
         program_string += f'{line}\n'
 
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
+        tf0.write(bytes(program_string, 'UTF-8'))
         tf0.seek(0)
         tf1.seek(0)
         tf2.seek(0)
-        tf0.write(bytes(program_string, 'UTF-8'))
-        no_err = "2>/dev/null"
         command = f"""
-            /Users/cgyurgyik/Projects/dahlia/fuse {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {no_err}
-            && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {no_err} 
+            /Users/cgyurgyik/Projects/dahlia/fuse {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} \
+            {NO_ERR} && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {NO_ERR} 
             """
         subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
-        dahlia_component = open(tf2.name, 'r').read()[29:]  # Skip over importing the primitives library.
-        return dahlia_component
+        component = open(tf2.name, 'r').read()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
+        return component
 
 
 def tensor1d_op(declaration):
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index dbdd21e666..61afa9d236 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -1,14 +1,9 @@
 from dataclasses import dataclass
 from typing import List, Dict
+from types import FunctionType
 from enum import Enum
 
 
-class DahliaFunctionType(Enum):
-    Tensor1DBinaryOp = 1
-    Tensor2DBinaryOp = 2
-    Tensor3DBatchFlatten = 3
-
-
 class PrimitiveType(Enum):
     Register = 1
     Constant = 2
@@ -142,11 +137,14 @@ def add_cell(self, subcomponent: Cell):
 class DahliaDeclaration:
     decl_name: str
     component_name: str
-    type: DahliaFunctionType
     op: str = None
-    program: str = None
     inputs: List[Cell] = None
     output: Cell = None
+    function: FunctionType = None
+    program: str = None
+
+    def invoke(self):
+        self.program = self.function(self)
 
 
 @dataclass

From 18a8361fc4c2530e2c0cba8ef7c24a973976df11 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 18:29:09 -0500
Subject: [PATCH 05/75] Assume Dahlia is on path.

---
 frontends/relay-futil/README.md           |  8 ++++----
 frontends/relay-futil/compiler.py         | 11 ++++++-----
 frontends/relay-futil/dahlia_functions.py |  8 ++++----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/frontends/relay-futil/README.md b/frontends/relay-futil/README.md
index 0136ec5436..11d60d8a82 100644
--- a/frontends/relay-futil/README.md
+++ b/frontends/relay-futil/README.md
@@ -7,9 +7,7 @@ This is an in-progress compiler from [TVM][]'s intermediate representation, [Rel
 Installation
 ------------
 
-You will need to install TVM—and we depend on the latest source (unreleased changes for 0.7). There are [official instructions][tvm-install], but these might work for you:
-
-1. Clone the TVM repository (success was once attained with revision `ccacb1ec1`):
+1. Clone the TVM repository with commit hash `ccacb1ec1`):
 
         git clone --recursive git@github.com:apache/incubator-tvm.git
         cd incubator-tvm && git reset --hard ccacb1ec1
@@ -19,7 +17,7 @@ You will need to install TVM—and we depend on the latest source (unreleased ch
         mkdir build && cd build
         cp ../cmake/config.cmake .
 
-4. Build (takes about 9 minutes on my MacBook Pro):
+4. Build TVM:
 
         cmake -G Ninja .. && ninja
 
@@ -35,6 +33,8 @@ You will need to install TVM—and we depend on the latest source (unreleased ch
         python3 setup.py bdist_wheel
         pip3 install --user dist/topi-*.whl
 
+7. Install [Dahlia](https://github.com/cucapra/dahlia#set-it-up), which is used when lowering from Relay to FuTIL.
+The `fuse` executable is expected to be on your path.
 
 Run an Example
 --------------
diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 23d51b25d8..e125a14d7c 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -64,18 +64,19 @@ def get_dahlia_declaration(self, function_name, cells, args):
         """
         input_type = cells[0].primitive.type
         function = name = op = None
+
         if function_name in BuiltInBinaryCalls:
             op = BuiltInBinaryCalls[function_name]
             if input_type == PrimitiveType.Memory1D:
-                name = self.relay_id(f'tensor1d_{function_name}')
                 function = tensor1d_op
+                name = self.relay_id(f'tensor1d_{function_name}')
             if input_type == PrimitiveType.Memory2D:
-                name = self.relay_id(f'tensor2d_{function_name}')
                 function = tensor2d_op
+                name = self.relay_id(f'tensor2d_{function_name}')
         if function_name == "nn.batch_flatten":
-            assert input_type == PrimitiveType.Memory3D, f'{input_type} not supported for batch flattening.'
-            function = tensor3d_batch_flatten
-            name = self.relay_id(f'{function.__name__}')
+            if input_type == PrimitiveType.Memory3D:
+                function = tensor3d_batch_flatten
+                name = self.relay_id(f'{function.__name__}')
 
         assert function != None and name != None, f'{function_name} with type {input_type} is not supported.'
         return DahliaDeclaration(component_name=name, decl_name=self.id(name), op=op, inputs=args, function=function)
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 4d401bfe3e..f4e853a05b 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -28,16 +28,16 @@ def lower_dahlia_program(prog, component_name):
                      to the file as well. However, this makes debugging difficult as well.
     '''
     program_string = ""
-    for line in prog.splitlines():
-        program_string += f'{line}\n'
+    for line in prog.splitlines(): program_string += f'{line}\n'
 
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
         tf0.write(bytes(program_string, 'UTF-8'))
         tf0.seek(0)
         tf1.seek(0)
         tf2.seek(0)
-        command = f"""
-            /Users/cgyurgyik/Projects/dahlia/fuse {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} \
+        command = \
+            f"""
+            fuse {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} \
             {NO_ERR} && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {NO_ERR} 
             """
         subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()

From 3ae22eed973fcde3ee181b926a14c4b1083bf22d Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 18:41:43 -0500
Subject: [PATCH 06/75] Add DAHLIA_EXEC environment variable.

---
 frontends/relay-futil/README.md           |  3 ++-
 frontends/relay-futil/dahlia_functions.py | 28 +++++++----------------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/frontends/relay-futil/README.md b/frontends/relay-futil/README.md
index 11d60d8a82..38ed6ba368 100644
--- a/frontends/relay-futil/README.md
+++ b/frontends/relay-futil/README.md
@@ -34,7 +34,8 @@ Installation
         pip3 install --user dist/topi-*.whl
 
 7. Install [Dahlia](https://github.com/cucapra/dahlia#set-it-up), which is used when lowering from Relay to FuTIL.
-The `fuse` executable is expected to be on your path.
+The `fuse` executable is expected to be on your path. Alternatively, it will check to see if the environment variable
+`$DAHLIA_EXEC` is set. 
 
 Run an Example
 --------------
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index f4e853a05b..fe16e71d98 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -1,4 +1,5 @@
 import subprocess
+import os
 
 from tempfile import NamedTemporaryFile, TemporaryFile
 from futil_ast import *
@@ -9,23 +10,9 @@
 
 def lower_dahlia_program(prog, component_name):
     '''
-    Takes in a string that represents a Dahlia program, lowers it to FuTIL, and applies the `externalize` pass.
-    This is just for experimental purposes, and needs to be replaced.
-    More bluntly, this does the following:
-    1. Copies dahlia program `prog` to a temporary file `temp.fuse`.
-       $ echo `program_string` > temp.fuse
-
-    2. Lowers `temp.fuse` to FuTIL with the name changed to `component_name`, and saves it in `lowered.futil`.
-       $ ./fuse temp.fuse --lower -b=futil -n=component_name > lowered.futil
-
-    3. Runs the 'externalize' pass on the `lowered.futil` file.
-       $ cargo run -- lowered.futil -p externalize > temp.futil
-
-    4. Copies the output from `lowered.futil`, except for the first line (we don't want another copy of the import).
-
-    TODO(cgyurgyik): As you'll see below, this only works on my local machine.
-                     I've explicitly removed errors with `2>/dev/null` so they aren't inserted
-                     to the file as well. However, this makes debugging difficult as well.
+    Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
+    and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
+    declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the component.
     '''
     program_string = ""
     for line in prog.splitlines(): program_string += f'{line}\n'
@@ -35,13 +22,14 @@ def lower_dahlia_program(prog, component_name):
         tf0.seek(0)
         tf1.seek(0)
         tf2.seek(0)
+        fuse_binary = os.environ['DAHLIA_EXEC'] if 'DAHLIA_EXEC' in os.environ else 'fuse'
         command = \
             f"""
-            fuse {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} \
-            {NO_ERR} && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {NO_ERR} 
+            {fuse_binary} {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {NO_ERR} \
+            && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {NO_ERR} 
             """
         subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
-        component = open(tf2.name, 'r').read()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
+        component = tf2.read().decode()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
         return component
 
 

From dc96f853261953378b6270c85c334e471acd4b04 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 18:48:52 -0500
Subject: [PATCH 07/75] cleanup.

---
 frontends/relay-futil/compiler.py  | 3 ++-
 frontends/relay-futil/futil_ast.py | 9 +++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index e125a14d7c..7aa7321589 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -25,6 +25,7 @@ def __init__(self):
     def id(self, name):
         """
         Provides a unique identification for a given name.
+        For example, if 'a' is seen three times, it will produce: 'a0', 'a1', 'a2'.
         """
         id_number = self.id_dictionary[name]
         self.id_dictionary[name] += 1
@@ -150,7 +151,7 @@ def compile(program) -> str:
     PREAMBLE = """import "primitives/std.lib";"""
     MAIN = visitor.visit(program)
     DAHLIA_COMPONENTS = '\n'.join(visitor.dahlia_components)
-    NEWL = "\n\n"
+    NEWL = '\n\n'
     return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}'
 
 
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 61afa9d236..cef66aef0a 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -165,11 +165,8 @@ class FCell(Cell):
     declaration: FDeclaration = None
     dahlia_declaration: DahliaDeclaration = None
 
-    def is_primitive(self):
-        return self.primitive != None
+    def is_primitive(self): return self.primitive != None
 
-    def is_declaration(self):
-        return self.declaration != None
+    def is_declaration(self): return self.declaration != None
 
-    def is_dahlia_declaration(self):
-        return self.dahlia_declaration != None
+    def is_dahlia_declaration(self): return self.dahlia_declaration != None

From c8088eb2541613e1bec8b192880c46d082390e12 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 5 Nov 2020 20:28:03 -0500
Subject: [PATCH 08/75] Cleanup.

---
 frontends/relay-futil/README.md | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/frontends/relay-futil/README.md b/frontends/relay-futil/README.md
index 38ed6ba368..848f4abd3d 100644
--- a/frontends/relay-futil/README.md
+++ b/frontends/relay-futil/README.md
@@ -23,19 +23,16 @@ Installation
 
 5. Install the `tvm` Python package by building a [wheel][]:
 
-        cd ../python
-        python3 setup.py bdist_wheel
+        cd ../python && python3 setup.py bdist_wheel
         pip3 install --user dist/tvm-*.whl
 
 6. Install the accompanying `topi` Python package:
 
-        cd ../topi/python
-        python3 setup.py bdist_wheel
+        cd ../topi/python && python3 setup.py bdist_wheel
         pip3 install --user dist/topi-*.whl
 
-7. Install [Dahlia](https://github.com/cucapra/dahlia#set-it-up), which is used when lowering from Relay to FuTIL.
-The `fuse` executable is expected to be on your path. Alternatively, it will check to see if the environment variable
-`$DAHLIA_EXEC` is set. 
+7. Install [Dahlia][], which is used when lowering from Relay to FuTIL.
+The `fuse` executable is expected to be on your path. Alternatively, it will check to see if the environment variable `$DAHLIA_EXEC` is set. 
 
 Run an Example
 --------------
@@ -63,7 +60,7 @@ The Relay text format parser requires ANTLR, so also do this:
 
 Then, just type `runt` to run the tests.
 
-
+[dahlia]: https://github.com/cucapra/dahlia#set-it-up
 [tvm]: https://tvm.apache.org
 [tvm-install]: https://tvm.apache.org/docs/install/from_source.html#developers-get-source-from-github
 [relay]: https://tvm.apache.org/docs/api/python/relay/index.html

From 7d753704c748b21ebef4b1eec0df67157277e590 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 6 Nov 2020 18:16:44 -0500
Subject: [PATCH 09/75] Add incorrect batch_matmul.

---
 frontends/relay-futil/compiler.py             |   3 +
 frontends/relay-futil/dahlia_functions.py     |  85 +++----
 frontends/relay-futil/example.py              |  10 +-
 .../relay-futil/tests/batch_matmul.expect     | 218 ++++++++++++++++++
 .../relay-futil/tests/batch_matmul.relay      |   6 +
 .../tests/data/batch_matmul.expect            | 161 +++++++++++++
 .../relay-futil/tests/data/batch_matmul.relay |   5 +
 .../tests/data/batch_matmul.relay.data        |  14 ++
 8 files changed, 461 insertions(+), 41 deletions(-)
 create mode 100644 frontends/relay-futil/tests/batch_matmul.expect
 create mode 100644 frontends/relay-futil/tests/batch_matmul.relay
 create mode 100644 frontends/relay-futil/tests/data/batch_matmul.expect
 create mode 100644 frontends/relay-futil/tests/data/batch_matmul.relay
 create mode 100644 frontends/relay-futil/tests/data/batch_matmul.relay.data

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 7aa7321589..5f922522ae 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -78,6 +78,9 @@ def get_dahlia_declaration(self, function_name, cells, args):
             if input_type == PrimitiveType.Memory3D:
                 function = tensor3d_batch_flatten
                 name = self.relay_id(f'{function.__name__}')
+        elif function_name == "nn.batch_matmul":
+            function = batch_matmul
+            name = self.relay_id(f'{function.__name__}')
 
         assert function != None and name != None, f'{function_name} with type {input_type} is not supported.'
         return DahliaDeclaration(component_name=name, decl_name=self.id(name), op=op, inputs=args, function=function)
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index fe16e71d98..450e914185 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -12,7 +12,7 @@ def lower_dahlia_program(prog, component_name):
     '''
     Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
     and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
-    declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the component.
+    declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the respective component.
     '''
     program_string = ""
     for line in prog.splitlines(): program_string += f'{line}\n'
@@ -34,17 +34,12 @@ def lower_dahlia_program(prog, component_name):
 
 
 def tensor1d_op(declaration):
-    op1 = declaration.inputs[0].primitive
-    op2 = declaration.inputs[1].primitive
-    res = declaration.output.primitive
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
 
     assert op1.type == PrimitiveType.Memory1D and op1.type == op2.type and op2.type == res.type
-    assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0]
-    assert op1.data[1] == op2.data[1] and op2.data[1] == res.data[1]
-    assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2]
-    bitwidth = op1.data[0]
-    size = op1.data[1]
-    index_size = op1.data[2]
+    assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0] and op1.data[1] == op2.data[1]
+    assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2] and op2.data[1] == res.data[1]
+    bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
     return lower_dahlia_program(f"""
     decl {op1.name}: ubit<{bitwidth}>[{size}];
     decl {op2.name}: ubit<{bitwidth}>[{size}];
@@ -55,22 +50,13 @@ def tensor1d_op(declaration):
 
 
 def tensor2d_op(declaration):
-    op1 = declaration.inputs[0].primitive
-    op2 = declaration.inputs[1].primitive
-    res = declaration.output.primitive
-
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    bitwidth, size0, size1, index_size0, index_size1 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
     assert op1.type == PrimitiveType.Memory2D and op1.type == op2.type and op2.type == res.type
-    assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0]
-    assert op1.data[1] == op2.data[1] and op2.data[1] == res.data[1]
-    assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2]
-    assert op1.data[3] == op2.data[3] and op2.data[3] == res.data[3]
-    assert op1.data[4] == op2.data[4] and op2.data[4] == res.data[4]
-
-    bitwidth = op1.data[0]
-    size0 = op1.data[1]
-    size1 = op1.data[2]
-    index_size0 = op1.data[3]
-    index_size1 = op1.data[4]
+    assert bitwidth == op2.data[0] and op1.data[0] == res.data[0] and op2.data[4] == res.data[4]
+    assert size0 == op2.data[1] and op2.data[1] == res.data[1] and size1 == op2.data[2] and op2.data[2] == res.data[2]
+    assert index_size0 == op2.data[3] and op2.data[3] == res.data[3] and index_size1 == op2.data[4]
+
     return lower_dahlia_program(f"""
     decl {op1.name}: ubit<{bitwidth}>[{size0}][{size1}];
     decl {op2.name}: ubit<{bitwidth}>[{size0}][{size1}];
@@ -83,21 +69,12 @@ def tensor2d_op(declaration):
 
 
 def tensor3d_batch_flatten(declaration):
-    op1 = declaration.inputs[0].primitive
-    res = declaration.output.primitive
-
-    bitwidth = op1.data[0]
-    op1_size0 = op1.data[1]
-    op1_size1 = op1.data[2]
-    op1_size2 = op1.data[3]
-    op1_index_size0 = op1.data[4]
-    op1_index_size1 = op1.data[5]
-    op1_index_size2 = op1.data[6]
-    res_bitwidth = res.data[0]
-    res_size0 = res.data[1]
-    res_size1 = res.data[2]
-    res_index_size0 = res.data[3]
-    res_index_size1 = res.data[4]
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
+    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, op1_size0, op1_size1, op1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
+    op1_index_size0, op1_index_size1, op1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
+    res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
+    res_index_size0, res_index_size1 = res.data[3], res.data[4]
 
     assert op1.type == PrimitiveType.Memory3D and res_size1 == op1_size1 * op1_size2 and res_size0 == op1_size0
     assert res.type == PrimitiveType.Memory2D and res_bitwidth == bitwidth
@@ -113,3 +90,31 @@ def tensor3d_batch_flatten(declaration):
             }}
           }}
         }}""", declaration.component_name)
+
+
+def batch_matmul(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
+    assert False, "Unimplemented. nn.batch_matmul currently does not execute properly."
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    bitwidth, M1_size0, M1_size1, M1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
+    M1_index_size0, M1_index_size1, M1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
+    M2_size0, M2_size1, M2_size2 = op2.data[1], op2.data[2], op2.data[3]
+    M2_index_size0, M2_index_size1, M2_index_size2 = op2.data[4], op2.data[5], op2.data[6]
+    assert op1.type == PrimitiveType.Memory3D and op1.type == op2.type and op2.type == res.type
+    assert M2_size1 == M1_size2 and bitwidth == op2.data[0] and M1_size0 == M2_size0
+
+    return lower_dahlia_program(f"""
+    decl {op1.name}: ubit<{bitwidth}>[{M1_size0}][{M1_size1}][{M1_size2}];
+    decl {op2.name}: ubit<{bitwidth}>[{M2_size0}][{M2_size1}][{M2_size2}];
+    decl {res.name}: ubit<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size2}];
+    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let j: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+        for (let k: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
+          for (let l: ubit<{M1_index_size2}> = 0..{M1_size2}) {{
+            let prod = {op1.name}[i][j][l] * {op2.name}[i][l][k];
+          }} combine {{
+            {res.name}[i][j][k] += prod;
+          }}
+        }}
+      }}
+    }}""", declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 9d0c06d8f8..d5f3d15011 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -27,6 +27,14 @@ def batch_flatten():
     return relay.Function([x], relay.nn.batch_flatten(x))
 
 
+def batch_matmul():
+    """Add together two 2-dimensional tensors in Relay.
+    """
+    x = relay.var("x", relay.TensorType((1, 3, 2), "int32"))
+    y = relay.var("y", relay.TensorType((1, 2, 3), "int32"))
+    return relay.Function([x, y], relay.nn.batch_matmul(x, y))
+
+
 def mlp_net():
     """The MLP test from Relay.
     """
@@ -34,7 +42,7 @@ def mlp_net():
     return mlp.get_net(1)
 
 
-ALL_FUNCS = [add, tensor_add, batch_flatten, mlp_net]
+ALL_FUNCS = [add, tensor_add, batch_flatten, batch_matmul, mlp_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/tests/batch_matmul.expect b/frontends/relay-futil/tests/batch_matmul.expect
new file mode 100644
index 0000000000..7e2e4c3cc9
--- /dev/null
+++ b/frontends/relay-futil/tests/batch_matmul.expect
@@ -0,0 +1,218 @@
+import "primitives/std.lib";
+
+component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 2, a0_0_0_addr1: 2, a0_0_0_addr2: 2, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 2, b0_0_0_addr1: 2, b0_0_0_addr2: 2, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 2, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
+  cells {
+    a_read0_0 = prim std_reg(32);
+    add0 = prim std_add(32);
+    add1 = prim std_add(2);
+    add2 = prim std_add(2);
+    add3 = prim std_add(2);
+    add4 = prim std_add(2);
+    b_read0_0 = prim std_reg(32);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(2, 0);
+    const1 = prim std_const(2, 2);
+    const10 = prim std_const(2, 1);
+    const11 = prim std_const(2, 1);
+    const2 = prim std_const(2, 0);
+    const3 = prim std_const(2, 2);
+    const4 = prim std_const(2, 0);
+    const5 = prim std_const(2, 2);
+    const6 = prim std_const(2, 0);
+    const7 = prim std_const(2, 2);
+    const8 = prim std_const(2, 1);
+    const9 = prim std_const(2, 1);
+    i0 = prim std_reg(2);
+    j0 = prim std_reg(2);
+    k0 = prim std_reg(2);
+    l0 = prim std_reg(2);
+    le0 = prim std_le(2);
+    le1 = prim std_le(2);
+    le2 = prim std_le(2);
+    le3 = prim std_le(2);
+    mult_pipe0 = prim std_mult_pipe(32);
+    prod_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = l0.out;
+      le3.right = const7.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      k0.in = const4.out;
+      k0.write_en = 1'd1;
+      let2[done] = k0.done;
+    }
+    group let3<"static"=1> {
+      l0.in = const6.out;
+      l0.write_en = 1'd1;
+      let3[done] = l0.done;
+    }
+    group let4<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let4[done] = bin_read0_0.done;
+      mult_pipe0.left = a_read0_0.out;
+      mult_pipe0.right = b_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let5<"static"=1> {
+      prod_0.in = bin_read0_0.out;
+      prod_0.write_en = 1'd1;
+      let5[done] = prod_0.done;
+    }
+    group upd0<"static"=1> {
+      a_read0_0.write_en = 1'd1;
+      a0_0_0_addr2 = l0.out;
+      a0_0_0_addr1 = j0.out;
+      a0_0_0_addr0 = i0.out;
+      a_read0_0.in = 1'd1 ? a0_0_0_read_data;
+      upd0[done] = a_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      b_read0_0.write_en = 1'd1;
+      b0_0_0_addr2 = k0.out;
+      b0_0_0_addr1 = l0.out;
+      b0_0_0_addr0 = i0.out;
+      b_read0_0.in = 1'd1 ? b0_0_0_read_data;
+      upd1[done] = b_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x0_0_0_addr2 = k0.out;
+      x0_0_0_addr1 = j0.out;
+      x0_0_0_addr0 = i0.out;
+      x0_0_0_write_en = 1'd1;
+      add0.left = x0_0_0_read_data;
+      add0.right = prod_0.out;
+      x0_0_0_addr2 = k0.out;
+      x0_0_0_addr1 = j0.out;
+      x0_0_0_addr0 = i0.out;
+      x0_0_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x0_0_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      l0.write_en = 1'd1;
+      add1.left = l0.out;
+      add1.right = const8.out;
+      l0.in = 1'd1 ? add1.out;
+      upd3[done] = l0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      k0.write_en = 1'd1;
+      add2.left = k0.out;
+      add2.right = const9.out;
+      k0.in = 1'd1 ? add2.out;
+      upd4[done] = k0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      j0.write_en = 1'd1;
+      add3.left = j0.out;
+      add3.right = const10.out;
+      j0.in = 1'd1 ? add3.out;
+      upd5[done] = j0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      i0.write_en = 1'd1;
+      add4.left = i0.out;
+      add4.right = const11.out;
+      i0.in = 1'd1 ? add4.out;
+      upd6[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      par {
+                        upd0;
+                        upd1;
+                      }
+                      let4;
+                      let5;
+                      upd2;
+                      upd3;
+                    }
+                  }
+                  upd4;
+                }
+              }
+              upd5;
+            }
+          }
+          upd6;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x = prim std_mem_d3(32, 3, 3, 3, 2, 2, 2);
+    a = prim std_mem_d3(32, 3, 3, 3, 2, 2, 2);
+    b = prim std_mem_d3(32, 3, 3, 3, 2, 2, 2);
+    batch_matmul0 = batch_matmul;
+  }
+  wires {
+    group run_batch_matmul {
+      a.addr0 = batch_matmul0.a0_0_0_addr0;
+      batch_matmul0.a0_0_0_read_data = a.read_data;
+      a.addr1 = batch_matmul0.a0_0_0_addr1;
+      a.addr2 = batch_matmul0.a0_0_0_addr2;
+      b.addr0 = batch_matmul0.b0_0_0_addr0;
+      batch_matmul0.b0_0_0_read_data = b.read_data;
+      b.addr1 = batch_matmul0.b0_0_0_addr1;
+      b.addr2 = batch_matmul0.b0_0_0_addr2;
+      x.addr0 = batch_matmul0.x0_0_0_addr0;
+      x.addr1 = batch_matmul0.x0_0_0_addr1;
+      x.addr2 = batch_matmul0.x0_0_0_addr2;
+      x.write_data = batch_matmul0.x0_0_0_write_data;
+      x.write_en = batch_matmul0.x0_0_0_write_en;
+      batch_matmul0.x0_0_0_done = x.done;
+      batch_matmul0.go = 1'd1;
+      run_batch_matmul[done] = batch_matmul0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_batch_matmul;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/batch_matmul.relay b/frontends/relay-futil/tests/batch_matmul.relay
new file mode 100644
index 0000000000..3c33743956
--- /dev/null
+++ b/frontends/relay-futil/tests/batch_matmul.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%a: Tensor[(3, 3, 3), int32], %b: Tensor[(3, 3, 3), int32]) -> Tensor[(3, 3, 3), int32] {
+  let %x: Tensor[(3, 3, 3), int32] = nn.batch_matmul(%a, %b);
+  %x
+}
+
diff --git a/frontends/relay-futil/tests/data/batch_matmul.expect b/frontends/relay-futil/tests/data/batch_matmul.expect
new file mode 100644
index 0000000000..f8d5eb52f9
--- /dev/null
+++ b/frontends/relay-futil/tests/data/batch_matmul.expect
@@ -0,0 +1,161 @@
+{
+  "a": [
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ],
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ],
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ]
+  ],
+  "b": [
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ],
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ],
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ]
+  ],
+  "x": [
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ],
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ],
+    [
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ],
+      [
+        1,
+        1,
+        1
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/batch_matmul.relay b/frontends/relay-futil/tests/data/batch_matmul.relay
new file mode 100644
index 0000000000..20f860a2b7
--- /dev/null
+++ b/frontends/relay-futil/tests/data/batch_matmul.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%a: Tensor[(3, 3, 3), int32], %b: Tensor[(3, 3, 3), int32]) -> Tensor[(3, 3, 3), int32] {
+  let %x: Tensor[(3, 3, 3), int32] = nn.batch_matmul(%a, %b);
+  %x
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/batch_matmul.relay.data b/frontends/relay-futil/tests/data/batch_matmul.relay.data
new file mode 100644
index 0000000000..172b2d3ac2
--- /dev/null
+++ b/frontends/relay-futil/tests/data/batch_matmul.relay.data
@@ -0,0 +1,14 @@
+{
+  "a": {
+    "data": [[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]],
+    "bitwidth": 32
+  },
+  "b": {
+    "data": [[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]],
+    "bitwidth": 32
+  },
+  "x": {
+    "data": [[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]]],
+    "bitwidth": 32
+    }
+}
\ No newline at end of file

From d73ace84367bd55e7e73f5589f01e92a7e172999 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 6 Nov 2020 18:30:42 -0500
Subject: [PATCH 10/75] [Relay] Add actual expect for matrix multiply.

---
 .../tests/data/batch_matmul.expect            | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/frontends/relay-futil/tests/data/batch_matmul.expect b/frontends/relay-futil/tests/data/batch_matmul.expect
index f8d5eb52f9..f5bbcf1366 100644
--- a/frontends/relay-futil/tests/data/batch_matmul.expect
+++ b/frontends/relay-futil/tests/data/batch_matmul.expect
@@ -108,53 +108,53 @@
   "x": [
     [
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ],
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ],
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ]
     ],
     [
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ],
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ],
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ]
     ],
     [
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ],
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ],
       [
-        1,
-        1,
-        1
+        3,
+        3,
+        3
       ]
     ]
   ]

From 57f85be9568af360cd53e3292fd73615e225cde8 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 6 Nov 2020 21:52:44 -0500
Subject: [PATCH 11/75] Add succinct example.

---
 frontends/relay-futil/dahlia_functions.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 450e914185..6ed7a6df5c 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -13,6 +13,25 @@ def lower_dahlia_program(prog, component_name):
     Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
     and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
     declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the respective component.
+
+    Example:
+        ------ Dahlia, component name: ProcessX ------
+        decl X: ubit<32>[4];
+        ...
+
+        ------------- Lower to FuTIL -----------------
+        component ProcessX() -> () {
+          X = prim std_mem_d1_ext(32, 4, 2);
+          ...
+        }
+
+        ------------- Externalize Pass ---------------
+        component ProcessX
+        (go: 1, clk: 1, X0_read_data: 32, X0_done: 1) ->
+        (done: 1, X0_addr0: 2, X0_write_data: 32, X0_write_en: 1, X0_clk: 1) {
+           ...
+        }
+
     '''
     program_string = ""
     for line in prog.splitlines(): program_string += f'{line}\n'

From d87b5f3e4db1725b7561c6acb947116240194360 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 12 Nov 2020 10:17:37 -0500
Subject: [PATCH 12/75] [relay] Add more functions, cleanup.

---
 frontends/relay-futil/README.md               |   8 +-
 frontends/relay-futil/compiler.py             |  66 ++-
 frontends/relay-futil/dahlia_functions.py     | 156 +++++--
 frontends/relay-futil/example.py              |  61 +--
 frontends/relay-futil/futil_ast.py            |   3 +-
 frontends/relay-futil/pretty_print.py         |  20 +-
 frontends/relay-futil/runt.toml               |   2 +-
 .../relay-futil/tests/batch_matmul.expect     | 430 +++++++++++++-----
 .../relay-futil/tests/batch_matmul.relay      |   4 +-
 frontends/relay-futil/tests/bias_add.expect   | 131 ++++++
 frontends/relay-futil/tests/bias_add.relay    |   7 +
 ...ch_flatten.expect => batch_flatten.expect} |   0
 ...atch_flatten.relay => batch_flatten.relay} |   0
 ...en.relay.data => batch_flatten.relay.data} |   0
 .../tests/data/batch_matmul.expect            | 141 +++---
 .../relay-futil/tests/data/batch_matmul.relay |   4 +-
 .../tests/data/batch_matmul.relay.data        |  14 +-
 .../relay-futil/tests/data/bias_add.expect    |  36 ++
 .../relay-futil/tests/data/bias_add.relay     |   5 +
 .../tests/data/bias_add.relay.data            |  14 +
 .../relay-futil/tests/fixed_point_add.expect  |  98 ++++
 .../relay-futil/tests/fixed_point_add.relay   |   5 +
 frontends/relay-futil/tests/let3.expect       |  52 ++-
 frontends/relay-futil/tests/let3.relay        |   2 +-
 frontends/relay-futil/tests/relu.expect       | 152 +++++++
 frontends/relay-futil/tests/relu.relay        |   6 +
 frontends/relay-futil/utilities.py            |  59 ++-
 27 files changed, 1115 insertions(+), 361 deletions(-)
 create mode 100644 frontends/relay-futil/tests/bias_add.expect
 create mode 100644 frontends/relay-futil/tests/bias_add.relay
 rename frontends/relay-futil/tests/data/{tensor3d_batch_flatten.expect => batch_flatten.expect} (100%)
 rename frontends/relay-futil/tests/data/{tensor3d_batch_flatten.relay => batch_flatten.relay} (100%)
 rename frontends/relay-futil/tests/data/{tensor3d_batch_flatten.relay.data => batch_flatten.relay.data} (100%)
 create mode 100644 frontends/relay-futil/tests/data/bias_add.expect
 create mode 100644 frontends/relay-futil/tests/data/bias_add.relay
 create mode 100644 frontends/relay-futil/tests/data/bias_add.relay.data
 create mode 100644 frontends/relay-futil/tests/fixed_point_add.expect
 create mode 100644 frontends/relay-futil/tests/fixed_point_add.relay
 create mode 100644 frontends/relay-futil/tests/relu.expect
 create mode 100644 frontends/relay-futil/tests/relu.relay

diff --git a/frontends/relay-futil/README.md b/frontends/relay-futil/README.md
index 848f4abd3d..a47b0b3e0b 100644
--- a/frontends/relay-futil/README.md
+++ b/frontends/relay-futil/README.md
@@ -31,7 +31,11 @@ Installation
         cd ../topi/python && python3 setup.py bdist_wheel
         pip3 install --user dist/topi-*.whl
 
-7. Install [Dahlia][], which is used when lowering from Relay to FuTIL.
+7. To run the [MLP net][] and [VGG net][] examples, install `pytest`:
+        
+        pip3 install pytest
+
+8. Install [Dahlia][], which is used when lowering from Relay to FuTIL.
 The `fuse` executable is expected to be on your path. Alternatively, it will check to see if the environment variable `$DAHLIA_EXEC` is set. 
 
 Run an Example
@@ -60,6 +64,8 @@ The Relay text format parser requires ANTLR, so also do this:
 
 Then, just type `runt` to run the tests.
 
+[vgg net]: https://github.com/apache/incubator-tvm/blob/main/python/tvm/relay/testing/vgg.py 
+[mlp net]: https://github.com/apache/incubator-tvm/blob/main/python/tvm/relay/testing/mlp.py
 [dahlia]: https://github.com/cucapra/dahlia#set-it-up
 [tvm]: https://tvm.apache.org
 [tvm-install]: https://tvm.apache.org/docs/install/from_source.html#developers-get-source-from-github
diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 5f922522ae..2b66ac0f7e 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -9,7 +9,7 @@
 from dahlia_functions import *
 
 # Mapping from Relay binary calls to the respective Dahlia operator.
-BuiltInBinaryCalls = {'add': '+', 'multiply': '*', 'subtract': '-'}
+BuiltInBinaryCalls = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
 
 
 class Relay2Futil(ExprFunctor):
@@ -69,81 +69,71 @@ def get_dahlia_declaration(self, function_name, cells, args):
         if function_name in BuiltInBinaryCalls:
             op = BuiltInBinaryCalls[function_name]
             if input_type == PrimitiveType.Memory1D:
-                function = tensor1d_op
-                name = self.relay_id(f'tensor1d_{function_name}')
-            if input_type == PrimitiveType.Memory2D:
-                function = tensor2d_op
-                name = self.relay_id(f'tensor2d_{function_name}')
+                function, name = tensor1d_op, f'tensor1d_{function_name}'
+            elif input_type == PrimitiveType.Memory2D:
+                function, name = tensor2d_op, f'tensor2d_{function_name}'
+
         if function_name == "nn.batch_flatten":
-            if input_type == PrimitiveType.Memory3D:
-                function = tensor3d_batch_flatten
-                name = self.relay_id(f'{function.__name__}')
+            if input_type == PrimitiveType.Memory3D: function = tensor3d_batch_flatten
         elif function_name == "nn.batch_matmul":
             function = batch_matmul
-            name = self.relay_id(f'{function.__name__}')
+        elif function_name == "nn.bias_add":
+            if input_type == PrimitiveType.Memory2D: function = tensor2d_bias_add
+        elif function_name == "nn.relu":
+            if input_type == PrimitiveType.Memory2D: function = tensor2d_relu
 
-        assert function != None and name != None, f'{function_name} with type {input_type} is not supported.'
-        return DahliaDeclaration(component_name=name, decl_name=self.id(name), op=op, inputs=args, function=function)
+        assert function != None, f'{function_name} with type {input_type} is not supported.'
+        if name == None: name = function.__name__
+        return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name), op=op, inputs=args,
+                                 function=function)
 
     def visit_var(self, var):
         name = self.relay_id(var.name_hint)
-        if self.main.contains_primitive(name): return [cell]
-
-        data, type = get_memory_parameters(var.type_annotation)
+        # Do not add duplicate primitives to main.
+        if self.main.contains_primitive(name): return cell
+        data, type, data_type = get_memory_parameters(var.type_annotation)
         dahlia_name = self.produce_dahlia_name(name, type)
-        return [FCell(dahlia_name=dahlia_name, primitive=FPrimitive(name=name, data=data, type=type))]
+        return FCell(dahlia_name=dahlia_name,
+                     primitive=FPrimitive(name=name, data=data, data_type=data_type, type=type))
 
     def visit_let(self, let):
-        variable = self.visit(let.var)
-        body = self.visit(let.body)
-        values = self.visit(let.value)
-
-        output = variable[0]
-        for value in flatten(values):
+        output, body, values = self.visit(let.var), self.visit(let.body), self.visit(let.value)
+        for value in values:
             if not value.is_dahlia_declaration(): continue
             value.dahlia_declaration.output = output
             value.dahlia_declaration.invoke()
         return [body, values]
 
     def visit_constant(self, const):
-        type = const.data.dtype
-        shape = const.data.shape
-        data = [get_bitwidth(type), int(const.data.asnumpy())]
-        name = self.id("const")
-        return [FCell(primitive=FPrimitive(name=name, data=data, type=PrimitiveType.Constant))]
+        type, shape = const.data.dtype, const.data.shape
+        name, data, data_type = self.id("const"), [get_bitwidth(type), int(const.data.asnumpy())], get_type(type)
+        return FCell(primitive=FPrimitive(name=name, data=data, data_type=data_type, type=PrimitiveType.Constant))
 
     def visit_call(self, call):
-        cells = []
-        args = []
+        cells, args = [], []
         for arg in call.args:
             argument = self.visit(arg)
             cells.append(argument)
             args.append(argument)
-        cells = flatten(cells)
-        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, flatten(args))))
+        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, args)))
         return cells
 
     def visit_function(self, function):
         body = self.visit(function.body)
-
         for cell in flatten(body):
             self.main.add_cell(cell)
             if not cell.is_dahlia_declaration(): continue
             self.dahlia_components.append(cell.dahlia_declaration.program)
-
-        build_main(self.main)  # Groups, wires, connections.
+        build_main_controls(self.main)
         return pp_component(self.main)
 
 
 def infer_type(expr: Function) -> Function:
     infer_types_pass = relay.transform.InferType()
-    fuse_op__pass = relay.transform.FuseOps()
-    to_normal_pass = relay.transform.ToANormalForm()
     mod = ir.IRModule()
     mod['main'] = expr
     mod = infer_types_pass(mod)
-    ret = mod['main']
-    return ret
+    return mod['main']
 
 
 def compile(program) -> str:
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 6ed7a6df5c..fd0b3db770 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -33,20 +33,14 @@ def lower_dahlia_program(prog, component_name):
         }
 
     '''
-    program_string = ""
-    for line in prog.splitlines(): program_string += f'{line}\n'
-
+    program_string = '\n'.join(prog.splitlines())
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
         tf0.write(bytes(program_string, 'UTF-8'))
-        tf0.seek(0)
-        tf1.seek(0)
-        tf2.seek(0)
+        tf0.seek(0), tf1.seek(0), tf2.seek(0)
         fuse_binary = os.environ['DAHLIA_EXEC'] if 'DAHLIA_EXEC' in os.environ else 'fuse'
-        command = \
-            f"""
-            {fuse_binary} {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {NO_ERR} \
-            && cd ../../ && cargo run -- {tf1.name} -p externalize > {tf2.name} {NO_ERR} 
-            """
+        command = f"""
+                {fuse_binary} {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {NO_ERR} \
+                 && cargo run -- {tf1.name} -l ../../ -p externalize > {tf2.name} {NO_ERR}"""
         subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
         component = tf2.read().decode()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
         return component
@@ -55,17 +49,19 @@ def lower_dahlia_program(prog, component_name):
 def tensor1d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
 
+    assert op1.data_type == op2.data_type and op2.data_type == res.data_type
     assert op1.type == PrimitiveType.Memory1D and op1.type == op2.type and op2.type == res.type
     assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0] and op1.data[1] == op2.data[1]
     assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2] and op2.data[1] == res.data[1]
     bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
-    return lower_dahlia_program(f"""
-    decl {op1.name}: ubit<{bitwidth}>[{size}];
-    decl {op2.name}: ubit<{bitwidth}>[{size}];
-    decl {res.name}: ubit<{bitwidth}>[{size}];
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
+    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
     for (let i: ubit<{index_size}> = 0..{size}) {{
       {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
-    }}""", declaration.component_name)
+    }}"""
+    return lower_dahlia_program(program, declaration.component_name)
 
 
 def tensor2d_op(declaration):
@@ -75,16 +71,16 @@ def tensor2d_op(declaration):
     assert bitwidth == op2.data[0] and op1.data[0] == res.data[0] and op2.data[4] == res.data[4]
     assert size0 == op2.data[1] and op2.data[1] == res.data[1] and size1 == op2.data[2] and op2.data[2] == res.data[2]
     assert index_size0 == op2.data[3] and op2.data[3] == res.data[3] and index_size1 == op2.data[4]
-
-    return lower_dahlia_program(f"""
-    decl {op1.name}: ubit<{bitwidth}>[{size0}][{size1}];
-    decl {op2.name}: ubit<{bitwidth}>[{size0}][{size1}];
-    decl {res.name}: ubit<{bitwidth}>[{size0}][{size1}];
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}];
+    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];
     for (let i: ubit<{index_size0}> = 0..{size0}) {{
       for (let j: ubit<{index_size1}> = 0..{size1}) {{
         {res.name}[i][j] := {op1.name}[i][j] {declaration.op} {op2.name}[i][j];
       }}
-    }}""", declaration.component_name)
+    }}"""
+    return lower_dahlia_program(program, declaration.component_name)
 
 
 def tensor3d_batch_flatten(declaration):
@@ -96,10 +92,11 @@ def tensor3d_batch_flatten(declaration):
     res_index_size0, res_index_size1 = res.data[3], res.data[4]
 
     assert op1.type == PrimitiveType.Memory3D and res_size1 == op1_size1 * op1_size2 and res_size0 == op1_size0
-    assert res.type == PrimitiveType.Memory2D and res_bitwidth == bitwidth
-    return lower_dahlia_program(f"""
-        decl {op1.name}: ubit<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
-        decl {res.name}: ubit<{bitwidth}>[{res_size0}][{res_size1}];
+    assert res.type == PrimitiveType.Memory2D and res_bitwidth == bitwidth and op1.data_type == res.data_type
+    assert op1.data_type == res.data_type
+    program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
         let l: ubit<{res_index_size1}> = 0;
         for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
           for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
@@ -108,32 +105,109 @@ def tensor3d_batch_flatten(declaration):
               l := l + 1;
             }}
           }}
-        }}""", declaration.component_name)
+        }}"""
+    return lower_dahlia_program(program, declaration.component_name)
+
+
+def tensor2d_bias_add(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
+    # Assumes default value axis=1 is passed in.
+    data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    bitwidth = data.data[0]
+    size0, size1, index_size0, index_size1 = data.data[1], data.data[2], data.data[3], data.data[4]
+    bias_size, bias_index_size = bias.data[1], bias.data[2]
+
+    assert bitwidth == res.data[0] and bitwidth == bias.data[0]
+    assert size0 == res.data[1] and size1 == res.data[2] and bias_size == size1
+    assert bias.type == PrimitiveType.Memory1D and data.type == PrimitiveType.Memory2D and data.type == res.type
+    program = f"""
+    decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}];
+    decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];
+    for (let i: ubit<{index_size0}> = 0..{size0}) {{
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{
+        {res.name}[i][j] := {data.name}[i][j] + {bias.name}[j];
+      }}
+    }}
+    """
+    return lower_dahlia_program(program, declaration.component_name)
+
+
+# TODO(cgyurgyik):
+#  1. This won't work for fixed point currently, since Dahlia
+#     will not take fixed point operands for the `>` operator.
+#  2. Without signed bit array support, this is also meaningless.
+def tensor2d_relu(declaration):
+    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
+    assert res.data_type == 'ubit', f'{res.data_type} is not currently supported for ReLU.'
+    bitwidth, op1_size0, op1_size1 = op1.data[0], op1.data[1], op1.data[2]
+    op1_index_size0, op1_index_size1 = op1.data[3], op1.data[4]
+    res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
+    res_index_size0, res_index_size1 = res.data[3], res.data[4]
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
+    let zero: {op1.data_type}<{bitwidth}> = 0;
+    for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
+      for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
+        if ({op1.name}[i][j] > zero) {{
+          {res.name}[i][j] := {op1.name}[i][j];
+        }} else {{
+          {res.name}[i][j] := 0;
+        }}
+      }}
+    }}
+    """
+    return lower_dahlia_program(program, declaration.component_name)
 
 
 def batch_matmul(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
-    assert False, "Unimplemented. nn.batch_matmul currently does not execute properly."
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth, M1_size0, M1_size1, M1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
     M1_index_size0, M1_index_size1, M1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
     M2_size0, M2_size1, M2_size2 = op2.data[1], op2.data[2], op2.data[3]
     M2_index_size0, M2_index_size1, M2_index_size2 = op2.data[4], op2.data[5], op2.data[6]
     assert op1.type == PrimitiveType.Memory3D and op1.type == op2.type and op2.type == res.type
-    assert M2_size1 == M1_size2 and bitwidth == op2.data[0] and M1_size0 == M2_size0
-
-    return lower_dahlia_program(f"""
-    decl {op1.name}: ubit<{bitwidth}>[{M1_size0}][{M1_size1}][{M1_size2}];
-    decl {op2.name}: ubit<{bitwidth}>[{M2_size0}][{M2_size1}][{M2_size2}];
-    decl {res.name}: ubit<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size2}];
-    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
-      for (let j: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
-        for (let k: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
-          for (let l: ubit<{M1_index_size2}> = 0..{M1_size2}) {{
-            let prod = {op1.name}[i][j][l] * {op2.name}[i][l][k];
+    assert op1.data_type == op2.data_type and op2.data_type == res.data_type
+
+    # 1. Get transpose of second operand.
+    # 2. Conduct temporary = op1 * transpose(op2).
+    # 3. Write temporary value to return value.*
+    #    * This third step may not be necessary, but trying to conduct the matrix multiply
+    #      directly with the return value declared resulted in incorrect outputs.
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M1_size2}];
+    decl {op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size1}][{M2_size2}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
+    let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size2}][{M2_size1}];
+    let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
+    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let i: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+        for (let j: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
+          transpose_{op2.name}[batch][j][i] := {op2.name}[batch][i][j];
+        }}
+      }}
+    }} 
+
+    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let i: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+        for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+          for (let k: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
+            let product = {op1.name}[batch][i][k] * transpose_{op2.name}[batch][k][j];
           }} combine {{
-            {res.name}[i][j][k] += prod;
+            temporary_{res.name}[batch][i][j] += product;
           }}
         }}
       }}
-    }}""", declaration.component_name)
+    }}
+
+    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let i: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+        for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+          {res.name}[batch][i][j] := temporary_{res.name}[batch][i][j];
+        }}
+      }}
+    }} 
+    """
+    return lower_dahlia_program(program, declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index d5f3d15011..d0f2ebd730 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -5,63 +5,72 @@
 
 
 def add():
-    """Add together two variables in Relay.
-    """
     x = relay.var('x', shape=(), dtype="int32")
     y = relay.var('y', shape=(), dtype="int32")
     return relay.Function([x, y], relay.add(x, y))
 
 
-def tensor_add():
-    """Add together two 2-dimensional tensors in Relay.
-    """
+def tensor_subtract():
     x = relay.var("x", relay.TensorType((2, 4), "int32"))
     y = relay.var("y", relay.TensorType((2, 4), "int32"))
-    return relay.Function([x, y], relay.add(x, y))
+    return relay.Function([x, y], relay.subtract(x, y))
 
 
 def batch_flatten():
-    """Flattens all dimensions except for the batch dimension.
-    """
     x = relay.var("x", relay.TensorType((2, 5, 5), "int32"))
     return relay.Function([x], relay.nn.batch_flatten(x))
 
 
 def batch_matmul():
-    """Add together two 2-dimensional tensors in Relay.
-    """
-    x = relay.var("x", relay.TensorType((1, 3, 2), "int32"))
-    y = relay.var("y", relay.TensorType((1, 2, 3), "int32"))
+    x = relay.var('x', shape=[1, 3, 3], dtype='float32')
+    y = relay.var('y', shape=[1, 3, 3], dtype='float32')
     return relay.Function([x, y], relay.nn.batch_matmul(x, y))
 
 
+def bias_add():
+    x = relay.var('x', shape=[2, 4], dtype='float32')
+    bias = relay.var('bias', shape=[4], dtype='float32')
+    return relay.Function([x, bias], relay.nn.bias_add(data=x, bias=bias))
+
+
+def relu():
+    x = relay.var('x', shape=[2, 4], dtype='int32')
+    return relay.Function([x], relay.nn.relu(x))
+
+
 def mlp_net():
-    """The MLP test from Relay.
-    """
+    """The MLP test from Relay."""
     from tvm.relay.testing import mlp
     return mlp.get_net(1)
 
 
-ALL_FUNCS = [add, tensor_add, batch_flatten, batch_matmul, mlp_net]
+def vgg_net():
+    """The VGG test from Relay."""
+    from tvm.relay.testing import vgg
+    return vgg.get_net(batch_size=1, image_shape=(3, 224, 224), num_classes=10, dtype='int32', num_layers=11,
+                       batch_norm=True)
+
+
+ALL_FUNCS = [add, tensor_subtract, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
-def simple_example():
-    if '-h' in sys.argv[1:]:
-        supported_functions = []
+def run_example():
+    input = sys.argv[1:]
+    if '-h' in input or input == []:
         print("- To see FuTIL output:\n$ python3 example.py <function_name>")
         print("- To see Relay IR:\n$ python3 example.py <function_name> -r")
-        print("\n- Supported function names:")
-        for f in FUNC_NAMES: print(f'    {f}')
+        print("\n- Supported functions:")
+        (lambda x: print(', '.join(x)))(FUNC_NAMES)
         return
     func = None
     # See if the command line contains a function name.
     for option in ALL_FUNCS:
-        if option.__name__ in sys.argv[1:]:
+        if option.__name__ in input:
             func = option()
             break
     if func == None:
-        print("For help:\n$ python3 example.py -h")
+        print(f'Function {input} is not a supported. To see a list of functions:\n$ python3 example.py -h')
         return
 
     # Try optimizing the Relay IR with a few built-in passes.
@@ -71,10 +80,10 @@ def simple_example():
         relay.transform.ToANormalForm(),
     ])
 
-    mod = tvm.IRModule.from_expr(func)
-    mod_opt = seq(mod)
+    mod_opt = tvm.IRModule.from_expr(func)
+    mod_opt = seq(mod_opt)
     func = mod_opt['main']
-    if '-r' in sys.argv[1:]:
+    if '-r' in input:
         # Dump the Relay representation (for educational purposes).
         print(func)
     else:
@@ -83,4 +92,4 @@ def simple_example():
 
 
 if __name__ == '__main__':
-    simple_example()
+    run_example()
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index cef66aef0a..1d469c8888 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -26,6 +26,7 @@ class FPrimitive:
     name: str
     data: List[int]
     type: PrimitiveType
+    data_type: str
 
 
 @dataclass
@@ -153,8 +154,6 @@ class FDeclaration:
     Represents a FuTIL declaration.
     '''
     name: str
-    intermediary_inputs: List[Cell] = None
-    intermediary_output: Cell = None
     component: FComponent = None
 
 
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 6b8cbe4632..53eb47662d 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -61,9 +61,7 @@ def pp_component(component: FComponent):
             continue
         subcomponents.append(pp_cell(cell))
     cells = mk_block("cells", '\n'.join(subcomponents))
-
     inputs, outputs = pp_component_signature(component)
-
     wires = mk_block("wires", '\n'.join(pp_connections(component)))
 
     controls = "" if component.controls == None else '\n'.join(pp_control(component))
@@ -75,24 +73,27 @@ def pp_component(component: FComponent):
 def pp_cell(cell: FCell):
     if cell.is_primitive():
         data = cell.primitive.data
-        bitwidth = str(data[0])
+        data_type = cell.primitive.data_type
+        if data_type == 'ubit' or data_type == 'bit': bitwidth = str(data[0])
+        # `fix` / `ufix` will have bitwidth form: <TotalWidth, FractWidth>. We only want TotalWidth.
+        if data_type == 'ufix' or data_type == 'fix': bitwidth = str(data[0]).split(',')[0]
         if cell.primitive.type == PrimitiveType.Register:
             return f'{cell.primitive.name} = prim std_reg({bitwidth});'
-        elif cell.primitive.type == PrimitiveType.Constant:
+        if cell.primitive.type == PrimitiveType.Constant:
             value = str(data[1])
             return f'{cell.primitive.name} = prim std_const({bitwidth}, {value});'
-        elif cell.primitive.type == PrimitiveType.Memory1D:
+        if cell.primitive.type == PrimitiveType.Memory1D:
             size = str(data[1])
             index_size = str(data[2])
             return f'{cell.primitive.name} = prim std_mem_d1({bitwidth}, {size}, {index_size});'
-        elif cell.primitive.type == PrimitiveType.Memory2D:
+        if cell.primitive.type == PrimitiveType.Memory2D:
             size0 = str(data[1])
             size1 = str(data[2])
             index_size0 = str(data[3])
             index_size1 = str(data[4])
             return f'{cell.primitive.name} = prim std_mem_d2({bitwidth}, ' \
                    f'{size0}, {size1}, {index_size0}, {index_size1});'
-        elif cell.primitive.type == PrimitiveType.Memory3D:
+        if cell.primitive.type == PrimitiveType.Memory3D:
             size0 = str(data[1])
             size1 = str(data[2])
             size2 = str(data[3])
@@ -101,11 +102,10 @@ def pp_cell(cell: FCell):
             index_size2 = str(data[6])
             return f'{cell.primitive.name} = prim std_mem_d3({bitwidth}, ' \
                    f'{size0}, {size1}, {size2}, {index_size0}, {index_size1}, {index_size2});'
-        elif cell.primitive.type == PrimitiveType.BinOp:
+        if cell.primitive.type == PrimitiveType.BinOp:
             op = data[1]
             return f'{cell.primitive.name} = prim std_{op}({bitwidth});'
-        else:
-            assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
+        assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
     elif cell.is_declaration():
         return f'{cell.declaration.name} = {cell.declaration.component.name};'
     elif cell.is_dahlia_declaration():
diff --git a/frontends/relay-futil/runt.toml b/frontends/relay-futil/runt.toml
index 00e4dc96e6..26fd2172e8 100644
--- a/frontends/relay-futil/runt.toml
+++ b/frontends/relay-futil/runt.toml
@@ -6,7 +6,7 @@ paths = ["tests/*.relay"]
 cmd = "python3 compiler.py < {}"
 
 [[tests]]
-name = "FuTIL to dat"
+name = "Relay to FuTIL to dat"
 paths = ["tests/data/*.relay"]
 cmd = """
       python3 compiler.py < {} > {}.expect && \
diff --git a/frontends/relay-futil/tests/batch_matmul.expect b/frontends/relay-futil/tests/batch_matmul.expect
index 7e2e4c3cc9..0bf73d4754 100644
--- a/frontends/relay-futil/tests/batch_matmul.expect
+++ b/frontends/relay-futil/tests/batch_matmul.expect
@@ -1,148 +1,322 @@
 import "primitives/std.lib";
 
-component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 2, a0_0_0_addr1: 2, a0_0_0_addr2: 2, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 2, b0_0_0_addr1: 2, b0_0_0_addr2: 2, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 2, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
+component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 3, a0_0_0_addr1: 3, a0_0_0_addr2: 3, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 3, b0_0_0_addr1: 3, b0_0_0_addr2: 3, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
-    add0 = prim std_add(32);
-    add1 = prim std_add(2);
-    add2 = prim std_add(2);
-    add3 = prim std_add(2);
-    add4 = prim std_add(2);
+    add0 = prim std_add(3);
+    add1 = prim std_add(3);
+    add10 = prim std_add(3);
+    add2 = prim std_add(3);
+    add3 = prim std_add(32);
+    add4 = prim std_add(3);
+    add5 = prim std_add(3);
+    add6 = prim std_add(3);
+    add7 = prim std_add(3);
+    add8 = prim std_add(3);
+    add9 = prim std_add(3);
     b_read0_0 = prim std_reg(32);
+    batch0 = prim std_reg(3);
+    batch1 = prim std_reg(3);
+    batch2 = prim std_reg(3);
     bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(2, 0);
-    const1 = prim std_const(2, 2);
-    const10 = prim std_const(2, 1);
-    const11 = prim std_const(2, 1);
-    const2 = prim std_const(2, 0);
-    const3 = prim std_const(2, 2);
-    const4 = prim std_const(2, 0);
-    const5 = prim std_const(2, 2);
-    const6 = prim std_const(2, 0);
-    const7 = prim std_const(2, 2);
-    const8 = prim std_const(2, 1);
-    const9 = prim std_const(2, 1);
-    i0 = prim std_reg(2);
-    j0 = prim std_reg(2);
-    k0 = prim std_reg(2);
-    l0 = prim std_reg(2);
-    le0 = prim std_le(2);
-    le1 = prim std_le(2);
-    le2 = prim std_le(2);
-    le3 = prim std_le(2);
+    const0 = prim std_const(3, 0);
+    const1 = prim std_const(3, 3);
+    const10 = prim std_const(3, 3);
+    const11 = prim std_const(3, 0);
+    const12 = prim std_const(3, 6);
+    const13 = prim std_const(3, 0);
+    const14 = prim std_const(3, 6);
+    const15 = prim std_const(3, 0);
+    const16 = prim std_const(3, 4);
+    const17 = prim std_const(3, 1);
+    const18 = prim std_const(3, 1);
+    const19 = prim std_const(3, 1);
+    const2 = prim std_const(3, 0);
+    const20 = prim std_const(3, 1);
+    const21 = prim std_const(3, 0);
+    const22 = prim std_const(3, 3);
+    const23 = prim std_const(3, 0);
+    const24 = prim std_const(3, 6);
+    const25 = prim std_const(3, 0);
+    const26 = prim std_const(3, 6);
+    const27 = prim std_const(3, 1);
+    const28 = prim std_const(3, 1);
+    const29 = prim std_const(3, 1);
+    const3 = prim std_const(3, 6);
+    const4 = prim std_const(3, 0);
+    const5 = prim std_const(3, 4);
+    const6 = prim std_const(3, 1);
+    const7 = prim std_const(3, 1);
+    const8 = prim std_const(3, 1);
+    const9 = prim std_const(3, 0);
+    i0 = prim std_reg(3);
+    i1 = prim std_reg(3);
+    i2 = prim std_reg(3);
+    j0 = prim std_reg(3);
+    j1 = prim std_reg(3);
+    j2 = prim std_reg(3);
+    k0 = prim std_reg(3);
+    le0 = prim std_le(3);
+    le1 = prim std_le(3);
+    le2 = prim std_le(3);
+    le3 = prim std_le(3);
+    le4 = prim std_le(3);
+    le5 = prim std_le(3);
+    le6 = prim std_le(3);
+    le7 = prim std_le(3);
+    le8 = prim std_le(3);
+    le9 = prim std_le(3);
     mult_pipe0 = prim std_mult_pipe(32);
-    prod_0 = prim std_reg(32);
+    product_0 = prim std_reg(32);
+    temporary_x0_0_0 = prim std_mem_d3(32, 4, 7, 7, 3, 3, 3);
+    temporary_x_read0_0 = prim std_reg(32);
+    transpose_b0_0_0 = prim std_mem_d3(32, 4, 5, 7, 3, 3, 3);
+    transpose_b_read0_0 = prim std_reg(32);
   }
   wires {
     group cond0<"static"=0> {
       cond0[done] = 1'd1;
-      le0.left = i0.out;
+      le0.left = batch0.out;
       le0.right = const1.out;
     }
     group cond1<"static"=0> {
       cond1[done] = 1'd1;
-      le1.left = j0.out;
+      le1.left = i0.out;
       le1.right = const3.out;
     }
     group cond2<"static"=0> {
       cond2[done] = 1'd1;
-      le2.left = k0.out;
+      le2.left = j0.out;
       le2.right = const5.out;
     }
     group cond3<"static"=0> {
       cond3[done] = 1'd1;
-      le3.left = l0.out;
-      le3.right = const7.out;
+      le3.left = batch1.out;
+      le3.right = const10.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = i1.out;
+      le4.right = const12.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = j1.out;
+      le5.right = const14.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = k0.out;
+      le6.right = const16.out;
+    }
+    group cond7<"static"=0> {
+      cond7[done] = 1'd1;
+      le7.left = batch2.out;
+      le7.right = const22.out;
+    }
+    group cond8<"static"=0> {
+      cond8[done] = 1'd1;
+      le8.left = i2.out;
+      le8.right = const24.out;
+    }
+    group cond9<"static"=0> {
+      cond9[done] = 1'd1;
+      le9.left = j2.out;
+      le9.right = const26.out;
     }
     group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
+      batch0.in = const0.out;
+      batch0.write_en = 1'd1;
+      let0[done] = batch0.done;
     }
     group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
+      i0.in = const2.out;
+      i0.write_en = 1'd1;
+      let1[done] = i0.done;
+    }
+    group let10<"static"=1> {
+      i2.in = const23.out;
+      i2.write_en = 1'd1;
+      let10[done] = i2.done;
+    }
+    group let11<"static"=1> {
+      j2.in = const25.out;
+      j2.write_en = 1'd1;
+      let11[done] = j2.done;
     }
     group let2<"static"=1> {
-      k0.in = const4.out;
-      k0.write_en = 1'd1;
-      let2[done] = k0.done;
+      j0.in = const4.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
     }
     group let3<"static"=1> {
-      l0.in = const6.out;
-      l0.write_en = 1'd1;
-      let3[done] = l0.done;
+      batch1.in = const9.out;
+      batch1.write_en = 1'd1;
+      let3[done] = batch1.done;
+    }
+    group let4<"static"=1> {
+      i1.in = const11.out;
+      i1.write_en = 1'd1;
+      let4[done] = i1.done;
+    }
+    group let5<"static"=1> {
+      j1.in = const13.out;
+      j1.write_en = 1'd1;
+      let5[done] = j1.done;
     }
-    group let4<"static"=4> {
+    group let6<"static"=1> {
+      k0.in = const15.out;
+      k0.write_en = 1'd1;
+      let6[done] = k0.done;
+    }
+    group let7<"static"=4> {
       bin_read0_0.in = mult_pipe0.out;
       bin_read0_0.write_en = mult_pipe0.done;
-      let4[done] = bin_read0_0.done;
+      let7[done] = bin_read0_0.done;
       mult_pipe0.left = a_read0_0.out;
-      mult_pipe0.right = b_read0_0.out;
+      mult_pipe0.right = transpose_b_read0_0.out;
       mult_pipe0.go = !mult_pipe0.done ? 1'd1;
     }
-    group let5<"static"=1> {
-      prod_0.in = bin_read0_0.out;
-      prod_0.write_en = 1'd1;
-      let5[done] = prod_0.done;
+    group let8<"static"=1> {
+      product_0.in = bin_read0_0.out;
+      product_0.write_en = 1'd1;
+      let8[done] = product_0.done;
     }
-    group upd0<"static"=1> {
-      a_read0_0.write_en = 1'd1;
-      a0_0_0_addr2 = l0.out;
-      a0_0_0_addr1 = j0.out;
-      a0_0_0_addr0 = i0.out;
-      a_read0_0.in = 1'd1 ? a0_0_0_read_data;
-      upd0[done] = a_read0_0.done ? 1'd1;
+    group let9<"static"=1> {
+      batch2.in = const21.out;
+      batch2.write_en = 1'd1;
+      let9[done] = batch2.done;
     }
-    group upd1<"static"=1> {
+    group upd0<"static"=1> {
       b_read0_0.write_en = 1'd1;
-      b0_0_0_addr2 = k0.out;
-      b0_0_0_addr1 = l0.out;
-      b0_0_0_addr0 = i0.out;
+      b0_0_0_addr2 = j0.out;
+      b0_0_0_addr1 = i0.out;
+      b0_0_0_addr0 = batch0.out;
       b_read0_0.in = 1'd1 ? b0_0_0_read_data;
-      upd1[done] = b_read0_0.done ? 1'd1;
+      upd0[done] = b_read0_0.done ? 1'd1;
     }
-    group upd2<"static"=1> {
-      x0_0_0_addr2 = k0.out;
-      x0_0_0_addr1 = j0.out;
-      x0_0_0_addr0 = i0.out;
+    group upd1<"static"=1> {
+      transpose_b0_0_0.addr2 = i0.out;
+      transpose_b0_0_0.addr1 = j0.out;
+      transpose_b0_0_0.addr0 = batch0.out;
+      transpose_b0_0_0.write_en = 1'd1;
+      transpose_b0_0_0.write_data = 1'd1 ? b_read0_0.out;
+      upd1[done] = transpose_b0_0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      i1.write_en = 1'd1;
+      add6.left = i1.out;
+      add6.right = const19.out;
+      i1.in = 1'd1 ? add6.out;
+      upd10[done] = i1.done ? 1'd1;
+    }
+    group upd11<"static"=1> {
+      batch1.write_en = 1'd1;
+      add7.left = batch1.out;
+      add7.right = const20.out;
+      batch1.in = 1'd1 ? add7.out;
+      upd11[done] = batch1.done ? 1'd1;
+    }
+    group upd12<"static"=1> {
+      temporary_x_read0_0.write_en = 1'd1;
+      temporary_x0_0_0.addr2 = j2.out;
+      temporary_x0_0_0.addr1 = i2.out;
+      temporary_x0_0_0.addr0 = batch2.out;
+      temporary_x_read0_0.in = 1'd1 ? temporary_x0_0_0.read_data;
+      upd12[done] = temporary_x_read0_0.done ? 1'd1;
+    }
+    group upd13<"static"=1> {
+      x0_0_0_addr2 = j2.out;
+      x0_0_0_addr1 = i2.out;
+      x0_0_0_addr0 = batch2.out;
       x0_0_0_write_en = 1'd1;
-      add0.left = x0_0_0_read_data;
-      add0.right = prod_0.out;
-      x0_0_0_addr2 = k0.out;
-      x0_0_0_addr1 = j0.out;
-      x0_0_0_addr0 = i0.out;
-      x0_0_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x0_0_0_done ? 1'd1;
+      x0_0_0_write_data = 1'd1 ? temporary_x_read0_0.out;
+      upd13[done] = x0_0_0_done ? 1'd1;
+    }
+    group upd14<"static"=1> {
+      j2.write_en = 1'd1;
+      add8.left = j2.out;
+      add8.right = const27.out;
+      j2.in = 1'd1 ? add8.out;
+      upd14[done] = j2.done ? 1'd1;
+    }
+    group upd15<"static"=1> {
+      i2.write_en = 1'd1;
+      add9.left = i2.out;
+      add9.right = const28.out;
+      i2.in = 1'd1 ? add9.out;
+      upd15[done] = i2.done ? 1'd1;
+    }
+    group upd16<"static"=1> {
+      batch2.write_en = 1'd1;
+      add10.left = batch2.out;
+      add10.right = const29.out;
+      batch2.in = 1'd1 ? add10.out;
+      upd16[done] = batch2.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const6.out;
+      j0.in = 1'd1 ? add0.out;
+      upd2[done] = j0.done ? 1'd1;
     }
     group upd3<"static"=1> {
-      l0.write_en = 1'd1;
-      add1.left = l0.out;
-      add1.right = const8.out;
-      l0.in = 1'd1 ? add1.out;
-      upd3[done] = l0.done ? 1'd1;
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const7.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
     }
     group upd4<"static"=1> {
-      k0.write_en = 1'd1;
-      add2.left = k0.out;
-      add2.right = const9.out;
-      k0.in = 1'd1 ? add2.out;
-      upd4[done] = k0.done ? 1'd1;
+      batch0.write_en = 1'd1;
+      add2.left = batch0.out;
+      add2.right = const8.out;
+      batch0.in = 1'd1 ? add2.out;
+      upd4[done] = batch0.done ? 1'd1;
     }
     group upd5<"static"=1> {
-      j0.write_en = 1'd1;
-      add3.left = j0.out;
-      add3.right = const10.out;
-      j0.in = 1'd1 ? add3.out;
-      upd5[done] = j0.done ? 1'd1;
+      a_read0_0.write_en = 1'd1;
+      a0_0_0_addr2 = k0.out;
+      a0_0_0_addr1 = i1.out;
+      a0_0_0_addr0 = batch1.out;
+      a_read0_0.in = 1'd1 ? a0_0_0_read_data;
+      upd5[done] = a_read0_0.done ? 1'd1;
     }
     group upd6<"static"=1> {
-      i0.write_en = 1'd1;
-      add4.left = i0.out;
-      add4.right = const11.out;
-      i0.in = 1'd1 ? add4.out;
-      upd6[done] = i0.done ? 1'd1;
+      transpose_b_read0_0.write_en = 1'd1;
+      transpose_b0_0_0.addr2 = j1.out;
+      transpose_b0_0_0.addr1 = k0.out;
+      transpose_b0_0_0.addr0 = batch1.out;
+      transpose_b_read0_0.in = 1'd1 ? transpose_b0_0_0.read_data;
+      upd6[done] = transpose_b_read0_0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      temporary_x0_0_0.addr2 = j1.out;
+      temporary_x0_0_0.addr1 = i1.out;
+      temporary_x0_0_0.addr0 = batch1.out;
+      temporary_x0_0_0.write_en = 1'd1;
+      add3.left = temporary_x0_0_0.read_data;
+      add3.right = product_0.out;
+      temporary_x0_0_0.addr2 = j1.out;
+      temporary_x0_0_0.addr1 = i1.out;
+      temporary_x0_0_0.addr0 = batch1.out;
+      temporary_x0_0_0.write_data = 1'd1 ? add3.out;
+      upd7[done] = temporary_x0_0_0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      k0.write_en = 1'd1;
+      add4.left = k0.out;
+      add4.right = const17.out;
+      k0.in = 1'd1 ? add4.out;
+      upd8[done] = k0.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      j1.write_en = 1'd1;
+      add5.left = j1.out;
+      add5.right = const18.out;
+      j1.in = 1'd1 ? add5.out;
+      upd9[done] = j1.done ? 1'd1;
     }
   }
 
@@ -157,26 +331,66 @@ component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0
               let2;
               while le2.out with cond2 {
                 seq {
-                  let3;
-                  while le3.out with cond3 {
+                  upd0;
+                  upd1;
+                  upd2;
+                }
+              }
+              upd3;
+            }
+          }
+          upd4;
+        }
+      }
+      let3;
+      while le3.out with cond3 {
+        seq {
+          let4;
+          while le4.out with cond4 {
+            seq {
+              let5;
+              while le5.out with cond5 {
+                seq {
+                  let6;
+                  while le6.out with cond6 {
                     seq {
                       par {
-                        upd0;
-                        upd1;
+                        upd5;
+                        upd6;
                       }
-                      let4;
-                      let5;
-                      upd2;
-                      upd3;
+                      let7;
+                      let8;
+                      upd7;
+                      upd8;
                     }
                   }
-                  upd4;
+                  upd9;
+                }
+              }
+              upd10;
+            }
+          }
+          upd11;
+        }
+      }
+      let9;
+      while le7.out with cond7 {
+        seq {
+          let10;
+          while le8.out with cond8 {
+            seq {
+              let11;
+              while le9.out with cond9 {
+                seq {
+                  upd12;
+                  upd13;
+                  upd14;
                 }
               }
-              upd5;
+              upd15;
             }
           }
-          upd6;
+          upd16;
         }
       }
     }
@@ -185,9 +399,9 @@ component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0
 
 component main () -> () {
   cells {
-    x = prim std_mem_d3(32, 3, 3, 3, 2, 2, 2);
-    a = prim std_mem_d3(32, 3, 3, 3, 2, 2, 2);
-    b = prim std_mem_d3(32, 3, 3, 3, 2, 2, 2);
+    x = prim std_mem_d3(32, 4, 7, 7, 3, 3, 3);
+    a = prim std_mem_d3(32, 4, 7, 5, 3, 3, 3);
+    b = prim std_mem_d3(32, 4, 7, 5, 3, 3, 3);
     batch_matmul0 = batch_matmul;
   }
   wires {
diff --git a/frontends/relay-futil/tests/batch_matmul.relay b/frontends/relay-futil/tests/batch_matmul.relay
index 3c33743956..cdb972e9b0 100644
--- a/frontends/relay-futil/tests/batch_matmul.relay
+++ b/frontends/relay-futil/tests/batch_matmul.relay
@@ -1,6 +1,6 @@
 v0.0.4
-fn (%a: Tensor[(3, 3, 3), int32], %b: Tensor[(3, 3, 3), int32]) -> Tensor[(3, 3, 3), int32] {
-  let %x: Tensor[(3, 3, 3), int32] = nn.batch_matmul(%a, %b);
+fn (%a: Tensor[(4, 7, 5), int32], %b: Tensor[(4, 7, 5), int32]) {
+  let %x: Tensor[(4, 7, 7), int32] = nn.batch_matmul(%a, %b);
   %x
 }
 
diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
new file mode 100644
index 0000000000..ee66f8942d
--- /dev/null
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -0,0 +1,131 @@
+import "primitives/std.lib";
+
+component tensor2d_bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 7, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 7, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+  cells {
+    add0 = prim fixed_p_std_add(32, 16, 16);
+    add1 = prim std_add(7);
+    add2 = prim std_add(1);
+    bias_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(7, 0);
+    const3 = prim std_const(7, 63);
+    const4 = prim std_const(7, 1);
+    const5 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(7);
+    le0 = prim std_le(1);
+    le1 = prim std_le(7);
+    x_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = j0.out;
+      x0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      bias_read0_0.write_en = 1'd1;
+      bias0_addr0 = j0.out;
+      bias_read0_0.in = 1'd1 ? bias0_read_data;
+      upd1[done] = bias_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x10_0_addr1 = j0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      add0.left = x_read0_0.out;
+      add0.right = bias_read0_0.out;
+      x10_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x10_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd3[done] = j0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      i0.write_en = 1'd1;
+      add2.left = i0.out;
+      add2.right = const5.out;
+      i0.in = 1'd1 ? add2.out;
+      upd4[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              par {
+                upd0;
+                upd1;
+              }
+              upd2;
+              upd3;
+            }
+          }
+          upd4;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 1, 64, 1, 7);
+    x = prim std_mem_d2(32, 1, 64, 1, 7);
+    bias = prim std_mem_d1(32, 64, 7);
+    tensor2d_bias_add0 = tensor2d_bias_add;
+  }
+  wires {
+    group run_tensor2d_bias_add {
+      x.addr0 = tensor2d_bias_add0.x0_0_addr0;
+      tensor2d_bias_add0.x0_0_read_data = x.read_data;
+      x.addr1 = tensor2d_bias_add0.x0_0_addr1;
+      bias.addr0 = tensor2d_bias_add0.bias0_addr0;
+      tensor2d_bias_add0.bias0_read_data = bias.read_data;
+      x1.addr0 = tensor2d_bias_add0.x10_0_addr0;
+      x1.addr1 = tensor2d_bias_add0.x10_0_addr1;
+      x1.write_data = tensor2d_bias_add0.x10_0_write_data;
+      x1.write_en = tensor2d_bias_add0.x10_0_write_en;
+      tensor2d_bias_add0.x10_0_done = x1.done;
+      tensor2d_bias_add0.go = 1'd1;
+      run_tensor2d_bias_add[done] = tensor2d_bias_add0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor2d_bias_add;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/bias_add.relay b/frontends/relay-futil/tests/bias_add.relay
new file mode 100644
index 0000000000..6b90020ed3
--- /dev/null
+++ b/frontends/relay-futil/tests/bias_add.relay
@@ -0,0 +1,7 @@
+v0.0.4
+fn (%x: Tensor[(1, 64), float32], %bias: Tensor[(64), float32])  {
+  let %x1: Tensor[(1, 64), float32] = nn.bias_add(%x, %bias);
+  %x1
+}
+
+
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect b/frontends/relay-futil/tests/data/batch_flatten.expect
similarity index 100%
rename from frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
rename to frontends/relay-futil/tests/data/batch_flatten.expect
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay b/frontends/relay-futil/tests/data/batch_flatten.relay
similarity index 100%
rename from frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay
rename to frontends/relay-futil/tests/data/batch_flatten.relay
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data b/frontends/relay-futil/tests/data/batch_flatten.relay.data
similarity index 100%
rename from frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
rename to frontends/relay-futil/tests/data/batch_flatten.relay.data
diff --git a/frontends/relay-futil/tests/data/batch_matmul.expect b/frontends/relay-futil/tests/data/batch_matmul.expect
index f5bbcf1366..cfc67febca 100644
--- a/frontends/relay-futil/tests/data/batch_matmul.expect
+++ b/frontends/relay-futil/tests/data/batch_matmul.expect
@@ -3,52 +3,25 @@
     [
       [
         1,
-        1,
-        1
-      ],
-      [
-        1,
-        1,
-        1
-      ],
-      [
-        1,
-        1,
-        1
-      ]
-    ],
-    [
-      [
-        1,
-        1,
-        1
-      ],
-      [
-        1,
-        1,
-        1
+        2,
+        3
       ],
       [
         1,
-        1,
-        1
+        2,
+        3
       ]
     ],
     [
       [
         1,
-        1,
-        1
-      ],
-      [
-        1,
-        1,
-        1
+        2,
+        3
       ],
       [
         1,
-        1,
-        1
+        2,
+        3
       ]
     ]
   ],
@@ -56,105 +29,99 @@
     [
       [
         1,
-        1,
-        1
-      ],
-      [
-        1,
-        1,
-        1
+        2,
+        3
       ],
       [
         1,
-        1,
-        1
+        2,
+        3
       ]
     ],
     [
       [
         1,
-        1,
-        1
+        2,
+        3
       ],
       [
         1,
-        1,
-        1
+        2,
+        3
+      ]
+    ]
+  ],
+  "temporary_x0_0_0": [
+    [
+      [
+        14,
+        14
       ],
       [
-        1,
-        1,
-        1
+        14,
+        14
       ]
     ],
     [
       [
-        1,
-        1,
-        1
-      ],
-      [
-        1,
-        1,
-        1
+        14,
+        14
       ],
       [
-        1,
-        1,
-        1
+        14,
+        14
       ]
     ]
   ],
-  "x": [
+  "transpose_b0_0_0": [
     [
       [
-        3,
-        3,
-        3
+        1,
+        1
       ],
       [
-        3,
-        3,
-        3
+        2,
+        2
       ],
       [
-        3,
         3,
         3
       ]
     ],
     [
       [
-        3,
-        3,
-        3
+        1,
+        1
       ],
       [
-        3,
-        3,
-        3
+        2,
+        2
       ],
       [
-        3,
         3,
         3
       ]
-    ],
+    ]
+  ],
+  "x": [
     [
       [
-        3,
-        3,
-        3
+        14,
+        14
       ],
       [
-        3,
-        3,
-        3
+        14,
+        14
+      ]
+    ],
+    [
+      [
+        14,
+        14
       ],
       [
-        3,
-        3,
-        3
+        14,
+        14
       ]
     ]
   ]
diff --git a/frontends/relay-futil/tests/data/batch_matmul.relay b/frontends/relay-futil/tests/data/batch_matmul.relay
index 20f860a2b7..0ab8e09c9f 100644
--- a/frontends/relay-futil/tests/data/batch_matmul.relay
+++ b/frontends/relay-futil/tests/data/batch_matmul.relay
@@ -1,5 +1,5 @@
 v0.0.4
-fn (%a: Tensor[(3, 3, 3), int32], %b: Tensor[(3, 3, 3), int32]) -> Tensor[(3, 3, 3), int32] {
-  let %x: Tensor[(3, 3, 3), int32] = nn.batch_matmul(%a, %b);
+fn (%a: Tensor[(2, 2, 3), int32], %b: Tensor[(2, 2, 3), int32]) {
+  let %x = nn.batch_matmul(%a, %b);
   %x
 }
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/batch_matmul.relay.data b/frontends/relay-futil/tests/data/batch_matmul.relay.data
index 172b2d3ac2..6e7ef55c45 100644
--- a/frontends/relay-futil/tests/data/batch_matmul.relay.data
+++ b/frontends/relay-futil/tests/data/batch_matmul.relay.data
@@ -1,14 +1,22 @@
 {
   "a": {
-    "data": [[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]],
+    "data": [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3]]],
     "bitwidth": 32
   },
   "b": {
-    "data": [[[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]],
+    "data": [[[1,2,3], [1,2,3]], [[1,2,3], [1,2,3]]],
     "bitwidth": 32
   },
   "x": {
-    "data": [[[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]]],
+   "data": [[[0,0], [0,0]], [[0,0], [0,0]]],
+   "bitwidth": 32
+  },
+    "temporary_x0_0_0": {
+     "data": [[[0,0], [0,0]], [[0,0], [0,0]]],
+     "bitwidth": 32
+    },
+  "transpose_b0_0_0": {
+    "data": [[[0,0], [0,0], [0,0]], [[0,0], [0,0], [0,0]]],
     "bitwidth": 32
     }
 }
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/bias_add.expect b/frontends/relay-futil/tests/data/bias_add.expect
new file mode 100644
index 0000000000..4f6da23238
--- /dev/null
+++ b/frontends/relay-futil/tests/data/bias_add.expect
@@ -0,0 +1,36 @@
+{
+  "bias": [
+    1,
+    1,
+    1,
+    1
+  ],
+  "x": [
+    [
+      41,
+      41,
+      41,
+      41
+    ],
+    [
+      41,
+      41,
+      41,
+      41
+    ]
+  ],
+  "x1": [
+    [
+      42,
+      42,
+      42,
+      42
+    ],
+    [
+      42,
+      42,
+      42,
+      42
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/bias_add.relay b/frontends/relay-futil/tests/data/bias_add.relay
new file mode 100644
index 0000000000..4a1c58a64a
--- /dev/null
+++ b/frontends/relay-futil/tests/data/bias_add.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(2, 4), float32], %bias: Tensor[(4), float32])  {
+  let %x1: Tensor[(2, 4), float32] = nn.bias_add(%x, %bias);
+  %x1
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/bias_add.relay.data b/frontends/relay-futil/tests/data/bias_add.relay.data
new file mode 100644
index 0000000000..f1ef184ad6
--- /dev/null
+++ b/frontends/relay-futil/tests/data/bias_add.relay.data
@@ -0,0 +1,14 @@
+{
+  "x": {
+    "data": [[41,41,41,41], [41,41,41,41]],
+    "bitwidth": 32
+  },
+  "bias": {
+    "data": [1,1,1,1],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [[0,0,0,0], [0,0,0,0]],
+    "bitwidth": 32
+    }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/fixed_point_add.expect b/frontends/relay-futil/tests/fixed_point_add.expect
new file mode 100644
index 0000000000..3a08f950e8
--- /dev/null
+++ b/frontends/relay-futil/tests/fixed_point_add.expect
@@ -0,0 +1,98 @@
+import "primitives/std.lib";
+
+component tensor1d_add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+  cells {
+    add0 = prim fixed_p_std_add(32, 16, 16);
+    add1 = prim std_add(1);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 1);
+    i0 = prim std_reg(1);
+    le0 = prim std_le(1);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_read_data;
+      upd1[done] = y_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      z0_addr0 = i0.out;
+      z0_write_en = 1'd1;
+      add0.left = x_read0_0.out;
+      add0.right = y_read0_0.out;
+      z0_write_data = 1'd1 ? add0.out;
+      upd2[done] = z0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const2.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          upd2;
+          upd3;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    z = prim std_mem_d1(32, 1, 1);
+    x = prim std_mem_d1(32, 1, 1);
+    y = prim std_mem_d1(32, 1, 1);
+    tensor1d_add0 = tensor1d_add;
+  }
+  wires {
+    group run_tensor1d_add {
+      x.addr0 = tensor1d_add0.x0_addr0;
+      tensor1d_add0.x0_read_data = x.read_data;
+      y.addr0 = tensor1d_add0.y0_addr0;
+      tensor1d_add0.y0_read_data = y.read_data;
+      z.addr0 = tensor1d_add0.z0_addr0;
+      z.write_data = tensor1d_add0.z0_write_data;
+      z.write_en = tensor1d_add0.z0_write_en;
+      tensor1d_add0.z0_done = z.done;
+      tensor1d_add0.go = 1'd1;
+      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor1d_add;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/fixed_point_add.relay b/frontends/relay-futil/tests/fixed_point_add.relay
new file mode 100644
index 0000000000..4de39a28fb
--- /dev/null
+++ b/frontends/relay-futil/tests/fixed_point_add.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: float32, %y: float32) {
+  let %z = add(%x, %y);
+  %z
+}
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index 8885ca1785..3ef8c350f2 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -76,15 +76,16 @@ component tensor1d_multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read
     }
   }
 }
-component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+component tensor1d_divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
-    add0 = prim std_add(32);
-    add1 = prim std_add(1);
+    add0 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
     c_read0_0 = prim std_reg(32);
     const0 = prim std_const(1, 0);
     const1 = prim std_const(1, 0);
     const2 = prim std_const(1, 1);
+    div_pipe0 = prim std_div_pipe(32);
     i0 = prim std_reg(1);
     le0 = prim std_le(1);
   }
@@ -99,6 +100,14 @@ component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data
       i0.write_en = 1'd1;
       let0[done] = i0.done;
     }
+    group let1<> {
+      bin_read0_0.in = div_pipe0.out;
+      bin_read0_0.write_en = div_pipe0.done;
+      let1[done] = bin_read0_0.done;
+      div_pipe0.left = c_read0_0.out;
+      div_pipe0.right = a_read0_0.out;
+      div_pipe0.go = !div_pipe0.done ? 1'd1;
+    }
     group upd0<"static"=1> {
       c_read0_0.write_en = 1'd1;
       c0_addr0 = i0.out;
@@ -114,16 +123,14 @@ component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data
     group upd2<"static"=1> {
       d0_addr0 = i0.out;
       d0_write_en = 1'd1;
-      add0.left = c_read0_0.out;
-      add0.right = a_read0_0.out;
-      d0_write_data = 1'd1 ? add0.out;
+      d0_write_data = 1'd1 ? bin_read0_0.out;
       upd2[done] = d0_done ? 1'd1;
     }
     group upd3<"static"=1> {
       i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const2.out;
-      i0.in = 1'd1 ? add1.out;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
       upd3[done] = i0.done ? 1'd1;
     }
   }
@@ -137,6 +144,7 @@ component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data
             upd0;
             upd1;
           }
+          let1;
           upd2;
           upd3;
         }
@@ -220,7 +228,7 @@ component main () -> () {
     d = prim std_mem_d1(32, 1, 1);
     tensor1d_multiply0 = tensor1d_multiply;
     a = prim std_mem_d1(32, 1, 1);
-    tensor1d_add0 = tensor1d_add;
+    tensor1d_divide0 = tensor1d_divide;
     b = prim std_mem_d1(32, 1, 1);
     tensor1d_subtract0 = tensor1d_subtract;
   }
@@ -237,17 +245,17 @@ component main () -> () {
       tensor1d_subtract0.go = 1'd1;
       run_tensor1d_subtract[done] = tensor1d_subtract0.done ? 1'd1;
     }
-    group run_tensor1d_add {
-      c.addr0 = tensor1d_add0.c0_addr0;
-      tensor1d_add0.c0_read_data = c.read_data;
-      a.addr0 = tensor1d_add0.a0_addr0;
-      tensor1d_add0.a0_read_data = a.read_data;
-      d.addr0 = tensor1d_add0.d0_addr0;
-      d.write_data = tensor1d_add0.d0_write_data;
-      d.write_en = tensor1d_add0.d0_write_en;
-      tensor1d_add0.d0_done = d.done;
-      tensor1d_add0.go = 1'd1;
-      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
+    group run_tensor1d_divide {
+      c.addr0 = tensor1d_divide0.c0_addr0;
+      tensor1d_divide0.c0_read_data = c.read_data;
+      a.addr0 = tensor1d_divide0.a0_addr0;
+      tensor1d_divide0.a0_read_data = a.read_data;
+      d.addr0 = tensor1d_divide0.d0_addr0;
+      d.write_data = tensor1d_divide0.d0_write_data;
+      d.write_en = tensor1d_divide0.d0_write_en;
+      tensor1d_divide0.d0_done = d.done;
+      tensor1d_divide0.go = 1'd1;
+      run_tensor1d_divide[done] = tensor1d_divide0.done ? 1'd1;
     }
     group run_tensor1d_multiply {
       c.addr0 = tensor1d_multiply0.c0_addr0;
@@ -265,7 +273,7 @@ component main () -> () {
   control {
     seq {
       run_tensor1d_subtract;
-      run_tensor1d_add;
+      run_tensor1d_divide;
       run_tensor1d_multiply;
     }
   }
diff --git a/frontends/relay-futil/tests/let3.relay b/frontends/relay-futil/tests/let3.relay
index 29d96dd3e9..50aa9a8064 100644
--- a/frontends/relay-futil/tests/let3.relay
+++ b/frontends/relay-futil/tests/let3.relay
@@ -1,7 +1,7 @@
 v0.0.4
 fn (%a: int32, %b: int32) {
   let %c = subtract(%a, %b);
-  let %d = add(%c, %a);
+  let %d = divide(%c, %a);
   let %e = multiply(%c, %d);
   %e
 }
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
new file mode 100644
index 0000000000..098c7bcc1d
--- /dev/null
+++ b/frontends/relay-futil/tests/relu.expect
@@ -0,0 +1,152 @@
+import "primitives/std.lib";
+
+component tensor2d_relu(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+  cells {
+    add0 = prim std_add(3);
+    add1 = prim std_add(2);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(2, 0);
+    const2 = prim std_const(2, 1);
+    const3 = prim std_const(3, 0);
+    const4 = prim std_const(3, 3);
+    const5 = prim std_const(32, 0);
+    const6 = prim std_const(3, 1);
+    const7 = prim std_const(2, 1);
+    gt0 = prim std_gt(32);
+    i0 = prim std_reg(2);
+    j0 = prim std_reg(3);
+    le0 = prim std_le(2);
+    le1 = prim std_le(3);
+    x_read0_0 = prim std_reg(32);
+    x_read1_0 = prim std_reg(32);
+    zero_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const2.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const4.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      gt0.left = x_read0_0.out;
+      gt0.right = zero_0.out;
+    }
+    group let0<"static"=1> {
+      zero_0.in = const0.out;
+      zero_0.write_en = 1'd1;
+      let0[done] = zero_0.done;
+    }
+    group let1<"static"=1> {
+      i0.in = const1.out;
+      i0.write_en = 1'd1;
+      let1[done] = i0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const3.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = j0.out;
+      x0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      x_read1_0.write_en = 1'd1;
+      x0_0_addr1 = j0.out;
+      x0_0_addr0 = i0.out;
+      x_read1_0.in = 1'd1 ? x0_0_read_data;
+      upd1[done] = x_read1_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x10_0_addr1 = j0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? x_read1_0.out;
+      upd2[done] = x10_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      x10_0_addr1 = j0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? const5.out;
+      upd3[done] = x10_0_done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const6.out;
+      j0.in = 1'd1 ? add0.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const7.out;
+      i0.in = 1'd1 ? add1.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      let1;
+      while le0.out with cond0 {
+        seq {
+          let2;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              if gt0.out with cond2 {
+                seq {
+                  upd1;
+                  upd2;
+                }
+              } else {
+                upd3;
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 2, 4, 2, 3);
+    x = prim std_mem_d2(32, 2, 4, 2, 3);
+    tensor2d_relu0 = tensor2d_relu;
+  }
+  wires {
+    group run_tensor2d_relu {
+      x.addr0 = tensor2d_relu0.x0_0_addr0;
+      tensor2d_relu0.x0_0_read_data = x.read_data;
+      x.addr1 = tensor2d_relu0.x0_0_addr1;
+      x1.addr0 = tensor2d_relu0.x10_0_addr0;
+      x1.addr1 = tensor2d_relu0.x10_0_addr1;
+      x1.write_data = tensor2d_relu0.x10_0_write_data;
+      x1.write_en = tensor2d_relu0.x10_0_write_en;
+      tensor2d_relu0.x10_0_done = x1.done;
+      tensor2d_relu0.go = 1'd1;
+      run_tensor2d_relu[done] = tensor2d_relu0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor2d_relu;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/relu.relay b/frontends/relay-futil/tests/relu.relay
new file mode 100644
index 0000000000..f8f324a033
--- /dev/null
+++ b/frontends/relay-futil/tests/relu.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(2, 4), int32]) {
+  let %x1: Tensor[(2, 4), int32] = nn.relu(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 89adfec513..8deeb217c1 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -17,25 +17,45 @@ def flatten(l):
     return new_list
 
 
-def get_bitwidth(type):
+def get_dahlia_data_type(relay_type):
     '''
-    Quick and dirty way to get the bitwidth.
+    Gets the Dahlia data type from the given Relay type.
+    NOTE: Currently, Dahlia does not support signed types for arrays.
     '''
-    t = str(type)
-    assert t[0:3] == 'int' or t[0:5] == 'float', f'{t} is not supported.'
-    begin = 3 if t[0:3] == 'int' else 5  # 'float'
-    return int(t[begin:len(t)])
+    if 'int' in relay_type: return 'ubit'
+    if 'float' in relay_type: return 'ufix'
+    assert False, f'{relay_type} is not supported.'
+
+
+def get_bitwidth(relay_type):
+    '''
+    Gets the bitwidth from a Relay type.
+    If the relay_type is floating point of size N, returns a fixed point of size <N, N/2>.
+    This lowers to a fixed point cell with `int_width` of size N/2, and a `fract_width` of size N/2.
+    '''
+    type = str(relay_type)
+    length = len(type)
+    if 'int' in type: return type[3:length]
+    if 'float' in type:
+        width = int(type[5:length])
+        return f'{width}, {int(width / 2)}'
+    assert False, f'{relay_type} is not supported.'
 
 
 def get_memory_parameters(type):
     '''
     Acquires the memory parameters necessary to create a FuTIL memory primitive.
+
+    A Tensor type in Relay is presented as: `Tensor[(dim1, dim2, ...), type]`.
+    For example, `Tensor[(2, 4), int32]` is a 2-dimensional tensor with data type int32.
+
+    We then parse this to determine the corresponding FuTIL and Dahlia types.
     '''
     t = str(type)
+    data_type = get_dahlia_data_type(t)
     if t[0:3] == 'int' or t[0:5] == 'float':
-        return [get_bitwidth(type), 1, 1], PrimitiveType.Memory1D
+        return [get_bitwidth(type), 1, 1], PrimitiveType.Memory1D, data_type
     assert t[0:6] == 'Tensor', f'{type} is not currently supported.'
-
     string_type = t[t.find(")") + 3:t.find("]")]
     string_dimensions = t[t.find("(") + 1:t.find(")")]
 
@@ -44,14 +64,19 @@ def get_memory_parameters(type):
     for dimension in tensor_dimensions: data.append(dimension)  # Size.
     for dimension in tensor_dimensions: data.append(int(math.log2(dimension) + 1))  # Index size.
 
-    if len(tensor_dimensions) == 2:
-        type = PrimitiveType.Memory2D
-    elif len(tensor_dimensions) == 3:
-        type = PrimitiveType.Memory3D
-    return data, type
+    if len(tensor_dimensions) == 1: primitive_type = PrimitiveType.Memory1D
+    if len(tensor_dimensions) == 2: primitive_type = PrimitiveType.Memory2D
+    if len(tensor_dimensions) == 3: primitive_type = PrimitiveType.Memory3D
+    return data, primitive_type, data_type
 
 
-def build_main(c: FComponent):
+def build_main_controls(c: FComponent):
+    '''
+    Builds the wires and control for the `main` component.
+    This is done by creating a group run_* with its respective
+    wiring for each Dahlia declaration, and adding it to the
+    control.
+    '''
     dahlia_declarations = []
     for cell in reversed(c.cells):
         if not cell.is_dahlia_declaration(): continue
@@ -66,9 +91,9 @@ def build_main(c: FComponent):
             wires.append(FWire(f'{prim.name}.addr0', f'{declaration.decl_name}.{input.dahlia_name}_addr0'))
             wires.append(
                 FWire(f'{declaration.decl_name}.{input.dahlia_name}_read_data', f'{prim.name}.read_data'))
-            if not prim.type == PrimitiveType.Memory2D and not prim.type == PrimitiveType.Memory3D: continue
+            if prim.type == PrimitiveType.Memory1D: continue
             wires.append(FWire(f'{prim.name}.addr1', f'{declaration.decl_name}.{input.dahlia_name}_addr1'))
-            if not prim.type == PrimitiveType.Memory3D: continue
+            if prim.type == PrimitiveType.Memory2D: continue
             wires.append(FWire(f'{prim.name}.addr2', f'{declaration.decl_name}.{input.dahlia_name}_addr2'))
 
         output = declaration.output
@@ -87,6 +112,6 @@ def build_main(c: FComponent):
         wires.append(FWire(f'{group_name}[done]', f"{declaration.decl_name}.done ? 1'd1"))
         c.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
 
-    # Ensures that only group names make it into the Controls of a component.
+    # Ensures that only group names make it into the controls of a component.
     connections = list(filter(lambda w: w.is_group(), c.wires))
     c.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]

From 8980ed9b134ccd66e9299be0318190b5e267c7fd Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 12 Nov 2020 10:36:13 -0500
Subject: [PATCH 13/75] [relay] Clarify comment.

---
 frontends/relay-futil/dahlia_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index fd0b3db770..e2c5729d16 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -172,8 +172,8 @@ def batch_matmul(declaration):
     assert op1.data_type == op2.data_type and op2.data_type == res.data_type
 
     # 1. Get transpose of second operand.
-    # 2. Conduct temporary = op1 * transpose(op2).
-    # 3. Write temporary value to return value.*
+    # 2. Create temporary value `t`. Then, t = op1 * transpose(op2).
+    # 3. Copy temporary value to return value.*
     #    * This third step may not be necessary, but trying to conduct the matrix multiply
     #      directly with the return value declared resulted in incorrect outputs.
     program = f"""

From 9b08d295a5612535d8c6f52f46fab6ee9fb2cd22 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 14 Nov 2020 20:55:40 -0500
Subject: [PATCH 14/75] Rename to batch_flatten.

---
 frontends/relay-futil/compiler.py              |  2 +-
 frontends/relay-futil/dahlia_functions.py      |  2 +-
 ...tch_flatten.expect => batch_flatten.expect} |  0
 ...batch_flatten.relay => batch_flatten.relay} |  0
 .../tests/data/tensor3d_batch_flatten.expect   | 18 ------------------
 .../data/tensor3d_batch_flatten.relay.data     | 10 ----------
 .../tests/tensor3d_batch_flatten.relay         |  6 ------
 7 files changed, 2 insertions(+), 36 deletions(-)
 rename frontends/relay-futil/tests/{tensor3d_batch_flatten.expect => batch_flatten.expect} (100%)
 rename frontends/relay-futil/tests/{data/tensor3d_batch_flatten.relay => batch_flatten.relay} (100%)
 delete mode 100644 frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
 delete mode 100644 frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
 delete mode 100644 frontends/relay-futil/tests/tensor3d_batch_flatten.relay

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 2b66ac0f7e..93da8c9f5e 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -74,7 +74,7 @@ def get_dahlia_declaration(self, function_name, cells, args):
                 function, name = tensor2d_op, f'tensor2d_{function_name}'
 
         if function_name == "nn.batch_flatten":
-            if input_type == PrimitiveType.Memory3D: function = tensor3d_batch_flatten
+            if input_type == PrimitiveType.Memory3D: function = batch_flatten
         elif function_name == "nn.batch_matmul":
             function = batch_matmul
         elif function_name == "nn.bias_add":
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index e2c5729d16..2c72f19b87 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -83,7 +83,7 @@ def tensor2d_op(declaration):
     return lower_dahlia_program(program, declaration.component_name)
 
 
-def tensor3d_batch_flatten(declaration):
+def batch_flatten(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
     bitwidth, op1_size0, op1_size1, op1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
diff --git a/frontends/relay-futil/tests/tensor3d_batch_flatten.expect b/frontends/relay-futil/tests/batch_flatten.expect
similarity index 100%
rename from frontends/relay-futil/tests/tensor3d_batch_flatten.expect
rename to frontends/relay-futil/tests/batch_flatten.expect
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay b/frontends/relay-futil/tests/batch_flatten.relay
similarity index 100%
rename from frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay
rename to frontends/relay-futil/tests/batch_flatten.relay
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
deleted file mode 100644
index 4d55d4d415..0000000000
--- a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.expect
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "x": [
-    [
-      1,
-      2
-    ],
-    [
-      3,
-      4
-    ]
-  ],
-  "x1": [
-    1,
-    2,
-    3,
-    4
-  ]
-}
diff --git a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data b/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
deleted file mode 100644
index b6c5eae239..0000000000
--- a/frontends/relay-futil/tests/data/tensor3d_batch_flatten.relay.data
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "x": {
-    "data": [[1, 2], [3, 4]],
-    "bitwidth": 32
-  },
-  "x1": {
-    "data": [1, 2, 3, 4],
-    "bitwidth": 32
-  }
-}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/tensor3d_batch_flatten.relay b/frontends/relay-futil/tests/tensor3d_batch_flatten.relay
deleted file mode 100644
index 2a5e223fec..0000000000
--- a/frontends/relay-futil/tests/tensor3d_batch_flatten.relay
+++ /dev/null
@@ -1,6 +0,0 @@
-v0.0.4
-fn (%x: Tensor[(1, 2, 2), int32]) -> Tensor[(1, 4), int32] {
-  let %x1: Tensor[(1, 4), int32] = nn.batch_flatten(%x);
-  %x1
-}
-

From 806e93d2ca8a1ff657ec9965db56850d718217bc Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 14 Nov 2020 20:58:59 -0500
Subject: [PATCH 15/75] Fix batch_flatten, add test for 1d tensor binary ops

---
 .../relay-futil/tests/batch_flatten.expect    |  30 ++---
 .../relay-futil/tests/tensor1d_mult.expect    | 106 ++++++++++++++++++
 .../relay-futil/tests/tensor1d_mult.relay     |   6 +
 3 files changed, 127 insertions(+), 15 deletions(-)
 create mode 100644 frontends/relay-futil/tests/tensor1d_mult.expect
 create mode 100644 frontends/relay-futil/tests/tensor1d_mult.relay

diff --git a/frontends/relay-futil/tests/batch_flatten.expect b/frontends/relay-futil/tests/batch_flatten.expect
index b04fd2d7b3..6927e4ad85 100644
--- a/frontends/relay-futil/tests/batch_flatten.expect
+++ b/frontends/relay-futil/tests/batch_flatten.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor3d_batch_flatten(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component batch_flatten(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     add1 = prim std_add(2);
@@ -139,26 +139,26 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d2(32, 1, 4, 1, 3);
     x = prim std_mem_d3(32, 1, 2, 2, 1, 2, 2);
-    tensor3d_batch_flatten0 = tensor3d_batch_flatten;
+    batch_flatten0 = batch_flatten;
   }
   wires {
-    group run_tensor3d_batch_flatten {
-      x.addr0 = tensor3d_batch_flatten0.x0_0_0_addr0;
-      tensor3d_batch_flatten0.x0_0_0_read_data = x.read_data;
-      x.addr1 = tensor3d_batch_flatten0.x0_0_0_addr1;
-      x.addr2 = tensor3d_batch_flatten0.x0_0_0_addr2;
-      x1.addr0 = tensor3d_batch_flatten0.x10_0_addr0;
-      x1.addr1 = tensor3d_batch_flatten0.x10_0_addr1;
-      x1.write_data = tensor3d_batch_flatten0.x10_0_write_data;
-      x1.write_en = tensor3d_batch_flatten0.x10_0_write_en;
-      tensor3d_batch_flatten0.x10_0_done = x1.done;
-      tensor3d_batch_flatten0.go = 1'd1;
-      run_tensor3d_batch_flatten[done] = tensor3d_batch_flatten0.done ? 1'd1;
+    group run_batch_flatten {
+      x.addr0 = batch_flatten0.x0_0_0_addr0;
+      batch_flatten0.x0_0_0_read_data = x.read_data;
+      x.addr1 = batch_flatten0.x0_0_0_addr1;
+      x.addr2 = batch_flatten0.x0_0_0_addr2;
+      x1.addr0 = batch_flatten0.x10_0_addr0;
+      x1.addr1 = batch_flatten0.x10_0_addr1;
+      x1.write_data = batch_flatten0.x10_0_write_data;
+      x1.write_en = batch_flatten0.x10_0_write_en;
+      batch_flatten0.x10_0_done = x1.done;
+      batch_flatten0.go = 1'd1;
+      run_batch_flatten[done] = batch_flatten0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor3d_batch_flatten;
+      run_batch_flatten;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor1d_mult.expect b/frontends/relay-futil/tests/tensor1d_mult.expect
new file mode 100644
index 0000000000..d6086cd33d
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor1d_mult.expect
@@ -0,0 +1,106 @@
+import "primitives/std.lib";
+
+component tensor1d_multiply(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
+  cells {
+    add0 = prim std_add(3);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(3, 0);
+    const1 = prim std_const(3, 3);
+    const2 = prim std_const(3, 1);
+    i0 = prim std_reg(3);
+    le0 = prim std_le(3);
+    mult_pipe0 = prim std_mult_pipe(32);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let1[done] = bin_read0_0.done;
+      mult_pipe0.left = x_read0_0.out;
+      mult_pipe0.right = y_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_read_data;
+      upd1[done] = y_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x10_addr0 = i0.out;
+      x10_write_en = 1'd1;
+      x10_write_data = 1'd1 ? bin_read0_0.out;
+      upd2[done] = x10_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add0.left = i0.out;
+      add0.right = const2.out;
+      i0.in = 1'd1 ? add0.out;
+      upd3[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            upd0;
+            upd1;
+          }
+          let1;
+          upd2;
+          upd3;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d1(32, 4, 3);
+    x = prim std_mem_d1(32, 4, 3);
+    y = prim std_mem_d1(32, 4, 3);
+    tensor1d_multiply0 = tensor1d_multiply;
+  }
+  wires {
+    group run_tensor1d_multiply {
+      x.addr0 = tensor1d_multiply0.x0_addr0;
+      tensor1d_multiply0.x0_read_data = x.read_data;
+      y.addr0 = tensor1d_multiply0.y0_addr0;
+      tensor1d_multiply0.y0_read_data = y.read_data;
+      x1.addr0 = tensor1d_multiply0.x10_addr0;
+      x1.write_data = tensor1d_multiply0.x10_write_data;
+      x1.write_en = tensor1d_multiply0.x10_write_en;
+      tensor1d_multiply0.x10_done = x1.done;
+      tensor1d_multiply0.go = 1'd1;
+      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor1d_multiply;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/tensor1d_mult.relay b/frontends/relay-futil/tests/tensor1d_mult.relay
new file mode 100644
index 0000000000..5c87584cf4
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor1d_mult.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(4), int32], %y: Tensor[(4), int32]) {
+  let %x1 = multiply(%x, %y);
+  %x1
+}
+

From 425a87c80d01b43ad9dbb85b33cf7aaf67d41be6 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 14 Nov 2020 21:08:16 -0500
Subject: [PATCH 16/75] [relay] add binary ops for 3d tensors.

---
 frontends/relay-futil/compiler.py             |   2 +
 frontends/relay-futil/dahlia_functions.py     |  23 +++
 .../relay-futil/tests/tensor3d_divide.expect  | 176 ++++++++++++++++++
 .../relay-futil/tests/tensor3d_divide.relay   |   6 +
 4 files changed, 207 insertions(+)
 create mode 100644 frontends/relay-futil/tests/tensor3d_divide.expect
 create mode 100644 frontends/relay-futil/tests/tensor3d_divide.relay

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 93da8c9f5e..3ff627b37b 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -72,6 +72,8 @@ def get_dahlia_declaration(self, function_name, cells, args):
                 function, name = tensor1d_op, f'tensor1d_{function_name}'
             elif input_type == PrimitiveType.Memory2D:
                 function, name = tensor2d_op, f'tensor2d_{function_name}'
+            elif input_type == PrimitiveType.Memory3D:
+                function, name = tensor3d_op, f'tensor3d_{function_name}'
 
         if function_name == "nn.batch_flatten":
             if input_type == PrimitiveType.Memory3D: function = batch_flatten
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 2c72f19b87..63a4c09e42 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -83,6 +83,29 @@ def tensor2d_op(declaration):
     return lower_dahlia_program(program, declaration.component_name)
 
 
+def tensor3d_op(declaration):
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    bitwidth, size0, size1, size2, = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
+    index_size0, index_size1, index_size2 = op1.data[4], op1.data[5], op1.data[6]
+    assert op1.type == PrimitiveType.Memory3D and op1.type == op2.type and op2.type == res.type
+    assert bitwidth == op2.data[0] and op1.data[0] == res.data[0] and op2.data[4] == res.data[4]
+    assert size0 == op2.data[1] and op2.data[1] == res.data[1] and size1 == op2.data[2] and op2.data[2] == res.data[2]
+    assert index_size0 == op2.data[4] and op2.data[4] == res.data[4] and index_size1 == op2.data[5]
+    assert index_size2 == op2.data[6] and op2.data[6] == res.data[6]
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
+    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
+    for (let i: ubit<{index_size0}> = 0..{size0}) {{
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{
+        for (let k: ubit<{index_size2}> = 0..{size2}) {{
+          {res.name}[i][j][k] := {op1.name}[i][j][k] {declaration.op} {op2.name}[i][j][k];
+        }}
+      }}
+    }}"""
+    return lower_dahlia_program(program, declaration.component_name)
+
+
 def batch_flatten(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
diff --git a/frontends/relay-futil/tests/tensor3d_divide.expect b/frontends/relay-futil/tests/tensor3d_divide.expect
new file mode 100644
index 0000000000..10eb243cc1
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor3d_divide.expect
@@ -0,0 +1,176 @@
+import "primitives/std.lib";
+
+component tensor3d_divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(3);
+    add1 = prim std_add(3);
+    add2 = prim std_add(3);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(3, 0);
+    const1 = prim std_const(3, 3);
+    const2 = prim std_const(3, 0);
+    const3 = prim std_const(3, 4);
+    const4 = prim std_const(3, 0);
+    const5 = prim std_const(3, 5);
+    const6 = prim std_const(3, 1);
+    const7 = prim std_const(3, 1);
+    const8 = prim std_const(3, 1);
+    div_pipe0 = prim std_div_pipe(32);
+    i0 = prim std_reg(3);
+    j0 = prim std_reg(3);
+    k0 = prim std_reg(3);
+    le0 = prim std_le(3);
+    le1 = prim std_le(3);
+    le2 = prim std_le(3);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const5.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      k0.in = const4.out;
+      k0.write_en = 1'd1;
+      let2[done] = k0.done;
+    }
+    group let3<> {
+      bin_read0_0.in = div_pipe0.out;
+      bin_read0_0.write_en = div_pipe0.done;
+      let3[done] = bin_read0_0.done;
+      div_pipe0.left = x_read0_0.out;
+      div_pipe0.right = y_read0_0.out;
+      div_pipe0.go = !div_pipe0.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_0_addr2 = k0.out;
+      x0_0_0_addr1 = j0.out;
+      x0_0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_0_0_addr2 = k0.out;
+      y0_0_0_addr1 = j0.out;
+      y0_0_0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_0_0_read_data;
+      upd1[done] = y_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x10_0_0_addr2 = k0.out;
+      x10_0_0_addr1 = j0.out;
+      x10_0_0_addr0 = i0.out;
+      x10_0_0_write_en = 1'd1;
+      x10_0_0_write_data = 1'd1 ? bin_read0_0.out;
+      upd2[done] = x10_0_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      k0.write_en = 1'd1;
+      add0.left = k0.out;
+      add0.right = const6.out;
+      k0.in = 1'd1 ? add0.out;
+      upd3[done] = k0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const7.out;
+      j0.in = 1'd1 ? add1.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add2.left = i0.out;
+      add2.right = const8.out;
+      i0.in = 1'd1 ? add2.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  par {
+                    upd0;
+                    upd1;
+                  }
+                  let3;
+                  upd2;
+                  upd3;
+                }
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
+    x = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
+    y = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
+    tensor3d_divide0 = tensor3d_divide;
+  }
+  wires {
+    group run_tensor3d_divide {
+      x.addr0 = tensor3d_divide0.x0_0_0_addr0;
+      tensor3d_divide0.x0_0_0_read_data = x.read_data;
+      x.addr1 = tensor3d_divide0.x0_0_0_addr1;
+      x.addr2 = tensor3d_divide0.x0_0_0_addr2;
+      y.addr0 = tensor3d_divide0.y0_0_0_addr0;
+      tensor3d_divide0.y0_0_0_read_data = y.read_data;
+      y.addr1 = tensor3d_divide0.y0_0_0_addr1;
+      y.addr2 = tensor3d_divide0.y0_0_0_addr2;
+      x1.addr0 = tensor3d_divide0.x10_0_0_addr0;
+      x1.addr1 = tensor3d_divide0.x10_0_0_addr1;
+      x1.addr2 = tensor3d_divide0.x10_0_0_addr2;
+      x1.write_data = tensor3d_divide0.x10_0_0_write_data;
+      x1.write_en = tensor3d_divide0.x10_0_0_write_en;
+      tensor3d_divide0.x10_0_0_done = x1.done;
+      tensor3d_divide0.go = 1'd1;
+      run_tensor3d_divide[done] = tensor3d_divide0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_tensor3d_divide;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/tensor3d_divide.relay b/frontends/relay-futil/tests/tensor3d_divide.relay
new file mode 100644
index 0000000000..3a9c5a995d
--- /dev/null
+++ b/frontends/relay-futil/tests/tensor3d_divide.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(4,5,6), int32], %y: Tensor[(4,5,6), int32]) {
+  let %x1 = divide(%x, %y);
+  %x1
+}
+

From a24a4dbe48b2b2e8eea9651705017281ba3443b8 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 14 Nov 2020 21:13:52 -0500
Subject: [PATCH 17/75] Add additional assert statement.

---
 frontends/relay-futil/dahlia_functions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 63a4c09e42..2782620cf4 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -90,6 +90,7 @@ def tensor3d_op(declaration):
     assert op1.type == PrimitiveType.Memory3D and op1.type == op2.type and op2.type == res.type
     assert bitwidth == op2.data[0] and op1.data[0] == res.data[0] and op2.data[4] == res.data[4]
     assert size0 == op2.data[1] and op2.data[1] == res.data[1] and size1 == op2.data[2] and op2.data[2] == res.data[2]
+    assert size2 == op2.data[3] and op2.data[3] == res.data[3]
     assert index_size0 == op2.data[4] and op2.data[4] == res.data[4] and index_size1 == op2.data[5]
     assert index_size2 == op2.data[6] and op2.data[6] == res.data[6]
     program = f"""

From 46dc53648bc6e569db9831316938d38bdd17f79b Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 15:10:36 -0500
Subject: [PATCH 18/75] Add 4d tensor ops.

---
 frontends/relay-futil/compiler.py         |   6 +-
 frontends/relay-futil/dahlia_functions.py | 116 +++++++++------
 frontends/relay-futil/futil_ast.py        |   1 +
 frontends/relay-futil/pretty_print.py     |  11 ++
 frontends/relay-futil/tests/relu.expect   | 174 +++++++++++++++-------
 frontends/relay-futil/tests/relu.relay    |   4 +-
 frontends/relay-futil/utilities.py        |   1 +
 7 files changed, 210 insertions(+), 103 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 3ff627b37b..dc5c47df93 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -56,6 +56,7 @@ def produce_dahlia_name(self, name, type):
         if type == PrimitiveType.Memory1D: return dahlia_name
         if type == PrimitiveType.Memory2D: return dahlia_name + "_0"
         if type == PrimitiveType.Memory3D: return dahlia_name + "_0_0"
+        if type == PrimitiveType.Memory4D: return dahlia_name + "_0_0_0"
         assert False, f'{name} with {type} is not supported yet.'
 
     def get_dahlia_declaration(self, function_name, cells, args):
@@ -74,6 +75,8 @@ def get_dahlia_declaration(self, function_name, cells, args):
                 function, name = tensor2d_op, f'tensor2d_{function_name}'
             elif input_type == PrimitiveType.Memory3D:
                 function, name = tensor3d_op, f'tensor3d_{function_name}'
+            elif input_type == PrimitiveType.Memory4D:
+                function, name = tensor4d_op, f'tensor4d_{function_name}'
 
         if function_name == "nn.batch_flatten":
             if input_type == PrimitiveType.Memory3D: function = batch_flatten
@@ -82,7 +85,8 @@ def get_dahlia_declaration(self, function_name, cells, args):
         elif function_name == "nn.bias_add":
             if input_type == PrimitiveType.Memory2D: function = tensor2d_bias_add
         elif function_name == "nn.relu":
-            if input_type == PrimitiveType.Memory2D: function = tensor2d_relu
+            assert input_type == PrimitiveType.Memory2D or input_type == PrimitiveType.Memory4D
+            function = relu
 
         assert function != None, f'{function_name} with type {input_type} is not supported.'
         if name == None: name = function.__name__
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 2782620cf4..1f4bf8b80d 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -48,11 +48,6 @@ def lower_dahlia_program(prog, component_name):
 
 def tensor1d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-
-    assert op1.data_type == op2.data_type and op2.data_type == res.data_type
-    assert op1.type == PrimitiveType.Memory1D and op1.type == op2.type and op2.type == res.type
-    assert op1.data[0] == op2.data[0] and op1.data[0] == res.data[0] and op1.data[1] == op2.data[1]
-    assert op1.data[2] == op2.data[2] and op2.data[2] == res.data[2] and op2.data[1] == res.data[1]
     bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
     program = f"""
     decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
@@ -67,10 +62,6 @@ def tensor1d_op(declaration):
 def tensor2d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth, size0, size1, index_size0, index_size1 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
-    assert op1.type == PrimitiveType.Memory2D and op1.type == op2.type and op2.type == res.type
-    assert bitwidth == op2.data[0] and op1.data[0] == res.data[0] and op2.data[4] == res.data[4]
-    assert size0 == op2.data[1] and op2.data[1] == res.data[1] and size1 == op2.data[2] and op2.data[2] == res.data[2]
-    assert index_size0 == op2.data[3] and op2.data[3] == res.data[3] and index_size1 == op2.data[4]
     program = f"""
     decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}];
     decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}];
@@ -85,14 +76,8 @@ def tensor2d_op(declaration):
 
 def tensor3d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, size0, size1, size2, = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
+    bitwidth, size0, size1, size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
     index_size0, index_size1, index_size2 = op1.data[4], op1.data[5], op1.data[6]
-    assert op1.type == PrimitiveType.Memory3D and op1.type == op2.type and op2.type == res.type
-    assert bitwidth == op2.data[0] and op1.data[0] == res.data[0] and op2.data[4] == res.data[4]
-    assert size0 == op2.data[1] and op2.data[1] == res.data[1] and size1 == op2.data[2] and op2.data[2] == res.data[2]
-    assert size2 == op2.data[3] and op2.data[3] == res.data[3]
-    assert index_size0 == op2.data[4] and op2.data[4] == res.data[4] and index_size1 == op2.data[5]
-    assert index_size2 == op2.data[6] and op2.data[6] == res.data[6]
     program = f"""
     decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
     decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
@@ -107,6 +92,26 @@ def tensor3d_op(declaration):
     return lower_dahlia_program(program, declaration.component_name)
 
 
+def tensor4d_op(declaration):
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    bitwidth, size0, size1, size2, size3 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
+    index_size0, index_size1, index_size2, index_size3 = op1.data[5], op1.data[6], op1.data[7], op1.data[8]
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+    for (let i: ubit<{index_size0}> = 0..{size0}) {{
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{
+        for (let k: ubit<{index_size2}> = 0..{size2}) {{
+          for (let l: ubit<{index_size3}> = 0..{size3}) {{
+            {res.name}[i][j][k][l] := {op1.name}[i][j][k][l] {declaration.op} {op2.name}[i][j][k][l];
+          }}
+        }}
+      }}
+    }}"""
+    return lower_dahlia_program(program, declaration.component_name)
+
+
 def batch_flatten(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
@@ -114,10 +119,6 @@ def batch_flatten(declaration):
     op1_index_size0, op1_index_size1, op1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
     res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
     res_index_size0, res_index_size1 = res.data[3], res.data[4]
-
-    assert op1.type == PrimitiveType.Memory3D and res_size1 == op1_size1 * op1_size2 and res_size0 == op1_size0
-    assert res.type == PrimitiveType.Memory2D and res_bitwidth == bitwidth and op1.data_type == res.data_type
-    assert op1.data_type == res.data_type
     program = f"""
         decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
         decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
@@ -140,10 +141,6 @@ def tensor2d_bias_add(declaration):
     bitwidth = data.data[0]
     size0, size1, index_size0, index_size1 = data.data[1], data.data[2], data.data[3], data.data[4]
     bias_size, bias_index_size = bias.data[1], bias.data[2]
-
-    assert bitwidth == res.data[0] and bitwidth == bias.data[0]
-    assert size0 == res.data[1] and size1 == res.data[2] and bias_size == size1
-    assert bias.type == PrimitiveType.Memory1D and data.type == PrimitiveType.Memory2D and data.type == res.type
     program = f"""
     decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}];
     decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
@@ -161,28 +158,58 @@ def tensor2d_bias_add(declaration):
 #  1. This won't work for fixed point currently, since Dahlia
 #     will not take fixed point operands for the `>` operator.
 #  2. Without signed bit array support, this is also meaningless.
-def tensor2d_relu(declaration):
+def relu(declaration):
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
     assert res.data_type == 'ubit', f'{res.data_type} is not currently supported for ReLU.'
-    bitwidth, op1_size0, op1_size1 = op1.data[0], op1.data[1], op1.data[2]
-    op1_index_size0, op1_index_size1 = op1.data[3], op1.data[4]
-    res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
-    res_index_size0, res_index_size1 = res.data[3], res.data[4]
-    program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
-    let zero: {op1.data_type}<{bitwidth}> = 0;
-    for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
-      for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
-        if ({op1.name}[i][j] > zero) {{
-          {res.name}[i][j] := {op1.name}[i][j];
-        }} else {{
-          {res.name}[i][j] := 0;
+
+    if op1.type == PrimitiveType.Memory2D:
+        bitwidth, op1_size0, op1_size1 = op1.data[0], op1.data[1], op1.data[2]
+        op1_index_size0, op1_index_size1 = op1.data[3], op1.data[4]
+        res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
+        res_index_size0, res_index_size1 = res.data[3], res.data[4]
+        program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
+        let zero: {op1.data_type}<{bitwidth}> = 0;
+        for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
+          for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
+            if ({op1.name}[i][j] > zero) {{
+              {res.name}[i][j] := {op1.name}[i][j];
+            }} else {{
+              {res.name}[i][j] := 0;
+            }}
+          }}
         }}
-      }}
-    }}
-    """
-    return lower_dahlia_program(program, declaration.component_name)
+        """
+        return lower_dahlia_program(program, declaration.component_name)
+
+    elif op1.type == PrimitiveType.Memory4D:
+        bitwidth, op1_size0, op1_size1 = op1.data[0], op1.data[1], op1.data[2]
+        op1_size2, op1_size3, op1_index_size0, = op1.data[3], op1.data[4], op1.data[5]
+        op1_index_size1, op1_index_size2, op1_index_size3 = op1.data[6], op1.data[7], op1.data[8]
+        res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
+        res_size2, res_size3, res_index_size0, res_index_size1 = res.data[3], res.data[4], res.data[5], res.data[6]
+        res_index_size2, res_index_size3 = res.data[7], res.data[8]
+
+        program = f"""
+                decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}][{op1_size3}];
+                decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}][{op1_size2}][{op1_size3}];
+                let zero: {op1.data_type}<{bitwidth}> = 0;
+                for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
+                  for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
+                    for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
+                      for (let l: ubit<{op1_index_size3}> = 0..{op1_size3}) {{
+                        if ({op1.name}[i][j][k][l] > zero) {{
+                          {res.name}[i][j][k][l] := {op1.name}[i][j][k][l];
+                        }} else {{
+                          {res.name}[i][j][k][l] := 0;
+                        }}
+                      }} 
+                    }}
+                  }}
+                }}
+                """
+        return lower_dahlia_program(program, declaration.component_name)
 
 
 def batch_matmul(declaration):
@@ -192,9 +219,6 @@ def batch_matmul(declaration):
     M1_index_size0, M1_index_size1, M1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
     M2_size0, M2_size1, M2_size2 = op2.data[1], op2.data[2], op2.data[3]
     M2_index_size0, M2_index_size1, M2_index_size2 = op2.data[4], op2.data[5], op2.data[6]
-    assert op1.type == PrimitiveType.Memory3D and op1.type == op2.type and op2.type == res.type
-    assert op1.data_type == op2.data_type and op2.data_type == res.data_type
-
     # 1. Get transpose of second operand.
     # 2. Create temporary value `t`. Then, t = op1 * transpose(op2).
     # 3. Copy temporary value to return value.*
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 1d469c8888..5522ca22e7 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -10,6 +10,7 @@ class PrimitiveType(Enum):
     Memory1D = 3
     Memory2D = 4
     Memory3D = 5
+    Memory4D = 6
 
 
 class ControlType(Enum):
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 53eb47662d..6c65dea0dc 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -102,6 +102,17 @@ def pp_cell(cell: FCell):
             index_size2 = str(data[6])
             return f'{cell.primitive.name} = prim std_mem_d3({bitwidth}, ' \
                    f'{size0}, {size1}, {size2}, {index_size0}, {index_size1}, {index_size2});'
+        if cell.primitive.type == PrimitiveType.Memory4D:
+            size0 = str(data[1])
+            size1 = str(data[2])
+            size2 = str(data[3])
+            size3 = str(data[4])
+            index_size0 = str(data[4])
+            index_size1 = str(data[5])
+            index_size2 = str(data[6])
+            index_size3 = str(data[7])
+            return f'{cell.primitive.name} = prim std_mem_d4({bitwidth}, ' \
+                   f'{size0}, {size1}, {size2}, {size3}, {index_size0}, {index_size1}, {index_size2}, {index_size3});'
         if cell.primitive.type == PrimitiveType.BinOp:
             op = data[1]
             return f'{cell.primitive.name} = prim std_{op}({bitwidth});'
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
index 098c7bcc1d..a0998dd229 100644
--- a/frontends/relay-futil/tests/relu.expect
+++ b/frontends/relay-futil/tests/relu.expect
@@ -1,22 +1,34 @@
 import "primitives/std.lib";
 
-component tensor2d_relu(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 2, x0_0_0_0_addr1: 3, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 6, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 2, x10_0_0_0_addr1: 3, x10_0_0_0_addr2: 4, x10_0_0_0_addr3: 6, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
-    add0 = prim std_add(3);
-    add1 = prim std_add(2);
+    add0 = prim std_add(6);
+    add1 = prim std_add(4);
+    add2 = prim std_add(3);
+    add3 = prim std_add(2);
     const0 = prim std_const(32, 0);
     const1 = prim std_const(2, 0);
+    const10 = prim std_const(6, 1);
+    const11 = prim std_const(4, 1);
+    const12 = prim std_const(3, 1);
+    const13 = prim std_const(2, 1);
     const2 = prim std_const(2, 1);
     const3 = prim std_const(3, 0);
     const4 = prim std_const(3, 3);
-    const5 = prim std_const(32, 0);
-    const6 = prim std_const(3, 1);
-    const7 = prim std_const(2, 1);
+    const5 = prim std_const(4, 0);
+    const6 = prim std_const(4, 7);
+    const7 = prim std_const(6, 0);
+    const8 = prim std_const(6, 31);
+    const9 = prim std_const(32, 0);
     gt0 = prim std_gt(32);
     i0 = prim std_reg(2);
     j0 = prim std_reg(3);
+    k0 = prim std_reg(4);
+    l0 = prim std_reg(6);
     le0 = prim std_le(2);
     le1 = prim std_le(3);
+    le2 = prim std_le(4);
+    le3 = prim std_le(6);
     x_read0_0 = prim std_reg(32);
     x_read1_0 = prim std_reg(32);
     zero_0 = prim std_reg(32);
@@ -34,6 +46,16 @@ component tensor2d_relu(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_r
     }
     group cond2<"static"=0> {
       cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const6.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = l0.out;
+      le3.right = const8.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
       gt0.left = x_read0_0.out;
       gt0.right = zero_0.out;
     }
@@ -52,47 +74,79 @@ component tensor2d_relu(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_r
       j0.write_en = 1'd1;
       let2[done] = j0.done;
     }
+    group let3<"static"=1> {
+      k0.in = const5.out;
+      k0.write_en = 1'd1;
+      let3[done] = k0.done;
+    }
+    group let4<"static"=1> {
+      l0.in = const7.out;
+      l0.write_en = 1'd1;
+      let4[done] = l0.done;
+    }
     group upd0<"static"=1> {
       x_read0_0.write_en = 1'd1;
-      x0_0_addr1 = j0.out;
-      x0_0_addr0 = i0.out;
-      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      x0_0_0_0_addr3 = l0.out;
+      x0_0_0_0_addr2 = k0.out;
+      x0_0_0_0_addr1 = j0.out;
+      x0_0_0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_0_0_read_data;
       upd0[done] = x_read0_0.done ? 1'd1;
     }
     group upd1<"static"=1> {
       x_read1_0.write_en = 1'd1;
-      x0_0_addr1 = j0.out;
-      x0_0_addr0 = i0.out;
-      x_read1_0.in = 1'd1 ? x0_0_read_data;
+      x0_0_0_0_addr3 = l0.out;
+      x0_0_0_0_addr2 = k0.out;
+      x0_0_0_0_addr1 = j0.out;
+      x0_0_0_0_addr0 = i0.out;
+      x_read1_0.in = 1'd1 ? x0_0_0_0_read_data;
       upd1[done] = x_read1_0.done ? 1'd1;
     }
     group upd2<"static"=1> {
-      x10_0_addr1 = j0.out;
-      x10_0_addr0 = i0.out;
-      x10_0_write_en = 1'd1;
-      x10_0_write_data = 1'd1 ? x_read1_0.out;
-      upd2[done] = x10_0_done ? 1'd1;
+      x10_0_0_0_addr3 = l0.out;
+      x10_0_0_0_addr2 = k0.out;
+      x10_0_0_0_addr1 = j0.out;
+      x10_0_0_0_addr0 = i0.out;
+      x10_0_0_0_write_en = 1'd1;
+      x10_0_0_0_write_data = 1'd1 ? x_read1_0.out;
+      upd2[done] = x10_0_0_0_done ? 1'd1;
     }
     group upd3<"static"=1> {
-      x10_0_addr1 = j0.out;
-      x10_0_addr0 = i0.out;
-      x10_0_write_en = 1'd1;
-      x10_0_write_data = 1'd1 ? const5.out;
-      upd3[done] = x10_0_done ? 1'd1;
+      x10_0_0_0_addr3 = l0.out;
+      x10_0_0_0_addr2 = k0.out;
+      x10_0_0_0_addr1 = j0.out;
+      x10_0_0_0_addr0 = i0.out;
+      x10_0_0_0_write_en = 1'd1;
+      x10_0_0_0_write_data = 1'd1 ? const9.out;
+      upd3[done] = x10_0_0_0_done ? 1'd1;
     }
     group upd4<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const6.out;
-      j0.in = 1'd1 ? add0.out;
-      upd4[done] = j0.done ? 1'd1;
+      l0.write_en = 1'd1;
+      add0.left = l0.out;
+      add0.right = const10.out;
+      l0.in = 1'd1 ? add0.out;
+      upd4[done] = l0.done ? 1'd1;
     }
     group upd5<"static"=1> {
+      k0.write_en = 1'd1;
+      add1.left = k0.out;
+      add1.right = const11.out;
+      k0.in = 1'd1 ? add1.out;
+      upd5[done] = k0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      j0.write_en = 1'd1;
+      add2.left = j0.out;
+      add2.right = const12.out;
+      j0.in = 1'd1 ? add2.out;
+      upd6[done] = j0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
       i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const7.out;
-      i0.in = 1'd1 ? add1.out;
-      upd5[done] = i0.done ? 1'd1;
+      add3.left = i0.out;
+      add3.right = const13.out;
+      i0.in = 1'd1 ? add3.out;
+      upd7[done] = i0.done ? 1'd1;
     }
   }
 
@@ -105,19 +159,31 @@ component tensor2d_relu(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_r
           let2;
           while le1.out with cond1 {
             seq {
-              upd0;
-              if gt0.out with cond2 {
+              let3;
+              while le2.out with cond2 {
                 seq {
-                  upd1;
-                  upd2;
+                  let4;
+                  while le3.out with cond3 {
+                    seq {
+                      upd0;
+                      if gt0.out with cond4 {
+                        seq {
+                          upd1;
+                          upd2;
+                        }
+                      } else {
+                        upd3;
+                      }
+                      upd4;
+                    }
+                  }
+                  upd5;
                 }
-              } else {
-                upd3;
               }
-              upd4;
+              upd6;
             }
           }
-          upd5;
+          upd7;
         }
       }
     }
@@ -126,27 +192,27 @@ component tensor2d_relu(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_r
 
 component main () -> () {
   cells {
-    x1 = prim std_mem_d2(32, 2, 4, 2, 3);
-    x = prim std_mem_d2(32, 2, 4, 2, 3);
-    tensor2d_relu0 = tensor2d_relu;
+    x1 = prim std_mem_d4(32, 2, 4, 8, 32, 32, 2, 3, 4);
+    x = prim std_mem_d4(32, 2, 4, 8, 32, 32, 2, 3, 4);
+    relu0 = relu;
   }
   wires {
-    group run_tensor2d_relu {
-      x.addr0 = tensor2d_relu0.x0_0_addr0;
-      tensor2d_relu0.x0_0_read_data = x.read_data;
-      x.addr1 = tensor2d_relu0.x0_0_addr1;
-      x1.addr0 = tensor2d_relu0.x10_0_addr0;
-      x1.addr1 = tensor2d_relu0.x10_0_addr1;
-      x1.write_data = tensor2d_relu0.x10_0_write_data;
-      x1.write_en = tensor2d_relu0.x10_0_write_en;
-      tensor2d_relu0.x10_0_done = x1.done;
-      tensor2d_relu0.go = 1'd1;
-      run_tensor2d_relu[done] = tensor2d_relu0.done ? 1'd1;
+    group run_relu {
+      x.addr0 = relu0.x0_0_0_0_addr0;
+      relu0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = relu0.x0_0_0_0_addr1;
+      x.addr2 = relu0.x0_0_0_0_addr2;
+      x1.addr0 = relu0.x10_0_0_0_addr0;
+      x1.write_data = relu0.x10_0_0_0_write_data;
+      x1.write_en = relu0.x10_0_0_0_write_en;
+      relu0.x10_0_0_0_done = x1.done;
+      relu0.go = 1'd1;
+      run_relu[done] = relu0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor2d_relu;
+      run_relu;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/relu.relay b/frontends/relay-futil/tests/relu.relay
index f8f324a033..fd5278c4a8 100644
--- a/frontends/relay-futil/tests/relu.relay
+++ b/frontends/relay-futil/tests/relu.relay
@@ -1,6 +1,6 @@
 v0.0.4
-fn (%x: Tensor[(2, 4), int32]) {
-  let %x1: Tensor[(2, 4), int32] = nn.relu(%x);
+fn (%x: Tensor[(2, 4, 8, 32), int32]) {
+  let %x1: Tensor[(2, 4, 8, 32), int32] = nn.relu(%x);
   %x1
 }
 
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 8deeb217c1..39a4591af1 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -67,6 +67,7 @@ def get_memory_parameters(type):
     if len(tensor_dimensions) == 1: primitive_type = PrimitiveType.Memory1D
     if len(tensor_dimensions) == 2: primitive_type = PrimitiveType.Memory2D
     if len(tensor_dimensions) == 3: primitive_type = PrimitiveType.Memory3D
+    if len(tensor_dimensions) == 4: primitive_type = PrimitiveType.Memory4D
     return data, primitive_type, data_type
 
 

From 40db68f0ae901b3dd74548a7c7cec0efca339833 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 15:21:35 -0500
Subject: [PATCH 19/75] Remove comma.

---
 primitives/std.lib | 75 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/primitives/std.lib b/primitives/std.lib
index 805384ceee..7f4d887efa 100644
--- a/primitives/std.lib
+++ b/primitives/std.lib
@@ -158,6 +158,81 @@ primitive std_mem_d3_ext[
   clk: 1
 ) -> (read_data: width, done: 1) {}
 
+primitive std_mem_d4[
+    width,
+    d0_size,
+    d1_size,
+    d2_size,
+    d3_size,
+    d0_idx_size,
+    d1_idx_size,
+    d2_idx_size,
+    d3_idx_size
+] (
+  addr0: d0_idx_size,
+  addr1: d1_idx_size,
+  addr2: d2_idx_size,
+  addr3: d3_idx_size,
+  write_data: width,
+  write_en: 1,
+  clk: 1
+) -> (read_data: width, done: 1) {
+  verilog {
+    module std_mem_d4
+      #(parameter width = 32,
+        parameter d0_size = 16,
+        parameter d1_size = 16,
+        parameter d2_size = 16,
+        parameter d3_size = 16,
+        parameter d0_idx_size = 4,
+        parameter d1_idx_size = 4,
+        parameter d2_idx_size = 4,
+        parameter d3_idx_size = 4,)
+       (input logic [d0_idx_size-1:0] addr0,
+        input logic [d1_idx_size-1:0] addr1,
+        input logic [d2_idx_size-1:0] addr2,
+        input logic [d3_idx_size-1:0] addr3,
+        input logic [width-1:0]   write_data,
+        input logic               write_en,
+        input logic               clk,
+        output logic [width-1:0]  read_data,
+        output logic done);
+
+      /* verilator lint_off WIDTH */
+      logic [width-1:0]  mem[d0_size-1:0][d1_size-1:0][d2_size-1:0][d3_size-1:0];
+
+      assign read_data = mem[addr0][addr1][addr2][addr3];
+      always_ff @(posedge clk) begin
+        if (write_en) begin
+          mem[addr0][addr1][addr2][addr3] <= write_data;
+          done <= 1'd1;
+        end else
+          done <= 1'd0;
+      end
+    endmodule
+  }
+}
+
+primitive std_mem_d4_ext[
+    width,
+    d0_size,
+    d1_size,
+    d2_size,
+    d3_size,
+    d0_idx_size,
+    d1_idx_size,
+    d2_idx_size,
+    d3_idx_size
+] (
+  addr0: d0_idx_size,
+  addr1: d1_idx_size,
+  addr2: d2_idx_size,
+  addr3: d3_idx_size,
+  write_data: width,
+  write_en: 1,
+  clk: 1
+) -> (read_data: width, done: 1) {}
+
 primitive std_logsize_mem_d1[width, logsize](
   addr0: logsize,
   write_data: width,

From 63535b2241acc85ab0e838a604975e2a9f37ea30 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 20:20:03 -0500
Subject: [PATCH 20/75] Add negative, non-working expand_dims.

---
 frontends/relay-futil/compiler.py             | 51 +++++------
 frontends/relay-futil/dahlia_functions.py     | 35 +++++++-
 frontends/relay-futil/example.py              |  8 +-
 frontends/relay-futil/futil_ast.py            | 12 +--
 frontends/relay-futil/tests/add.expect        | 28 +++----
 frontends/relay-futil/tests/bias_add.expect   | 32 +++----
 .../relay-futil/tests/expand_dims.expect      |  0
 frontends/relay-futil/tests/expand_dims.relay |  6 ++
 .../relay-futil/tests/fixed_point_add.expect  | 28 +++----
 frontends/relay-futil/tests/let1.expect       | 28 +++----
 frontends/relay-futil/tests/let2.expect       | 58 ++++++-------
 frontends/relay-futil/tests/let3.expect       | 84 +++++++++----------
 frontends/relay-futil/tests/sub.expect        | 28 +++----
 .../relay-futil/tests/tensor1d_mult.expect    | 28 +++----
 .../relay-futil/tests/tensor2d_add.expect     | 34 ++++----
 .../relay-futil/tests/tensor3d_divide.expect  | 40 ++++-----
 16 files changed, 268 insertions(+), 232 deletions(-)
 create mode 100644 frontends/relay-futil/tests/expand_dims.expect
 create mode 100644 frontends/relay-futil/tests/expand_dims.relay

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index dc5c47df93..9f97f7ac11 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -9,7 +9,15 @@
 from dahlia_functions import *
 
 # Mapping from Relay binary calls to the respective Dahlia operator.
-BuiltInBinaryCalls = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
+BuiltInBinaryOps = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
+
+# Mapping from Tensor dimensions to function type.
+BinaryOpTensorDimensions = {PrimitiveType.Memory1D: tensor1d_op, PrimitiveType.Memory2D: tensor2d_op,
+                            PrimitiveType.Memory3D: tensor3d_op, PrimitiveType.Memory4D: tensor4d_op}
+
+# Mapping from Relay function names to their respective Dahlia lowering.
+RelayFunctionCalls = {'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
+                      'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims}
 
 
 class Relay2Futil(ExprFunctor):
@@ -52,12 +60,11 @@ def produce_dahlia_name(self, name, type):
         Memory2D: 'X0_0', 'X1_0', 'X2_0', ...
         Memory3D: 'X0_0_0', 'X1_0_0', 'X2_0_0', ...
         """
+        DahliaNameMapping = {PrimitiveType.Memory1D: '', PrimitiveType.Memory2D: '_0',
+                             PrimitiveType.Memory3D: '_0_0', PrimitiveType.Memory4D: '_0_0_0'}
         dahlia_name = self.id(name)
-        if type == PrimitiveType.Memory1D: return dahlia_name
-        if type == PrimitiveType.Memory2D: return dahlia_name + "_0"
-        if type == PrimitiveType.Memory3D: return dahlia_name + "_0_0"
-        if type == PrimitiveType.Memory4D: return dahlia_name + "_0_0_0"
-        assert False, f'{name} with {type} is not supported yet.'
+        assert type in DahliaNameMapping, f'{name} with {type} is not supported yet.'
+        return DahliaNameMapping[type]
 
     def get_dahlia_declaration(self, function_name, cells, args):
         """
@@ -66,30 +73,14 @@ def get_dahlia_declaration(self, function_name, cells, args):
         """
         input_type = cells[0].primitive.type
         function = name = op = None
-
-        if function_name in BuiltInBinaryCalls:
-            op = BuiltInBinaryCalls[function_name]
-            if input_type == PrimitiveType.Memory1D:
-                function, name = tensor1d_op, f'tensor1d_{function_name}'
-            elif input_type == PrimitiveType.Memory2D:
-                function, name = tensor2d_op, f'tensor2d_{function_name}'
-            elif input_type == PrimitiveType.Memory3D:
-                function, name = tensor3d_op, f'tensor3d_{function_name}'
-            elif input_type == PrimitiveType.Memory4D:
-                function, name = tensor4d_op, f'tensor4d_{function_name}'
-
-        if function_name == "nn.batch_flatten":
-            if input_type == PrimitiveType.Memory3D: function = batch_flatten
-        elif function_name == "nn.batch_matmul":
-            function = batch_matmul
-        elif function_name == "nn.bias_add":
-            if input_type == PrimitiveType.Memory2D: function = tensor2d_bias_add
-        elif function_name == "nn.relu":
-            assert input_type == PrimitiveType.Memory2D or input_type == PrimitiveType.Memory4D
-            function = relu
-
-        assert function != None, f'{function_name} with type {input_type} is not supported.'
-        if name == None: name = function.__name__
+        if function_name in BuiltInBinaryOps:
+            op = BuiltInBinaryOps[function_name]
+            function, name = BinaryOpTensorDimensions[input_type], function_name
+        elif function_name in RelayFunctionCalls:
+            function = RelayFunctionCalls[function_name]
+            name = function.__name__
+        else:
+            assert False, f'{function_name} with type {input_type} is not supported.'
         return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name), op=op, inputs=args,
                                  function=function)
 
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index ac08cca3af..50dd7100d4 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -91,6 +91,7 @@ def tensor3d_op(declaration):
     }}"""
     return lower_dahlia_program(program, declaration.component_name)
 
+
 def tensor4d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth, size0, size1, size2, size3 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
@@ -133,7 +134,7 @@ def batch_flatten(declaration):
     return lower_dahlia_program(program, declaration.component_name)
 
 
-def tensor2d_bias_add(declaration):
+def bias_add(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
     # Assumes default value axis=1 is passed in.
     data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
@@ -158,6 +159,7 @@ def tensor2d_bias_add(declaration):
 #     will not take fixed point operands for the `>` operator.
 #  2. Without signed bit array support, this is also meaningless.
 def relu(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
     assert res.data_type == 'ubit', f'{res.data_type} is not currently supported for ReLU.'
 
@@ -211,6 +213,37 @@ def relu(declaration):
         return lower_dahlia_program(program, declaration.component_name)
 
 
+# TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
+def negative(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
+    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
+    program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
+        for (let i: ubit<{index_size}> = 0..{size}) {{
+          {res.name}[i] := -{op1.name}[i];
+        }}
+    """
+    return lower_dahlia_program(program, declaration.component_name)
+
+
+def expand_dims(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
+    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
+    size0, size1, size2 = res.data[1], res.data[2], res.data[3]
+    index_size0, index_size1, index_size2 = res.data[4], res.data[5], res.data[6]
+    program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
+        for (let i: ubit<{index_size}> = 0..{size}) {{
+          {res.name}[i][0][0] := {op1.name}[i];
+        }}
+    """
+    return lower_dahlia_program(program, declaration.component_name)
+
+
 def batch_matmul(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 3531e082b7..779493b1a4 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -16,6 +16,11 @@ def tensor_subtract():
     return relay.Function([x, y], relay.subtract(x, y))
 
 
+def expand_dims():
+    x = relay.var('x', shape=[4], dtype='int32')
+    return relay.Function([x], relay.expand_dims(x, axis=1, num_newaxis=2))
+
+
 def batch_flatten():
     x = relay.var("x", relay.TensorType((2, 5, 5), "int32"))
     return relay.Function([x], relay.nn.batch_flatten(x))
@@ -37,6 +42,7 @@ def relu():
     x = relay.var('x', shape=[2, 4], dtype='int32')
     return relay.Function([x], relay.nn.relu(x))
 
+
 def mlp_net():
     """The MLP test from Relay."""
     from tvm.relay.testing import mlp
@@ -50,7 +56,7 @@ def vgg_net():
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, tensor_subtract, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
+ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 5522ca22e7..868e6ffbcf 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -5,12 +5,12 @@
 
 
 class PrimitiveType(Enum):
-    Register = 1
-    Constant = 2
-    Memory1D = 3
-    Memory2D = 4
-    Memory3D = 5
-    Memory4D = 6
+    Memory1D = 1
+    Memory2D = 2
+    Memory3D = 3
+    Memory4D = 4
+    Register = 5
+    Constant = 6
 
 
 class ControlType(Enum):
diff --git a/frontends/relay-futil/tests/add.expect b/frontends/relay-futil/tests/add.expect
index a67d257997..f239d18b42 100644
--- a/frontends/relay-futil/tests/add.expect
+++ b/frontends/relay-futil/tests/add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(1);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    tensor1d_add0 = tensor1d_add;
+    add0 = add;
   }
   wires {
-    group run_tensor1d_add {
-      x.addr0 = tensor1d_add0.x0_addr0;
-      tensor1d_add0.x0_read_data = x.read_data;
-      y.addr0 = tensor1d_add0.y0_addr0;
-      tensor1d_add0.y0_read_data = y.read_data;
-      z.addr0 = tensor1d_add0.z0_addr0;
-      z.write_data = tensor1d_add0.z0_write_data;
-      z.write_en = tensor1d_add0.z0_write_en;
-      tensor1d_add0.z0_done = z.done;
-      tensor1d_add0.go = 1'd1;
-      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
+    group run_add {
+      x.addr0 = add0.x0_addr0;
+      add0.x0_read_data = x.read_data;
+      y.addr0 = add0.y0_addr0;
+      add0.y0_read_data = y.read_data;
+      z.addr0 = add0.z0_addr0;
+      z.write_data = add0.z0_write_data;
+      z.write_en = add0.z0_write_en;
+      add0.z0_done = z.done;
+      add0.go = 1'd1;
+      run_add[done] = add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_add;
+      run_add;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
index ee66f8942d..08566e1f14 100644
--- a/frontends/relay-futil/tests/bias_add.expect
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor2d_bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 7, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 7, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 7, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 7, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(7);
@@ -105,27 +105,27 @@ component main () -> () {
     x1 = prim std_mem_d2(32, 1, 64, 1, 7);
     x = prim std_mem_d2(32, 1, 64, 1, 7);
     bias = prim std_mem_d1(32, 64, 7);
-    tensor2d_bias_add0 = tensor2d_bias_add;
+    bias_add0 = bias_add;
   }
   wires {
-    group run_tensor2d_bias_add {
-      x.addr0 = tensor2d_bias_add0.x0_0_addr0;
-      tensor2d_bias_add0.x0_0_read_data = x.read_data;
-      x.addr1 = tensor2d_bias_add0.x0_0_addr1;
-      bias.addr0 = tensor2d_bias_add0.bias0_addr0;
-      tensor2d_bias_add0.bias0_read_data = bias.read_data;
-      x1.addr0 = tensor2d_bias_add0.x10_0_addr0;
-      x1.addr1 = tensor2d_bias_add0.x10_0_addr1;
-      x1.write_data = tensor2d_bias_add0.x10_0_write_data;
-      x1.write_en = tensor2d_bias_add0.x10_0_write_en;
-      tensor2d_bias_add0.x10_0_done = x1.done;
-      tensor2d_bias_add0.go = 1'd1;
-      run_tensor2d_bias_add[done] = tensor2d_bias_add0.done ? 1'd1;
+    group run_bias_add {
+      x.addr0 = bias_add0.x0_0_addr0;
+      bias_add0.x0_0_read_data = x.read_data;
+      x.addr1 = bias_add0.x0_0_addr1;
+      bias.addr0 = bias_add0.bias0_addr0;
+      bias_add0.bias0_read_data = bias.read_data;
+      x1.addr0 = bias_add0.x10_0_addr0;
+      x1.addr1 = bias_add0.x10_0_addr1;
+      x1.write_data = bias_add0.x10_0_write_data;
+      x1.write_en = bias_add0.x10_0_write_en;
+      bias_add0.x10_0_done = x1.done;
+      bias_add0.go = 1'd1;
+      run_bias_add[done] = bias_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor2d_bias_add;
+      run_bias_add;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/expand_dims.expect b/frontends/relay-futil/tests/expand_dims.expect
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/frontends/relay-futil/tests/expand_dims.relay b/frontends/relay-futil/tests/expand_dims.relay
new file mode 100644
index 0000000000..47ae5ce31a
--- /dev/null
+++ b/frontends/relay-futil/tests/expand_dims.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(4), int32]) {
+  let %x1 = expand_dims(%x, axis=1, num_newaxis=2);
+  %x1
+}
+
diff --git a/frontends/relay-futil/tests/fixed_point_add.expect b/frontends/relay-futil/tests/fixed_point_add.expect
index 3a08f950e8..aa8240b4cf 100644
--- a/frontends/relay-futil/tests/fixed_point_add.expect
+++ b/frontends/relay-futil/tests/fixed_point_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(1);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    tensor1d_add0 = tensor1d_add;
+    add0 = add;
   }
   wires {
-    group run_tensor1d_add {
-      x.addr0 = tensor1d_add0.x0_addr0;
-      tensor1d_add0.x0_read_data = x.read_data;
-      y.addr0 = tensor1d_add0.y0_addr0;
-      tensor1d_add0.y0_read_data = y.read_data;
-      z.addr0 = tensor1d_add0.z0_addr0;
-      z.write_data = tensor1d_add0.z0_write_data;
-      z.write_en = tensor1d_add0.z0_write_en;
-      tensor1d_add0.z0_done = z.done;
-      tensor1d_add0.go = 1'd1;
-      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
+    group run_add {
+      x.addr0 = add0.x0_addr0;
+      add0.x0_read_data = x.read_data;
+      y.addr0 = add0.y0_addr0;
+      add0.y0_read_data = y.read_data;
+      z.addr0 = add0.z0_addr0;
+      z.write_data = add0.z0_write_data;
+      z.write_en = add0.z0_write_en;
+      add0.z0_done = z.done;
+      add0.go = 1'd1;
+      run_add[done] = add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_add;
+      run_add;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let1.expect b/frontends/relay-futil/tests/let1.expect
index 77312716ef..cf228003ae 100644
--- a/frontends/relay-futil/tests/let1.expect
+++ b/frontends/relay-futil/tests/let1.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -82,25 +82,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
     b = prim std_mem_d1(32, 1, 1);
-    tensor1d_multiply0 = tensor1d_multiply;
+    multiply0 = multiply;
   }
   wires {
-    group run_tensor1d_multiply {
-      a.addr0 = tensor1d_multiply0.a0_addr0;
-      tensor1d_multiply0.a0_read_data = a.read_data;
-      b.addr0 = tensor1d_multiply0.b0_addr0;
-      tensor1d_multiply0.b0_read_data = b.read_data;
-      z.addr0 = tensor1d_multiply0.z0_addr0;
-      z.write_data = tensor1d_multiply0.z0_write_data;
-      z.write_en = tensor1d_multiply0.z0_write_en;
-      tensor1d_multiply0.z0_done = z.done;
-      tensor1d_multiply0.go = 1'd1;
-      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
+    group run_multiply {
+      a.addr0 = multiply0.a0_addr0;
+      multiply0.a0_read_data = a.read_data;
+      b.addr0 = multiply0.b0_addr0;
+      multiply0.b0_read_data = b.read_data;
+      z.addr0 = multiply0.z0_addr0;
+      z.write_data = multiply0.z0_write_data;
+      z.write_en = multiply0.z0_write_en;
+      multiply0.z0_done = z.done;
+      multiply0.go = 1'd1;
+      run_multiply[done] = multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_multiply;
+      run_multiply;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let2.expect b/frontends/relay-futil/tests/let2.expect
index 88da5412ff..b9a9bfd9ec 100644
--- a/frontends/relay-futil/tests/let2.expect
+++ b/frontends/relay-futil/tests/let2.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+component add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(32);
@@ -68,7 +68,7 @@ component tensor1d_add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data
     }
   }
 }
-component tensor1d_multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
+component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -150,40 +150,40 @@ component main () -> () {
     d = prim std_mem_d1(32, 1, 1);
     c = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
-    tensor1d_add0 = tensor1d_add;
+    add0 = add;
     b = prim std_mem_d1(32, 1, 1);
-    tensor1d_multiply0 = tensor1d_multiply;
+    multiply0 = multiply;
   }
   wires {
-    group run_tensor1d_multiply {
-      a.addr0 = tensor1d_multiply0.a0_addr0;
-      tensor1d_multiply0.a0_read_data = a.read_data;
-      b.addr0 = tensor1d_multiply0.b0_addr0;
-      tensor1d_multiply0.b0_read_data = b.read_data;
-      c.addr0 = tensor1d_multiply0.c0_addr0;
-      c.write_data = tensor1d_multiply0.c0_write_data;
-      c.write_en = tensor1d_multiply0.c0_write_en;
-      tensor1d_multiply0.c0_done = c.done;
-      tensor1d_multiply0.go = 1'd1;
-      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
-    }
-    group run_tensor1d_add {
-      c.addr0 = tensor1d_add0.c0_addr0;
-      tensor1d_add0.c0_read_data = c.read_data;
-      a.addr0 = tensor1d_add0.a0_addr0;
-      tensor1d_add0.a0_read_data = a.read_data;
-      d.addr0 = tensor1d_add0.d0_addr0;
-      d.write_data = tensor1d_add0.d0_write_data;
-      d.write_en = tensor1d_add0.d0_write_en;
-      tensor1d_add0.d0_done = d.done;
-      tensor1d_add0.go = 1'd1;
-      run_tensor1d_add[done] = tensor1d_add0.done ? 1'd1;
+    group run_multiply {
+      a.addr0 = multiply0.a0_addr0;
+      multiply0.a0_read_data = a.read_data;
+      b.addr0 = multiply0.b0_addr0;
+      multiply0.b0_read_data = b.read_data;
+      c.addr0 = multiply0.c0_addr0;
+      c.write_data = multiply0.c0_write_data;
+      c.write_en = multiply0.c0_write_en;
+      multiply0.c0_done = c.done;
+      multiply0.go = 1'd1;
+      run_multiply[done] = multiply0.done ? 1'd1;
+    }
+    group run_add {
+      c.addr0 = add0.c0_addr0;
+      add0.c0_read_data = c.read_data;
+      a.addr0 = add0.a0_addr0;
+      add0.a0_read_data = a.read_data;
+      d.addr0 = add0.d0_addr0;
+      d.write_data = add0.d0_write_data;
+      d.write_en = add0.d0_write_en;
+      add0.d0_done = d.done;
+      add0.go = 1'd1;
+      run_add[done] = add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_multiply;
-      run_tensor1d_add;
+      run_multiply;
+      run_add;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index 3ef8c350f2..11b79b4180 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
+component multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
   cells {
     add0 = prim std_add(1);
     bin_read0_0 = prim std_reg(32);
@@ -76,7 +76,7 @@ component tensor1d_multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read
     }
   }
 }
-component tensor1d_divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -152,7 +152,7 @@ component tensor1d_divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_d
     }
   }
 }
-component tensor1d_subtract(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
+component subtract(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -226,55 +226,55 @@ component main () -> () {
     e = prim std_mem_d1(32, 1, 1);
     c = prim std_mem_d1(32, 1, 1);
     d = prim std_mem_d1(32, 1, 1);
-    tensor1d_multiply0 = tensor1d_multiply;
+    multiply0 = multiply;
     a = prim std_mem_d1(32, 1, 1);
-    tensor1d_divide0 = tensor1d_divide;
+    divide0 = divide;
     b = prim std_mem_d1(32, 1, 1);
-    tensor1d_subtract0 = tensor1d_subtract;
+    subtract0 = subtract;
   }
   wires {
-    group run_tensor1d_subtract {
-      a.addr0 = tensor1d_subtract0.a0_addr0;
-      tensor1d_subtract0.a0_read_data = a.read_data;
-      b.addr0 = tensor1d_subtract0.b0_addr0;
-      tensor1d_subtract0.b0_read_data = b.read_data;
-      c.addr0 = tensor1d_subtract0.c0_addr0;
-      c.write_data = tensor1d_subtract0.c0_write_data;
-      c.write_en = tensor1d_subtract0.c0_write_en;
-      tensor1d_subtract0.c0_done = c.done;
-      tensor1d_subtract0.go = 1'd1;
-      run_tensor1d_subtract[done] = tensor1d_subtract0.done ? 1'd1;
+    group run_subtract {
+      a.addr0 = subtract0.a0_addr0;
+      subtract0.a0_read_data = a.read_data;
+      b.addr0 = subtract0.b0_addr0;
+      subtract0.b0_read_data = b.read_data;
+      c.addr0 = subtract0.c0_addr0;
+      c.write_data = subtract0.c0_write_data;
+      c.write_en = subtract0.c0_write_en;
+      subtract0.c0_done = c.done;
+      subtract0.go = 1'd1;
+      run_subtract[done] = subtract0.done ? 1'd1;
     }
-    group run_tensor1d_divide {
-      c.addr0 = tensor1d_divide0.c0_addr0;
-      tensor1d_divide0.c0_read_data = c.read_data;
-      a.addr0 = tensor1d_divide0.a0_addr0;
-      tensor1d_divide0.a0_read_data = a.read_data;
-      d.addr0 = tensor1d_divide0.d0_addr0;
-      d.write_data = tensor1d_divide0.d0_write_data;
-      d.write_en = tensor1d_divide0.d0_write_en;
-      tensor1d_divide0.d0_done = d.done;
-      tensor1d_divide0.go = 1'd1;
-      run_tensor1d_divide[done] = tensor1d_divide0.done ? 1'd1;
+    group run_divide {
+      c.addr0 = divide0.c0_addr0;
+      divide0.c0_read_data = c.read_data;
+      a.addr0 = divide0.a0_addr0;
+      divide0.a0_read_data = a.read_data;
+      d.addr0 = divide0.d0_addr0;
+      d.write_data = divide0.d0_write_data;
+      d.write_en = divide0.d0_write_en;
+      divide0.d0_done = d.done;
+      divide0.go = 1'd1;
+      run_divide[done] = divide0.done ? 1'd1;
     }
-    group run_tensor1d_multiply {
-      c.addr0 = tensor1d_multiply0.c0_addr0;
-      tensor1d_multiply0.c0_read_data = c.read_data;
-      d.addr0 = tensor1d_multiply0.d0_addr0;
-      tensor1d_multiply0.d0_read_data = d.read_data;
-      e.addr0 = tensor1d_multiply0.e0_addr0;
-      e.write_data = tensor1d_multiply0.e0_write_data;
-      e.write_en = tensor1d_multiply0.e0_write_en;
-      tensor1d_multiply0.e0_done = e.done;
-      tensor1d_multiply0.go = 1'd1;
-      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
+    group run_multiply {
+      c.addr0 = multiply0.c0_addr0;
+      multiply0.c0_read_data = c.read_data;
+      d.addr0 = multiply0.d0_addr0;
+      multiply0.d0_read_data = d.read_data;
+      e.addr0 = multiply0.e0_addr0;
+      e.write_data = multiply0.e0_write_data;
+      e.write_en = multiply0.e0_write_en;
+      multiply0.e0_done = e.done;
+      multiply0.go = 1'd1;
+      run_multiply[done] = multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_subtract;
-      run_tensor1d_divide;
-      run_tensor1d_multiply;
+      run_subtract;
+      run_divide;
+      run_multiply;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/sub.expect b/frontends/relay-futil/tests/sub.expect
index 9cac092744..c74af4fb2e 100644
--- a/frontends/relay-futil/tests/sub.expect
+++ b/frontends/relay-futil/tests/sub.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_subtract(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component subtract(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim std_add(1);
     const0 = prim std_const(1, 0);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    tensor1d_subtract0 = tensor1d_subtract;
+    subtract0 = subtract;
   }
   wires {
-    group run_tensor1d_subtract {
-      x.addr0 = tensor1d_subtract0.x0_addr0;
-      tensor1d_subtract0.x0_read_data = x.read_data;
-      y.addr0 = tensor1d_subtract0.y0_addr0;
-      tensor1d_subtract0.y0_read_data = y.read_data;
-      z.addr0 = tensor1d_subtract0.z0_addr0;
-      z.write_data = tensor1d_subtract0.z0_write_data;
-      z.write_en = tensor1d_subtract0.z0_write_en;
-      tensor1d_subtract0.z0_done = z.done;
-      tensor1d_subtract0.go = 1'd1;
-      run_tensor1d_subtract[done] = tensor1d_subtract0.done ? 1'd1;
+    group run_subtract {
+      x.addr0 = subtract0.x0_addr0;
+      subtract0.x0_read_data = x.read_data;
+      y.addr0 = subtract0.y0_addr0;
+      subtract0.y0_read_data = y.read_data;
+      z.addr0 = subtract0.z0_addr0;
+      z.write_data = subtract0.z0_write_data;
+      z.write_en = subtract0.z0_write_en;
+      subtract0.z0_done = z.done;
+      subtract0.go = 1'd1;
+      run_subtract[done] = subtract0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_subtract;
+      run_subtract;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor1d_mult.expect b/frontends/relay-futil/tests/tensor1d_mult.expect
index d6086cd33d..dac0e76d85 100644
--- a/frontends/relay-futil/tests/tensor1d_mult.expect
+++ b/frontends/relay-futil/tests/tensor1d_mult.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor1d_multiply(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
+component multiply(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     bin_read0_0 = prim std_reg(32);
@@ -82,25 +82,25 @@ component main () -> () {
     x1 = prim std_mem_d1(32, 4, 3);
     x = prim std_mem_d1(32, 4, 3);
     y = prim std_mem_d1(32, 4, 3);
-    tensor1d_multiply0 = tensor1d_multiply;
+    multiply0 = multiply;
   }
   wires {
-    group run_tensor1d_multiply {
-      x.addr0 = tensor1d_multiply0.x0_addr0;
-      tensor1d_multiply0.x0_read_data = x.read_data;
-      y.addr0 = tensor1d_multiply0.y0_addr0;
-      tensor1d_multiply0.y0_read_data = y.read_data;
-      x1.addr0 = tensor1d_multiply0.x10_addr0;
-      x1.write_data = tensor1d_multiply0.x10_write_data;
-      x1.write_en = tensor1d_multiply0.x10_write_en;
-      tensor1d_multiply0.x10_done = x1.done;
-      tensor1d_multiply0.go = 1'd1;
-      run_tensor1d_multiply[done] = tensor1d_multiply0.done ? 1'd1;
+    group run_multiply {
+      x.addr0 = multiply0.x0_addr0;
+      multiply0.x0_read_data = x.read_data;
+      y.addr0 = multiply0.y0_addr0;
+      multiply0.y0_read_data = y.read_data;
+      x1.addr0 = multiply0.x10_addr0;
+      x1.write_data = multiply0.x10_write_data;
+      x1.write_en = multiply0.x10_write_en;
+      multiply0.x10_done = x1.done;
+      multiply0.go = 1'd1;
+      run_multiply[done] = multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor1d_multiply;
+      run_multiply;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor2d_add.expect b/frontends/relay-futil/tests/tensor2d_add.expect
index 46db3a2cab..d289badb27 100644
--- a/frontends/relay-futil/tests/tensor2d_add.expect
+++ b/frontends/relay-futil/tests/tensor2d_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor2d_add(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+component add(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(3);
@@ -106,28 +106,28 @@ component main () -> () {
     x1 = prim std_mem_d2(32, 2, 4, 2, 3);
     x = prim std_mem_d2(32, 2, 4, 2, 3);
     y = prim std_mem_d2(32, 2, 4, 2, 3);
-    tensor2d_add0 = tensor2d_add;
+    add0 = add;
   }
   wires {
-    group run_tensor2d_add {
-      x.addr0 = tensor2d_add0.x0_0_addr0;
-      tensor2d_add0.x0_0_read_data = x.read_data;
-      x.addr1 = tensor2d_add0.x0_0_addr1;
-      y.addr0 = tensor2d_add0.y0_0_addr0;
-      tensor2d_add0.y0_0_read_data = y.read_data;
-      y.addr1 = tensor2d_add0.y0_0_addr1;
-      x1.addr0 = tensor2d_add0.x10_0_addr0;
-      x1.addr1 = tensor2d_add0.x10_0_addr1;
-      x1.write_data = tensor2d_add0.x10_0_write_data;
-      x1.write_en = tensor2d_add0.x10_0_write_en;
-      tensor2d_add0.x10_0_done = x1.done;
-      tensor2d_add0.go = 1'd1;
-      run_tensor2d_add[done] = tensor2d_add0.done ? 1'd1;
+    group run_add {
+      x.addr0 = add0.x0_0_addr0;
+      add0.x0_0_read_data = x.read_data;
+      x.addr1 = add0.x0_0_addr1;
+      y.addr0 = add0.y0_0_addr0;
+      add0.y0_0_read_data = y.read_data;
+      y.addr1 = add0.y0_0_addr1;
+      x1.addr0 = add0.x10_0_addr0;
+      x1.addr1 = add0.x10_0_addr1;
+      x1.write_data = add0.x10_0_write_data;
+      x1.write_en = add0.x10_0_write_en;
+      add0.x10_0_done = x1.done;
+      add0.go = 1'd1;
+      run_add[done] = add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor2d_add;
+      run_add;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor3d_divide.expect b/frontends/relay-futil/tests/tensor3d_divide.expect
index 10eb243cc1..5058296dd8 100644
--- a/frontends/relay-futil/tests/tensor3d_divide.expect
+++ b/frontends/relay-futil/tests/tensor3d_divide.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component tensor3d_divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
+component divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     add1 = prim std_add(3);
@@ -146,31 +146,31 @@ component main () -> () {
     x1 = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
     x = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
     y = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
-    tensor3d_divide0 = tensor3d_divide;
+    divide0 = divide;
   }
   wires {
-    group run_tensor3d_divide {
-      x.addr0 = tensor3d_divide0.x0_0_0_addr0;
-      tensor3d_divide0.x0_0_0_read_data = x.read_data;
-      x.addr1 = tensor3d_divide0.x0_0_0_addr1;
-      x.addr2 = tensor3d_divide0.x0_0_0_addr2;
-      y.addr0 = tensor3d_divide0.y0_0_0_addr0;
-      tensor3d_divide0.y0_0_0_read_data = y.read_data;
-      y.addr1 = tensor3d_divide0.y0_0_0_addr1;
-      y.addr2 = tensor3d_divide0.y0_0_0_addr2;
-      x1.addr0 = tensor3d_divide0.x10_0_0_addr0;
-      x1.addr1 = tensor3d_divide0.x10_0_0_addr1;
-      x1.addr2 = tensor3d_divide0.x10_0_0_addr2;
-      x1.write_data = tensor3d_divide0.x10_0_0_write_data;
-      x1.write_en = tensor3d_divide0.x10_0_0_write_en;
-      tensor3d_divide0.x10_0_0_done = x1.done;
-      tensor3d_divide0.go = 1'd1;
-      run_tensor3d_divide[done] = tensor3d_divide0.done ? 1'd1;
+    group run_divide {
+      x.addr0 = divide0.x0_0_0_addr0;
+      divide0.x0_0_0_read_data = x.read_data;
+      x.addr1 = divide0.x0_0_0_addr1;
+      x.addr2 = divide0.x0_0_0_addr2;
+      y.addr0 = divide0.y0_0_0_addr0;
+      divide0.y0_0_0_read_data = y.read_data;
+      y.addr1 = divide0.y0_0_0_addr1;
+      y.addr2 = divide0.y0_0_0_addr2;
+      x1.addr0 = divide0.x10_0_0_addr0;
+      x1.addr1 = divide0.x10_0_0_addr1;
+      x1.addr2 = divide0.x10_0_0_addr2;
+      x1.write_data = divide0.x10_0_0_write_data;
+      x1.write_en = divide0.x10_0_0_write_en;
+      divide0.x10_0_0_done = x1.done;
+      divide0.go = 1'd1;
+      run_divide[done] = divide0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_tensor3d_divide;
+      run_divide;
     }
   }
 }

From 8614cc6f249d0053082ea3cc2feec49ed952abde Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 20:32:28 -0500
Subject: [PATCH 21/75] Fix dahlia name.

---
 frontends/relay-futil/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 9f97f7ac11..2d48d270e0 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -64,7 +64,7 @@ def produce_dahlia_name(self, name, type):
                              PrimitiveType.Memory3D: '_0_0', PrimitiveType.Memory4D: '_0_0_0'}
         dahlia_name = self.id(name)
         assert type in DahliaNameMapping, f'{name} with {type} is not supported yet.'
-        return DahliaNameMapping[type]
+        return dahlia_name + DahliaNameMapping[type]
 
     def get_dahlia_declaration(self, function_name, cells, args):
         """

From 6657d2f763d91fa09893340b08b70c6a3ebeb863 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 20:48:52 -0500
Subject: [PATCH 22/75] Add axis=1.

---
 frontends/relay-futil/compiler.py          | 3 ++-
 frontends/relay-futil/tests/bias_add.relay | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 2d48d270e0..cf7af47ab6 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -103,7 +103,8 @@ def visit_let(self, let):
 
     def visit_constant(self, const):
         type, shape = const.data.dtype, const.data.shape
-        name, data, data_type = self.id("const"), [get_bitwidth(type), int(const.data.asnumpy())], get_type(type)
+        name, data = self.id("const"), [get_bitwidth(type), int(const.data.asnumpy())]
+        data_type = get_memory_parameters(type)
         return FCell(primitive=FPrimitive(name=name, data=data, data_type=data_type, type=PrimitiveType.Constant))
 
     def visit_call(self, call):
diff --git a/frontends/relay-futil/tests/bias_add.relay b/frontends/relay-futil/tests/bias_add.relay
index 6b90020ed3..f71e872e09 100644
--- a/frontends/relay-futil/tests/bias_add.relay
+++ b/frontends/relay-futil/tests/bias_add.relay
@@ -1,6 +1,6 @@
 v0.0.4
 fn (%x: Tensor[(1, 64), float32], %bias: Tensor[(64), float32])  {
-  let %x1: Tensor[(1, 64), float32] = nn.bias_add(%x, %bias);
+  let %x1: Tensor[(1, 64), float32] = nn.bias_add(%x, %bias, axis=1);
   %x1
 }
 

From b4a373ae9acf4a38c5702f73d67a1112d940a065 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 20:51:22 -0500
Subject: [PATCH 23/75] Add does not work.

---
 frontends/relay-futil/tests/expand_dims.expect | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontends/relay-futil/tests/expand_dims.expect b/frontends/relay-futil/tests/expand_dims.expect
index e69de29bb2..7a959bb71d 100644
--- a/frontends/relay-futil/tests/expand_dims.expect
+++ b/frontends/relay-futil/tests/expand_dims.expect
@@ -0,0 +1 @@
+// Does not work.
\ No newline at end of file

From 21a4921ffe79bcfebe18353bebee183cb1776fa8 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 20 Nov 2020 21:14:51 -0500
Subject: [PATCH 24/75] Add transforms.

---
 frontends/relay-futil/compiler.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index cf7af47ab6..0f37c0142d 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -1,3 +1,4 @@
+import tvm
 from tvm import relay, ir
 from tvm.relay.expr_functor import ExprFunctor
 from tvm.relay.function import Function
@@ -126,17 +127,22 @@ def visit_function(self, function):
         return pp_component(self.main)
 
 
-def infer_type(expr: Function) -> Function:
-    infer_types_pass = relay.transform.InferType()
+def relay_transforms(expr: Function) -> Function:
+    """https://tvm.apache.org/docs/api/python/relay/transform.html"""
+    transform = tvm.transform.Sequential([
+        relay.transform.SimplifyExpr(),
+        relay.transform.SimplifyInference(),
+        relay.transform.InferType()
+    ])
     mod = ir.IRModule()
     mod['main'] = expr
-    mod = infer_types_pass(mod)
+    mod = transform(mod)
     return mod['main']
 
 
 def compile(program) -> str:
     """Translate a Relay function to a FuTIL program (as a string)."""
-    program = infer_type(program)
+    program = relay_transforms(program)
     visitor = Relay2Futil()
 
     PREAMBLE = """import "primitives/std.lib";"""

From 40b93920f8fac6302b186e12ff0bd9efcb114622 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 08:43:42 -0500
Subject: [PATCH 25/75] Add attributes, bias_add along different axis.

---
 frontends/relay-futil/compiler.py             |  9 ++---
 frontends/relay-futil/dahlia_functions.py     | 24 ++++++++-----
 frontends/relay-futil/futil_ast.py            |  2 ++
 .../relay-futil/tests/data/bias_add.relay     |  2 +-
 .../relay-futil/tests/data/bias_add2.expect   | 34 +++++++++++++++++++
 .../relay-futil/tests/data/bias_add2.relay    |  5 +++
 .../tests/data/bias_add2.relay.data           | 14 ++++++++
 frontends/relay-futil/utilities.py            | 14 ++++----
 8 files changed, 84 insertions(+), 20 deletions(-)
 create mode 100644 frontends/relay-futil/tests/data/bias_add2.expect
 create mode 100644 frontends/relay-futil/tests/data/bias_add2.relay
 create mode 100644 frontends/relay-futil/tests/data/bias_add2.relay.data

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 0f37c0142d..d8afe5474b 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -67,7 +67,7 @@ def produce_dahlia_name(self, name, type):
         assert type in DahliaNameMapping, f'{name} with {type} is not supported yet.'
         return dahlia_name + DahliaNameMapping[type]
 
-    def get_dahlia_declaration(self, function_name, cells, args):
+    def get_dahlia_declaration(self, function_name, cells, args, attrs):
         """
         Returns the corresponding name, Dahlia function type, and op (if it is a binary op, otherwise None).
         If the function type isn't supported, fails with an assertion.
@@ -82,8 +82,8 @@ def get_dahlia_declaration(self, function_name, cells, args):
             name = function.__name__
         else:
             assert False, f'{function_name} with type {input_type} is not supported.'
-        return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name), op=op, inputs=args,
-                                 function=function)
+        return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name),
+                                 op=op, inputs=args, attributes=attrs, function=function)
 
     def visit_var(self, var):
         name = self.relay_id(var.name_hint)
@@ -109,12 +109,13 @@ def visit_constant(self, const):
         return FCell(primitive=FPrimitive(name=name, data=data, data_type=data_type, type=PrimitiveType.Constant))
 
     def visit_call(self, call):
+        attributes = call.attrs
         cells, args = [], []
         for arg in call.args:
             argument = self.visit(arg)
             cells.append(argument)
             args.append(argument)
-        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, args)))
+        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, args, call.attrs)))
         return cells
 
     def visit_function(self, function):
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 50dd7100d4..c2c69c61bb 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -136,7 +136,7 @@ def batch_flatten(declaration):
 
 def bias_add(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
-    # Assumes default value axis=1 is passed in.
+    axis = declaration.attributes.get_int("axis")
     data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth = data.data[0]
     size0, size1, index_size0, index_size1 = data.data[1], data.data[2], data.data[3], data.data[4]
@@ -144,13 +144,21 @@ def bias_add(declaration):
     program = f"""
     decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}];
     decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];
-    for (let i: ubit<{index_size0}> = 0..{size0}) {{
-      for (let j: ubit<{index_size1}> = 0..{size1}) {{
-        {res.name}[i][j] := {data.name}[i][j] + {bias.name}[j];
-      }}
-    }}
-    """
+    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];"""
+    if axis == 1:
+        program += f"""
+        for (let i: ubit<{index_size0}> = 0..{size0}) {{
+          for (let j: ubit<{index_size1}> = 0..{size1}) {{
+            {res.name}[i][j] := {data.name}[i][j] + {bias.name}[j];
+          }}
+        }}"""
+    elif axis == 0:
+        program += f"""
+        for (let j: ubit<{index_size1}> = 0..{size1}) {{
+          for (let i: ubit<{index_size0}> = 0..{size0}) {{
+            {res.name}[i][j] := {data.name}[i][j] + {bias.name}[i];
+          }}
+        }}"""
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 868e6ffbcf..9432a62a4d 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -1,3 +1,4 @@
+import tvm
 from dataclasses import dataclass
 from typing import List, Dict
 from types import FunctionType
@@ -142,6 +143,7 @@ class DahliaDeclaration:
     op: str = None
     inputs: List[Cell] = None
     output: Cell = None
+    attributes: tvm.ir.Attrs = None
     function: FunctionType = None
     program: str = None
 
diff --git a/frontends/relay-futil/tests/data/bias_add.relay b/frontends/relay-futil/tests/data/bias_add.relay
index 4a1c58a64a..1f9b35120c 100644
--- a/frontends/relay-futil/tests/data/bias_add.relay
+++ b/frontends/relay-futil/tests/data/bias_add.relay
@@ -1,5 +1,5 @@
 v0.0.4
 fn (%x: Tensor[(2, 4), float32], %bias: Tensor[(4), float32])  {
-  let %x1: Tensor[(2, 4), float32] = nn.bias_add(%x, %bias);
+  let %x1: Tensor[(2, 4), float32] = nn.bias_add(%x, %bias, axis=1);
   %x1
 }
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/bias_add2.expect b/frontends/relay-futil/tests/data/bias_add2.expect
new file mode 100644
index 0000000000..1d6ef587cb
--- /dev/null
+++ b/frontends/relay-futil/tests/data/bias_add2.expect
@@ -0,0 +1,34 @@
+{
+  "bias": [
+    42,
+    5
+  ],
+  "x": [
+    [
+      0,
+      0,
+      0,
+      0
+    ],
+    [
+      0,
+      0,
+      0,
+      0
+    ]
+  ],
+  "x1": [
+    [
+      42,
+      42,
+      42,
+      42
+    ],
+    [
+      5,
+      5,
+      5,
+      5
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/bias_add2.relay b/frontends/relay-futil/tests/data/bias_add2.relay
new file mode 100644
index 0000000000..302ede8c97
--- /dev/null
+++ b/frontends/relay-futil/tests/data/bias_add2.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(2, 4), float32], %bias: Tensor[(2), float32])  {
+  let %x1: Tensor[(2, 4), float32] = nn.bias_add(%x, %bias, axis=0);
+  %x1
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/bias_add2.relay.data b/frontends/relay-futil/tests/data/bias_add2.relay.data
new file mode 100644
index 0000000000..b96b5aa3e7
--- /dev/null
+++ b/frontends/relay-futil/tests/data/bias_add2.relay.data
@@ -0,0 +1,14 @@
+{
+  "x": {
+    "data": [[0,0,0,0], [0,0,0,0]],
+    "bitwidth": 32
+  },
+  "bias": {
+    "data": [42,5],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [[0,0,0,0], [0,0,0,0]],
+    "bitwidth": 32
+    }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 39a4591af1..faaccf1fe7 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -2,6 +2,10 @@
 from itertools import chain
 import math
 
+# Mapping from the tensor dimensions to the corresponding FuTIL memory type.
+TensorToMemoryDimensionMapping = {1: PrimitiveType.Memory1D, 2: PrimitiveType.Memory2D,
+                                  3: PrimitiveType.Memory3D, 4: PrimitiveType.Memory4D}
+
 
 def flatten(l):
     '''
@@ -60,15 +64,11 @@ def get_memory_parameters(type):
     string_dimensions = t[t.find("(") + 1:t.find(")")]
 
     tensor_dimensions = list(map(int, string_dimensions.split(',')))
-    data = [get_bitwidth(string_type)]
+    data, num_dimensions = [get_bitwidth(string_type)], len(tensor_dimensions)
+    assert num_dimensions in TensorToMemoryDimensionMapping, f'{num_dimensions} dimensions is not supported.'
     for dimension in tensor_dimensions: data.append(dimension)  # Size.
     for dimension in tensor_dimensions: data.append(int(math.log2(dimension) + 1))  # Index size.
-
-    if len(tensor_dimensions) == 1: primitive_type = PrimitiveType.Memory1D
-    if len(tensor_dimensions) == 2: primitive_type = PrimitiveType.Memory2D
-    if len(tensor_dimensions) == 3: primitive_type = PrimitiveType.Memory3D
-    if len(tensor_dimensions) == 4: primitive_type = PrimitiveType.Memory4D
-    return data, primitive_type, data_type
+    return data, TensorToMemoryDimensionMapping[num_dimensions], data_type
 
 
 def build_main_controls(c: FComponent):

From 3555c6241d93913a9139489d0f979e8f46edb388 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 11:17:58 -0500
Subject: [PATCH 26/75] Add bias add.

---
 frontends/relay-futil/dahlia_functions.py   |  62 ++++++---
 frontends/relay-futil/tests/bias_add.expect | 138 ++++++++++++++------
 frontends/relay-futil/tests/bias_add.relay  |   4 +-
 3 files changed, 143 insertions(+), 61 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index c2c69c61bb..00eb769fa4 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -31,7 +31,6 @@ def lower_dahlia_program(prog, component_name):
         (done: 1, X0_addr0: 2, X0_write_data: 32, X0_write_en: 1, X0_clk: 1) {
            ...
         }
-
     '''
     program_string = '\n'.join(prog.splitlines())
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
@@ -139,26 +138,47 @@ def bias_add(declaration):
     axis = declaration.attributes.get_int("axis")
     data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth = data.data[0]
-    size0, size1, index_size0, index_size1 = data.data[1], data.data[2], data.data[3], data.data[4]
-    bias_size, bias_index_size = bias.data[1], bias.data[2]
-    program = f"""
-    decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}];
-    decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];"""
-    if axis == 1:
-        program += f"""
-        for (let i: ubit<{index_size0}> = 0..{size0}) {{
-          for (let j: ubit<{index_size1}> = 0..{size1}) {{
-            {res.name}[i][j] := {data.name}[i][j] + {bias.name}[j];
-          }}
-        }}"""
-    elif axis == 0:
-        program += f"""
-        for (let j: ubit<{index_size1}> = 0..{size1}) {{
-          for (let i: ubit<{index_size0}> = 0..{size0}) {{
-            {res.name}[i][j] := {data.name}[i][j] + {bias.name}[i];
-          }}
-        }}"""
+    if data.type == PrimitiveType.Memory2D:
+        size0, size1, index_size0, index_size1 = data.data[1], data.data[2], data.data[3], data.data[4]
+        bias_size, bias_index_size = bias.data[1], bias.data[2]
+        program = f"""
+        decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}];
+        decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];"""
+        if axis == 1:
+            program += f"""
+            for (let i: ubit<{index_size0}> = 0..{size0}) {{
+              for (let j: ubit<{index_size1}> = 0..{size1}) {{
+                {res.name}[i][j] := {data.name}[i][j] + {bias.name}[j];
+              }}
+            }}"""
+        elif axis == 0:
+            program += f"""
+            for (let j: ubit<{index_size1}> = 0..{size1}) {{
+              for (let i: ubit<{index_size0}> = 0..{size0}) {{
+                {res.name}[i][j] := {data.name}[i][j] + {bias.name}[i];
+              }}
+            }}"""
+    elif data.type == PrimitiveType.Memory4D:
+        bitwidth, size0, size1, size2, size3 = data.data[0], data.data[1], data.data[2], data.data[3], data.data[4]
+        index_size0, index_size1, index_size2, index_size3 = data.data[5], data.data[6], data.data[7], data.data[8]
+        bias_size, bias_index_size = bias.data[1], bias.data[2]
+        program = f"""
+        decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+        decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];"""
+        if axis == 1:
+            program += f"""
+            for (let i: ubit<{index_size0}> = 0..{size0}) {{
+              for (let j: ubit<{index_size1}> = 0..{size1}) {{
+                for (let k: ubit<{index_size2}> = 0..{size2}) {{
+                  for (let l: ubit<{index_size3}> = 0..{size3}) {{
+                    {res.name}[i][j][k][l] := {data.name}[i][j][k][l] + {bias.name}[j];
+                  }}
+                }}
+              }}
+            }}"""
+
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
index 08566e1f14..562f55dfe0 100644
--- a/frontends/relay-futil/tests/bias_add.expect
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -1,21 +1,33 @@
 import "primitives/std.lib";
 
-component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 7, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 7, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_0_0_addr0: 1, x0_0_0_0_addr1: 7, x0_0_0_0_addr2: 10, x0_0_0_0_addr3: 9, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 1, x10_0_0_0_addr1: 7, x10_0_0_0_addr2: 10, x10_0_0_0_addr3: 9, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(7);
-    add2 = prim std_add(1);
+    add1 = prim std_add(9);
+    add2 = prim std_add(10);
+    add3 = prim std_add(7);
+    add4 = prim std_add(1);
     bias_read0_0 = prim std_reg(32);
     const0 = prim std_const(1, 0);
     const1 = prim std_const(1, 0);
+    const10 = prim std_const(7, 1);
+    const11 = prim std_const(1, 1);
     const2 = prim std_const(7, 0);
     const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(1, 1);
+    const4 = prim std_const(10, 0);
+    const5 = prim std_const(10, 511);
+    const6 = prim std_const(9, 0);
+    const7 = prim std_const(9, 255);
+    const8 = prim std_const(9, 1);
+    const9 = prim std_const(10, 1);
     i0 = prim std_reg(1);
     j0 = prim std_reg(7);
+    k0 = prim std_reg(10);
+    l0 = prim std_reg(9);
     le0 = prim std_le(1);
     le1 = prim std_le(7);
+    le2 = prim std_le(10);
+    le3 = prim std_le(9);
     x_read0_0 = prim std_reg(32);
   }
   wires {
@@ -29,6 +41,16 @@ component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_
       le1.left = j0.out;
       le1.right = const3.out;
     }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = l0.out;
+      le3.right = const7.out;
+    }
     group let0<"static"=1> {
       i0.in = const0.out;
       i0.write_en = 1'd1;
@@ -39,11 +61,23 @@ component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_
       j0.write_en = 1'd1;
       let1[done] = j0.done;
     }
+    group let2<"static"=1> {
+      k0.in = const4.out;
+      k0.write_en = 1'd1;
+      let2[done] = k0.done;
+    }
+    group let3<"static"=1> {
+      l0.in = const6.out;
+      l0.write_en = 1'd1;
+      let3[done] = l0.done;
+    }
     group upd0<"static"=1> {
       x_read0_0.write_en = 1'd1;
-      x0_0_addr1 = j0.out;
-      x0_0_addr0 = i0.out;
-      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      x0_0_0_0_addr3 = l0.out;
+      x0_0_0_0_addr2 = k0.out;
+      x0_0_0_0_addr1 = j0.out;
+      x0_0_0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_0_0_read_data;
       upd0[done] = x_read0_0.done ? 1'd1;
     }
     group upd1<"static"=1> {
@@ -53,27 +87,43 @@ component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_
       upd1[done] = bias_read0_0.done ? 1'd1;
     }
     group upd2<"static"=1> {
-      x10_0_addr1 = j0.out;
-      x10_0_addr0 = i0.out;
-      x10_0_write_en = 1'd1;
+      x10_0_0_0_addr3 = l0.out;
+      x10_0_0_0_addr2 = k0.out;
+      x10_0_0_0_addr1 = j0.out;
+      x10_0_0_0_addr0 = i0.out;
+      x10_0_0_0_write_en = 1'd1;
       add0.left = x_read0_0.out;
       add0.right = bias_read0_0.out;
-      x10_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x10_0_done ? 1'd1;
+      x10_0_0_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x10_0_0_0_done ? 1'd1;
     }
     group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
+      l0.write_en = 1'd1;
+      add1.left = l0.out;
+      add1.right = const8.out;
+      l0.in = 1'd1 ? add1.out;
+      upd3[done] = l0.done ? 1'd1;
     }
     group upd4<"static"=1> {
+      k0.write_en = 1'd1;
+      add2.left = k0.out;
+      add2.right = const9.out;
+      k0.in = 1'd1 ? add2.out;
+      upd4[done] = k0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      j0.write_en = 1'd1;
+      add3.left = j0.out;
+      add3.right = const10.out;
+      j0.in = 1'd1 ? add3.out;
+      upd5[done] = j0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
       i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
+      add4.left = i0.out;
+      add4.right = const11.out;
+      i0.in = 1'd1 ? add4.out;
+      upd6[done] = i0.done ? 1'd1;
     }
   }
 
@@ -85,15 +135,27 @@ component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_
           let1;
           while le1.out with cond1 {
             seq {
-              par {
-                upd0;
-                upd1;
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      par {
+                        upd0;
+                        upd1;
+                      }
+                      upd2;
+                      upd3;
+                    }
+                  }
+                  upd4;
+                }
               }
-              upd2;
-              upd3;
+              upd5;
             }
           }
-          upd4;
+          upd6;
         }
       }
     }
@@ -102,23 +164,23 @@ component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_read_
 
 component main () -> () {
   cells {
-    x1 = prim std_mem_d2(32, 1, 64, 1, 7);
-    x = prim std_mem_d2(32, 1, 64, 1, 7);
+    x1 = prim std_mem_d4(32, 1, 64, 512, 256, 256, 1, 7, 10);
+    x = prim std_mem_d4(32, 1, 64, 512, 256, 256, 1, 7, 10);
     bias = prim std_mem_d1(32, 64, 7);
     bias_add0 = bias_add;
   }
   wires {
     group run_bias_add {
-      x.addr0 = bias_add0.x0_0_addr0;
-      bias_add0.x0_0_read_data = x.read_data;
-      x.addr1 = bias_add0.x0_0_addr1;
+      x.addr0 = bias_add0.x0_0_0_0_addr0;
+      bias_add0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = bias_add0.x0_0_0_0_addr1;
+      x.addr2 = bias_add0.x0_0_0_0_addr2;
       bias.addr0 = bias_add0.bias0_addr0;
       bias_add0.bias0_read_data = bias.read_data;
-      x1.addr0 = bias_add0.x10_0_addr0;
-      x1.addr1 = bias_add0.x10_0_addr1;
-      x1.write_data = bias_add0.x10_0_write_data;
-      x1.write_en = bias_add0.x10_0_write_en;
-      bias_add0.x10_0_done = x1.done;
+      x1.addr0 = bias_add0.x10_0_0_0_addr0;
+      x1.write_data = bias_add0.x10_0_0_0_write_data;
+      x1.write_en = bias_add0.x10_0_0_0_write_en;
+      bias_add0.x10_0_0_0_done = x1.done;
       bias_add0.go = 1'd1;
       run_bias_add[done] = bias_add0.done ? 1'd1;
     }
diff --git a/frontends/relay-futil/tests/bias_add.relay b/frontends/relay-futil/tests/bias_add.relay
index f71e872e09..84d9c6d54e 100644
--- a/frontends/relay-futil/tests/bias_add.relay
+++ b/frontends/relay-futil/tests/bias_add.relay
@@ -1,6 +1,6 @@
 v0.0.4
-fn (%x: Tensor[(1, 64), float32], %bias: Tensor[(64), float32])  {
-  let %x1: Tensor[(1, 64), float32] = nn.bias_add(%x, %bias, axis=1);
+fn (%x: Tensor[(1, 64, 512, 256), float32], %bias: Tensor[(64), float32])  {
+  let %x1: Tensor[(1, 64, 512, 256), float32] = nn.bias_add(%x, %bias, axis=1);
   %x1
 }
 

From a959fa5992de374edc0a414c7bfc8945c3d8336c Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 13:00:13 -0500
Subject: [PATCH 27/75] Fix name ordering for visit let.

---
 frontends/relay-futil/compiler.py              |  3 ++-
 frontends/relay-futil/dahlia_functions.py      | 14 ++++++++------
 frontends/relay-futil/example.py               |  2 +-
 frontends/relay-futil/tests/expand_dims.expect |  1 -
 frontends/relay-futil/tests/expand_dims.relay  |  6 ------
 5 files changed, 11 insertions(+), 15 deletions(-)
 delete mode 100644 frontends/relay-futil/tests/expand_dims.expect
 delete mode 100644 frontends/relay-futil/tests/expand_dims.relay

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index d8afe5474b..77e23e4a6e 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -95,11 +95,12 @@ def visit_var(self, var):
                      primitive=FPrimitive(name=name, data=data, data_type=data_type, type=type))
 
     def visit_let(self, let):
-        output, body, values = self.visit(let.var), self.visit(let.body), self.visit(let.value)
+        values, output = self.visit(let.value), self.visit(let.var)
         for value in values:
             if not value.is_dahlia_declaration(): continue
             value.dahlia_declaration.output = output
             value.dahlia_declaration.invoke()
+        body = self.visit(let.body)
         return [body, values]
 
     def visit_constant(self, const):
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 00eb769fa4..a197bb93ec 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -258,17 +258,19 @@ def negative(declaration):
 
 def expand_dims(declaration):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
-    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
+    axis, num_newaxis = declaration.attributes.get_int("axis"), declaration.attributes.get_int("num_newaxis")
+    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, size, index_size = data.data[0], data.data[1], data.data[2]
     size0, size1, size2 = res.data[1], res.data[2], res.data[3]
     index_size0, index_size1, index_size2 = res.data[4], res.data[5], res.data[6]
-    program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
+    program = f"""decl {data.name}: {data.data_type}<{bitwidth}>[{size}];"""
+    if axis == 1 and num_newaxis == 2:
+        program += f"""
         decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
         for (let i: ubit<{index_size}> = 0..{size}) {{
-          {res.name}[i][0][0] := {op1.name}[i];
+          {res.name}[i][0][0] := {data.name}[i];
         }}
-    """
+        """
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 779493b1a4..f7e8526627 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -17,7 +17,7 @@ def tensor_subtract():
 
 
 def expand_dims():
-    x = relay.var('x', shape=[4], dtype='int32')
+    x = relay.var('x', shape=[512], dtype='int32')
     return relay.Function([x], relay.expand_dims(x, axis=1, num_newaxis=2))
 
 
diff --git a/frontends/relay-futil/tests/expand_dims.expect b/frontends/relay-futil/tests/expand_dims.expect
deleted file mode 100644
index 7a959bb71d..0000000000
--- a/frontends/relay-futil/tests/expand_dims.expect
+++ /dev/null
@@ -1 +0,0 @@
-// Does not work.
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/expand_dims.relay b/frontends/relay-futil/tests/expand_dims.relay
deleted file mode 100644
index 47ae5ce31a..0000000000
--- a/frontends/relay-futil/tests/expand_dims.relay
+++ /dev/null
@@ -1,6 +0,0 @@
-v0.0.4
-fn (%x: Tensor[(4), int32]) {
-  let %x1 = expand_dims(%x, axis=1, num_newaxis=2);
-  %x1
-}
-

From 2c8b72ec6b3502b20a39acfbfaa62d45588afbd1 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 13:22:31 -0500
Subject: [PATCH 28/75] Cleanup!

---
 frontends/relay-futil/compiler.py         | 6 +++---
 frontends/relay-futil/dahlia_functions.py | 5 +++--
 frontends/relay-futil/example.py          | 6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 77e23e4a6e..1459bdef64 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -136,7 +136,7 @@ def relay_transforms(expr: Function) -> Function:
         relay.transform.SimplifyInference(),
         relay.transform.InferType()
     ])
-    mod = ir.IRModule()
+    mod = ir.IRModule.from_expr(expr)
     mod['main'] = expr
     mod = transform(mod)
     return mod['main']
@@ -157,5 +157,5 @@ def compile(program) -> str:
 if __name__ == '__main__':
     import sys
 
-    relay_func = relay.fromtext(sys.stdin.read())
-    print(compile(relay_func))
+    relay_function = relay.fromtext(sys.stdin.read())
+    print(compile(relay_function))
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index a197bb93ec..029cde5f4b 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -263,14 +263,15 @@ def expand_dims(declaration):
     bitwidth, size, index_size = data.data[0], data.data[1], data.data[2]
     size0, size1, size2 = res.data[1], res.data[2], res.data[3]
     index_size0, index_size1, index_size2 = res.data[4], res.data[5], res.data[6]
-    program = f"""decl {data.name}: {data.data_type}<{bitwidth}>[{size}];"""
     if axis == 1 and num_newaxis == 2:
-        program += f"""
+        program = f"""
+        decl {data.name}: {data.data_type}<{bitwidth}>[{size}];
         decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
         for (let i: ubit<{index_size}> = 0..{size}) {{
           {res.name}[i][0][0] := {data.name}[i];
         }}
         """
+    print(program)
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index f7e8526627..95c8dd5cca 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -87,13 +87,13 @@ def run_example():
 
     mod_opt = tvm.IRModule.from_expr(func)
     mod_opt = seq(mod_opt)
-    func = mod_opt['main']
+    relay_IR = mod_opt['main']
     if '-r' in input:
         # Dump the Relay representation (for educational purposes).
-        print(func)
+        print(relay_IR)
     else:
         # Compile the function and print the FuTIL.
-        print(compile(func))
+        print(compile(relay_IR))
 
 
 if __name__ == '__main__':

From 77f01c488a263a6b1f7e36e47139f4859d84a678 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 13:24:07 -0500
Subject: [PATCH 29/75] More cleanup.

---
 frontends/relay-futil/compiler.py         | 4 ++--
 frontends/relay-futil/dahlia_functions.py | 1 -
 frontends/relay-futil/example.py          | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 1459bdef64..1857b7a0ad 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -142,7 +142,7 @@ def relay_transforms(expr: Function) -> Function:
     return mod['main']
 
 
-def compile(program) -> str:
+def lower_to_futil(program) -> str:
     """Translate a Relay function to a FuTIL program (as a string)."""
     program = relay_transforms(program)
     visitor = Relay2Futil()
@@ -158,4 +158,4 @@ def compile(program) -> str:
     import sys
 
     relay_function = relay.fromtext(sys.stdin.read())
-    print(compile(relay_function))
+    print(lower_to_futil(relay_function))
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 029cde5f4b..0ddb778b16 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -271,7 +271,6 @@ def expand_dims(declaration):
           {res.name}[i][0][0] := {data.name}[i];
         }}
         """
-    print(program)
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 95c8dd5cca..c09c4f3c36 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -93,7 +93,7 @@ def run_example():
         print(relay_IR)
     else:
         # Compile the function and print the FuTIL.
-        print(compile(relay_IR))
+        print(lower_to_futil(relay_IR))
 
 
 if __name__ == '__main__':

From e5aa60d3e4a2d1da02dd8576535781064a843817 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 14:22:34 -0500
Subject: [PATCH 30/75] Add element-wise with a single value.

---
 frontends/relay-futil/compiler.py             |  5 ++--
 frontends/relay-futil/dahlia_functions.py     | 28 +++++++++++++------
 frontends/relay-futil/example.py              |  7 ++++-
 frontends/relay-futil/pretty_print.py         |  5 ++--
 frontends/relay-futil/tests/add.relay         |  5 ++--
 frontends/relay-futil/tests/data/sub.expect   | 18 ++++++++----
 frontends/relay-futil/tests/data/sub.relay    |  7 +++--
 .../relay-futil/tests/data/sub.relay.data     | 12 ++++----
 8 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 1857b7a0ad..3a60bced51 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -96,14 +96,15 @@ def visit_var(self, var):
 
     def visit_let(self, let):
         values, output = self.visit(let.value), self.visit(let.var)
+        if not isinstance(values, list): return [self.visit(let.body), values]
         for value in values:
             if not value.is_dahlia_declaration(): continue
             value.dahlia_declaration.output = output
             value.dahlia_declaration.invoke()
-        body = self.visit(let.body)
-        return [body, values]
+        return [self.visit(let.body), values]
 
     def visit_constant(self, const):
+        # Note: We're currently treating constants defined in a `let` statement in Relay IR as 1D Memory.
         type, shape = const.data.dtype, const.data.shape
         name, data = self.id("const"), [get_bitwidth(type), int(const.data.asnumpy())]
         data_type = get_memory_parameters(type)
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 0ddb778b16..ba4c903aee 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -47,14 +47,26 @@ def lower_dahlia_program(prog, component_name):
 
 def tensor1d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
-    program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
-    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
-    for (let i: ubit<{index_size}> = 0..{size}) {{
-      {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
-    }}"""
+    bitwidth, size, index_size, op2_size = op1.data[0], op1.data[1], op1.data[2], op2.data[1]
+    if op2_size != size:
+        # Element-wise operation using a single value, e.g.
+        # let %a = 42;
+        # let %c = add(%b: Tensor[(512)], %a);
+        program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
+        decl {op2.name}: {op2.data_type}<{bitwidth}>[{op2_size}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
+        for (let i: ubit<{index_size}> = 0..{size}) {{
+            {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[0];
+        }}"""
+    else:
+        program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
+        decl {op2.name}: {op2.data_type}<{bitwidth}>[{size}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
+        for (let i: ubit<{index_size}> = 0..{size}) {{
+          {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
+        }}"""
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index c09c4f3c36..116f91b887 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -10,6 +10,11 @@ def add():
     return relay.Function([x, y], relay.add(x, y))
 
 
+def add2():
+    x = relay.var('x', shape=(), dtype="int32")
+    return relay.Function([x], relay.add(x, relay.const(42)))
+
+
 def tensor_subtract():
     x = relay.var("x", relay.TensorType((2, 4), "int32"))
     y = relay.var("y", relay.TensorType((2, 4), "int32"))
@@ -56,7 +61,7 @@ def vgg_net():
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
+ALL_FUNCS = [add, add2, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 6c65dea0dc..8a2b13a1d9 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -73,10 +73,9 @@ def pp_component(component: FComponent):
 def pp_cell(cell: FCell):
     if cell.is_primitive():
         data = cell.primitive.data
-        data_type = cell.primitive.data_type
-        if data_type == 'ubit' or data_type == 'bit': bitwidth = str(data[0])
+        data_type, bitwidth = cell.primitive.data_type, data[0]
         # `fix` / `ufix` will have bitwidth form: <TotalWidth, FractWidth>. We only want TotalWidth.
-        if data_type == 'ufix' or data_type == 'fix': bitwidth = str(data[0]).split(',')[0]
+        if data_type == 'ufix' or data_type == 'fix': bitwidth = str(bitwidth).split(',')[0]
         if cell.primitive.type == PrimitiveType.Register:
             return f'{cell.primitive.name} = prim std_reg({bitwidth});'
         if cell.primitive.type == PrimitiveType.Constant:
diff --git a/frontends/relay-futil/tests/add.relay b/frontends/relay-futil/tests/add.relay
index 378c62b4fd..51e09388ed 100644
--- a/frontends/relay-futil/tests/add.relay
+++ b/frontends/relay-futil/tests/add.relay
@@ -1,5 +1,6 @@
 v0.0.4
-fn (%x: int32, %y: int32) {
-  let %z = add(%x, %y);
+fn (%x: Tensor[(512), int32]) {
+  let %l: int32 = 42;
+  let %z = add(%x, %l);
   %z
 }
diff --git a/frontends/relay-futil/tests/data/sub.expect b/frontends/relay-futil/tests/data/sub.expect
index e313c7824d..523f8c2430 100644
--- a/frontends/relay-futil/tests/data/sub.expect
+++ b/frontends/relay-futil/tests/data/sub.expect
@@ -1,11 +1,17 @@
 {
-  "a": [
-    49
+  "x": [
+    16,
+    16,
+    16,
+    16
   ],
-  "b": [
-    7
+  "x1": [
+    8,
+    8,
+    8,
+    8
   ],
-  "c": [
-    42
+  "y": [
+    8
   ]
 }
diff --git a/frontends/relay-futil/tests/data/sub.relay b/frontends/relay-futil/tests/data/sub.relay
index 0e0df9fb24..f73c4da85d 100644
--- a/frontends/relay-futil/tests/data/sub.relay
+++ b/frontends/relay-futil/tests/data/sub.relay
@@ -1,5 +1,6 @@
 v0.0.4
-fn (%a: int32, %b: int32) {
-  let %c = subtract(%a, %b);
-  %c
+fn (%x: Tensor[(4), int32]) {
+  let %y = 8;
+  let %x1 = subtract(%x, %y);
+  %x1
 }
diff --git a/frontends/relay-futil/tests/data/sub.relay.data b/frontends/relay-futil/tests/data/sub.relay.data
index 219d0fa867..6f7ef00e0f 100644
--- a/frontends/relay-futil/tests/data/sub.relay.data
+++ b/frontends/relay-futil/tests/data/sub.relay.data
@@ -1,14 +1,14 @@
 {
-  "a": {
-    "data": [49],
+  "x": {
+    "data": [16, 16, 16, 16],
     "bitwidth": 32
   },
-  "b": {
-      "data": [7],
+  "y": {
+      "data": [8],
       "bitwidth": 32
     },
-    "c": {
-      "data": [0],
+    "x1": {
+      "data": [0, 0, 0, 0],
       "bitwidth": 32
     }
 }
\ No newline at end of file

From 5a317313e10dc6163f3573fd6d8b566458c9e457 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 16:53:08 -0500
Subject: [PATCH 31/75] Revert add.

---
 frontends/relay-futil/tests/add.relay | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/frontends/relay-futil/tests/add.relay b/frontends/relay-futil/tests/add.relay
index 51e09388ed..378c62b4fd 100644
--- a/frontends/relay-futil/tests/add.relay
+++ b/frontends/relay-futil/tests/add.relay
@@ -1,6 +1,5 @@
 v0.0.4
-fn (%x: Tensor[(512), int32]) {
-  let %l: int32 = 42;
-  let %z = add(%x, %l);
+fn (%x: int32, %y: int32) {
+  let %z = add(%x, %y);
   %z
 }

From 6a9f170fc8aedad4e691369a226327d18395aad5 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 22:04:42 -0500
Subject: [PATCH 32/75] Fix PP for D4.

---
 frontends/relay-futil/dahlia_functions.py | 59 +++++++++++++++--------
 frontends/relay-futil/example.py          |  8 +--
 frontends/relay-futil/pretty_print.py     |  8 +--
 3 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index ba4c903aee..0b42dfa13f 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -48,24 +48,24 @@ def lower_dahlia_program(prog, component_name):
 def tensor1d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth, size, index_size, op2_size = op1.data[0], op1.data[1], op1.data[2], op2.data[1]
-    if op2_size != size:
-        # Element-wise operation using a single value, e.g.
-        # let %a = 42;
-        # let %c = add(%b: Tensor[(512)], %a);
+    if size == op2_size:
         program = f"""
         decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
-        decl {op2.name}: {op2.data_type}<{bitwidth}>[{op2_size}];
+        decl {op2.name}: {op2.data_type}<{bitwidth}>[{size}];
         decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
         for (let i: ubit<{index_size}> = 0..{size}) {{
-            {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[0];
+          {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
         }}"""
     else:
+        # Broadcasting using a single value, e.g.
+        #   let %a = 42;
+        #   let %c = add(%b: Tensor[(512)], %a);
         program = f"""
         decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
-        decl {op2.name}: {op2.data_type}<{bitwidth}>[{size}];
+        decl {op2.name}: {op2.data_type}<{bitwidth}>[{op2_size}];
         decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
         for (let i: ubit<{index_size}> = 0..{size}) {{
-          {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
+            {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[0];
         }}"""
     return lower_dahlia_program(program, declaration.component_name)
 
@@ -107,19 +107,38 @@ def tensor4d_op(declaration):
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth, size0, size1, size2, size3 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
     index_size0, index_size1, index_size2, index_size3 = op1.data[5], op1.data[6], op1.data[7], op1.data[8]
-    program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-    for (let i: ubit<{index_size0}> = 0..{size0}) {{
-      for (let j: ubit<{index_size1}> = 0..{size1}) {{
-        for (let k: ubit<{index_size2}> = 0..{size2}) {{
-          for (let l: ubit<{index_size3}> = 0..{size3}) {{
-            {res.name}[i][j][k][l] := {op1.name}[i][j][k][l] {declaration.op} {op2.name}[i][j][k][l];
+    if op1.type == op2.type:
+        program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+        decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+        for (let i: ubit<{index_size0}> = 0..{size0}) {{
+          for (let j: ubit<{index_size1}> = 0..{size1}) {{
+            for (let k: ubit<{index_size2}> = 0..{size2}) {{
+              for (let l: ubit<{index_size3}> = 0..{size3}) {{
+                {res.name}[i][j][k][l] := {op1.name}[i][j][k][l] {declaration.op} {op2.name}[i][j][k][l];
+              }}
+            }}
           }}
-        }}
-      }}
-    }}"""
+        }}"""
+    else:  # Broadcasting.
+        op2_size0, op2_size1, op2_size2 = op2.data[1], op2.data[2], op2.data[3]
+        op2_index_size0, op2_index_size1, op2_index_size2 = op2.data[3], op2.data[5], op2.data[6]
+        # TODO(cgyurgyik): This is defaulted to: `Tensor(X, Y, 1, 1) op Tensor(Y, 1, 1)` for VGG Net.
+        # This should be generalized.
+        program = f"""
+        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+        decl {op2.name}: {op2.data_type}<{bitwidth}>[{op2_size0}][{op2_size1}][{op2_size2}];
+        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
+        for (let i: ubit<{index_size0}> = 0..{size0}) {{
+          for (let j: ubit<{index_size1}> = 0..{size1}) {{
+            for (let k: ubit<{index_size2}> = 0..{size2}) {{
+              for (let l: ubit<{index_size3}> = 0..{size3}) {{
+                {res.name}[i][j][k][l] := {op1.name}[i][j][k][l] {declaration.op} {op2.name}[j][0][0];
+              }}
+            }}
+          }}
+        }}"""
     return lower_dahlia_program(program, declaration.component_name)
 
 
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 116f91b887..45e18b387c 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -9,12 +9,6 @@ def add():
     y = relay.var('y', shape=(), dtype="int32")
     return relay.Function([x, y], relay.add(x, y))
 
-
-def add2():
-    x = relay.var('x', shape=(), dtype="int32")
-    return relay.Function([x], relay.add(x, relay.const(42)))
-
-
 def tensor_subtract():
     x = relay.var("x", relay.TensorType((2, 4), "int32"))
     y = relay.var("y", relay.TensorType((2, 4), "int32"))
@@ -61,7 +55,7 @@ def vgg_net():
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, add2, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
+ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 8a2b13a1d9..6c5b03dbd5 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -106,10 +106,10 @@ def pp_cell(cell: FCell):
             size1 = str(data[2])
             size2 = str(data[3])
             size3 = str(data[4])
-            index_size0 = str(data[4])
-            index_size1 = str(data[5])
-            index_size2 = str(data[6])
-            index_size3 = str(data[7])
+            index_size0 = str(data[5])
+            index_size1 = str(data[6])
+            index_size2 = str(data[7])
+            index_size3 = str(data[8])
             return f'{cell.primitive.name} = prim std_mem_d4({bitwidth}, ' \
                    f'{size0}, {size1}, {size2}, {size3}, {index_size0}, {index_size1}, {index_size2}, {index_size3});'
         if cell.primitive.type == PrimitiveType.BinOp:

From 655a439a128f1e121c42e3d82fb930e81248e849 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 21 Nov 2020 22:08:11 -0500
Subject: [PATCH 33/75] Fix 4d case.

---
 frontends/relay-futil/tests/bias_add.expect | 4 ++--
 frontends/relay-futil/tests/relu.expect     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
index 562f55dfe0..620da35d44 100644
--- a/frontends/relay-futil/tests/bias_add.expect
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -164,8 +164,8 @@ component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_0_0_r
 
 component main () -> () {
   cells {
-    x1 = prim std_mem_d4(32, 1, 64, 512, 256, 256, 1, 7, 10);
-    x = prim std_mem_d4(32, 1, 64, 512, 256, 256, 1, 7, 10);
+    x1 = prim std_mem_d4(32, 1, 64, 512, 256, 1, 7, 10, 9);
+    x = prim std_mem_d4(32, 1, 64, 512, 256, 1, 7, 10, 9);
     bias = prim std_mem_d1(32, 64, 7);
     bias_add0 = bias_add;
   }
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
index a0998dd229..7c2ac6e96a 100644
--- a/frontends/relay-futil/tests/relu.expect
+++ b/frontends/relay-futil/tests/relu.expect
@@ -192,8 +192,8 @@ component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_
 
 component main () -> () {
   cells {
-    x1 = prim std_mem_d4(32, 2, 4, 8, 32, 32, 2, 3, 4);
-    x = prim std_mem_d4(32, 2, 4, 8, 32, 32, 2, 3, 4);
+    x1 = prim std_mem_d4(32, 2, 4, 8, 32, 2, 3, 4, 6);
+    x = prim std_mem_d4(32, 2, 4, 8, 32, 2, 3, 4, 6);
     relu0 = relu;
   }
   wires {

From caf01128e753ee8f01ea2f6e773d20bc7007012b Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 22 Nov 2020 08:19:25 -0500
Subject: [PATCH 34/75] Remove extra comma.

---
 primitives/std.lib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/primitives/std.lib b/primitives/std.lib
index 7f4d887efa..ecd352640a 100644
--- a/primitives/std.lib
+++ b/primitives/std.lib
@@ -187,7 +187,7 @@ primitive std_mem_d4[
         parameter d0_idx_size = 4,
         parameter d1_idx_size = 4,
         parameter d2_idx_size = 4,
-        parameter d3_idx_size = 4,)
+        parameter d3_idx_size = 4)
        (input logic [d0_idx_size-1:0] addr0,
         input logic [d1_idx_size-1:0] addr1,
         input logic [d2_idx_size-1:0] addr2,

From f0496458224dd37bc32972217e21f7ef945764bc Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 22 Nov 2020 08:35:21 -0500
Subject: [PATCH 35/75] Simplify expr.

---
 frontends/relay-futil/compiler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 3a60bced51..6026dcfcb2 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -96,11 +96,11 @@ def visit_var(self, var):
 
     def visit_let(self, let):
         values, output = self.visit(let.value), self.visit(let.var)
-        if not isinstance(values, list): return [self.visit(let.body), values]
-        for value in values:
-            if not value.is_dahlia_declaration(): continue
-            value.dahlia_declaration.output = output
-            value.dahlia_declaration.invoke()
+        if isinstance(values, list):
+            for value in values:
+                if not value.is_dahlia_declaration(): continue
+                value.dahlia_declaration.output = output
+                value.dahlia_declaration.invoke()
         return [self.visit(let.body), values]
 
     def visit_constant(self, const):

From 318b6f5d84696bcee05898a5f1b50f7edde144f0 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 22 Nov 2020 08:37:17 -0500
Subject: [PATCH 36/75] Change to dahlia_name.

---
 frontends/relay-futil/compiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 6026dcfcb2..125d0d8092 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -54,7 +54,7 @@ def relay_id(self, name):
         if id_number == 0: return name
         return name + str(id_number)
 
-    def produce_dahlia_name(self, name, type):
+    def dahlia_name(self, name, type):
         """
         Dahlia uses the following naming scheme for an arbitrary variable 'X':
         Memory1D: 'X0', 'X1', 'X2', ...
@@ -90,7 +90,7 @@ def visit_var(self, var):
         # Do not add duplicate primitives to main.
         if self.main.contains_primitive(name): return cell
         data, type, data_type = get_memory_parameters(var.type_annotation)
-        dahlia_name = self.produce_dahlia_name(name, type)
+        dahlia_name = self.dahlia_name(name, type)
         return FCell(dahlia_name=dahlia_name,
                      primitive=FPrimitive(name=name, data=data, data_type=data_type, type=type))
 

From 6305a0f5e399c38b3ed41cd2cadce2fa1cade79b Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 22 Nov 2020 08:48:20 -0500
Subject: [PATCH 37/75] Clean up PP.

---
 frontends/relay-futil/pretty_print.py | 31 +++++++--------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 6c5b03dbd5..f711e78546 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -14,8 +14,7 @@ def mk_block(decl, contents, indent=2):
 
 def pp_component_signature(component: FComponent):
     inputs = []
-    if component.signature == None:
-        return "", ""
+    if component.signature == None: return "", ""
 
     for input in component.signature.inputs:
         inputs.append(f'{input.name}: {input.bitwidth}')
@@ -74,7 +73,7 @@ def pp_cell(cell: FCell):
     if cell.is_primitive():
         data = cell.primitive.data
         data_type, bitwidth = cell.primitive.data_type, data[0]
-        # `fix` / `ufix` will have bitwidth form: <TotalWidth, FractWidth>. We only want TotalWidth.
+        # `fix` / `ufix` will have bitwidth in the form: <TotalWidth, FractWidth>. We only want TotalWidth.
         if data_type == 'ufix' or data_type == 'fix': bitwidth = str(bitwidth).split(',')[0]
         if cell.primitive.type == PrimitiveType.Register:
             return f'{cell.primitive.name} = prim std_reg({bitwidth});'
@@ -82,34 +81,20 @@ def pp_cell(cell: FCell):
             value = str(data[1])
             return f'{cell.primitive.name} = prim std_const({bitwidth}, {value});'
         if cell.primitive.type == PrimitiveType.Memory1D:
-            size = str(data[1])
-            index_size = str(data[2])
+            size, index_size = str(data[1]), str(data[2])
             return f'{cell.primitive.name} = prim std_mem_d1({bitwidth}, {size}, {index_size});'
         if cell.primitive.type == PrimitiveType.Memory2D:
-            size0 = str(data[1])
-            size1 = str(data[2])
-            index_size0 = str(data[3])
-            index_size1 = str(data[4])
+            size0, size1, index_size0, index_size1 = str(data[1]), str(data[2]), str(data[3]), str(data[4])
             return f'{cell.primitive.name} = prim std_mem_d2({bitwidth}, ' \
                    f'{size0}, {size1}, {index_size0}, {index_size1});'
         if cell.primitive.type == PrimitiveType.Memory3D:
-            size0 = str(data[1])
-            size1 = str(data[2])
-            size2 = str(data[3])
-            index_size0 = str(data[4])
-            index_size1 = str(data[5])
-            index_size2 = str(data[6])
+            size0, size1, size2 = str(data[1]), str(data[2]), str(data[3])
+            index_size0, index_size1, index_size2 = str(data[4]), str(data[5]), str(data[6])
             return f'{cell.primitive.name} = prim std_mem_d3({bitwidth}, ' \
                    f'{size0}, {size1}, {size2}, {index_size0}, {index_size1}, {index_size2});'
         if cell.primitive.type == PrimitiveType.Memory4D:
-            size0 = str(data[1])
-            size1 = str(data[2])
-            size2 = str(data[3])
-            size3 = str(data[4])
-            index_size0 = str(data[5])
-            index_size1 = str(data[6])
-            index_size2 = str(data[7])
-            index_size3 = str(data[8])
+            size0, size1, size2, size3 = str(data[1]), str(data[2]), str(data[3]), str(data[4])
+            index_size0, index_size1, index_size2, index_size3 = str(data[5]), str(data[6]), str(data[7]), str(data[8])
             return f'{cell.primitive.name} = prim std_mem_d4({bitwidth}, ' \
                    f'{size0}, {size1}, {size2}, {size3}, {index_size0}, {index_size1}, {index_size2}, {index_size3});'
         if cell.primitive.type == PrimitiveType.BinOp:

From a99384f54421c4da425d96089a4fdef25709a2d1 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 22 Nov 2020 08:51:22 -0500
Subject: [PATCH 38/75] Place mapping outside of function call.

---
 frontends/relay-futil/compiler.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 125d0d8092..ae328b4144 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -20,6 +20,11 @@
 RelayFunctionCalls = {'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
                       'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims}
 
+# Mapping between primitive type and associated Dahlia name extension.
+# E.g. A 2D memory array named `A` will be lowered to `A_0`.
+DahliaNameExtension = {PrimitiveType.Memory1D: '', PrimitiveType.Memory2D: '_0',
+                       PrimitiveType.Memory3D: '_0_0', PrimitiveType.Memory4D: '_0_0_0'}
+
 
 class Relay2Futil(ExprFunctor):
     """The main compilation visitor."""
@@ -61,11 +66,9 @@ def dahlia_name(self, name, type):
         Memory2D: 'X0_0', 'X1_0', 'X2_0', ...
         Memory3D: 'X0_0_0', 'X1_0_0', 'X2_0_0', ...
         """
-        DahliaNameMapping = {PrimitiveType.Memory1D: '', PrimitiveType.Memory2D: '_0',
-                             PrimitiveType.Memory3D: '_0_0', PrimitiveType.Memory4D: '_0_0_0'}
         dahlia_name = self.id(name)
-        assert type in DahliaNameMapping, f'{name} with {type} is not supported yet.'
-        return dahlia_name + DahliaNameMapping[type]
+        assert type in DahliaNameExtension, f'{name} with {type} is not supported yet.'
+        return dahlia_name + DahliaNameExtension[type]
 
     def get_dahlia_declaration(self, function_name, cells, args, attrs):
         """

From 3dd6a44debd0b67deec76301ae7fa5a73db2bd9b Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 22 Nov 2020 14:58:19 -0500
Subject: [PATCH 39/75] Add support for dense operator.

---
 frontends/relay-futil/tests/data/dense.expect |  43 +++
 frontends/relay-futil/tests/data/dense.relay  |   5 +
 .../relay-futil/tests/data/dense.relay.data   |  22 ++
 frontends/relay-futil/tests/dense.expect      | 334 ++++++++++++++++++
 frontends/relay-futil/tests/dense.relay       |   5 +
 5 files changed, 409 insertions(+)
 create mode 100644 frontends/relay-futil/tests/data/dense.expect
 create mode 100644 frontends/relay-futil/tests/data/dense.relay
 create mode 100644 frontends/relay-futil/tests/data/dense.relay.data
 create mode 100644 frontends/relay-futil/tests/dense.expect
 create mode 100644 frontends/relay-futil/tests/dense.relay

diff --git a/frontends/relay-futil/tests/data/dense.expect b/frontends/relay-futil/tests/data/dense.expect
new file mode 100644
index 0000000000..279a161a27
--- /dev/null
+++ b/frontends/relay-futil/tests/data/dense.expect
@@ -0,0 +1,43 @@
+{
+  "temporary_x20_0": [
+    3,
+    3
+  ],
+  "transpose_x10_0": [
+    [
+      1,
+      1
+    ],
+    [
+      1,
+      1
+    ],
+    [
+      1,
+      1
+    ]
+  ],
+  "x": [
+    [
+      1,
+      1,
+      1
+    ]
+  ],
+  "x1": [
+    [
+      1,
+      1,
+      1
+    ],
+    [
+      1,
+      1,
+      1
+    ]
+  ],
+  "x2": [
+    3,
+    3
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/dense.relay b/frontends/relay-futil/tests/data/dense.relay
new file mode 100644
index 0000000000..edf56c4d30
--- /dev/null
+++ b/frontends/relay-futil/tests/data/dense.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(1, 3), int32], %x1: Tensor[(2, 3), int32]) -> Tensor[(1, 2), int32] {
+  let %x2: Tensor[(1, 2), int32] = nn.dense(%x, %x1) /* ty=Tensor[(1, 2), int32] */;
+  %x2
+}
diff --git a/frontends/relay-futil/tests/data/dense.relay.data b/frontends/relay-futil/tests/data/dense.relay.data
new file mode 100644
index 0000000000..5ac288db59
--- /dev/null
+++ b/frontends/relay-futil/tests/data/dense.relay.data
@@ -0,0 +1,22 @@
+{
+  "x": {
+   "data": [[1, 1, 1]],
+   "bitwidth": 32
+  },
+  "x1": {
+   "data": [[1, 1, 1], [1, 1, 1]],
+   "bitwidth": 32
+  },
+  "x2": {
+   "data": [0, 0],
+   "bitwidth": 32
+  },
+  "temporary_x20_0": {
+   "data": [0, 0],
+   "bitwidth": 32
+  },
+  "transpose_x10_0": {
+   "data": [[0, 0], [0, 0], [0, 0]],
+   "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/dense.expect b/frontends/relay-futil/tests/dense.expect
new file mode 100644
index 0000000000..a0d5ead2b1
--- /dev/null
+++ b/frontends/relay-futil/tests/dense.expect
@@ -0,0 +1,334 @@
+import "primitives/std.lib";
+
+component dense(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 13, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 4, y0_0_addr1: 13, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(13);
+    add1 = prim std_add(4);
+    add2 = prim std_add(32);
+    add3 = prim std_add(13);
+    add4 = prim std_add(4);
+    add5 = prim std_add(1);
+    add6 = prim std_add(4);
+    add7 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(4, 0);
+    const1 = prim std_const(4, 9);
+    const10 = prim std_const(13, 0);
+    const11 = prim std_const(13, 4095);
+    const12 = prim std_const(13, 1);
+    const13 = prim std_const(4, 1);
+    const14 = prim std_const(1, 1);
+    const15 = prim std_const(1, 0);
+    const16 = prim std_const(1, 0);
+    const17 = prim std_const(4, 0);
+    const18 = prim std_const(4, 9);
+    const19 = prim std_const(4, 1);
+    const2 = prim std_const(13, 0);
+    const20 = prim std_const(1, 1);
+    const3 = prim std_const(13, 4095);
+    const4 = prim std_const(13, 1);
+    const5 = prim std_const(4, 1);
+    const6 = prim std_const(1, 0);
+    const7 = prim std_const(1, 0);
+    const8 = prim std_const(4, 0);
+    const9 = prim std_const(4, 9);
+    i0 = prim std_reg(4);
+    i1 = prim std_reg(1);
+    i2 = prim std_reg(1);
+    j0 = prim std_reg(13);
+    j1 = prim std_reg(4);
+    j2 = prim std_reg(4);
+    k0 = prim std_reg(13);
+    le0 = prim std_le(4);
+    le1 = prim std_le(13);
+    le2 = prim std_le(1);
+    le3 = prim std_le(4);
+    le4 = prim std_le(13);
+    le5 = prim std_le(1);
+    le6 = prim std_le(4);
+    mult_pipe0 = prim std_mult_pipe(32);
+    product_0 = prim std_reg(32);
+    temporary_x10_0 = prim std_mem_d2(32, 1, 10, 1, 4);
+    temporary_x1_read0_0 = prim std_reg(32);
+    transpose_y0_0 = prim std_mem_d2(32, 4096, 10, 13, 4);
+    transpose_y_read0_0 = prim std_reg(32);
+    x_read0_0 = prim std_reg(32);
+    y_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = i1.out;
+      le2.right = const7.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = j1.out;
+      le3.right = const9.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = k0.out;
+      le4.right = const11.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = i2.out;
+      le5.right = const16.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = j2.out;
+      le6.right = const18.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      i1.in = const6.out;
+      i1.write_en = 1'd1;
+      let2[done] = i1.done;
+    }
+    group let3<"static"=1> {
+      j1.in = const8.out;
+      j1.write_en = 1'd1;
+      let3[done] = j1.done;
+    }
+    group let4<"static"=1> {
+      k0.in = const10.out;
+      k0.write_en = 1'd1;
+      let4[done] = k0.done;
+    }
+    group let5<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let5[done] = bin_read0_0.done;
+      mult_pipe0.left = x_read0_0.out;
+      mult_pipe0.right = transpose_y_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let6<"static"=1> {
+      product_0.in = bin_read0_0.out;
+      product_0.write_en = 1'd1;
+      let6[done] = product_0.done;
+    }
+    group let7<"static"=1> {
+      i2.in = const15.out;
+      i2.write_en = 1'd1;
+      let7[done] = i2.done;
+    }
+    group let8<"static"=1> {
+      j2.in = const17.out;
+      j2.write_en = 1'd1;
+      let8[done] = j2.done;
+    }
+    group upd0<"static"=1> {
+      y_read0_0.write_en = 1'd1;
+      y0_0_addr1 = j0.out;
+      y0_0_addr0 = i0.out;
+      y_read0_0.in = 1'd1 ? y0_0_read_data;
+      upd0[done] = y_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      transpose_y0_0.addr1 = i0.out;
+      transpose_y0_0.addr0 = j0.out;
+      transpose_y0_0.write_en = 1'd1;
+      transpose_y0_0.write_data = 1'd1 ? y_read0_0.out;
+      upd1[done] = transpose_y0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      temporary_x1_read0_0.write_en = 1'd1;
+      temporary_x10_0.addr1 = j2.out;
+      temporary_x10_0.addr0 = i2.out;
+      temporary_x1_read0_0.in = 1'd1 ? temporary_x10_0.read_data;
+      upd10[done] = temporary_x1_read0_0.done ? 1'd1;
+    }
+    group upd11<"static"=1> {
+      x10_0_addr1 = j2.out;
+      x10_0_addr0 = i2.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? temporary_x1_read0_0.out;
+      upd11[done] = x10_0_done ? 1'd1;
+    }
+    group upd12<"static"=1> {
+      j2.write_en = 1'd1;
+      add6.left = j2.out;
+      add6.right = const19.out;
+      j2.in = 1'd1 ? add6.out;
+      upd12[done] = j2.done ? 1'd1;
+    }
+    group upd13<"static"=1> {
+      i2.write_en = 1'd1;
+      add7.left = i2.out;
+      add7.right = const20.out;
+      i2.in = 1'd1 ? add7.out;
+      upd13[done] = i2.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const4.out;
+      j0.in = 1'd1 ? add0.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const5.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = k0.out;
+      x0_0_addr0 = i1.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd4[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      transpose_y_read0_0.write_en = 1'd1;
+      transpose_y0_0.addr1 = j1.out;
+      transpose_y0_0.addr0 = k0.out;
+      transpose_y_read0_0.in = 1'd1 ? transpose_y0_0.read_data;
+      upd5[done] = transpose_y_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      temporary_x10_0.addr1 = j1.out;
+      temporary_x10_0.addr0 = i1.out;
+      temporary_x10_0.write_en = 1'd1;
+      add2.left = temporary_x10_0.read_data;
+      add2.right = product_0.out;
+      temporary_x10_0.addr1 = j1.out;
+      temporary_x10_0.addr0 = i1.out;
+      temporary_x10_0.write_data = 1'd1 ? add2.out;
+      upd6[done] = temporary_x10_0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      k0.write_en = 1'd1;
+      add3.left = k0.out;
+      add3.right = const12.out;
+      k0.in = 1'd1 ? add3.out;
+      upd7[done] = k0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      j1.write_en = 1'd1;
+      add4.left = j1.out;
+      add4.right = const13.out;
+      j1.in = 1'd1 ? add4.out;
+      upd8[done] = j1.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      i1.write_en = 1'd1;
+      add5.left = i1.out;
+      add5.right = const14.out;
+      i1.in = 1'd1 ? add5.out;
+      upd9[done] = i1.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              upd1;
+              upd2;
+            }
+          }
+          upd3;
+        }
+      }
+      let2;
+      while le2.out with cond2 {
+        seq {
+          let3;
+          while le3.out with cond3 {
+            seq {
+              let4;
+              while le4.out with cond4 {
+                seq {
+                  par {
+                    upd4;
+                    upd5;
+                  }
+                  let5;
+                  let6;
+                  upd6;
+                  upd7;
+                }
+              }
+              upd8;
+            }
+          }
+          upd9;
+        }
+      }
+      let7;
+      while le5.out with cond5 {
+        seq {
+          let8;
+          while le6.out with cond6 {
+            seq {
+              upd10;
+              upd11;
+              upd12;
+            }
+          }
+          upd13;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 1, 10, 1, 4);
+    x = prim std_mem_d2(32, 1, 4096, 1, 13);
+    y = prim std_mem_d2(32, 10, 4096, 4, 13);
+    dense0 = dense;
+  }
+  wires {
+    group run_dense {
+      x.addr0 = dense0.x0_0_addr0;
+      dense0.x0_0_read_data = x.read_data;
+      x.addr1 = dense0.x0_0_addr1;
+      y.addr0 = dense0.y0_0_addr0;
+      dense0.y0_0_read_data = y.read_data;
+      y.addr1 = dense0.y0_0_addr1;
+      x1.addr0 = dense0.x10_0_addr0;
+      x1.addr1 = dense0.x10_0_addr1;
+      x1.write_data = dense0.x10_0_write_data;
+      x1.write_en = dense0.x10_0_write_en;
+      dense0.x10_0_done = x1.done;
+      dense0.go = 1'd1;
+      run_dense[done] = dense0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_dense;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/dense.relay b/frontends/relay-futil/tests/dense.relay
new file mode 100644
index 0000000000..5bd91be7a2
--- /dev/null
+++ b/frontends/relay-futil/tests/dense.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(1, 4096), int32], %y: Tensor[(10, 4096), int32]) -> Tensor[(1, 10), int32] {
+  let %x1: Tensor[(1, 10), int32] = nn.dense(%x, %y, units=10) /* ty=Tensor[(1, 10), int32] */;
+  %x1
+}

From c51c41a26fe3da41094b341a45f7b2e75fc6eb46 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 16:18:04 -0500
Subject: [PATCH 40/75] Implement broadcasting.

---
 frontends/relay-futil/compiler.py             |   8 +-
 frontends/relay-futil/dahlia_functions.py     | 278 +++++++++++-------
 frontends/relay-futil/example.py              |   9 +-
 frontends/relay-futil/futil_ast.py            |   6 +-
 frontends/relay-futil/tests/broadcast.expect  | 171 +++++++++++
 frontends/relay-futil/tests/broadcast.relay   |   5 +
 .../relay-futil/tests/data/broadcast.expect   |  70 +++++
 .../relay-futil/tests/data/broadcast.relay    |   5 +
 .../tests/data/broadcast.relay.data           |  14 +
 9 files changed, 448 insertions(+), 118 deletions(-)
 create mode 100644 frontends/relay-futil/tests/broadcast.expect
 create mode 100644 frontends/relay-futil/tests/broadcast.relay
 create mode 100644 frontends/relay-futil/tests/data/broadcast.expect
 create mode 100644 frontends/relay-futil/tests/data/broadcast.relay
 create mode 100644 frontends/relay-futil/tests/data/broadcast.relay.data

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index ae328b4144..cd9bcb77b1 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -12,12 +12,8 @@
 # Mapping from Relay binary calls to the respective Dahlia operator.
 BuiltInBinaryOps = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
 
-# Mapping from Tensor dimensions to function type.
-BinaryOpTensorDimensions = {PrimitiveType.Memory1D: tensor1d_op, PrimitiveType.Memory2D: tensor2d_op,
-                            PrimitiveType.Memory3D: tensor3d_op, PrimitiveType.Memory4D: tensor4d_op}
-
 # Mapping from Relay function names to their respective Dahlia lowering.
-RelayFunctionCalls = {'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
+RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
                       'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims}
 
 # Mapping between primitive type and associated Dahlia name extension.
@@ -79,7 +75,7 @@ def get_dahlia_declaration(self, function_name, cells, args, attrs):
         function = name = op = None
         if function_name in BuiltInBinaryOps:
             op = BuiltInBinaryOps[function_name]
-            function, name = BinaryOpTensorDimensions[input_type], function_name
+            function, name = broadcast, function_name
         elif function_name in RelayFunctionCalls:
             function = RelayFunctionCalls[function_name]
             name = function.__name__
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 0b42dfa13f..bf7d69c7df 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -6,10 +6,11 @@
 
 IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
 NO_ERR = "2>/dev/null"
+CHARACTER_I = chr(ord('i'))
 
 
 def lower_dahlia_program(prog, component_name):
-    '''
+    """
     Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
     and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
     declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the respective component.
@@ -31,7 +32,7 @@ def lower_dahlia_program(prog, component_name):
         (done: 1, X0_addr0: 2, X0_write_data: 32, X0_write_en: 1, X0_clk: 1) {
            ...
         }
-    '''
+    """
     program_string = '\n'.join(prog.splitlines())
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
         tf0.write(bytes(program_string, 'UTF-8'))
@@ -45,123 +46,143 @@ def lower_dahlia_program(prog, component_name):
         return component
 
 
-def tensor1d_op(declaration):
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, size, index_size, op2_size = op1.data[0], op1.data[1], op1.data[2], op2.data[1]
-    if size == op2_size:
-        program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
-        decl {op2.name}: {op2.data_type}<{bitwidth}>[{size}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
-        for (let i: ubit<{index_size}> = 0..{size}) {{
-          {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[i];
-        }}"""
-    else:
-        # Broadcasting using a single value, e.g.
-        #   let %a = 42;
-        #   let %c = add(%b: Tensor[(512)], %a);
-        program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
-        decl {op2.name}: {op2.data_type}<{bitwidth}>[{op2_size}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
-        for (let i: ubit<{index_size}> = 0..{size}) {{
-            {res.name}[i] := {op1.name}[i] {declaration.op} {op2.name}[0];
-        }}"""
-    return lower_dahlia_program(program, declaration.component_name)
-
+def next_character(ch, dir=1):
+    """
+    Returns the next character after 'ch'.
+    If dir is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
+    """
+    return chr(ord(ch) + dir) if dir > 0 else chr(ord(ch) - 1)
 
-def tensor2d_op(declaration):
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, size0, size1, index_size0, index_size1 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
-    program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}];
-    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];
-    for (let i: ubit<{index_size0}> = 0..{size0}) {{
-      for (let j: ubit<{index_size1}> = 0..{size1}) {{
-        {res.name}[i][j] := {op1.name}[i][j] {declaration.op} {op2.name}[i][j];
-      }}
-    }}"""
-    return lower_dahlia_program(program, declaration.component_name)
 
+def broadcast(declaration):
+    """
+    https://numpy.org/doc/stable/user/basics.broadcasting.html
+    Implements array broadcasting:
+    Two dimensions are compatible when either (1) they're equal, or (2) one of them is 1.
+    It is not required that both operands have the same number of dimensions either.
+    - When lowering from Relay IR, we are guaranteed the arrays are compatible for broadcasting.
+    - Variable names for indexing through the array begin with `i`, and continue alphabetically.
 
-def tensor3d_op(declaration):
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, size0, size1, size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
-    index_size0, index_size1, index_size2 = op1.data[4], op1.data[5], op1.data[6]
+    Example:
+         first operand:  64 x  1 x 32
+        second operand:       16 x  1
+                result:  64 x 16 x 32
+        ->
+        for (i = 0...64) {
+          for (j = 0..16) {
+            for (k = 0..32) {
+              result[i][j][k] := op1[i][0][k] + op2[j][0];
+              ...
+    """
+    operand1, operand2 = declaration.inputs[0].primitive, declaration.inputs[1].primitive
+    res = declaration.output.primitive
+    op1 = operand1 if operand1.type >= operand2.type else operand2
+    op2 = operand2 if op1 == operand1 else operand1
+
+    op1_offset, op2_offset = op1.type, op2.type
+    op1_sizes, op2_sizes, res_sizes = [], [], []
+    for i in reversed(range(1, op1_offset + 1)): op1_sizes.append(op1.data[i])
+    for i in reversed(range(1, op2_offset + 1)): op2_sizes.append(op2.data[i])
+    for i in range(0, len(op1_sizes)):
+        size = op1_sizes[i]
+        res_sizes.append(max(size, op2_sizes[i]) if i < len(op2_sizes) else size)
+
+    op1_indices, op2_indices, res_indices = [], [], []
+    # Get the character associated with 'i' + N, where N == Memory Dimensions
+    variable_name = chr(ord(CHARACTER_I) + op1_offset - 1)
+    for i in range(0, len(op1_sizes)):
+        current_dimension, index_zero = f'[{variable_name}]', '[0]'
+        res_indices.append(current_dimension)
+        if len(op2_sizes) <= i:
+            op1_indices.append(current_dimension)
+            continue
+        elif op1_sizes[i] == op2_sizes[i]:
+            op1_indices.append(current_dimension)
+            op2_indices.append(current_dimension)
+        elif op1_sizes[i] > op2_sizes[i]:
+            op1_indices.append(current_dimension)
+            op2_indices.append(index_zero)
+        else:  # op2_sizes[i] < op1_sizes[i]
+            op1_indices.append(index_zero)
+            op2_indices.append(current_dimension)
+        variable_name = next_character(variable_name, -1)
+
+    op1_nth_index, op2_nth_index = ''.join(reversed(op1_indices)), ''.join(reversed(op2_indices))
+    res_nth_index = ''.join(reversed(res_indices))
+
+    # Declarations for op1, op2, res.
+    op1_decl = f'decl {op1.name}: {op1.data_type}<{op1.data[0]}>'
+    for i in reversed(range(0, len(op1_sizes))): op1_decl += f'[{op1_sizes[i]}]'
+
+    op2_decl = f'decl {op2.name}: {op2.data_type}<{op2.data[0]}>'
+    for i in reversed(range(0, len(op2_sizes))): op2_decl += f'[{op2_sizes[i]}]'
+
+    res_decl = f'decl {res.name}: {res.data_type}<{res.data[0]}>'
+    for i in reversed(range(0, len(res_sizes))): res_decl += f'[{res_sizes[i]}]'
+
+    # For loop(s).
+    variable_name = CHARACTER_I
+    loop_body = []
+    for i in range(1, len(op1_sizes) + 1):
+        size, index_size = res.data[i], res.data[i + op1_offset]
+        if (i + op2_offset < len(op2_sizes)):
+            op2_size, op2_index_size = op2.data[i], op2.data[i + op2_offset]
+            size, index_size = max(size, op2_size), max(size, op2_index_size)
+        loop_body.append(f'for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
+        variable_name = next_character(variable_name)
+    loop_body.append(
+        f'{res.name}{res_nth_index} := {op1.name}{op1_nth_index} {declaration.op} {op2.name}{op2_nth_index};')
+    for i in range(1, len(op1_sizes) + 1): loop_body.append('}')
     program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
-    decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
-    for (let i: ubit<{index_size0}> = 0..{size0}) {{
-      for (let j: ubit<{index_size1}> = 0..{size1}) {{
-        for (let k: ubit<{index_size2}> = 0..{size2}) {{
-          {res.name}[i][j][k] := {op1.name}[i][j][k] {declaration.op} {op2.name}[i][j][k];
-        }}
-      }}
-    }}"""
-    return lower_dahlia_program(program, declaration.component_name)
-
-
-def tensor4d_op(declaration):
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, size0, size1, size2, size3 = op1.data[0], op1.data[1], op1.data[2], op1.data[3], op1.data[4]
-    index_size0, index_size1, index_size2, index_size3 = op1.data[5], op1.data[6], op1.data[7], op1.data[8]
-    if op1.type == op2.type:
-        program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-        decl {op2.name}: {op2.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-        for (let i: ubit<{index_size0}> = 0..{size0}) {{
-          for (let j: ubit<{index_size1}> = 0..{size1}) {{
-            for (let k: ubit<{index_size2}> = 0..{size2}) {{
-              for (let l: ubit<{index_size3}> = 0..{size3}) {{
-                {res.name}[i][j][k][l] := {op1.name}[i][j][k][l] {declaration.op} {op2.name}[i][j][k][l];
-              }}
-            }}
-          }}
-        }}"""
-    else:  # Broadcasting.
-        op2_size0, op2_size1, op2_size2 = op2.data[1], op2.data[2], op2.data[3]
-        op2_index_size0, op2_index_size1, op2_index_size2 = op2.data[3], op2.data[5], op2.data[6]
-        # TODO(cgyurgyik): This is defaulted to: `Tensor(X, Y, 1, 1) op Tensor(Y, 1, 1)` for VGG Net.
-        # This should be generalized.
-        program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-        decl {op2.name}: {op2.data_type}<{bitwidth}>[{op2_size0}][{op2_size1}][{op2_size2}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-        for (let i: ubit<{index_size0}> = 0..{size0}) {{
-          for (let j: ubit<{index_size1}> = 0..{size1}) {{
-            for (let k: ubit<{index_size2}> = 0..{size2}) {{
-              for (let l: ubit<{index_size3}> = 0..{size3}) {{
-                {res.name}[i][j][k][l] := {op1.name}[i][j][k][l] {declaration.op} {op2.name}[j][0][0];
-              }}
-            }}
-          }}
-        }}"""
+    {op1_decl};
+    {op2_decl};
+    {res_decl};
+    {' '.join(loop_body)}
+    """
     return lower_dahlia_program(program, declaration.component_name)
 
 
 def batch_flatten(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, op1_size0, op1_size1, op1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
-    op1_index_size0, op1_index_size1, op1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
-    res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
+    bitwidth, res_bitwidth, res_size0, res_size1 = op1.data[0], res.data[0], res.data[1], res.data[2]
     res_index_size0, res_index_size1 = res.data[3], res.data[4]
-    program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
-        let l: ubit<{res_index_size1}> = 0;
-        for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
-          for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
-            for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
-              {res.name}[i][l] := {op1.name}[i][j][k];
-              l := l + 1;
-            }}
-          }}
-        }}"""
-    return lower_dahlia_program(program, declaration.component_name)
+
+    if op1.type == PrimitiveType.Memory3D:
+        op1_size0, op1_size1, op1_size2 = op1.data[1], op1.data[2], op1.data[3]
+        op1_index_size0, op1_index_size1, op1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
+        program = f"""
+            decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
+            decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
+            let l: ubit<{res_index_size1}> = 0;
+            for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
+              for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
+                for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
+                  {res.name}[i][l] := {op1.name}[i][j][k];
+                  l := l + 1;
+                }}
+              }}
+            }}"""
+        return lower_dahlia_program(program, declaration.component_name)
+    if op1.type == PrimitiveType.Memory4D:
+        op1_size0, op1_size1, op1_size2, op1_size3 = op1.data[1], op1.data[2], op1.data[3], op1.data[4]
+        op1_index_size0, op1_index_size1 = op1.data[5], op1.data[6]
+        op1_index_size2, op1_index_size3 = op1.data[7], op1.data[8]
+        program = f"""
+            decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}][{op1_size3}];
+            decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
+            let l: ubit<{res_index_size1}> = 0;
+            for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
+              for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
+                for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
+                  for (let l: ubit<{op1_index_size3}> = 0..{op1_size3}) {{
+                    {res.name}[i][l] := {op1.name}[i][j][k][l];
+                    l := l + 1;
+                  }}
+                }}
+              }}
+            }}"""
+        return lower_dahlia_program(program, declaration.component_name)
 
 
 def bias_add(declaration):
@@ -352,3 +373,44 @@ def batch_matmul(declaration):
     }} 
     """
     return lower_dahlia_program(program, declaration.component_name)
+
+
+# TODO(cgyurgyik): Similar to batch_matmul, this requires a temporary memory to store the output
+# of the matrix multiply. Otherwise, the values aren't computed properly. Look deeper into this.
+def dense(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
+    # TODO(cgyurgyik): Add support for `units`.
+    units = declaration.attributes.get_int("units")
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    bitwidth, M1_size0, M1_size1 = op1.data[0], op1.data[1], op1.data[2]
+    M1_index_size0, M1_index_size1 = op1.data[3], op1.data[4]
+    M2_size0, M2_size1, M2_index_size0, M2_index_size1 = op2.data[1], op2.data[2], op2.data[3], op2.data[4]
+    program = f"""
+    decl {op1.name}: {op1.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}];
+    decl {op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size1}];
+    decl {res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
+    let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size1}][{M2_size0}];
+    let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
+    for (let i: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
+      for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+        transpose_{op2.name}[j][i] := {op2.name}[i][j];
+      }}
+    }} 
+
+    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let j: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
+        for (let k: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+          let product = {op1.name}[i][k] * transpose_{op2.name}[k][j];
+        }} combine {{
+          temporary_{res.name}[i][j] += product;
+        }}
+      }}
+    }}
+
+    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let j: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
+        {res.name}[i][j] := temporary_{res.name}[i][j];
+      }}
+    }}
+    """
+    return lower_dahlia_program(program, declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 45e18b387c..0a986e9453 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -9,6 +9,7 @@ def add():
     y = relay.var('y', shape=(), dtype="int32")
     return relay.Function([x, y], relay.add(x, y))
 
+
 def tensor_subtract():
     x = relay.var("x", relay.TensorType((2, 4), "int32"))
     y = relay.var("y", relay.TensorType((2, 4), "int32"))
@@ -42,6 +43,12 @@ def relu():
     return relay.Function([x], relay.nn.relu(x))
 
 
+def dense():
+    x = relay.var('x', shape=[1, 4096], dtype='int32')
+    y = relay.var('y', shape=[10, 4096], dtype='int32')
+    return relay.Function([x, y], relay.nn.dense(x, y, units=10))
+
+
 def mlp_net():
     """The MLP test from Relay."""
     from tvm.relay.testing import mlp
@@ -55,7 +62,7 @@ def vgg_net():
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, mlp_net, vgg_net]
+ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, dense, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 9432a62a4d..03f2e8fa0e 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -2,10 +2,10 @@
 from dataclasses import dataclass
 from typing import List, Dict
 from types import FunctionType
-from enum import Enum
+from enum import Enum, IntEnum
 
-
-class PrimitiveType(Enum):
+# Note: The integer value N for Memory with dimension N is used; these should remain unchanged.
+class PrimitiveType(IntEnum):
     Memory1D = 1
     Memory2D = 2
     Memory3D = 3
diff --git a/frontends/relay-futil/tests/broadcast.expect b/frontends/relay-futil/tests/broadcast.expect
new file mode 100644
index 0000000000..9527534871
--- /dev/null
+++ b/frontends/relay-futil/tests/broadcast.expect
@@ -0,0 +1,171 @@
+import "primitives/std.lib";
+
+component add(go: 1, clk: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_0_addr0: 1, x10_0_0_addr1: 2, x10_0_0_addr2: 2, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(2);
+    add2 = prim std_add(2);
+    add3 = prim std_add(2);
+    const0 = prim std_const(2, 0);
+    const1 = prim std_const(2, 1);
+    const10 = prim std_const(2, 1);
+    const11 = prim std_const(2, 1);
+    const2 = prim std_const(2, 0);
+    const3 = prim std_const(2, 1);
+    const4 = prim std_const(2, 0);
+    const5 = prim std_const(2, 1);
+    const6 = prim std_const(1, 0);
+    const7 = prim std_const(1, 0);
+    const8 = prim std_const(1, 0);
+    const9 = prim std_const(2, 1);
+    i0 = prim std_reg(2);
+    j0 = prim std_reg(2);
+    k0 = prim std_reg(2);
+    le0 = prim std_le(2);
+    le1 = prim std_le(2);
+    le2 = prim std_le(2);
+    x1_read0_0 = prim std_reg(32);
+    x2_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const5.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      k0.in = const4.out;
+      k0.write_en = 1'd1;
+      let2[done] = k0.done;
+    }
+    group upd0<"static"=1> {
+      x1_read0_0.write_en = 1'd1;
+      x10_0_0_addr2 = k0.out;
+      x10_0_0_addr1 = j0.out;
+      x10_0_0_addr0 = const6.out;
+      x1_read0_0.in = 1'd1 ? x10_0_0_read_data;
+      upd0[done] = x1_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      x2_read0_0.write_en = 1'd1;
+      x20_0_0_addr2 = const8.out;
+      x20_0_0_addr1 = const7.out;
+      x20_0_0_addr0 = i0.out;
+      x2_read0_0.in = 1'd1 ? x20_0_0_read_data;
+      upd1[done] = x2_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x30_0_0_addr2 = k0.out;
+      x30_0_0_addr1 = j0.out;
+      x30_0_0_addr0 = i0.out;
+      x30_0_0_write_en = 1'd1;
+      add0.left = x1_read0_0.out;
+      add0.right = x2_read0_0.out;
+      x30_0_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x30_0_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      k0.write_en = 1'd1;
+      add1.left = k0.out;
+      add1.right = const9.out;
+      k0.in = 1'd1 ? add1.out;
+      upd3[done] = k0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add2.left = j0.out;
+      add2.right = const10.out;
+      j0.in = 1'd1 ? add2.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add3.left = i0.out;
+      add3.right = const11.out;
+      i0.in = 1'd1 ? add3.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  par {
+                    upd0;
+                    upd1;
+                  }
+                  upd2;
+                  upd3;
+                }
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x3 = prim std_mem_d3(32, 2, 2, 2, 2, 2, 2);
+    x1 = prim std_mem_d3(32, 1, 2, 2, 1, 2, 2);
+    x2 = prim std_mem_d3(32, 2, 1, 1, 2, 1, 1);
+    add0 = add;
+  }
+  wires {
+    group run_add {
+      x1.addr0 = add0.x10_0_0_addr0;
+      add0.x10_0_0_read_data = x1.read_data;
+      x1.addr1 = add0.x10_0_0_addr1;
+      x1.addr2 = add0.x10_0_0_addr2;
+      x2.addr0 = add0.x20_0_0_addr0;
+      add0.x20_0_0_read_data = x2.read_data;
+      x2.addr1 = add0.x20_0_0_addr1;
+      x2.addr2 = add0.x20_0_0_addr2;
+      x3.addr0 = add0.x30_0_0_addr0;
+      x3.addr1 = add0.x30_0_0_addr1;
+      x3.addr2 = add0.x30_0_0_addr2;
+      x3.write_data = add0.x30_0_0_write_data;
+      x3.write_en = add0.x30_0_0_write_en;
+      add0.x30_0_0_done = x3.done;
+      add0.go = 1'd1;
+      run_add[done] = add0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_add;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/broadcast.relay b/frontends/relay-futil/tests/broadcast.relay
new file mode 100644
index 0000000000..bacd708118
--- /dev/null
+++ b/frontends/relay-futil/tests/broadcast.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x1: Tensor[(1, 2, 2), int32], %x2: Tensor[(2, 1, 1), int32]) {
+  let %x3 = add(%x1, %x2);
+  %x3
+}
diff --git a/frontends/relay-futil/tests/data/broadcast.expect b/frontends/relay-futil/tests/data/broadcast.expect
new file mode 100644
index 0000000000..b44c484e80
--- /dev/null
+++ b/frontends/relay-futil/tests/data/broadcast.expect
@@ -0,0 +1,70 @@
+{
+  "x1": [
+    [
+      [
+        1
+      ],
+      [
+        1
+      ]
+    ],
+    [
+      [
+        2
+      ],
+      [
+        2
+      ]
+    ]
+  ],
+  "x2": [
+    [
+      [
+        1
+      ]
+    ],
+    [
+      [
+        1
+      ]
+    ]
+  ],
+  "x3": [
+    [
+      [
+        [
+          2
+        ],
+        [
+          2
+        ]
+      ],
+      [
+        [
+          3
+        ],
+        [
+          3
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          2
+        ],
+        [
+          2
+        ]
+      ],
+      [
+        [
+          3
+        ],
+        [
+          3
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/broadcast.relay b/frontends/relay-futil/tests/data/broadcast.relay
new file mode 100644
index 0000000000..bacd708118
--- /dev/null
+++ b/frontends/relay-futil/tests/data/broadcast.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x1: Tensor[(1, 2, 2), int32], %x2: Tensor[(2, 1, 1), int32]) {
+  let %x3 = add(%x1, %x2);
+  %x3
+}
diff --git a/frontends/relay-futil/tests/data/broadcast.relay.data b/frontends/relay-futil/tests/data/broadcast.relay.data
new file mode 100644
index 0000000000..e2f914b5c5
--- /dev/null
+++ b/frontends/relay-futil/tests/data/broadcast.relay.data
@@ -0,0 +1,14 @@
+{
+  "x1": {
+    "data": [[[1], [1]], [[2], [2]]],
+    "bitwidth": 32
+  },
+  "x2": {
+      "data": [[[1]], [[1]]],
+      "bitwidth": 32
+    },
+  "x3": {
+    "data": [[ [[0], [0]], [[0], [0]] ], [ [[0], [0]], [[0], [0]] ]],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file

From 96654d463bd4fae3086c5a82e3e45e9d52d47cf0 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 16:23:33 -0500
Subject: [PATCH 41/75] Rename mapping.

---
 frontends/relay-futil/utilities.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index faaccf1fe7..44e9f9c4f9 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -3,7 +3,7 @@
 import math
 
 # Mapping from the tensor dimensions to the corresponding FuTIL memory type.
-TensorToMemoryDimensionMapping = {1: PrimitiveType.Memory1D, 2: PrimitiveType.Memory2D,
+NumDimensionsToPrimitive = {1: PrimitiveType.Memory1D, 2: PrimitiveType.Memory2D,
                                   3: PrimitiveType.Memory3D, 4: PrimitiveType.Memory4D}
 
 
@@ -65,10 +65,10 @@ def get_memory_parameters(type):
 
     tensor_dimensions = list(map(int, string_dimensions.split(',')))
     data, num_dimensions = [get_bitwidth(string_type)], len(tensor_dimensions)
-    assert num_dimensions in TensorToMemoryDimensionMapping, f'{num_dimensions} dimensions is not supported.'
+    assert num_dimensions in NumDimensionsToPrimitive, f'{num_dimensions} dimensions is not supported.'
     for dimension in tensor_dimensions: data.append(dimension)  # Size.
     for dimension in tensor_dimensions: data.append(int(math.log2(dimension) + 1))  # Index size.
-    return data, TensorToMemoryDimensionMapping[num_dimensions], data_type
+    return data, NumDimensionsToPrimitive[num_dimensions], data_type
 
 
 def build_main_controls(c: FComponent):

From a8a90395a6f1b70543ead3355a3c6ef827b5c12f Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 17:04:12 -0500
Subject: [PATCH 42/75] Fix spacing.

---
 frontends/relay-futil/utilities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 44e9f9c4f9..aab5fd2a72 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -4,7 +4,7 @@
 
 # Mapping from the tensor dimensions to the corresponding FuTIL memory type.
 NumDimensionsToPrimitive = {1: PrimitiveType.Memory1D, 2: PrimitiveType.Memory2D,
-                                  3: PrimitiveType.Memory3D, 4: PrimitiveType.Memory4D}
+                            3: PrimitiveType.Memory3D, 4: PrimitiveType.Memory4D}
 
 
 def flatten(l):

From 52d0b9014d3c33002acc022173692167545c6bdf Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 17:09:32 -0500
Subject: [PATCH 43/75] Fix dahlia naming.

---
 frontends/relay-futil/compiler.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index cd9bcb77b1..75dc565df9 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -17,9 +17,9 @@
                       'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims}
 
 # Mapping between primitive type and associated Dahlia name extension.
-# E.g. A 2D memory array named `A` will be lowered to `A_0`.
-DahliaNameExtension = {PrimitiveType.Memory1D: '', PrimitiveType.Memory2D: '_0',
-                       PrimitiveType.Memory3D: '_0_0', PrimitiveType.Memory4D: '_0_0_0'}
+# E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
+DahliaNameExtension = {PrimitiveType.Memory1D: '0', PrimitiveType.Memory2D: '0_0',
+                       PrimitiveType.Memory3D: '0_0_0', PrimitiveType.Memory4D: '0_0_0_0'}
 
 
 class Relay2Futil(ExprFunctor):
@@ -62,9 +62,8 @@ def dahlia_name(self, name, type):
         Memory2D: 'X0_0', 'X1_0', 'X2_0', ...
         Memory3D: 'X0_0_0', 'X1_0_0', 'X2_0_0', ...
         """
-        dahlia_name = self.id(name)
         assert type in DahliaNameExtension, f'{name} with {type} is not supported yet.'
-        return dahlia_name + DahliaNameExtension[type]
+        return name + DahliaNameExtension[type]
 
     def get_dahlia_declaration(self, function_name, cells, args, attrs):
         """

From 0ff11ece949ca6343fcad044947b04756867cb37 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 17:17:46 -0500
Subject: [PATCH 44/75] Cleanup.

---
 frontends/relay-futil/dahlia_functions.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index bf7d69c7df..5b6b397f0a 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -88,7 +88,7 @@ def broadcast(declaration):
         res_sizes.append(max(size, op2_sizes[i]) if i < len(op2_sizes) else size)
 
     op1_indices, op2_indices, res_indices = [], [], []
-    # Get the character associated with 'i' + N, where N == Memory Dimensions
+    # Get the character associated with 'i' + N, where N == number of dimensions in `op1`.
     variable_name = chr(ord(CHARACTER_I) + op1_offset - 1)
     for i in range(0, len(op1_sizes)):
         current_dimension, index_zero = f'[{variable_name}]', '[0]'
@@ -112,12 +112,10 @@ def broadcast(declaration):
 
     # Declarations for op1, op2, res.
     op1_decl = f'decl {op1.name}: {op1.data_type}<{op1.data[0]}>'
-    for i in reversed(range(0, len(op1_sizes))): op1_decl += f'[{op1_sizes[i]}]'
-
     op2_decl = f'decl {op2.name}: {op2.data_type}<{op2.data[0]}>'
-    for i in reversed(range(0, len(op2_sizes))): op2_decl += f'[{op2_sizes[i]}]'
-
     res_decl = f'decl {res.name}: {res.data_type}<{res.data[0]}>'
+    for i in reversed(range(0, len(op1_sizes))): op1_decl += f'[{op1_sizes[i]}]'
+    for i in reversed(range(0, len(op2_sizes))): op2_decl += f'[{op2_sizes[i]}]'
     for i in reversed(range(0, len(res_sizes))): res_decl += f'[{res_sizes[i]}]'
 
     # For loop(s).

From 09a20193cf39185512a9df092cb5cb1fa893a865 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 17:25:56 -0500
Subject: [PATCH 45/75] CLeanup.

---
 frontends/relay-futil/dahlia_functions.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 5b6b397f0a..2af92cd2a8 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -107,8 +107,9 @@ def broadcast(declaration):
             op2_indices.append(current_dimension)
         variable_name = next_character(variable_name, -1)
 
-    op1_nth_index, op2_nth_index = ''.join(reversed(op1_indices)), ''.join(reversed(op2_indices))
-    res_nth_index = ''.join(reversed(res_indices))
+    # Resulting index in the nested for loop, e.g. for op1[i][j][0][k], this is `[i][j][0][k]`.
+    op1_index, op2_index = ''.join(reversed(op1_indices)), ''.join(reversed(op2_indices))
+    res_index = ''.join(reversed(res_indices))
 
     # Declarations for op1, op2, res.
     op1_decl = f'decl {op1.name}: {op1.data_type}<{op1.data[0]}>'
@@ -128,8 +129,8 @@ def broadcast(declaration):
             size, index_size = max(size, op2_size), max(size, op2_index_size)
         loop_body.append(f'for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
         variable_name = next_character(variable_name)
-    loop_body.append(
-        f'{res.name}{res_nth_index} := {op1.name}{op1_nth_index} {declaration.op} {op2.name}{op2_nth_index};')
+    loop_body.append(f'{res.name}{res_index} := {op1.name}{op1_index} {declaration.op} {op2.name}{op2_index};')
+
     for i in range(1, len(op1_sizes) + 1): loop_body.append('}')
     program = f"""
     {op1_decl};

From 9c41278da21080d803d3aea331cc74b1db437bd6 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 23 Nov 2020 17:30:09 -0500
Subject: [PATCH 46/75] Add todo for supporting axis=-1.

---
 frontends/relay-futil/dahlia_functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 2af92cd2a8..ef76e9a709 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -187,6 +187,8 @@ def batch_flatten(declaration):
 def bias_add(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
     axis = declaration.attributes.get_int("axis")
+    # TODO(cgyurgyik): Supported axis = -1.
+    assert axis == 0 or axis == 1, f'bias_add with axis: {axis} is not currently supported.'
     data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth = data.data[0]
     if data.type == PrimitiveType.Memory2D:

From 98589f2a6dac048e13076963dfe7c76a96e86923 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Tue, 24 Nov 2020 08:10:18 -0500
Subject: [PATCH 47/75] Begin generalizing functions for any tensor size.

---
 frontends/relay-futil/dahlia_functions.py | 241 ++++++----------------
 frontends/relay-futil/pretty_print.py     |  51 +++++
 2 files changed, 118 insertions(+), 174 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index ef76e9a709..6b0e7496a9 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -3,10 +3,12 @@
 
 from tempfile import NamedTemporaryFile, TemporaryFile
 from futil_ast import *
+from pretty_print import *
 
 IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
 NO_ERR = "2>/dev/null"
 CHARACTER_I = chr(ord('i'))
+NEWL = '\n'
 
 
 def lower_dahlia_program(prog, component_name):
@@ -79,18 +81,17 @@ def broadcast(declaration):
     op1 = operand1 if operand1.type >= operand2.type else operand2
     op2 = operand2 if op1 == operand1 else operand1
 
-    op1_offset, op2_offset = op1.type, op2.type
+    op1_dims, op2_dims, res_dims = op1.type, op2.type, res.type
     op1_sizes, op2_sizes, res_sizes = [], [], []
-    for i in reversed(range(1, op1_offset + 1)): op1_sizes.append(op1.data[i])
-    for i in reversed(range(1, op2_offset + 1)): op2_sizes.append(op2.data[i])
-    for i in range(0, len(op1_sizes)):
-        size = op1_sizes[i]
-        res_sizes.append(max(size, op2_sizes[i]) if i < len(op2_sizes) else size)
+    # Get memory sizes in reversed order.
+    for i in reversed(range(0, op1_dims)): op1_sizes.append(op1.data[i + 1])
+    for i in reversed(range(0, op2_dims)): op2_sizes.append(op2.data[i + 1])
+    for i in reversed(range(0, res_dims)): res_sizes.append(res.data[i + 1])
 
     op1_indices, op2_indices, res_indices = [], [], []
     # Get the character associated with 'i' + N, where N == number of dimensions in `op1`.
-    variable_name = chr(ord(CHARACTER_I) + op1_offset - 1)
-    for i in range(0, len(op1_sizes)):
+    variable_name = chr(ord(CHARACTER_I) + op1_dims - 1)
+    for i in range(0, len(res_sizes)):
         current_dimension, index_zero = f'[{variable_name}]', '[0]'
         res_indices.append(current_dimension)
         if len(op2_sizes) <= i:
@@ -110,129 +111,60 @@ def broadcast(declaration):
     # Resulting index in the nested for loop, e.g. for op1[i][j][0][k], this is `[i][j][0][k]`.
     op1_index, op2_index = ''.join(reversed(op1_indices)), ''.join(reversed(op2_indices))
     res_index = ''.join(reversed(res_indices))
+    loop_body = f'{res.name}{res_index} := {op1.name}{op1_index} {declaration.op} {op2.name}{op2_index};'
 
-    # Declarations for op1, op2, res.
-    op1_decl = f'decl {op1.name}: {op1.data_type}<{op1.data[0]}>'
-    op2_decl = f'decl {op2.name}: {op2.data_type}<{op2.data[0]}>'
-    res_decl = f'decl {res.name}: {res.data_type}<{res.data[0]}>'
-    for i in reversed(range(0, len(op1_sizes))): op1_decl += f'[{op1_sizes[i]}]'
-    for i in reversed(range(0, len(op2_sizes))): op2_decl += f'[{op2_sizes[i]}]'
-    for i in reversed(range(0, len(res_sizes))): res_decl += f'[{res_sizes[i]}]'
-
-    # For loop(s).
-    variable_name = CHARACTER_I
-    loop_body = []
-    for i in range(1, len(op1_sizes) + 1):
-        size, index_size = res.data[i], res.data[i + op1_offset]
-        if (i + op2_offset < len(op2_sizes)):
-            op2_size, op2_index_size = op2.data[i], op2.data[i + op2_offset]
-            size, index_size = max(size, op2_size), max(size, op2_index_size)
-        loop_body.append(f'for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
-        variable_name = next_character(variable_name)
-    loop_body.append(f'{res.name}{res_index} := {op1.name}{op1_index} {declaration.op} {op2.name}{op2_index};')
-
-    for i in range(1, len(op1_sizes) + 1): loop_body.append('}')
-    program = f"""
-    {op1_decl};
-    {op2_decl};
-    {res_decl};
-    {' '.join(loop_body)}
-    """
+    program_body = pp_dahlia_loop(res, loop_body)
+    declarations = pp_dahlia_memory_declarations([res, op1, op2])
+    program = f"""{declarations}{NEWL}{program_body}"""
     return lower_dahlia_program(program, declaration.component_name)
 
 
 def batch_flatten(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
-    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, res_bitwidth, res_size0, res_size1 = op1.data[0], res.data[0], res.data[1], res.data[2]
-    res_index_size0, res_index_size1 = res.data[3], res.data[4]
+    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, num_dimensions = data.data[0], data.type
+    res_index_size1 = res.data[4]
 
-    if op1.type == PrimitiveType.Memory3D:
-        op1_size0, op1_size1, op1_size2 = op1.data[1], op1.data[2], op1.data[3]
-        op1_index_size0, op1_index_size1, op1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
-        program = f"""
-            decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}];
-            decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
-            let l: ubit<{res_index_size1}> = 0;
-            for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
-              for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
-                for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
-                  {res.name}[i][l] := {op1.name}[i][j][k];
-                  l := l + 1;
-                }}
-              }}
-            }}"""
-        return lower_dahlia_program(program, declaration.component_name)
-    if op1.type == PrimitiveType.Memory4D:
-        op1_size0, op1_size1, op1_size2, op1_size3 = op1.data[1], op1.data[2], op1.data[3], op1.data[4]
-        op1_index_size0, op1_index_size1 = op1.data[5], op1.data[6]
-        op1_index_size2, op1_index_size3 = op1.data[7], op1.data[8]
-        program = f"""
-            decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}][{op1_size3}];
-            decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
-            let l: ubit<{res_index_size1}> = 0;
-            for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
-              for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
-                for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
-                  for (let l: ubit<{op1_index_size3}> = 0..{op1_size3}) {{
-                    {res.name}[i][l] := {op1.name}[i][j][k][l];
-                    l := l + 1;
-                  }}
-                }}
-              }}
-            }}"""
-        return lower_dahlia_program(program, declaration.component_name)
+    variable_name = CHARACTER_I
+    data_indices, res_indices = "", f'[{variable_name}]'
+    for i in range(0, num_dimensions):
+        # Determine loop body indices based on `axis` provided.
+        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
+        index = f'[{variable_name}]'
+        data_indices += index
+        variable_name = next_character(variable_name)
+    res_indices += f'[{variable_name}]'
+
+    declarations = pp_dahlia_memory_declarations([data, res])
+    let_flattened = f'let {variable_name}: ubit<{res_index_size1}> = 0;'
+    body = (f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;")
+    loops = pp_dahlia_loop(data, body)
+    program = f"""{declarations}{NEWL}{let_flattened}{NEWL}{loops}"""
+    return lower_dahlia_program(program, declaration.component_name)
 
 
 def bias_add(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
-    axis = declaration.attributes.get_int("axis")
-    # TODO(cgyurgyik): Supported axis = -1.
-    assert axis == 0 or axis == 1, f'bias_add with axis: {axis} is not currently supported.'
     data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth = data.data[0]
-    if data.type == PrimitiveType.Memory2D:
-        size0, size1, index_size0, index_size1 = data.data[1], data.data[2], data.data[3], data.data[4]
-        bias_size, bias_index_size = bias.data[1], bias.data[2]
-        program = f"""
-        decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}];
-        decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}];"""
-        if axis == 1:
-            program += f"""
-            for (let i: ubit<{index_size0}> = 0..{size0}) {{
-              for (let j: ubit<{index_size1}> = 0..{size1}) {{
-                {res.name}[i][j] := {data.name}[i][j] + {bias.name}[j];
-              }}
-            }}"""
-        elif axis == 0:
-            program += f"""
-            for (let j: ubit<{index_size1}> = 0..{size1}) {{
-              for (let i: ubit<{index_size0}> = 0..{size0}) {{
-                {res.name}[i][j] := {data.name}[i][j] + {bias.name}[i];
-              }}
-            }}"""
-    elif data.type == PrimitiveType.Memory4D:
-        bitwidth, size0, size1, size2, size3 = data.data[0], data.data[1], data.data[2], data.data[3], data.data[4]
-        index_size0, index_size1, index_size2, index_size3 = data.data[5], data.data[6], data.data[7], data.data[8]
-        bias_size, bias_index_size = bias.data[1], bias.data[2]
-        program = f"""
-        decl {data.name}: {data.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];
-        decl {bias.name}: {bias.data_type}<{bitwidth}>[{bias_size}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}][{size3}];"""
-        if axis == 1:
-            program += f"""
-            for (let i: ubit<{index_size0}> = 0..{size0}) {{
-              for (let j: ubit<{index_size1}> = 0..{size1}) {{
-                for (let k: ubit<{index_size2}> = 0..{size2}) {{
-                  for (let l: ubit<{index_size3}> = 0..{size3}) {{
-                    {res.name}[i][j][k][l] := {data.name}[i][j][k][l] + {bias.name}[j];
-                  }}
-                }}
-              }}
-            }}"""
+    bitwidth, num_dimensions = data.data[0], data.type
 
-    return lower_dahlia_program(program, declaration.component_name)
+    axis_attribute = declaration.attributes.get_int("axis")
+    axis = num_dimensions - 1 if axis_attribute == -1 else axis_attribute
+
+    variable_name = CHARACTER_I
+    data_indices = ""
+    for i in range(0, num_dimensions):
+        # Determine loop body indices based on `axis` provided.
+        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
+        index = f'[{variable_name}]'
+        if axis == i: bias_index = index
+        data_indices += index
+        variable_name = next_character(variable_name)
+
+    declarations = pp_dahlia_memory_declarations([data, bias, res])
+    body = (f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};")
+    loops = pp_dahlia_loop(data, body)
+    return lower_dahlia_program(f"""{declarations}{NEWL}{loops}""", declaration.component_name)
 
 
 # TODO(cgyurgyik):
@@ -242,56 +174,23 @@ def bias_add(declaration):
 def relu(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, num_dimensions = op1.data[0], op1.type
     assert res.data_type == 'ubit', f'{res.data_type} is not currently supported for ReLU.'
 
-    if op1.type == PrimitiveType.Memory2D:
-        bitwidth, op1_size0, op1_size1 = op1.data[0], op1.data[1], op1.data[2]
-        op1_index_size0, op1_index_size1 = op1.data[3], op1.data[4]
-        res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
-        res_index_size0, res_index_size1 = res.data[3], res.data[4]
-        program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}];
-        let zero: {op1.data_type}<{bitwidth}> = 0;
-        for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
-          for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
-            if ({op1.name}[i][j] > zero) {{
-              {res.name}[i][j] := {op1.name}[i][j];
-            }} else {{
-              {res.name}[i][j] := 0;
-            }}
-          }}
-        }}
-        """
-        return lower_dahlia_program(program, declaration.component_name)
+    let_zero = f'let zero: {op1.data_type}<{bitwidth}> = 0;'
+    declarations = pp_dahlia_memory_declarations([op1, res])
 
-    elif op1.type == PrimitiveType.Memory4D:
-        bitwidth, op1_size0, op1_size1 = op1.data[0], op1.data[1], op1.data[2]
-        op1_size2, op1_size3, op1_index_size0, = op1.data[3], op1.data[4], op1.data[5]
-        op1_index_size1, op1_index_size2, op1_index_size3 = op1.data[6], op1.data[7], op1.data[8]
-        res_bitwidth, res_size0, res_size1 = res.data[0], res.data[1], res.data[2]
-        res_size2, res_size3, res_index_size0, res_index_size1 = res.data[3], res.data[4], res.data[5], res.data[6]
-        res_index_size2, res_index_size3 = res.data[7], res.data[8]
+    indices = ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        indices += f'[{variable_name}]'
+        variable_name = next_character(variable_name)
 
-        program = f"""
-                decl {op1.name}: {op1.data_type}<{bitwidth}>[{op1_size0}][{op1_size1}][{op1_size2}][{op1_size3}];
-                decl {res.name}: {res.data_type}<{bitwidth}>[{res_size0}][{res_size1}][{op1_size2}][{op1_size3}];
-                let zero: {op1.data_type}<{bitwidth}> = 0;
-                for (let i: ubit<{op1_index_size0}> = 0..{op1_size0}) {{
-                  for (let j: ubit<{op1_index_size1}> = 0..{op1_size1}) {{
-                    for (let k: ubit<{op1_index_size2}> = 0..{op1_size2}) {{
-                      for (let l: ubit<{op1_index_size3}> = 0..{op1_size3}) {{
-                        if ({op1.name}[i][j][k][l] > zero) {{
-                          {res.name}[i][j][k][l] := {op1.name}[i][j][k][l];
-                        }} else {{
-                          {res.name}[i][j][k][l] := 0;
-                        }}
-                      }} 
-                    }}
-                  }}
-                }}
-                """
-        return lower_dahlia_program(program, declaration.component_name)
+    body = f"""if ({op1.name}{indices} > zero) {{ {res.name}{indices} := {op1.name}{indices}; }} 
+        else {{ {res.name}{indices} := 0; }}"""
+    loops = pp_dahlia_loop(op1, body)
+    return lower_dahlia_program(f"""{declarations}{NEWL}{let_zero}{NEWL}{loops}""", declaration.component_name)
 
 
 # TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
@@ -300,8 +199,7 @@ def negative(declaration):
     op1, res = declaration.inputs[0].primitive, declaration.output.primitive
     bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
     program = f"""
-        decl {op1.name}: {op1.data_type}<{bitwidth}>[{size}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size}];
+        {pp_dahlia_memory_declarations([res, op1])}
         for (let i: ubit<{index_size}> = 0..{size}) {{
           {res.name}[i] := -{op1.name}[i];
         }}
@@ -318,8 +216,7 @@ def expand_dims(declaration):
     index_size0, index_size1, index_size2 = res.data[4], res.data[5], res.data[6]
     if axis == 1 and num_newaxis == 2:
         program = f"""
-        decl {data.name}: {data.data_type}<{bitwidth}>[{size}];
-        decl {res.name}: {res.data_type}<{bitwidth}>[{size0}][{size1}][{size2}];
+        {pp_dahlia_memory_declarations([res, data])}
         for (let i: ubit<{index_size}> = 0..{size}) {{
           {res.name}[i][0][0] := {data.name}[i];
         }}
@@ -340,9 +237,7 @@ def batch_matmul(declaration):
     #    * This third step may not be necessary, but trying to conduct the matrix multiply
     #      directly with the return value declared resulted in incorrect outputs.
     program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M1_size2}];
-    decl {op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size1}][{M2_size2}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
+    {pp_dahlia_memory_declarations([res, op1, op2])}
     let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size2}][{M2_size1}];
     let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
     for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
@@ -387,9 +282,7 @@ def dense(declaration):
     M1_index_size0, M1_index_size1 = op1.data[3], op1.data[4]
     M2_size0, M2_size1, M2_index_size0, M2_index_size1 = op2.data[1], op2.data[2], op2.data[3], op2.data[4]
     program = f"""
-    decl {op1.name}: {op1.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}];
-    decl {op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size1}];
-    decl {res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
+    {pp_dahlia_memory_declarations([res, op1, op2])}
     let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size1}][{M2_size0}];
     let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
     for (let i: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index f711e78546..7a57e6e6b7 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -105,3 +105,54 @@ def pp_cell(cell: FCell):
         return f'{cell.declaration.name} = {cell.declaration.component.name};'
     elif cell.is_dahlia_declaration():
         return f'{cell.dahlia_declaration.decl_name} = {cell.dahlia_declaration.component_name};'
+
+
+# Dahlia Pretty Printing.
+
+def next_character(ch, dir=1):
+    """
+    Returns the next character after 'ch'.
+    If dir is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
+    """
+    return chr(ord(ch) + dir) if dir > 0 else chr(ord(ch) - 1)
+
+
+def pp_dahlia_memory_declarations(declaration_list):
+    declarations = []
+    for decl in declaration_list:
+        decl_string = f'decl {decl.name}: {decl.data_type}<{decl.data[0]}>'
+        for i in range(0, decl.type): decl_string += f'[{decl.data[i + 1]}]'
+        declarations.append(f'{decl_string};')
+    return '\n'.join(declarations)
+
+
+def pp_dahlia_loop(data, body):
+    """
+    Returns an iteration over data with `body` as the work done within the nested loop(s).
+    Many tensor functions share the same control flow: (1) Iterate over `data`, and (2) do some work in body.
+    For example, if `data` is a 2D primitive of size (M, N) and body == `X;`, then this will return:
+
+    ```
+    for (let i: ubit<X> = 0..M) {
+      for (let j: ubit<Y> = 0..N) {
+        X;
+      }
+    }
+    ```
+    """
+    variable_name = chr(ord('i'))
+    num_dimensions = data.type
+
+    program = []
+    SPACING = ''
+    for i in range(0, num_dimensions):
+        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
+        program.append(f'{SPACING}for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
+        variable_name = next_character(variable_name)
+        SPACING += '  '
+    program.append(f'{SPACING}{body}')
+
+    for i in range(0, num_dimensions):
+        SPACING = SPACING[:-2]
+        program.append(f'{SPACING}}}')
+    return '\n'.join(program)

From 1b186732f4db39f295c2aec321aaca5d1ab361a4 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Tue, 24 Nov 2020 08:14:55 -0500
Subject: [PATCH 48/75] Fix comment.

---
 frontends/relay-futil/dahlia_functions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 6b0e7496a9..c9e5077c22 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -89,8 +89,10 @@ def broadcast(declaration):
     for i in reversed(range(0, res_dims)): res_sizes.append(res.data[i + 1])
 
     op1_indices, op2_indices, res_indices = [], [], []
-    # Get the character associated with 'i' + N, where N == number of dimensions in `op1`.
+    # Gets the last variable name since we will compare sizes in the reverse direction.
     variable_name = chr(ord(CHARACTER_I) + op1_dims - 1)
+    # Determine the value at the N'th index. This will either be `[x]` or `[0]`
+    # depending on the relationship between the dimensions sizes.
     for i in range(0, len(res_sizes)):
         current_dimension, index_zero = f'[{variable_name}]', '[0]'
         res_indices.append(current_dimension)

From 01bfe7411694db81977253fe0151d84eb381aa87 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Tue, 24 Nov 2020 10:58:35 -0500
Subject: [PATCH 49/75] Generalize functions.

---
 frontends/relay-futil/dahlia_functions.py    | 90 +++++++++++++-------
 frontends/relay-futil/tests/broadcast.expect | 31 +++----
 frontends/relay-futil/tests/broadcast.relay  |  2 +-
 3 files changed, 73 insertions(+), 50 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index c9e5077c22..7ce89e0d44 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -76,10 +76,7 @@ def broadcast(declaration):
               result[i][j][k] := op1[i][0][k] + op2[j][0];
               ...
     """
-    operand1, operand2 = declaration.inputs[0].primitive, declaration.inputs[1].primitive
-    res = declaration.output.primitive
-    op1 = operand1 if operand1.type >= operand2.type else operand2
-    op2 = operand2 if op1 == operand1 else operand1
+    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
 
     op1_dims, op2_dims, res_dims = op1.type, op2.type, res.type
     op1_sizes, op2_sizes, res_sizes = [], [], []
@@ -88,18 +85,22 @@ def broadcast(declaration):
     for i in reversed(range(0, op2_dims)): op2_sizes.append(op2.data[i + 1])
     for i in reversed(range(0, res_dims)): res_sizes.append(res.data[i + 1])
 
-    op1_indices, op2_indices, res_indices = [], [], []
     # Gets the last variable name since we will compare sizes in the reverse direction.
-    variable_name = chr(ord(CHARACTER_I) + op1_dims - 1)
-    # Determine the value at the N'th index. This will either be `[x]` or `[0]`
+    variable_name = chr(ord(CHARACTER_I) + res_dims - 1)
+    # Determine the value at the indices in reverse order.
+    # For each dimension, this will either be `[x]` for index_variable `x`, or `[0]`
     # depending on the relationship between the dimensions sizes.
+    op1_indices, op2_indices, res_indices = [], [], []
     for i in range(0, len(res_sizes)):
         current_dimension, index_zero = f'[{variable_name}]', '[0]'
         res_indices.append(current_dimension)
-        if len(op2_sizes) <= i:
+        if op1_dims > op2_dims and len(op2_sizes) <= i:
             op1_indices.append(current_dimension)
             continue
-        elif op1_sizes[i] == op2_sizes[i]:
+        if op2_dims > op1_dims and len(op1_sizes) <= i:
+            op2_indices.append(current_dimension)
+            continue
+        if op1_sizes[i] == op2_sizes[i]:
             op1_indices.append(current_dimension)
             op2_indices.append(current_dimension)
         elif op1_sizes[i] > op2_sizes[i]:
@@ -110,8 +111,9 @@ def broadcast(declaration):
             op2_indices.append(current_dimension)
         variable_name = next_character(variable_name, -1)
 
-    # Resulting index in the nested for loop, e.g. for op1[i][j][0][k], this is `[i][j][0][k]`.
-    op1_index, op2_index = ''.join(reversed(op1_indices)), ''.join(reversed(op2_indices))
+    # Resulting index in the nested for loop, e.g. for `op1[i][j][0][k]`, this is `[i][j][0][k]`.
+    op1_index = ''.join(reversed(op1_indices))
+    op2_index = ''.join(reversed(op2_indices))
     res_index = ''.join(reversed(res_indices))
     loop_body = f'{res.name}{res_index} := {op1.name}{op1_index} {declaration.op} {op2.name}{op2_index};'
 
@@ -139,9 +141,9 @@ def batch_flatten(declaration):
 
     declarations = pp_dahlia_memory_declarations([data, res])
     let_flattened = f'let {variable_name}: ubit<{res_index_size1}> = 0;'
-    body = (f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;")
-    loops = pp_dahlia_loop(data, body)
-    program = f"""{declarations}{NEWL}{let_flattened}{NEWL}{loops}"""
+    body = f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;"
+    program_body = pp_dahlia_loop(data, body)
+    program = f"""{declarations}{NEWL}{let_flattened}{NEWL}{program_body}"""
     return lower_dahlia_program(program, declaration.component_name)
 
 
@@ -165,8 +167,8 @@ def bias_add(declaration):
 
     declarations = pp_dahlia_memory_declarations([data, bias, res])
     body = (f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};")
-    loops = pp_dahlia_loop(data, body)
-    return lower_dahlia_program(f"""{declarations}{NEWL}{loops}""", declaration.component_name)
+    program_body = pp_dahlia_loop(data, body)
+    return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
 # TODO(cgyurgyik):
@@ -175,12 +177,12 @@ def bias_add(declaration):
 #  2. Without signed bit array support, this is also meaningless.
 def relu(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
-    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = op1.data[0], op1.type
+    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, num_dimensions = data.data[0], data.type
     assert res.data_type == 'ubit', f'{res.data_type} is not currently supported for ReLU.'
 
-    let_zero = f'let zero: {op1.data_type}<{bitwidth}> = 0;'
-    declarations = pp_dahlia_memory_declarations([op1, res])
+    declarations = pp_dahlia_memory_declarations([data, res])
+    let_zero = f'let zero: {data.data_type}<{bitwidth}> = 0;'
 
     indices = ""
     variable_name = CHARACTER_I
@@ -189,33 +191,57 @@ def relu(declaration):
         indices += f'[{variable_name}]'
         variable_name = next_character(variable_name)
 
-    body = f"""if ({op1.name}{indices} > zero) {{ {res.name}{indices} := {op1.name}{indices}; }} 
+    body = f"""if ({data.name}{indices} > zero) {{ {res.name}{indices} := {data.name}{indices}; }} 
         else {{ {res.name}{indices} := 0; }}"""
-    loops = pp_dahlia_loop(op1, body)
-    return lower_dahlia_program(f"""{declarations}{NEWL}{let_zero}{NEWL}{loops}""", declaration.component_name)
+    program_body = pp_dahlia_loop(data, body)
+    return lower_dahlia_program(f"""{declarations}{NEWL}{let_zero}{NEWL}{program_body}""", declaration.component_name)
 
 
 # TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
 def negative(declaration):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
-    op1, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, size, index_size = op1.data[0], op1.data[1], op1.data[2]
-    program = f"""
-        {pp_dahlia_memory_declarations([res, op1])}
-        for (let i: ubit<{index_size}> = 0..{size}) {{
-          {res.name}[i] := -{op1.name}[i];
-        }}
-    """
-    return lower_dahlia_program(program, declaration.component_name)
+    op, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, num_dimensions = op.data[0], op.type
+
+    indices = ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        indices += f'[{variable_name}]'
+        variable_name = next_character(variable_name)
+
+    declarations = pp_dahlia_memory_declarations([op, res])
+    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := -{op.name}{indices};""")
+    return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
 def expand_dims(declaration):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
     axis, num_newaxis = declaration.attributes.get_int("axis"), declaration.attributes.get_int("num_newaxis")
     data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, num_dimensions = data.data[0], data.type
+
+    declarations = pp_dahlia_memory_declarations([data, res])
+
+    res_indices, data_indices = "", ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        index = f'[{variable_name}]'
+        res_indices += index
+        data_indices += index
+        if axis == i + 1:
+            for _ in range(0, num_newaxis): res_indices += '[0]'
+        variable_name = next_character(variable_name)
+
+    program_body = pp_dahlia_loop(data, f'{res.name}{res_indices} := {data.name}{data_indices}')
+    program = f"""{declarations}{NEWL}{program_body}"""
+    return lower_dahlia_program(program, declaration.component_name)
+
     bitwidth, size, index_size = data.data[0], data.data[1], data.data[2]
     size0, size1, size2 = res.data[1], res.data[2], res.data[3]
     index_size0, index_size1, index_size2 = res.data[4], res.data[5], res.data[6]
+
     if axis == 1 and num_newaxis == 2:
         program = f"""
         {pp_dahlia_memory_declarations([res, data])}
diff --git a/frontends/relay-futil/tests/broadcast.expect b/frontends/relay-futil/tests/broadcast.expect
index 9527534871..84f5962b54 100644
--- a/frontends/relay-futil/tests/broadcast.expect
+++ b/frontends/relay-futil/tests/broadcast.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_0_addr0: 1, x10_0_0_addr1: 2, x10_0_0_addr2: 2, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
+component add(go: 1, clk: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_addr0: 2, x10_0_addr1: 2, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(2);
@@ -9,14 +9,13 @@ component add(go: 1, clk: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, x20_0_0_rea
     const0 = prim std_const(2, 0);
     const1 = prim std_const(2, 1);
     const10 = prim std_const(2, 1);
-    const11 = prim std_const(2, 1);
     const2 = prim std_const(2, 0);
     const3 = prim std_const(2, 1);
     const4 = prim std_const(2, 0);
     const5 = prim std_const(2, 1);
     const6 = prim std_const(1, 0);
     const7 = prim std_const(1, 0);
-    const8 = prim std_const(1, 0);
+    const8 = prim std_const(2, 1);
     const9 = prim std_const(2, 1);
     i0 = prim std_reg(2);
     j0 = prim std_reg(2);
@@ -60,16 +59,15 @@ component add(go: 1, clk: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, x20_0_0_rea
     }
     group upd0<"static"=1> {
       x1_read0_0.write_en = 1'd1;
-      x10_0_0_addr2 = k0.out;
-      x10_0_0_addr1 = j0.out;
-      x10_0_0_addr0 = const6.out;
-      x1_read0_0.in = 1'd1 ? x10_0_0_read_data;
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = j0.out;
+      x1_read0_0.in = 1'd1 ? x10_0_read_data;
       upd0[done] = x1_read0_0.done ? 1'd1;
     }
     group upd1<"static"=1> {
       x2_read0_0.write_en = 1'd1;
-      x20_0_0_addr2 = const8.out;
-      x20_0_0_addr1 = const7.out;
+      x20_0_0_addr2 = const7.out;
+      x20_0_0_addr1 = const6.out;
       x20_0_0_addr0 = i0.out;
       x2_read0_0.in = 1'd1 ? x20_0_0_read_data;
       upd1[done] = x2_read0_0.done ? 1'd1;
@@ -87,21 +85,21 @@ component add(go: 1, clk: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, x20_0_0_rea
     group upd3<"static"=1> {
       k0.write_en = 1'd1;
       add1.left = k0.out;
-      add1.right = const9.out;
+      add1.right = const8.out;
       k0.in = 1'd1 ? add1.out;
       upd3[done] = k0.done ? 1'd1;
     }
     group upd4<"static"=1> {
       j0.write_en = 1'd1;
       add2.left = j0.out;
-      add2.right = const10.out;
+      add2.right = const9.out;
       j0.in = 1'd1 ? add2.out;
       upd4[done] = j0.done ? 1'd1;
     }
     group upd5<"static"=1> {
       i0.write_en = 1'd1;
       add3.left = i0.out;
-      add3.right = const11.out;
+      add3.right = const10.out;
       i0.in = 1'd1 ? add3.out;
       upd5[done] = i0.done ? 1'd1;
     }
@@ -139,16 +137,15 @@ component add(go: 1, clk: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, x20_0_0_rea
 component main () -> () {
   cells {
     x3 = prim std_mem_d3(32, 2, 2, 2, 2, 2, 2);
-    x1 = prim std_mem_d3(32, 1, 2, 2, 1, 2, 2);
+    x1 = prim std_mem_d2(32, 2, 2, 2, 2);
     x2 = prim std_mem_d3(32, 2, 1, 1, 2, 1, 1);
     add0 = add;
   }
   wires {
     group run_add {
-      x1.addr0 = add0.x10_0_0_addr0;
-      add0.x10_0_0_read_data = x1.read_data;
-      x1.addr1 = add0.x10_0_0_addr1;
-      x1.addr2 = add0.x10_0_0_addr2;
+      x1.addr0 = add0.x10_0_addr0;
+      add0.x10_0_read_data = x1.read_data;
+      x1.addr1 = add0.x10_0_addr1;
       x2.addr0 = add0.x20_0_0_addr0;
       add0.x20_0_0_read_data = x2.read_data;
       x2.addr1 = add0.x20_0_0_addr1;
diff --git a/frontends/relay-futil/tests/broadcast.relay b/frontends/relay-futil/tests/broadcast.relay
index bacd708118..9dfdf5d721 100644
--- a/frontends/relay-futil/tests/broadcast.relay
+++ b/frontends/relay-futil/tests/broadcast.relay
@@ -1,5 +1,5 @@
 v0.0.4
-fn (%x1: Tensor[(1, 2, 2), int32], %x2: Tensor[(2, 1, 1), int32]) {
+fn (%x1: Tensor[(2, 2), int32], %x2: Tensor[(2, 1, 1), int32]) {
   let %x3 = add(%x1, %x2);
   %x3
 }

From 9f3a75661f5ad7fbb4b11076155a5ac3e7ce9378 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Tue, 24 Nov 2020 11:44:49 -0500
Subject: [PATCH 50/75] Fix batch_matmul.

---
 frontends/relay-futil/dahlia_functions.py | 25 ++---------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 7ce89e0d44..084524618b 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -48,14 +48,6 @@ def lower_dahlia_program(prog, component_name):
         return component
 
 
-def next_character(ch, dir=1):
-    """
-    Returns the next character after 'ch'.
-    If dir is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
-    """
-    return chr(ord(ch) + dir) if dir > 0 else chr(ord(ch) - 1)
-
-
 def broadcast(declaration):
     """
     https://numpy.org/doc/stable/user/basics.broadcasting.html
@@ -238,19 +230,6 @@ def expand_dims(declaration):
     program = f"""{declarations}{NEWL}{program_body}"""
     return lower_dahlia_program(program, declaration.component_name)
 
-    bitwidth, size, index_size = data.data[0], data.data[1], data.data[2]
-    size0, size1, size2 = res.data[1], res.data[2], res.data[3]
-    index_size0, index_size1, index_size2 = res.data[4], res.data[5], res.data[6]
-
-    if axis == 1 and num_newaxis == 2:
-        program = f"""
-        {pp_dahlia_memory_declarations([res, data])}
-        for (let i: ubit<{index_size}> = 0..{size}) {{
-          {res.name}[i][0][0] := {data.name}[i];
-        }}
-        """
-    return lower_dahlia_program(program, declaration.component_name)
-
 
 def batch_matmul(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
@@ -264,8 +243,8 @@ def batch_matmul(declaration):
     # 3. Copy temporary value to return value.*
     #    * This third step may not be necessary, but trying to conduct the matrix multiply
     #      directly with the return value declared resulted in incorrect outputs.
-    program = f"""
-    {pp_dahlia_memory_declarations([res, op1, op2])}
+    declarations = pp_dahlia_memory_declarations([res, op1, op2])
+    program = f"""{declarations}
     let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size2}][{M2_size1}];
     let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
     for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{

From 05a3935eaeba00ea16c0517ddbe72bc441b82b15 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Tue, 24 Nov 2020 14:02:51 -0500
Subject: [PATCH 51/75] Add mlp_net (incomplete example).

---
 frontends/relay-futil/tests/mlp_net.expect | 1781 ++++++++++++++++++++
 frontends/relay-futil/tests/mlp_net.relay  |   17 +
 2 files changed, 1798 insertions(+)
 create mode 100644 frontends/relay-futil/tests/mlp_net.expect
 create mode 100644 frontends/relay-futil/tests/mlp_net.relay

diff --git a/frontends/relay-futil/tests/mlp_net.expect b/frontends/relay-futil/tests/mlp_net.expect
new file mode 100644
index 0000000000..ea30df19af
--- /dev/null
+++ b/frontends/relay-futil/tests/mlp_net.expect
@@ -0,0 +1,1781 @@
+import "primitives/std.lib";
+
+component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x70_0_read_data: 32, x70_0_done: 1, x80_0_read_data: 32, x80_0_done: 1) -> (done: 1, fc3_bias0_addr0: 4, fc3_bias0_write_data: 32, fc3_bias0_write_en: 1, fc3_bias0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(4);
+    add2 = prim std_add(1);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(4, 0);
+    const3 = prim std_const(4, 9);
+    const4 = prim std_const(4, 1);
+    const5 = prim std_const(1, 1);
+    fc3_bias_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(4);
+    le0 = prim std_le(1);
+    le1 = prim std_le(4);
+    x7_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x7_read0_0.write_en = 1'd1;
+      x70_0_addr1 = j0.out;
+      x70_0_addr0 = i0.out;
+      x7_read0_0.in = 1'd1 ? x70_0_read_data;
+      upd0[done] = x7_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      fc3_bias_read0_0.write_en = 1'd1;
+      fc3_bias0_addr0 = j0.out;
+      fc3_bias_read0_0.in = 1'd1 ? fc3_bias0_read_data;
+      upd1[done] = fc3_bias_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x80_0_addr1 = j0.out;
+      x80_0_addr0 = i0.out;
+      x80_0_write_en = 1'd1;
+      add0.left = x7_read0_0.out;
+      add0.right = fc3_bias_read0_0.out;
+      x80_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x80_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd3[done] = j0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      i0.write_en = 1'd1;
+      add2.left = i0.out;
+      add2.right = const5.out;
+      i0.in = 1'd1 ? add2.out;
+      upd4[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              par {
+                upd0;
+                upd1;
+              }
+              upd2;
+              upd3;
+            }
+          }
+          upd4;
+        }
+      }
+    }
+  }
+}
+component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done: 1, x60_0_read_data: 32, x60_0_done: 1, x70_0_read_data: 32, x70_0_done: 1) -> (done: 1, fc3_weight0_0_addr0: 4, fc3_weight0_0_addr1: 7, fc3_weight0_0_write_data: 32, fc3_weight0_0_write_en: 1, fc3_weight0_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1) {
+  cells {
+    add0 = prim std_add(7);
+    add1 = prim std_add(4);
+    add2 = prim std_add(32);
+    add3 = prim std_add(7);
+    add4 = prim std_add(4);
+    add5 = prim std_add(1);
+    add6 = prim std_add(4);
+    add7 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(4, 0);
+    const1 = prim std_const(4, 9);
+    const10 = prim std_const(7, 0);
+    const11 = prim std_const(7, 63);
+    const12 = prim std_const(7, 1);
+    const13 = prim std_const(4, 1);
+    const14 = prim std_const(1, 1);
+    const15 = prim std_const(1, 0);
+    const16 = prim std_const(1, 0);
+    const17 = prim std_const(4, 0);
+    const18 = prim std_const(4, 9);
+    const19 = prim std_const(4, 1);
+    const2 = prim std_const(7, 0);
+    const20 = prim std_const(1, 1);
+    const3 = prim std_const(7, 63);
+    const4 = prim std_const(7, 1);
+    const5 = prim std_const(4, 1);
+    const6 = prim std_const(1, 0);
+    const7 = prim std_const(1, 0);
+    const8 = prim std_const(4, 0);
+    const9 = prim std_const(4, 9);
+    fc3_weight_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(4);
+    i1 = prim std_reg(1);
+    i2 = prim std_reg(1);
+    j0 = prim std_reg(7);
+    j1 = prim std_reg(4);
+    j2 = prim std_reg(4);
+    k0 = prim std_reg(7);
+    le0 = prim std_le(4);
+    le1 = prim std_le(7);
+    le2 = prim std_le(1);
+    le3 = prim std_le(4);
+    le4 = prim std_le(7);
+    le5 = prim std_le(1);
+    le6 = prim std_le(4);
+    mult_pipe0 = prim std_mult_pipe(32);
+    product_0 = prim std_reg(32);
+    temporary_x70_0 = prim std_mem_d2(32, 1, 10, 1, 4);
+    temporary_x7_read0_0 = prim std_reg(32);
+    transpose_fc3_weight0_0 = prim std_mem_d2(32, 64, 10, 7, 4);
+    transpose_fc3_weight_read0_0 = prim std_reg(32);
+    x6_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = i1.out;
+      le2.right = const7.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = j1.out;
+      le3.right = const9.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = k0.out;
+      le4.right = const11.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = i2.out;
+      le5.right = const16.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = j2.out;
+      le6.right = const18.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      i1.in = const6.out;
+      i1.write_en = 1'd1;
+      let2[done] = i1.done;
+    }
+    group let3<"static"=1> {
+      j1.in = const8.out;
+      j1.write_en = 1'd1;
+      let3[done] = j1.done;
+    }
+    group let4<"static"=1> {
+      k0.in = const10.out;
+      k0.write_en = 1'd1;
+      let4[done] = k0.done;
+    }
+    group let5<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let5[done] = bin_read0_0.done;
+      mult_pipe0.left = x6_read0_0.out;
+      mult_pipe0.right = transpose_fc3_weight_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let6<"static"=1> {
+      product_0.in = bin_read0_0.out;
+      product_0.write_en = 1'd1;
+      let6[done] = product_0.done;
+    }
+    group let7<"static"=1> {
+      i2.in = const15.out;
+      i2.write_en = 1'd1;
+      let7[done] = i2.done;
+    }
+    group let8<"static"=1> {
+      j2.in = const17.out;
+      j2.write_en = 1'd1;
+      let8[done] = j2.done;
+    }
+    group upd0<"static"=1> {
+      fc3_weight_read0_0.write_en = 1'd1;
+      fc3_weight0_0_addr1 = j0.out;
+      fc3_weight0_0_addr0 = i0.out;
+      fc3_weight_read0_0.in = 1'd1 ? fc3_weight0_0_read_data;
+      upd0[done] = fc3_weight_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      transpose_fc3_weight0_0.addr1 = i0.out;
+      transpose_fc3_weight0_0.addr0 = j0.out;
+      transpose_fc3_weight0_0.write_en = 1'd1;
+      transpose_fc3_weight0_0.write_data = 1'd1 ? fc3_weight_read0_0.out;
+      upd1[done] = transpose_fc3_weight0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      temporary_x7_read0_0.write_en = 1'd1;
+      temporary_x70_0.addr1 = j2.out;
+      temporary_x70_0.addr0 = i2.out;
+      temporary_x7_read0_0.in = 1'd1 ? temporary_x70_0.read_data;
+      upd10[done] = temporary_x7_read0_0.done ? 1'd1;
+    }
+    group upd11<"static"=1> {
+      x70_0_addr1 = j2.out;
+      x70_0_addr0 = i2.out;
+      x70_0_write_en = 1'd1;
+      x70_0_write_data = 1'd1 ? temporary_x7_read0_0.out;
+      upd11[done] = x70_0_done ? 1'd1;
+    }
+    group upd12<"static"=1> {
+      j2.write_en = 1'd1;
+      add6.left = j2.out;
+      add6.right = const19.out;
+      j2.in = 1'd1 ? add6.out;
+      upd12[done] = j2.done ? 1'd1;
+    }
+    group upd13<"static"=1> {
+      i2.write_en = 1'd1;
+      add7.left = i2.out;
+      add7.right = const20.out;
+      i2.in = 1'd1 ? add7.out;
+      upd13[done] = i2.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const4.out;
+      j0.in = 1'd1 ? add0.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const5.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      x6_read0_0.write_en = 1'd1;
+      x60_0_addr1 = k0.out;
+      x60_0_addr0 = i1.out;
+      x6_read0_0.in = 1'd1 ? x60_0_read_data;
+      upd4[done] = x6_read0_0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      transpose_fc3_weight_read0_0.write_en = 1'd1;
+      transpose_fc3_weight0_0.addr1 = j1.out;
+      transpose_fc3_weight0_0.addr0 = k0.out;
+      transpose_fc3_weight_read0_0.in = 1'd1 ? transpose_fc3_weight0_0.read_data;
+      upd5[done] = transpose_fc3_weight_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      temporary_x70_0.addr1 = j1.out;
+      temporary_x70_0.addr0 = i1.out;
+      temporary_x70_0.write_en = 1'd1;
+      add2.left = temporary_x70_0.read_data;
+      add2.right = product_0.out;
+      temporary_x70_0.addr1 = j1.out;
+      temporary_x70_0.addr0 = i1.out;
+      temporary_x70_0.write_data = 1'd1 ? add2.out;
+      upd6[done] = temporary_x70_0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      k0.write_en = 1'd1;
+      add3.left = k0.out;
+      add3.right = const12.out;
+      k0.in = 1'd1 ? add3.out;
+      upd7[done] = k0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      j1.write_en = 1'd1;
+      add4.left = j1.out;
+      add4.right = const13.out;
+      j1.in = 1'd1 ? add4.out;
+      upd8[done] = j1.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      i1.write_en = 1'd1;
+      add5.left = i1.out;
+      add5.right = const14.out;
+      i1.in = 1'd1 ? add5.out;
+      upd9[done] = i1.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              upd1;
+              upd2;
+            }
+          }
+          upd3;
+        }
+      }
+      let2;
+      while le2.out with cond2 {
+        seq {
+          let3;
+          while le3.out with cond3 {
+            seq {
+              let4;
+              while le4.out with cond4 {
+                seq {
+                  par {
+                    upd4;
+                    upd5;
+                  }
+                  let5;
+                  let6;
+                  upd6;
+                  upd7;
+                }
+              }
+              upd8;
+            }
+          }
+          upd9;
+        }
+      }
+      let7;
+      while le5.out with cond5 {
+        seq {
+          let8;
+          while le6.out with cond6 {
+            seq {
+              upd10;
+              upd11;
+              upd12;
+            }
+          }
+          upd13;
+        }
+      }
+    }
+  }
+}
+component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_data: 32, x60_0_done: 1) -> (done: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1) {
+  cells {
+    add0 = prim std_add(7);
+    add1 = prim std_add(1);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 0);
+    const3 = prim std_const(7, 0);
+    const4 = prim std_const(7, 63);
+    const5 = prim std_const(32, 0);
+    const6 = prim std_const(7, 1);
+    const7 = prim std_const(1, 1);
+    gt0 = prim std_gt(32);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(7);
+    le0 = prim std_le(1);
+    le1 = prim std_le(7);
+    x5_read0_0 = prim std_reg(32);
+    x5_read1_0 = prim std_reg(32);
+    zero_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const2.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const4.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      gt0.left = x5_read0_0.out;
+      gt0.right = zero_0.out;
+    }
+    group let0<"static"=1> {
+      zero_0.in = const0.out;
+      zero_0.write_en = 1'd1;
+      let0[done] = zero_0.done;
+    }
+    group let1<"static"=1> {
+      i0.in = const1.out;
+      i0.write_en = 1'd1;
+      let1[done] = i0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const3.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x5_read0_0.write_en = 1'd1;
+      x50_0_addr1 = j0.out;
+      x50_0_addr0 = i0.out;
+      x5_read0_0.in = 1'd1 ? x50_0_read_data;
+      upd0[done] = x5_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      x5_read1_0.write_en = 1'd1;
+      x50_0_addr1 = j0.out;
+      x50_0_addr0 = i0.out;
+      x5_read1_0.in = 1'd1 ? x50_0_read_data;
+      upd1[done] = x5_read1_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x60_0_addr1 = j0.out;
+      x60_0_addr0 = i0.out;
+      x60_0_write_en = 1'd1;
+      x60_0_write_data = 1'd1 ? x5_read1_0.out;
+      upd2[done] = x60_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      x60_0_addr1 = j0.out;
+      x60_0_addr0 = i0.out;
+      x60_0_write_en = 1'd1;
+      x60_0_write_data = 1'd1 ? const5.out;
+      upd3[done] = x60_0_done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const6.out;
+      j0.in = 1'd1 ? add0.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const7.out;
+      i0.in = 1'd1 ? add1.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      let1;
+      while le0.out with cond0 {
+        seq {
+          let2;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              if gt0.out with cond2 {
+                seq {
+                  upd1;
+                  upd2;
+                }
+              } else {
+                upd3;
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+component bias_add1(go: 1, clk: 1, fc2_bias0_read_data: 32, fc2_bias0_done: 1, x40_0_read_data: 32, x40_0_done: 1, x50_0_read_data: 32, x50_0_done: 1) -> (done: 1, fc2_bias0_addr0: 7, fc2_bias0_write_data: 32, fc2_bias0_write_en: 1, fc2_bias0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(7);
+    add2 = prim std_add(1);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(7, 0);
+    const3 = prim std_const(7, 63);
+    const4 = prim std_const(7, 1);
+    const5 = prim std_const(1, 1);
+    fc2_bias_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(7);
+    le0 = prim std_le(1);
+    le1 = prim std_le(7);
+    x4_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x4_read0_0.write_en = 1'd1;
+      x40_0_addr1 = j0.out;
+      x40_0_addr0 = i0.out;
+      x4_read0_0.in = 1'd1 ? x40_0_read_data;
+      upd0[done] = x4_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      fc2_bias_read0_0.write_en = 1'd1;
+      fc2_bias0_addr0 = j0.out;
+      fc2_bias_read0_0.in = 1'd1 ? fc2_bias0_read_data;
+      upd1[done] = fc2_bias_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x50_0_addr1 = j0.out;
+      x50_0_addr0 = i0.out;
+      x50_0_write_en = 1'd1;
+      add0.left = x4_read0_0.out;
+      add0.right = fc2_bias_read0_0.out;
+      x50_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x50_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd3[done] = j0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      i0.write_en = 1'd1;
+      add2.left = i0.out;
+      add2.right = const5.out;
+      i0.in = 1'd1 ? add2.out;
+      upd4[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              par {
+                upd0;
+                upd1;
+              }
+              upd2;
+              upd3;
+            }
+          }
+          upd4;
+        }
+      }
+    }
+  }
+}
+component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done: 1, x30_0_read_data: 32, x30_0_done: 1, x40_0_read_data: 32, x40_0_done: 1) -> (done: 1, fc2_weight0_0_addr0: 7, fc2_weight0_0_addr1: 8, fc2_weight0_0_write_data: 32, fc2_weight0_0_write_en: 1, fc2_weight0_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1) {
+  cells {
+    add0 = prim std_add(8);
+    add1 = prim std_add(7);
+    add2 = prim std_add(32);
+    add3 = prim std_add(8);
+    add4 = prim std_add(7);
+    add5 = prim std_add(1);
+    add6 = prim std_add(7);
+    add7 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(7, 0);
+    const1 = prim std_const(7, 63);
+    const10 = prim std_const(8, 0);
+    const11 = prim std_const(8, 127);
+    const12 = prim std_const(8, 1);
+    const13 = prim std_const(7, 1);
+    const14 = prim std_const(1, 1);
+    const15 = prim std_const(1, 0);
+    const16 = prim std_const(1, 0);
+    const17 = prim std_const(7, 0);
+    const18 = prim std_const(7, 63);
+    const19 = prim std_const(7, 1);
+    const2 = prim std_const(8, 0);
+    const20 = prim std_const(1, 1);
+    const3 = prim std_const(8, 127);
+    const4 = prim std_const(8, 1);
+    const5 = prim std_const(7, 1);
+    const6 = prim std_const(1, 0);
+    const7 = prim std_const(1, 0);
+    const8 = prim std_const(7, 0);
+    const9 = prim std_const(7, 63);
+    fc2_weight_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(7);
+    i1 = prim std_reg(1);
+    i2 = prim std_reg(1);
+    j0 = prim std_reg(8);
+    j1 = prim std_reg(7);
+    j2 = prim std_reg(7);
+    k0 = prim std_reg(8);
+    le0 = prim std_le(7);
+    le1 = prim std_le(8);
+    le2 = prim std_le(1);
+    le3 = prim std_le(7);
+    le4 = prim std_le(8);
+    le5 = prim std_le(1);
+    le6 = prim std_le(7);
+    mult_pipe0 = prim std_mult_pipe(32);
+    product_0 = prim std_reg(32);
+    temporary_x40_0 = prim std_mem_d2(32, 1, 64, 1, 7);
+    temporary_x4_read0_0 = prim std_reg(32);
+    transpose_fc2_weight0_0 = prim std_mem_d2(32, 128, 64, 8, 7);
+    transpose_fc2_weight_read0_0 = prim std_reg(32);
+    x3_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = i1.out;
+      le2.right = const7.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = j1.out;
+      le3.right = const9.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = k0.out;
+      le4.right = const11.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = i2.out;
+      le5.right = const16.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = j2.out;
+      le6.right = const18.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      i1.in = const6.out;
+      i1.write_en = 1'd1;
+      let2[done] = i1.done;
+    }
+    group let3<"static"=1> {
+      j1.in = const8.out;
+      j1.write_en = 1'd1;
+      let3[done] = j1.done;
+    }
+    group let4<"static"=1> {
+      k0.in = const10.out;
+      k0.write_en = 1'd1;
+      let4[done] = k0.done;
+    }
+    group let5<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let5[done] = bin_read0_0.done;
+      mult_pipe0.left = x3_read0_0.out;
+      mult_pipe0.right = transpose_fc2_weight_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let6<"static"=1> {
+      product_0.in = bin_read0_0.out;
+      product_0.write_en = 1'd1;
+      let6[done] = product_0.done;
+    }
+    group let7<"static"=1> {
+      i2.in = const15.out;
+      i2.write_en = 1'd1;
+      let7[done] = i2.done;
+    }
+    group let8<"static"=1> {
+      j2.in = const17.out;
+      j2.write_en = 1'd1;
+      let8[done] = j2.done;
+    }
+    group upd0<"static"=1> {
+      fc2_weight_read0_0.write_en = 1'd1;
+      fc2_weight0_0_addr1 = j0.out;
+      fc2_weight0_0_addr0 = i0.out;
+      fc2_weight_read0_0.in = 1'd1 ? fc2_weight0_0_read_data;
+      upd0[done] = fc2_weight_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      transpose_fc2_weight0_0.addr1 = i0.out;
+      transpose_fc2_weight0_0.addr0 = j0.out;
+      transpose_fc2_weight0_0.write_en = 1'd1;
+      transpose_fc2_weight0_0.write_data = 1'd1 ? fc2_weight_read0_0.out;
+      upd1[done] = transpose_fc2_weight0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      temporary_x4_read0_0.write_en = 1'd1;
+      temporary_x40_0.addr1 = j2.out;
+      temporary_x40_0.addr0 = i2.out;
+      temporary_x4_read0_0.in = 1'd1 ? temporary_x40_0.read_data;
+      upd10[done] = temporary_x4_read0_0.done ? 1'd1;
+    }
+    group upd11<"static"=1> {
+      x40_0_addr1 = j2.out;
+      x40_0_addr0 = i2.out;
+      x40_0_write_en = 1'd1;
+      x40_0_write_data = 1'd1 ? temporary_x4_read0_0.out;
+      upd11[done] = x40_0_done ? 1'd1;
+    }
+    group upd12<"static"=1> {
+      j2.write_en = 1'd1;
+      add6.left = j2.out;
+      add6.right = const19.out;
+      j2.in = 1'd1 ? add6.out;
+      upd12[done] = j2.done ? 1'd1;
+    }
+    group upd13<"static"=1> {
+      i2.write_en = 1'd1;
+      add7.left = i2.out;
+      add7.right = const20.out;
+      i2.in = 1'd1 ? add7.out;
+      upd13[done] = i2.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const4.out;
+      j0.in = 1'd1 ? add0.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const5.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      x3_read0_0.write_en = 1'd1;
+      x30_0_addr1 = k0.out;
+      x30_0_addr0 = i1.out;
+      x3_read0_0.in = 1'd1 ? x30_0_read_data;
+      upd4[done] = x3_read0_0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      transpose_fc2_weight_read0_0.write_en = 1'd1;
+      transpose_fc2_weight0_0.addr1 = j1.out;
+      transpose_fc2_weight0_0.addr0 = k0.out;
+      transpose_fc2_weight_read0_0.in = 1'd1 ? transpose_fc2_weight0_0.read_data;
+      upd5[done] = transpose_fc2_weight_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      temporary_x40_0.addr1 = j1.out;
+      temporary_x40_0.addr0 = i1.out;
+      temporary_x40_0.write_en = 1'd1;
+      add2.left = temporary_x40_0.read_data;
+      add2.right = product_0.out;
+      temporary_x40_0.addr1 = j1.out;
+      temporary_x40_0.addr0 = i1.out;
+      temporary_x40_0.write_data = 1'd1 ? add2.out;
+      upd6[done] = temporary_x40_0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      k0.write_en = 1'd1;
+      add3.left = k0.out;
+      add3.right = const12.out;
+      k0.in = 1'd1 ? add3.out;
+      upd7[done] = k0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      j1.write_en = 1'd1;
+      add4.left = j1.out;
+      add4.right = const13.out;
+      j1.in = 1'd1 ? add4.out;
+      upd8[done] = j1.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      i1.write_en = 1'd1;
+      add5.left = i1.out;
+      add5.right = const14.out;
+      i1.in = 1'd1 ? add5.out;
+      upd9[done] = i1.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              upd1;
+              upd2;
+            }
+          }
+          upd3;
+        }
+      }
+      let2;
+      while le2.out with cond2 {
+        seq {
+          let3;
+          while le3.out with cond3 {
+            seq {
+              let4;
+              while le4.out with cond4 {
+                seq {
+                  par {
+                    upd4;
+                    upd5;
+                  }
+                  let5;
+                  let6;
+                  upd6;
+                  upd7;
+                }
+              }
+              upd8;
+            }
+          }
+          upd9;
+        }
+      }
+      let7;
+      while le5.out with cond5 {
+        seq {
+          let8;
+          while le6.out with cond6 {
+            seq {
+              upd10;
+              upd11;
+              upd12;
+            }
+          }
+          upd13;
+        }
+      }
+    }
+  }
+}
+component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_data: 32, x30_0_done: 1) -> (done: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1) {
+  cells {
+    add0 = prim std_add(8);
+    add1 = prim std_add(1);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(1, 0);
+    const3 = prim std_const(8, 0);
+    const4 = prim std_const(8, 127);
+    const5 = prim std_const(32, 0);
+    const6 = prim std_const(8, 1);
+    const7 = prim std_const(1, 1);
+    gt0 = prim std_gt(32);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(8);
+    le0 = prim std_le(1);
+    le1 = prim std_le(8);
+    x2_read0_0 = prim std_reg(32);
+    x2_read1_0 = prim std_reg(32);
+    zero_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const2.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const4.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      gt0.left = x2_read0_0.out;
+      gt0.right = zero_0.out;
+    }
+    group let0<"static"=1> {
+      zero_0.in = const0.out;
+      zero_0.write_en = 1'd1;
+      let0[done] = zero_0.done;
+    }
+    group let1<"static"=1> {
+      i0.in = const1.out;
+      i0.write_en = 1'd1;
+      let1[done] = i0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const3.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x2_read0_0.write_en = 1'd1;
+      x20_0_addr1 = j0.out;
+      x20_0_addr0 = i0.out;
+      x2_read0_0.in = 1'd1 ? x20_0_read_data;
+      upd0[done] = x2_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      x2_read1_0.write_en = 1'd1;
+      x20_0_addr1 = j0.out;
+      x20_0_addr0 = i0.out;
+      x2_read1_0.in = 1'd1 ? x20_0_read_data;
+      upd1[done] = x2_read1_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x30_0_addr1 = j0.out;
+      x30_0_addr0 = i0.out;
+      x30_0_write_en = 1'd1;
+      x30_0_write_data = 1'd1 ? x2_read1_0.out;
+      upd2[done] = x30_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      x30_0_addr1 = j0.out;
+      x30_0_addr0 = i0.out;
+      x30_0_write_en = 1'd1;
+      x30_0_write_data = 1'd1 ? const5.out;
+      upd3[done] = x30_0_done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const6.out;
+      j0.in = 1'd1 ? add0.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const7.out;
+      i0.in = 1'd1 ? add1.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      let1;
+      while le0.out with cond0 {
+        seq {
+          let2;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              if gt0.out with cond2 {
+                seq {
+                  upd1;
+                  upd2;
+                }
+              } else {
+                upd3;
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+component bias_add(go: 1, clk: 1, fc1_bias0_read_data: 32, fc1_bias0_done: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_read_data: 32, x20_0_done: 1) -> (done: 1, fc1_bias0_addr0: 8, fc1_bias0_write_data: 32, fc1_bias0_write_en: 1, fc1_bias0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(8);
+    add2 = prim std_add(1);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(8, 0);
+    const3 = prim std_const(8, 127);
+    const4 = prim std_const(8, 1);
+    const5 = prim std_const(1, 1);
+    fc1_bias_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(8);
+    le0 = prim std_le(1);
+    le1 = prim std_le(8);
+    x1_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group upd0<"static"=1> {
+      x1_read0_0.write_en = 1'd1;
+      x10_0_addr1 = j0.out;
+      x10_0_addr0 = i0.out;
+      x1_read0_0.in = 1'd1 ? x10_0_read_data;
+      upd0[done] = x1_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      fc1_bias_read0_0.write_en = 1'd1;
+      fc1_bias0_addr0 = j0.out;
+      fc1_bias_read0_0.in = 1'd1 ? fc1_bias0_read_data;
+      upd1[done] = fc1_bias_read0_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      x20_0_addr1 = j0.out;
+      x20_0_addr0 = i0.out;
+      x20_0_write_en = 1'd1;
+      add0.left = x1_read0_0.out;
+      add0.right = fc1_bias_read0_0.out;
+      x20_0_write_data = 1'd1 ? add0.out;
+      upd2[done] = x20_0_done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd3[done] = j0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      i0.write_en = 1'd1;
+      add2.left = i0.out;
+      add2.right = const5.out;
+      i0.in = 1'd1 ? add2.out;
+      upd4[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              par {
+                upd0;
+                upd1;
+              }
+              upd2;
+              upd3;
+            }
+          }
+          upd4;
+        }
+      }
+    }
+  }
+}
+component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, fc1_weight0_0_addr0: 8, fc1_weight0_0_addr1: 10, fc1_weight0_0_write_data: 32, fc1_weight0_0_write_en: 1, fc1_weight0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+  cells {
+    add0 = prim std_add(10);
+    add1 = prim std_add(8);
+    add2 = prim std_add(32);
+    add3 = prim std_add(10);
+    add4 = prim std_add(8);
+    add5 = prim std_add(1);
+    add6 = prim std_add(8);
+    add7 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(8, 0);
+    const1 = prim std_const(8, 127);
+    const10 = prim std_const(10, 0);
+    const11 = prim std_const(10, 783);
+    const12 = prim std_const(10, 1);
+    const13 = prim std_const(8, 1);
+    const14 = prim std_const(1, 1);
+    const15 = prim std_const(1, 0);
+    const16 = prim std_const(1, 0);
+    const17 = prim std_const(8, 0);
+    const18 = prim std_const(8, 127);
+    const19 = prim std_const(8, 1);
+    const2 = prim std_const(10, 0);
+    const20 = prim std_const(1, 1);
+    const3 = prim std_const(10, 783);
+    const4 = prim std_const(10, 1);
+    const5 = prim std_const(8, 1);
+    const6 = prim std_const(1, 0);
+    const7 = prim std_const(1, 0);
+    const8 = prim std_const(8, 0);
+    const9 = prim std_const(8, 127);
+    fc1_weight_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(8);
+    i1 = prim std_reg(1);
+    i2 = prim std_reg(1);
+    j0 = prim std_reg(10);
+    j1 = prim std_reg(8);
+    j2 = prim std_reg(8);
+    k0 = prim std_reg(10);
+    le0 = prim std_le(8);
+    le1 = prim std_le(10);
+    le2 = prim std_le(1);
+    le3 = prim std_le(8);
+    le4 = prim std_le(10);
+    le5 = prim std_le(1);
+    le6 = prim std_le(8);
+    mult_pipe0 = prim std_mult_pipe(32);
+    product_0 = prim std_reg(32);
+    temporary_x10_0 = prim std_mem_d2(32, 1, 128, 1, 8);
+    temporary_x1_read0_0 = prim std_reg(32);
+    transpose_fc1_weight0_0 = prim std_mem_d2(32, 784, 128, 10, 8);
+    transpose_fc1_weight_read0_0 = prim std_reg(32);
+    x_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = i1.out;
+      le2.right = const7.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = j1.out;
+      le3.right = const9.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = k0.out;
+      le4.right = const11.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = i2.out;
+      le5.right = const16.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = j2.out;
+      le6.right = const18.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      i1.in = const6.out;
+      i1.write_en = 1'd1;
+      let2[done] = i1.done;
+    }
+    group let3<"static"=1> {
+      j1.in = const8.out;
+      j1.write_en = 1'd1;
+      let3[done] = j1.done;
+    }
+    group let4<"static"=1> {
+      k0.in = const10.out;
+      k0.write_en = 1'd1;
+      let4[done] = k0.done;
+    }
+    group let5<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let5[done] = bin_read0_0.done;
+      mult_pipe0.left = x_read0_0.out;
+      mult_pipe0.right = transpose_fc1_weight_read0_0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let6<"static"=1> {
+      product_0.in = bin_read0_0.out;
+      product_0.write_en = 1'd1;
+      let6[done] = product_0.done;
+    }
+    group let7<"static"=1> {
+      i2.in = const15.out;
+      i2.write_en = 1'd1;
+      let7[done] = i2.done;
+    }
+    group let8<"static"=1> {
+      j2.in = const17.out;
+      j2.write_en = 1'd1;
+      let8[done] = j2.done;
+    }
+    group upd0<"static"=1> {
+      fc1_weight_read0_0.write_en = 1'd1;
+      fc1_weight0_0_addr1 = j0.out;
+      fc1_weight0_0_addr0 = i0.out;
+      fc1_weight_read0_0.in = 1'd1 ? fc1_weight0_0_read_data;
+      upd0[done] = fc1_weight_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      transpose_fc1_weight0_0.addr1 = i0.out;
+      transpose_fc1_weight0_0.addr0 = j0.out;
+      transpose_fc1_weight0_0.write_en = 1'd1;
+      transpose_fc1_weight0_0.write_data = 1'd1 ? fc1_weight_read0_0.out;
+      upd1[done] = transpose_fc1_weight0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      temporary_x1_read0_0.write_en = 1'd1;
+      temporary_x10_0.addr1 = j2.out;
+      temporary_x10_0.addr0 = i2.out;
+      temporary_x1_read0_0.in = 1'd1 ? temporary_x10_0.read_data;
+      upd10[done] = temporary_x1_read0_0.done ? 1'd1;
+    }
+    group upd11<"static"=1> {
+      x10_0_addr1 = j2.out;
+      x10_0_addr0 = i2.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? temporary_x1_read0_0.out;
+      upd11[done] = x10_0_done ? 1'd1;
+    }
+    group upd12<"static"=1> {
+      j2.write_en = 1'd1;
+      add6.left = j2.out;
+      add6.right = const19.out;
+      j2.in = 1'd1 ? add6.out;
+      upd12[done] = j2.done ? 1'd1;
+    }
+    group upd13<"static"=1> {
+      i2.write_en = 1'd1;
+      add7.left = i2.out;
+      add7.right = const20.out;
+      i2.in = 1'd1 ? add7.out;
+      upd13[done] = i2.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add0.left = j0.out;
+      add0.right = const4.out;
+      j0.in = 1'd1 ? add0.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      i0.write_en = 1'd1;
+      add1.left = i0.out;
+      add1.right = const5.out;
+      i0.in = 1'd1 ? add1.out;
+      upd3[done] = i0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = k0.out;
+      x0_0_addr0 = i1.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd4[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      transpose_fc1_weight_read0_0.write_en = 1'd1;
+      transpose_fc1_weight0_0.addr1 = j1.out;
+      transpose_fc1_weight0_0.addr0 = k0.out;
+      transpose_fc1_weight_read0_0.in = 1'd1 ? transpose_fc1_weight0_0.read_data;
+      upd5[done] = transpose_fc1_weight_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      temporary_x10_0.addr1 = j1.out;
+      temporary_x10_0.addr0 = i1.out;
+      temporary_x10_0.write_en = 1'd1;
+      add2.left = temporary_x10_0.read_data;
+      add2.right = product_0.out;
+      temporary_x10_0.addr1 = j1.out;
+      temporary_x10_0.addr0 = i1.out;
+      temporary_x10_0.write_data = 1'd1 ? add2.out;
+      upd6[done] = temporary_x10_0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      k0.write_en = 1'd1;
+      add3.left = k0.out;
+      add3.right = const12.out;
+      k0.in = 1'd1 ? add3.out;
+      upd7[done] = k0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      j1.write_en = 1'd1;
+      add4.left = j1.out;
+      add4.right = const13.out;
+      j1.in = 1'd1 ? add4.out;
+      upd8[done] = j1.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      i1.write_en = 1'd1;
+      add5.left = i1.out;
+      add5.right = const14.out;
+      i1.in = 1'd1 ? add5.out;
+      upd9[done] = i1.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              upd0;
+              upd1;
+              upd2;
+            }
+          }
+          upd3;
+        }
+      }
+      let2;
+      while le2.out with cond2 {
+        seq {
+          let3;
+          while le3.out with cond3 {
+            seq {
+              let4;
+              while le4.out with cond4 {
+                seq {
+                  par {
+                    upd4;
+                    upd5;
+                  }
+                  let5;
+                  let6;
+                  upd6;
+                  upd7;
+                }
+              }
+              upd8;
+            }
+          }
+          upd9;
+        }
+      }
+      let7;
+      while le5.out with cond5 {
+        seq {
+          let8;
+          while le6.out with cond6 {
+            seq {
+              upd10;
+              upd11;
+              upd12;
+            }
+          }
+          upd13;
+        }
+      }
+    }
+  }
+}
+component batch_flatten(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 1, data0_0_0_0_addr1: 1, data0_0_0_0_addr2: 5, data0_0_0_0_addr3: 5, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(10);
+    add1 = prim std_add(5);
+    add2 = prim std_add(5);
+    add3 = prim std_add(1);
+    add4 = prim std_add(1);
+    const0 = prim std_const(10, 0);
+    const1 = prim std_const(1, 0);
+    const10 = prim std_const(5, 1);
+    const11 = prim std_const(5, 1);
+    const12 = prim std_const(1, 1);
+    const13 = prim std_const(1, 1);
+    const2 = prim std_const(1, 0);
+    const3 = prim std_const(1, 0);
+    const4 = prim std_const(1, 0);
+    const5 = prim std_const(5, 0);
+    const6 = prim std_const(5, 27);
+    const7 = prim std_const(5, 0);
+    const8 = prim std_const(5, 27);
+    const9 = prim std_const(10, 1);
+    data_read0_0 = prim std_reg(32);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(1);
+    k0 = prim std_reg(5);
+    l0 = prim std_reg(5);
+    le0 = prim std_le(1);
+    le1 = prim std_le(1);
+    le2 = prim std_le(5);
+    le3 = prim std_le(5);
+    m_0 = prim std_reg(10);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const2.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const4.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const6.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = l0.out;
+      le3.right = const8.out;
+    }
+    group let0<"static"=1> {
+      m_0.in = const0.out;
+      m_0.write_en = 1'd1;
+      let0[done] = m_0.done;
+    }
+    group let1<"static"=1> {
+      i0.in = const1.out;
+      i0.write_en = 1'd1;
+      let1[done] = i0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const3.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group let3<"static"=1> {
+      k0.in = const5.out;
+      k0.write_en = 1'd1;
+      let3[done] = k0.done;
+    }
+    group let4<"static"=1> {
+      l0.in = const7.out;
+      l0.write_en = 1'd1;
+      let4[done] = l0.done;
+    }
+    group upd0<"static"=1> {
+      data_read0_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = l0.out;
+      data0_0_0_0_addr2 = k0.out;
+      data0_0_0_0_addr1 = j0.out;
+      data0_0_0_0_addr0 = i0.out;
+      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd0[done] = data_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      x0_0_addr1 = m_0.out;
+      x0_0_addr0 = i0.out;
+      x0_0_write_en = 1'd1;
+      x0_0_write_data = 1'd1 ? data_read0_0.out;
+      upd1[done] = x0_0_done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      m_0.write_en = 1'd1;
+      add0.left = m_0.out;
+      add0.right = const9.out;
+      m_0.in = 1'd1 ? add0.out;
+      upd2[done] = m_0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      l0.write_en = 1'd1;
+      add1.left = l0.out;
+      add1.right = const10.out;
+      l0.in = 1'd1 ? add1.out;
+      upd3[done] = l0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      k0.write_en = 1'd1;
+      add2.left = k0.out;
+      add2.right = const11.out;
+      k0.in = 1'd1 ? add2.out;
+      upd4[done] = k0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      j0.write_en = 1'd1;
+      add3.left = j0.out;
+      add3.right = const12.out;
+      j0.in = 1'd1 ? add3.out;
+      upd5[done] = j0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      i0.write_en = 1'd1;
+      add4.left = i0.out;
+      add4.right = const13.out;
+      i0.in = 1'd1 ? add4.out;
+      upd6[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      let1;
+      while le0.out with cond0 {
+        seq {
+          let2;
+          while le1.out with cond1 {
+            seq {
+              let3;
+              while le2.out with cond2 {
+                seq {
+                  let4;
+                  while le3.out with cond3 {
+                    seq {
+                      upd0;
+                      upd1;
+                      upd2;
+                      upd3;
+                    }
+                  }
+                  upd4;
+                }
+              }
+              upd5;
+            }
+          }
+          upd6;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x8 = prim std_mem_d2(32, 1, 10, 1, 4);
+    x7 = prim std_mem_d2(32, 1, 10, 1, 4);
+    fc3_bias = prim std_mem_d1(32, 10, 4);
+    bias_add2 = bias_add2;
+    x6 = prim std_mem_d2(32, 1, 64, 1, 7);
+    fc3_weight = prim std_mem_d2(32, 10, 64, 4, 7);
+    dense2 = dense2;
+    x5 = prim std_mem_d2(32, 1, 64, 1, 7);
+    relu1 = relu1;
+    x4 = prim std_mem_d2(32, 1, 64, 1, 7);
+    fc2_bias = prim std_mem_d1(32, 64, 7);
+    bias_add1 = bias_add1;
+    x3 = prim std_mem_d2(32, 1, 128, 1, 8);
+    fc2_weight = prim std_mem_d2(32, 64, 128, 7, 8);
+    dense1 = dense1;
+    x2 = prim std_mem_d2(32, 1, 128, 1, 8);
+    relu0 = relu;
+    x1 = prim std_mem_d2(32, 1, 128, 1, 8);
+    fc1_bias = prim std_mem_d1(32, 128, 8);
+    bias_add0 = bias_add;
+    x = prim std_mem_d2(32, 1, 784, 1, 10);
+    fc1_weight = prim std_mem_d2(32, 128, 784, 8, 10);
+    dense0 = dense;
+    data = prim std_mem_d4(32, 1, 1, 28, 28, 1, 1, 5, 5);
+    batch_flatten0 = batch_flatten;
+  }
+  wires {
+    group run_batch_flatten {
+      data.addr0 = batch_flatten0.data0_0_0_0_addr0;
+      batch_flatten0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = batch_flatten0.data0_0_0_0_addr1;
+      data.addr2 = batch_flatten0.data0_0_0_0_addr2;
+      x.addr0 = batch_flatten0.x0_0_addr0;
+      x.addr1 = batch_flatten0.x0_0_addr1;
+      x.write_data = batch_flatten0.x0_0_write_data;
+      x.write_en = batch_flatten0.x0_0_write_en;
+      batch_flatten0.x0_0_done = x.done;
+      batch_flatten0.go = 1'd1;
+      run_batch_flatten[done] = batch_flatten0.done ? 1'd1;
+    }
+    group run_dense {
+      x.addr0 = dense0.x0_0_addr0;
+      dense0.x0_0_read_data = x.read_data;
+      x.addr1 = dense0.x0_0_addr1;
+      fc1_weight.addr0 = dense0.fc1_weight0_0_addr0;
+      dense0.fc1_weight0_0_read_data = fc1_weight.read_data;
+      fc1_weight.addr1 = dense0.fc1_weight0_0_addr1;
+      x1.addr0 = dense0.x10_0_addr0;
+      x1.addr1 = dense0.x10_0_addr1;
+      x1.write_data = dense0.x10_0_write_data;
+      x1.write_en = dense0.x10_0_write_en;
+      dense0.x10_0_done = x1.done;
+      dense0.go = 1'd1;
+      run_dense[done] = dense0.done ? 1'd1;
+    }
+    group run_bias_add {
+      x1.addr0 = bias_add0.x10_0_addr0;
+      bias_add0.x10_0_read_data = x1.read_data;
+      x1.addr1 = bias_add0.x10_0_addr1;
+      fc1_bias.addr0 = bias_add0.fc1_bias0_addr0;
+      bias_add0.fc1_bias0_read_data = fc1_bias.read_data;
+      x2.addr0 = bias_add0.x20_0_addr0;
+      x2.addr1 = bias_add0.x20_0_addr1;
+      x2.write_data = bias_add0.x20_0_write_data;
+      x2.write_en = bias_add0.x20_0_write_en;
+      bias_add0.x20_0_done = x2.done;
+      bias_add0.go = 1'd1;
+      run_bias_add[done] = bias_add0.done ? 1'd1;
+    }
+    group run_relu {
+      x2.addr0 = relu0.x20_0_addr0;
+      relu0.x20_0_read_data = x2.read_data;
+      x2.addr1 = relu0.x20_0_addr1;
+      x3.addr0 = relu0.x30_0_addr0;
+      x3.addr1 = relu0.x30_0_addr1;
+      x3.write_data = relu0.x30_0_write_data;
+      x3.write_en = relu0.x30_0_write_en;
+      relu0.x30_0_done = x3.done;
+      relu0.go = 1'd1;
+      run_relu[done] = relu0.done ? 1'd1;
+    }
+    group run_dense1 {
+      x3.addr0 = dense1.x30_0_addr0;
+      dense1.x30_0_read_data = x3.read_data;
+      x3.addr1 = dense1.x30_0_addr1;
+      fc2_weight.addr0 = dense1.fc2_weight0_0_addr0;
+      dense1.fc2_weight0_0_read_data = fc2_weight.read_data;
+      fc2_weight.addr1 = dense1.fc2_weight0_0_addr1;
+      x4.addr0 = dense1.x40_0_addr0;
+      x4.addr1 = dense1.x40_0_addr1;
+      x4.write_data = dense1.x40_0_write_data;
+      x4.write_en = dense1.x40_0_write_en;
+      dense1.x40_0_done = x4.done;
+      dense1.go = 1'd1;
+      run_dense1[done] = dense1.done ? 1'd1;
+    }
+    group run_bias_add1 {
+      x4.addr0 = bias_add1.x40_0_addr0;
+      bias_add1.x40_0_read_data = x4.read_data;
+      x4.addr1 = bias_add1.x40_0_addr1;
+      fc2_bias.addr0 = bias_add1.fc2_bias0_addr0;
+      bias_add1.fc2_bias0_read_data = fc2_bias.read_data;
+      x5.addr0 = bias_add1.x50_0_addr0;
+      x5.addr1 = bias_add1.x50_0_addr1;
+      x5.write_data = bias_add1.x50_0_write_data;
+      x5.write_en = bias_add1.x50_0_write_en;
+      bias_add1.x50_0_done = x5.done;
+      bias_add1.go = 1'd1;
+      run_bias_add1[done] = bias_add1.done ? 1'd1;
+    }
+    group run_relu1 {
+      x5.addr0 = relu1.x50_0_addr0;
+      relu1.x50_0_read_data = x5.read_data;
+      x5.addr1 = relu1.x50_0_addr1;
+      x6.addr0 = relu1.x60_0_addr0;
+      x6.addr1 = relu1.x60_0_addr1;
+      x6.write_data = relu1.x60_0_write_data;
+      x6.write_en = relu1.x60_0_write_en;
+      relu1.x60_0_done = x6.done;
+      relu1.go = 1'd1;
+      run_relu1[done] = relu1.done ? 1'd1;
+    }
+    group run_dense2 {
+      x6.addr0 = dense2.x60_0_addr0;
+      dense2.x60_0_read_data = x6.read_data;
+      x6.addr1 = dense2.x60_0_addr1;
+      fc3_weight.addr0 = dense2.fc3_weight0_0_addr0;
+      dense2.fc3_weight0_0_read_data = fc3_weight.read_data;
+      fc3_weight.addr1 = dense2.fc3_weight0_0_addr1;
+      x7.addr0 = dense2.x70_0_addr0;
+      x7.addr1 = dense2.x70_0_addr1;
+      x7.write_data = dense2.x70_0_write_data;
+      x7.write_en = dense2.x70_0_write_en;
+      dense2.x70_0_done = x7.done;
+      dense2.go = 1'd1;
+      run_dense2[done] = dense2.done ? 1'd1;
+    }
+    group run_bias_add2 {
+      x7.addr0 = bias_add2.x70_0_addr0;
+      bias_add2.x70_0_read_data = x7.read_data;
+      x7.addr1 = bias_add2.x70_0_addr1;
+      fc3_bias.addr0 = bias_add2.fc3_bias0_addr0;
+      bias_add2.fc3_bias0_read_data = fc3_bias.read_data;
+      x8.addr0 = bias_add2.x80_0_addr0;
+      x8.addr1 = bias_add2.x80_0_addr1;
+      x8.write_data = bias_add2.x80_0_write_data;
+      x8.write_en = bias_add2.x80_0_write_en;
+      bias_add2.x80_0_done = x8.done;
+      bias_add2.go = 1'd1;
+      run_bias_add2[done] = bias_add2.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_batch_flatten;
+      run_dense;
+      run_bias_add;
+      run_relu;
+      run_dense1;
+      run_bias_add1;
+      run_relu1;
+      run_dense2;
+      run_bias_add2;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/mlp_net.relay b/frontends/relay-futil/tests/mlp_net.relay
new file mode 100644
index 0000000000..ef53158f58
--- /dev/null
+++ b/frontends/relay-futil/tests/mlp_net.relay
@@ -0,0 +1,17 @@
+v0.0.4
+fn (%data: Tensor[(1, 1, 28, 28), int32], %fc1_weight: Tensor[(128, 784), int32], %fc1_bias: Tensor[(128), int32],
+    %fc2_weight: Tensor[(64, 128), int32], %fc2_bias: Tensor[(64), int32], %fc3_weight: Tensor[(10, 64), int32],
+    %fc3_bias: Tensor[(10), int32]) -> Tensor[(1, 10), int32] {
+  let %x: Tensor[(1, 784), int32] = nn.batch_flatten(%data) /* ty=Tensor[(1, 784), int32] */;
+  let %x1: Tensor[(1, 128), int32] = nn.dense(%x, %fc1_weight, units=128) /* ty=Tensor[(1, 128), int32] */;
+  let %x2: Tensor[(1, 128), int32] = nn.bias_add(%x1, %fc1_bias, axis=-1) /* ty=Tensor[(1, 128), int32] */;
+  let %x3: Tensor[(1, 128), int32] = nn.relu(%x2) /* ty=Tensor[(1, 128), int32] */;
+  let %x4: Tensor[(1, 64), int32] = nn.dense(%x3, %fc2_weight, units=64) /* ty=Tensor[(1, 64), int32] */;
+  let %x5: Tensor[(1, 64), int32] = nn.bias_add(%x4, %fc2_bias, axis=-1) /* ty=Tensor[(1, 64), int32] */;
+  let %x6: Tensor[(1, 64), int32] = nn.relu(%x5) /* ty=Tensor[(1, 64), int32] */;
+  let %x7: Tensor[(1, 10), int32] = nn.dense(%x6, %fc3_weight, units=10) /* ty=Tensor[(1, 10), int32] */;
+  let %x8: Tensor[(1, 10), int32] = nn.bias_add(%x7, %fc3_bias, axis=-1) /* ty=Tensor[(1, 10), int32] */;
+  %x8
+  // let %x9: Tensor[(1, 10), int32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), int32] */;
+  // %x9
+}

From fe78aee732e236d1c408bd760c007f9a9c51dcb8 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 27 Nov 2020 08:07:29 -0500
Subject: [PATCH 52/75] Add mem_d4 to remove externals pass

---
 calyx/src/passes/remove_external_memories.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/calyx/src/passes/remove_external_memories.rs b/calyx/src/passes/remove_external_memories.rs
index 381f878181..529312900b 100644
--- a/calyx/src/passes/remove_external_memories.rs
+++ b/calyx/src/passes/remove_external_memories.rs
@@ -14,9 +14,10 @@ impl Default for RemoveExternalMemories<'_> {
             ("std_mem_d1_ext", "std_mem_d1"),
             ("std_mem_d2_ext", "std_mem_d2"),
             ("std_mem_d3_ext", "std_mem_d3"),
+            ("std_mem_d4_ext", "std_mem_d4")
         ]
-        .into_iter()
-        .collect();
+            .into_iter()
+            .collect();
         Self { changeable }
     }
 }

From e9b3db37c7104f17f114b7243fba1bae8047e6f9 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 27 Nov 2020 08:14:34 -0500
Subject: [PATCH 53/75] Add ,

---
 calyx/src/passes/remove_external_memories.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/calyx/src/passes/remove_external_memories.rs b/calyx/src/passes/remove_external_memories.rs
index 529312900b..4c2abdd842 100644
--- a/calyx/src/passes/remove_external_memories.rs
+++ b/calyx/src/passes/remove_external_memories.rs
@@ -14,10 +14,10 @@ impl Default for RemoveExternalMemories<'_> {
             ("std_mem_d1_ext", "std_mem_d1"),
             ("std_mem_d2_ext", "std_mem_d2"),
             ("std_mem_d3_ext", "std_mem_d3"),
-            ("std_mem_d4_ext", "std_mem_d4")
+            ("std_mem_d4_ext", "std_mem_d4"),
         ]
-            .into_iter()
-            .collect();
+        .into_iter()
+        .collect();
         Self { changeable }
     }
 }

From 2ebe1eccb53e43c3be536f915d17b2b68f65d5f8 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 27 Nov 2020 08:45:19 -0500
Subject: [PATCH 54/75] Use op instead of +

---
 frontends/relay-futil/dahlia_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 084524618b..4839a452ec 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -65,7 +65,7 @@ def broadcast(declaration):
         for (i = 0...64) {
           for (j = 0..16) {
             for (k = 0..32) {
-              result[i][j][k] := op1[i][0][k] + op2[j][0];
+              result[i][j][k] := op1[i][0][k] op op2[j][0];
               ...
     """
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive

From 62a20de4a617aa17b72c854c3918f884ed43f83a Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 27 Nov 2020 13:34:18 -0500
Subject: [PATCH 55/75] Add fixed_p_std_gt.

---
 frontends/relay-futil/dahlia_functions.py |  6 +--
 frontends/relay-futil/tests/relu.expect   | 57 +++++++++++------------
 frontends/relay-futil/tests/relu.relay    |  4 +-
 primitives/std.lib                        | 18 ++++++-
 4 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 4839a452ec..1eb29a9b53 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -171,10 +171,10 @@ def relu(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
     data, res = declaration.inputs[0].primitive, declaration.output.primitive
     bitwidth, num_dimensions = data.data[0], data.type
-    assert res.data_type == 'ubit', f'{res.data_type} is not currently supported for ReLU.'
 
     declarations = pp_dahlia_memory_declarations([data, res])
-    let_zero = f'let zero: {data.data_type}<{bitwidth}> = 0;'
+    zero = '0.0' if data.data_type == 'ufix' else '0'
+    let_zero = f'let zero: {data.data_type}<{bitwidth}> = {zero};'
 
     indices = ""
     variable_name = CHARACTER_I
@@ -184,7 +184,7 @@ def relu(declaration):
         variable_name = next_character(variable_name)
 
     body = f"""if ({data.name}{indices} > zero) {{ {res.name}{indices} := {data.name}{indices}; }} 
-        else {{ {res.name}{indices} := 0; }}"""
+        else {{ {res.name}{indices} := zero; }}"""
     program_body = pp_dahlia_loop(data, body)
     return lower_dahlia_program(f"""{declarations}{NEWL}{let_zero}{NEWL}{program_body}""", declaration.component_name)
 
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
index 7c2ac6e96a..74b5646d9b 100644
--- a/frontends/relay-futil/tests/relu.expect
+++ b/frontends/relay-futil/tests/relu.expect
@@ -6,21 +6,20 @@ component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_
     add1 = prim std_add(4);
     add2 = prim std_add(3);
     add3 = prim std_add(2);
-    const0 = prim std_const(32, 0);
-    const1 = prim std_const(2, 0);
-    const10 = prim std_const(6, 1);
-    const11 = prim std_const(4, 1);
-    const12 = prim std_const(3, 1);
-    const13 = prim std_const(2, 1);
-    const2 = prim std_const(2, 1);
-    const3 = prim std_const(3, 0);
-    const4 = prim std_const(3, 3);
-    const5 = prim std_const(4, 0);
-    const6 = prim std_const(4, 7);
-    const7 = prim std_const(6, 0);
-    const8 = prim std_const(6, 31);
-    const9 = prim std_const(32, 0);
-    gt0 = prim std_gt(32);
+    const0 = prim std_const(2, 0);
+    const1 = prim std_const(2, 1);
+    const10 = prim std_const(3, 1);
+    const11 = prim std_const(2, 1);
+    const2 = prim std_const(3, 0);
+    const3 = prim std_const(3, 3);
+    const4 = prim std_const(4, 0);
+    const5 = prim std_const(4, 7);
+    const6 = prim std_const(6, 0);
+    const7 = prim std_const(6, 31);
+    const8 = prim std_const(6, 1);
+    const9 = prim std_const(4, 1);
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    gt0 = prim fixed_p_std_gt(32, 16, 16);
     i0 = prim std_reg(2);
     j0 = prim std_reg(3);
     k0 = prim std_reg(4);
@@ -37,22 +36,22 @@ component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_
     group cond0<"static"=0> {
       cond0[done] = 1'd1;
       le0.left = i0.out;
-      le0.right = const2.out;
+      le0.right = const1.out;
     }
     group cond1<"static"=0> {
       cond1[done] = 1'd1;
       le1.left = j0.out;
-      le1.right = const4.out;
+      le1.right = const3.out;
     }
     group cond2<"static"=0> {
       cond2[done] = 1'd1;
       le2.left = k0.out;
-      le2.right = const6.out;
+      le2.right = const5.out;
     }
     group cond3<"static"=0> {
       cond3[done] = 1'd1;
       le3.left = l0.out;
-      le3.right = const8.out;
+      le3.right = const7.out;
     }
     group cond4<"static"=0> {
       cond4[done] = 1'd1;
@@ -60,27 +59,27 @@ component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_
       gt0.right = zero_0.out;
     }
     group let0<"static"=1> {
-      zero_0.in = const0.out;
+      zero_0.in = fpconst0.out;
       zero_0.write_en = 1'd1;
       let0[done] = zero_0.done;
     }
     group let1<"static"=1> {
-      i0.in = const1.out;
+      i0.in = const0.out;
       i0.write_en = 1'd1;
       let1[done] = i0.done;
     }
     group let2<"static"=1> {
-      j0.in = const3.out;
+      j0.in = const2.out;
       j0.write_en = 1'd1;
       let2[done] = j0.done;
     }
     group let3<"static"=1> {
-      k0.in = const5.out;
+      k0.in = const4.out;
       k0.write_en = 1'd1;
       let3[done] = k0.done;
     }
     group let4<"static"=1> {
-      l0.in = const7.out;
+      l0.in = const6.out;
       l0.write_en = 1'd1;
       let4[done] = l0.done;
     }
@@ -117,34 +116,34 @@ component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_
       x10_0_0_0_addr1 = j0.out;
       x10_0_0_0_addr0 = i0.out;
       x10_0_0_0_write_en = 1'd1;
-      x10_0_0_0_write_data = 1'd1 ? const9.out;
+      x10_0_0_0_write_data = 1'd1 ? zero_0.out;
       upd3[done] = x10_0_0_0_done ? 1'd1;
     }
     group upd4<"static"=1> {
       l0.write_en = 1'd1;
       add0.left = l0.out;
-      add0.right = const10.out;
+      add0.right = const8.out;
       l0.in = 1'd1 ? add0.out;
       upd4[done] = l0.done ? 1'd1;
     }
     group upd5<"static"=1> {
       k0.write_en = 1'd1;
       add1.left = k0.out;
-      add1.right = const11.out;
+      add1.right = const9.out;
       k0.in = 1'd1 ? add1.out;
       upd5[done] = k0.done ? 1'd1;
     }
     group upd6<"static"=1> {
       j0.write_en = 1'd1;
       add2.left = j0.out;
-      add2.right = const12.out;
+      add2.right = const10.out;
       j0.in = 1'd1 ? add2.out;
       upd6[done] = j0.done ? 1'd1;
     }
     group upd7<"static"=1> {
       i0.write_en = 1'd1;
       add3.left = i0.out;
-      add3.right = const13.out;
+      add3.right = const11.out;
       i0.in = 1'd1 ? add3.out;
       upd7[done] = i0.done ? 1'd1;
     }
diff --git a/frontends/relay-futil/tests/relu.relay b/frontends/relay-futil/tests/relu.relay
index fd5278c4a8..98c9dc8df1 100644
--- a/frontends/relay-futil/tests/relu.relay
+++ b/frontends/relay-futil/tests/relu.relay
@@ -1,6 +1,6 @@
 v0.0.4
-fn (%x: Tensor[(2, 4, 8, 32), int32]) {
-  let %x1: Tensor[(2, 4, 8, 32), int32] = nn.relu(%x);
+fn (%x: Tensor[(2, 4, 8, 32), float32]) {
+  let %x1: Tensor[(2, 4, 8, 32), float32] = nn.relu(%x);
   %x1
 }
 
diff --git a/primitives/std.lib b/primitives/std.lib
index ecd352640a..ad63d71ff8 100644
--- a/primitives/std.lib
+++ b/primitives/std.lib
@@ -891,14 +891,28 @@ primitive fixed_p_std_div[width, int_width, fract_width](left: width, right: wid
           logic [2*width-2:0] result;
 
           assign result = left / right;
-          // result bit= 2*width, 1 is placed at fract_width,
-          //the valid bit would be width amount of bits starting at fract_width
+          // result bit = 2 * width, 1 is placed at fract_width,
+          // the valid bit would be width amount of bits starting at fract_width
           assign out = result[width+fract_width-1:fract_width];
         endmodule
 
     }
 }
 
+primitive fixed_p_std_gt<"share"=1>[width, int_width, fract_width](left: width, right: width) -> (out: 1) {
+  verilog {
+    module fixed_p_std_gt
+     #(parameter width = 32,
+       parameter int_width = 8,
+       parameter fract_width = 24)
+       (input  logic [width-1:0] left,
+        input  logic [width-1:0] right,
+        output logic             out);
+      assign out = left > right;
+    endmodule
+  }
+}
+
 // the bigger integer bit always comes left, one with bigger fractional bit comes right
 primitive fixed_p_std_add_dbit[width, int_width1, fract_width1, int_width2, fract_width2, out_width] (left: width, right: width)
 ->(out: out_width){

From 4482dd59ca4c77052908aa53118187408985341e Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 27 Nov 2020 13:50:33 -0500
Subject: [PATCH 56/75] Float mlp_net

---
 frontends/relay-futil/tests/mlp_net.expect | 110 +++++++++++----------
 frontends/relay-futil/tests/mlp_net.relay  |  26 ++---
 2 files changed, 73 insertions(+), 63 deletions(-)

diff --git a/frontends/relay-futil/tests/mlp_net.expect b/frontends/relay-futil/tests/mlp_net.expect
index ea30df19af..812a0381fd 100644
--- a/frontends/relay-futil/tests/mlp_net.expect
+++ b/frontends/relay-futil/tests/mlp_net.expect
@@ -2,7 +2,7 @@ import "primitives/std.lib";
 
 component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x70_0_read_data: 32, x70_0_done: 1, x80_0_read_data: 32, x80_0_done: 1) -> (done: 1, fc3_bias0_addr0: 4, fc3_bias0_write_data: 32, fc3_bias0_write_en: 1, fc3_bias0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1) {
   cells {
-    add0 = prim std_add(32);
+    add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(4);
     add2 = prim std_add(1);
     const0 = prim std_const(1, 0);
@@ -103,7 +103,7 @@ component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done:
   cells {
     add0 = prim std_add(7);
     add1 = prim std_add(4);
-    add2 = prim std_add(32);
+    add2 = prim fixed_p_std_add(32, 16, 16);
     add3 = prim std_add(7);
     add4 = prim std_add(4);
     add5 = prim std_add(1);
@@ -148,6 +148,8 @@ component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done:
     le6 = prim std_le(4);
     mult_pipe0 = prim std_mult_pipe(32);
     product_0 = prim std_reg(32);
+    slice0 = prim std_slice(32, 32);
+    slice1 = prim std_slice(32, 32);
     temporary_x70_0 = prim std_mem_d2(32, 1, 10, 1, 4);
     temporary_x7_read0_0 = prim std_reg(32);
     transpose_fc3_weight0_0 = prim std_mem_d2(32, 64, 10, 7, 4);
@@ -215,18 +217,20 @@ component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done:
       k0.write_en = 1'd1;
       let4[done] = k0.done;
     }
-    group let5<"static"=4> {
-      bin_read0_0.in = mult_pipe0.out;
-      bin_read0_0.write_en = mult_pipe0.done;
+    group let5<"static"=1> {
+      bin_read0_0.in = slice0.out;
+      bin_read0_0.write_en = 1'd1;
       let5[done] = bin_read0_0.done;
+      slice0.in = mult_pipe0.out;
       mult_pipe0.left = x6_read0_0.out;
       mult_pipe0.right = transpose_fc3_weight_read0_0.out;
       mult_pipe0.go = !mult_pipe0.done ? 1'd1;
     }
     group let6<"static"=1> {
-      product_0.in = bin_read0_0.out;
+      product_0.in = slice1.out;
       product_0.write_en = 1'd1;
       let6[done] = product_0.done;
+      slice1.in = bin_read0_0.out;
     }
     group let7<"static"=1> {
       i2.in = const15.out;
@@ -404,15 +408,14 @@ component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_da
   cells {
     add0 = prim std_add(7);
     add1 = prim std_add(1);
-    const0 = prim std_const(32, 0);
+    const0 = prim std_const(1, 0);
     const1 = prim std_const(1, 0);
-    const2 = prim std_const(1, 0);
-    const3 = prim std_const(7, 0);
-    const4 = prim std_const(7, 63);
-    const5 = prim std_const(32, 0);
-    const6 = prim std_const(7, 1);
-    const7 = prim std_const(1, 1);
-    gt0 = prim std_gt(32);
+    const2 = prim std_const(7, 0);
+    const3 = prim std_const(7, 63);
+    const4 = prim std_const(7, 1);
+    const5 = prim std_const(1, 1);
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    gt0 = prim fixed_p_std_gt(32, 16, 16);
     i0 = prim std_reg(1);
     j0 = prim std_reg(7);
     le0 = prim std_le(1);
@@ -425,12 +428,12 @@ component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_da
     group cond0<"static"=0> {
       cond0[done] = 1'd1;
       le0.left = i0.out;
-      le0.right = const2.out;
+      le0.right = const1.out;
     }
     group cond1<"static"=0> {
       cond1[done] = 1'd1;
       le1.left = j0.out;
-      le1.right = const4.out;
+      le1.right = const3.out;
     }
     group cond2<"static"=0> {
       cond2[done] = 1'd1;
@@ -438,17 +441,17 @@ component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_da
       gt0.right = zero_0.out;
     }
     group let0<"static"=1> {
-      zero_0.in = const0.out;
+      zero_0.in = fpconst0.out;
       zero_0.write_en = 1'd1;
       let0[done] = zero_0.done;
     }
     group let1<"static"=1> {
-      i0.in = const1.out;
+      i0.in = const0.out;
       i0.write_en = 1'd1;
       let1[done] = i0.done;
     }
     group let2<"static"=1> {
-      j0.in = const3.out;
+      j0.in = const2.out;
       j0.write_en = 1'd1;
       let2[done] = j0.done;
     }
@@ -477,20 +480,20 @@ component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_da
       x60_0_addr1 = j0.out;
       x60_0_addr0 = i0.out;
       x60_0_write_en = 1'd1;
-      x60_0_write_data = 1'd1 ? const5.out;
+      x60_0_write_data = 1'd1 ? zero_0.out;
       upd3[done] = x60_0_done ? 1'd1;
     }
     group upd4<"static"=1> {
       j0.write_en = 1'd1;
       add0.left = j0.out;
-      add0.right = const6.out;
+      add0.right = const4.out;
       j0.in = 1'd1 ? add0.out;
       upd4[done] = j0.done ? 1'd1;
     }
     group upd5<"static"=1> {
       i0.write_en = 1'd1;
       add1.left = i0.out;
-      add1.right = const7.out;
+      add1.right = const5.out;
       i0.in = 1'd1 ? add1.out;
       upd5[done] = i0.done ? 1'd1;
     }
@@ -525,7 +528,7 @@ component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_da
 }
 component bias_add1(go: 1, clk: 1, fc2_bias0_read_data: 32, fc2_bias0_done: 1, x40_0_read_data: 32, x40_0_done: 1, x50_0_read_data: 32, x50_0_done: 1) -> (done: 1, fc2_bias0_addr0: 7, fc2_bias0_write_data: 32, fc2_bias0_write_en: 1, fc2_bias0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1) {
   cells {
-    add0 = prim std_add(32);
+    add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(7);
     add2 = prim std_add(1);
     const0 = prim std_const(1, 0);
@@ -626,7 +629,7 @@ component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done:
   cells {
     add0 = prim std_add(8);
     add1 = prim std_add(7);
-    add2 = prim std_add(32);
+    add2 = prim fixed_p_std_add(32, 16, 16);
     add3 = prim std_add(8);
     add4 = prim std_add(7);
     add5 = prim std_add(1);
@@ -671,6 +674,8 @@ component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done:
     le6 = prim std_le(7);
     mult_pipe0 = prim std_mult_pipe(32);
     product_0 = prim std_reg(32);
+    slice0 = prim std_slice(32, 32);
+    slice1 = prim std_slice(32, 32);
     temporary_x40_0 = prim std_mem_d2(32, 1, 64, 1, 7);
     temporary_x4_read0_0 = prim std_reg(32);
     transpose_fc2_weight0_0 = prim std_mem_d2(32, 128, 64, 8, 7);
@@ -738,18 +743,20 @@ component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done:
       k0.write_en = 1'd1;
       let4[done] = k0.done;
     }
-    group let5<"static"=4> {
-      bin_read0_0.in = mult_pipe0.out;
-      bin_read0_0.write_en = mult_pipe0.done;
+    group let5<"static"=1> {
+      bin_read0_0.in = slice0.out;
+      bin_read0_0.write_en = 1'd1;
       let5[done] = bin_read0_0.done;
+      slice0.in = mult_pipe0.out;
       mult_pipe0.left = x3_read0_0.out;
       mult_pipe0.right = transpose_fc2_weight_read0_0.out;
       mult_pipe0.go = !mult_pipe0.done ? 1'd1;
     }
     group let6<"static"=1> {
-      product_0.in = bin_read0_0.out;
+      product_0.in = slice1.out;
       product_0.write_en = 1'd1;
       let6[done] = product_0.done;
+      slice1.in = bin_read0_0.out;
     }
     group let7<"static"=1> {
       i2.in = const15.out;
@@ -927,15 +934,14 @@ component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_dat
   cells {
     add0 = prim std_add(8);
     add1 = prim std_add(1);
-    const0 = prim std_const(32, 0);
+    const0 = prim std_const(1, 0);
     const1 = prim std_const(1, 0);
-    const2 = prim std_const(1, 0);
-    const3 = prim std_const(8, 0);
-    const4 = prim std_const(8, 127);
-    const5 = prim std_const(32, 0);
-    const6 = prim std_const(8, 1);
-    const7 = prim std_const(1, 1);
-    gt0 = prim std_gt(32);
+    const2 = prim std_const(8, 0);
+    const3 = prim std_const(8, 127);
+    const4 = prim std_const(8, 1);
+    const5 = prim std_const(1, 1);
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    gt0 = prim fixed_p_std_gt(32, 16, 16);
     i0 = prim std_reg(1);
     j0 = prim std_reg(8);
     le0 = prim std_le(1);
@@ -948,12 +954,12 @@ component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_dat
     group cond0<"static"=0> {
       cond0[done] = 1'd1;
       le0.left = i0.out;
-      le0.right = const2.out;
+      le0.right = const1.out;
     }
     group cond1<"static"=0> {
       cond1[done] = 1'd1;
       le1.left = j0.out;
-      le1.right = const4.out;
+      le1.right = const3.out;
     }
     group cond2<"static"=0> {
       cond2[done] = 1'd1;
@@ -961,17 +967,17 @@ component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_dat
       gt0.right = zero_0.out;
     }
     group let0<"static"=1> {
-      zero_0.in = const0.out;
+      zero_0.in = fpconst0.out;
       zero_0.write_en = 1'd1;
       let0[done] = zero_0.done;
     }
     group let1<"static"=1> {
-      i0.in = const1.out;
+      i0.in = const0.out;
       i0.write_en = 1'd1;
       let1[done] = i0.done;
     }
     group let2<"static"=1> {
-      j0.in = const3.out;
+      j0.in = const2.out;
       j0.write_en = 1'd1;
       let2[done] = j0.done;
     }
@@ -1000,20 +1006,20 @@ component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_dat
       x30_0_addr1 = j0.out;
       x30_0_addr0 = i0.out;
       x30_0_write_en = 1'd1;
-      x30_0_write_data = 1'd1 ? const5.out;
+      x30_0_write_data = 1'd1 ? zero_0.out;
       upd3[done] = x30_0_done ? 1'd1;
     }
     group upd4<"static"=1> {
       j0.write_en = 1'd1;
       add0.left = j0.out;
-      add0.right = const6.out;
+      add0.right = const4.out;
       j0.in = 1'd1 ? add0.out;
       upd4[done] = j0.done ? 1'd1;
     }
     group upd5<"static"=1> {
       i0.write_en = 1'd1;
       add1.left = i0.out;
-      add1.right = const7.out;
+      add1.right = const5.out;
       i0.in = 1'd1 ? add1.out;
       upd5[done] = i0.done ? 1'd1;
     }
@@ -1048,7 +1054,7 @@ component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_dat
 }
 component bias_add(go: 1, clk: 1, fc1_bias0_read_data: 32, fc1_bias0_done: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_read_data: 32, x20_0_done: 1) -> (done: 1, fc1_bias0_addr0: 8, fc1_bias0_write_data: 32, fc1_bias0_write_en: 1, fc1_bias0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1) {
   cells {
-    add0 = prim std_add(32);
+    add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(8);
     add2 = prim std_add(1);
     const0 = prim std_const(1, 0);
@@ -1149,7 +1155,7 @@ component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done:
   cells {
     add0 = prim std_add(10);
     add1 = prim std_add(8);
-    add2 = prim std_add(32);
+    add2 = prim fixed_p_std_add(32, 16, 16);
     add3 = prim std_add(10);
     add4 = prim std_add(8);
     add5 = prim std_add(1);
@@ -1194,6 +1200,8 @@ component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done:
     le6 = prim std_le(8);
     mult_pipe0 = prim std_mult_pipe(32);
     product_0 = prim std_reg(32);
+    slice0 = prim std_slice(32, 32);
+    slice1 = prim std_slice(32, 32);
     temporary_x10_0 = prim std_mem_d2(32, 1, 128, 1, 8);
     temporary_x1_read0_0 = prim std_reg(32);
     transpose_fc1_weight0_0 = prim std_mem_d2(32, 784, 128, 10, 8);
@@ -1261,18 +1269,20 @@ component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done:
       k0.write_en = 1'd1;
       let4[done] = k0.done;
     }
-    group let5<"static"=4> {
-      bin_read0_0.in = mult_pipe0.out;
-      bin_read0_0.write_en = mult_pipe0.done;
+    group let5<"static"=1> {
+      bin_read0_0.in = slice0.out;
+      bin_read0_0.write_en = 1'd1;
       let5[done] = bin_read0_0.done;
+      slice0.in = mult_pipe0.out;
       mult_pipe0.left = x_read0_0.out;
       mult_pipe0.right = transpose_fc1_weight_read0_0.out;
       mult_pipe0.go = !mult_pipe0.done ? 1'd1;
     }
     group let6<"static"=1> {
-      product_0.in = bin_read0_0.out;
+      product_0.in = slice1.out;
       product_0.write_en = 1'd1;
       let6[done] = product_0.done;
+      slice1.in = bin_read0_0.out;
     }
     group let7<"static"=1> {
       i2.in = const15.out;
diff --git a/frontends/relay-futil/tests/mlp_net.relay b/frontends/relay-futil/tests/mlp_net.relay
index ef53158f58..8943360100 100644
--- a/frontends/relay-futil/tests/mlp_net.relay
+++ b/frontends/relay-futil/tests/mlp_net.relay
@@ -1,17 +1,17 @@
 v0.0.4
-fn (%data: Tensor[(1, 1, 28, 28), int32], %fc1_weight: Tensor[(128, 784), int32], %fc1_bias: Tensor[(128), int32],
-    %fc2_weight: Tensor[(64, 128), int32], %fc2_bias: Tensor[(64), int32], %fc3_weight: Tensor[(10, 64), int32],
-    %fc3_bias: Tensor[(10), int32]) -> Tensor[(1, 10), int32] {
-  let %x: Tensor[(1, 784), int32] = nn.batch_flatten(%data) /* ty=Tensor[(1, 784), int32] */;
-  let %x1: Tensor[(1, 128), int32] = nn.dense(%x, %fc1_weight, units=128) /* ty=Tensor[(1, 128), int32] */;
-  let %x2: Tensor[(1, 128), int32] = nn.bias_add(%x1, %fc1_bias, axis=-1) /* ty=Tensor[(1, 128), int32] */;
-  let %x3: Tensor[(1, 128), int32] = nn.relu(%x2) /* ty=Tensor[(1, 128), int32] */;
-  let %x4: Tensor[(1, 64), int32] = nn.dense(%x3, %fc2_weight, units=64) /* ty=Tensor[(1, 64), int32] */;
-  let %x5: Tensor[(1, 64), int32] = nn.bias_add(%x4, %fc2_bias, axis=-1) /* ty=Tensor[(1, 64), int32] */;
-  let %x6: Tensor[(1, 64), int32] = nn.relu(%x5) /* ty=Tensor[(1, 64), int32] */;
-  let %x7: Tensor[(1, 10), int32] = nn.dense(%x6, %fc3_weight, units=10) /* ty=Tensor[(1, 10), int32] */;
-  let %x8: Tensor[(1, 10), int32] = nn.bias_add(%x7, %fc3_bias, axis=-1) /* ty=Tensor[(1, 10), int32] */;
+fn (%data: Tensor[(1, 1, 28, 28), float32], %fc1_weight: Tensor[(128, 784), float32], %fc1_bias: Tensor[(128), float32],
+    %fc2_weight: Tensor[(64, 128), float32], %fc2_bias: Tensor[(64), float32], %fc3_weight: Tensor[(10, 64), float32],
+    %fc3_bias: Tensor[(10), float32]) -> Tensor[(1, 10), float32] {
+  let %x: Tensor[(1, 784), float32] = nn.batch_flatten(%data) /* ty=Tensor[(1, 784), float32] */;
+  let %x1: Tensor[(1, 128), float32] = nn.dense(%x, %fc1_weight, units=128) /* ty=Tensor[(1, 128), float32] */;
+  let %x2: Tensor[(1, 128), float32] = nn.bias_add(%x1, %fc1_bias, axis=-1) /* ty=Tensor[(1, 128), float32] */;
+  let %x3: Tensor[(1, 128), float32] = nn.relu(%x2) /* ty=Tensor[(1, 128), float32] */;
+  let %x4: Tensor[(1, 64), float32] = nn.dense(%x3, %fc2_weight, units=64) /* ty=Tensor[(1, 64), float32] */;
+  let %x5: Tensor[(1, 64), float32] = nn.bias_add(%x4, %fc2_bias, axis=-1) /* ty=Tensor[(1, 64), float32] */;
+  let %x6: Tensor[(1, 64), float32] = nn.relu(%x5) /* ty=Tensor[(1, 64), float32] */;
+  let %x7: Tensor[(1, 10), float32] = nn.dense(%x6, %fc3_weight, units=10) /* ty=Tensor[(1, 10), float32] */;
+  let %x8: Tensor[(1, 10), float32] = nn.bias_add(%x7, %fc3_bias, axis=-1) /* ty=Tensor[(1, 10), float32] */;
   %x8
-  // let %x9: Tensor[(1, 10), int32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), int32] */;
+  // let %x9: Tensor[(1, 10), float32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), float32] */;
   // %x9
 }

From 822f1f32beb1c571c3dc60cbb6e5c7ccbcababb9 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 27 Nov 2020 14:13:57 -0500
Subject: [PATCH 57/75] Add element-wise sqrt.

---
 frontends/relay-futil/compiler.py         |   3 +-
 frontends/relay-futil/dahlia_functions.py |  20 +++
 frontends/relay-futil/tests/sqrt.expect   | 180 ++++++++++++++++++++++
 frontends/relay-futil/tests/sqrt.relay    |   6 +
 4 files changed, 208 insertions(+), 1 deletion(-)
 create mode 100644 frontends/relay-futil/tests/sqrt.expect
 create mode 100644 frontends/relay-futil/tests/sqrt.relay

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 75dc565df9..86fd5bd3e3 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -14,7 +14,8 @@
 
 # Mapping from Relay function names to their respective Dahlia lowering.
 RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
-                      'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims}
+                      'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims,
+                      'sqrt': sqrt}
 
 # Mapping between primitive type and associated Dahlia name extension.
 # E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 1eb29a9b53..106c000205 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -207,6 +207,26 @@ def negative(declaration):
     return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
+# TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
+def sqrt(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
+    op, res = declaration.inputs[0].primitive, declaration.output.primitive
+    bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
+    include_sqrt = f"""import "fxp_sqrt.h" {{ def sqrt(value: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
+
+    indices = ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        indices += f'[{variable_name}]'
+        variable_name = next_character(variable_name)
+
+    declarations = pp_dahlia_memory_declarations([op, res])
+    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := sqrt({op.name}{indices});""")
+    return lower_dahlia_program(f"""{include_sqrt}{NEWL}{declarations}{NEWL}{program_body}""",
+                                declaration.component_name)
+
+
 def expand_dims(declaration):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
     axis, num_newaxis = declaration.attributes.get_int("axis"), declaration.attributes.get_int("num_newaxis")
diff --git a/frontends/relay-futil/tests/sqrt.expect b/frontends/relay-futil/tests/sqrt.expect
new file mode 100644
index 0000000000..edb40c6259
--- /dev/null
+++ b/frontends/relay-futil/tests/sqrt.expect
@@ -0,0 +1,180 @@
+import "primitives/std.lib";
+
+component sqrt(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 4, x0_0_0_0_addr1: 4, x0_0_0_0_addr2: 5, x0_0_0_0_addr3: 7, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 4, x10_0_0_0_addr1: 4, x10_0_0_0_addr2: 5, x10_0_0_0_addr3: 7, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(7);
+    add1 = prim std_add(5);
+    add2 = prim std_add(4);
+    add3 = prim std_add(4);
+    const0 = prim std_const(4, 0);
+    const1 = prim std_const(4, 7);
+    const10 = prim std_const(4, 1);
+    const11 = prim std_const(4, 1);
+    const2 = prim std_const(4, 0);
+    const3 = prim std_const(4, 7);
+    const4 = prim std_const(5, 0);
+    const5 = prim std_const(5, 15);
+    const6 = prim std_const(7, 0);
+    const7 = prim std_const(7, 63);
+    const8 = prim std_const(7, 1);
+    const9 = prim std_const(5, 1);
+    i0 = prim std_reg(4);
+    j0 = prim std_reg(4);
+    k0 = prim std_reg(5);
+    l0 = prim std_reg(7);
+    le0 = prim std_le(4);
+    le1 = prim std_le(4);
+    le2 = prim std_le(5);
+    le3 = prim std_le(7);
+    sqrt0 = prim std_sqrt();
+    x_read0_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = l0.out;
+      le3.right = const7.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let2<"static"=1> {
+      k0.in = const4.out;
+      k0.write_en = 1'd1;
+      let2[done] = k0.done;
+    }
+    group let3<"static"=1> {
+      l0.in = const6.out;
+      l0.write_en = 1'd1;
+      let3[done] = l0.done;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_0_0_addr3 = l0.out;
+      x0_0_0_0_addr2 = k0.out;
+      x0_0_0_0_addr1 = j0.out;
+      x0_0_0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=18> {
+      x10_0_0_0_addr3 = l0.out;
+      x10_0_0_0_addr2 = k0.out;
+      x10_0_0_0_addr1 = j0.out;
+      x10_0_0_0_addr0 = i0.out;
+      x10_0_0_0_write_en = sqrt0.done;
+      sqrt0.in = x_read0_0.out;
+      sqrt0.go = !sqrt0.done ? 1'd1;
+      x10_0_0_0_write_data = sqrt0.done ? sqrt0.out;
+      upd1[done] = x10_0_0_0_done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      l0.write_en = 1'd1;
+      add0.left = l0.out;
+      add0.right = const8.out;
+      l0.in = 1'd1 ? add0.out;
+      upd2[done] = l0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      k0.write_en = 1'd1;
+      add1.left = k0.out;
+      add1.right = const9.out;
+      k0.in = 1'd1 ? add1.out;
+      upd3[done] = k0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      j0.write_en = 1'd1;
+      add2.left = j0.out;
+      add2.right = const10.out;
+      j0.in = 1'd1 ? add2.out;
+      upd4[done] = j0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      i0.write_en = 1'd1;
+      add3.left = i0.out;
+      add3.right = const11.out;
+      i0.in = 1'd1 ? add3.out;
+      upd5[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      upd0;
+                      upd1;
+                      upd2;
+                    }
+                  }
+                  upd3;
+                }
+              }
+              upd4;
+            }
+          }
+          upd5;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d4(32, 8, 8, 16, 64, 4, 4, 5, 7);
+    x = prim std_mem_d4(32, 8, 8, 16, 64, 4, 4, 5, 7);
+    sqrt0 = sqrt;
+  }
+  wires {
+    group run_sqrt {
+      x.addr0 = sqrt0.x0_0_0_0_addr0;
+      sqrt0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = sqrt0.x0_0_0_0_addr1;
+      x.addr2 = sqrt0.x0_0_0_0_addr2;
+      x1.addr0 = sqrt0.x10_0_0_0_addr0;
+      x1.write_data = sqrt0.x10_0_0_0_write_data;
+      x1.write_en = sqrt0.x10_0_0_0_write_en;
+      sqrt0.x10_0_0_0_done = x1.done;
+      sqrt0.go = 1'd1;
+      run_sqrt[done] = sqrt0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_sqrt;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/sqrt.relay b/frontends/relay-futil/tests/sqrt.relay
new file mode 100644
index 0000000000..abb0faaae8
--- /dev/null
+++ b/frontends/relay-futil/tests/sqrt.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(8, 8, 16, 64), int32]) {
+  let %x1 = sqrt(%x);
+  %x1
+}
+

From 87dfc0365439bcda16848e50e492e4548f56e370 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 13:37:50 -0500
Subject: [PATCH 58/75] Initial.

---
 frontends/relay-futil/compiler.py             |  14 +-
 frontends/relay-futil/dahlia_functions.py     |  88 +++-
 frontends/relay-futil/example.py              |  14 +-
 .../relay-futil/tests/data/max_pool2d.expect  | 162 ++++++++
 .../relay-futil/tests/data/max_pool2d.relay   |   6 +
 .../tests/data/max_pool2d.relay.data          |  20 +
 .../relay-futil/tests/data/softmax.expect     |  14 +
 .../relay-futil/tests/data/softmax.relay      |   6 +
 .../relay-futil/tests/data/softmax.relay.data |  10 +
 .../tests/data/tensor4d_multiply.expect       |   0
 .../tests/data/tensor4d_multiply.relay        |   5 +
 .../tests/data/tensor4d_multiply.relay.data   |  23 ++
 frontends/relay-futil/tests/max_pool2d.expect | 379 ++++++++++++++++++
 frontends/relay-futil/tests/max_pool2d.relay  |   6 +
 frontends/relay-futil/tests/mlp_net.expect    | 179 +++++++++
 frontends/relay-futil/tests/mlp_net.relay     |   5 +-
 frontends/relay-futil/tests/softmax.expect    | 193 +++++++++
 frontends/relay-futil/tests/softmax.relay     |   6 +
 frontends/relay-futil/utilities.py            |   9 +-
 19 files changed, 1116 insertions(+), 23 deletions(-)
 create mode 100644 frontends/relay-futil/tests/data/max_pool2d.expect
 create mode 100644 frontends/relay-futil/tests/data/max_pool2d.relay
 create mode 100644 frontends/relay-futil/tests/data/max_pool2d.relay.data
 create mode 100644 frontends/relay-futil/tests/data/softmax.expect
 create mode 100644 frontends/relay-futil/tests/data/softmax.relay
 create mode 100644 frontends/relay-futil/tests/data/softmax.relay.data
 create mode 100644 frontends/relay-futil/tests/data/tensor4d_multiply.expect
 create mode 100644 frontends/relay-futil/tests/data/tensor4d_multiply.relay
 create mode 100644 frontends/relay-futil/tests/data/tensor4d_multiply.relay.data
 create mode 100644 frontends/relay-futil/tests/max_pool2d.expect
 create mode 100644 frontends/relay-futil/tests/max_pool2d.relay
 create mode 100644 frontends/relay-futil/tests/softmax.expect
 create mode 100644 frontends/relay-futil/tests/softmax.relay

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 86fd5bd3e3..5212354512 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -14,8 +14,8 @@
 
 # Mapping from Relay function names to their respective Dahlia lowering.
 RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
-                      'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims,
-                      'sqrt': sqrt}
+                      'nn.bias_add': bias_add, 'nn.relu': relu, 'nn.softmax': softmax, 'nn.max_pool2d': max_pool2d,
+                      'negative': negative, 'expand_dims': expand_dims, 'sqrt': sqrt}
 
 # Mapping between primitive type and associated Dahlia name extension.
 # E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
@@ -58,10 +58,10 @@ def relay_id(self, name):
 
     def dahlia_name(self, name, type):
         """
-        Dahlia uses the following naming scheme for an arbitrary variable 'X':
-        Memory1D: 'X0', 'X1', 'X2', ...
-        Memory2D: 'X0_0', 'X1_0', 'X2_0', ...
-        Memory3D: 'X0_0_0', 'X1_0_0', 'X2_0_0', ...
+        Dahlia uses the following naming scheme for arbitrary variables `X`, `Y`:
+        Memory1D: `X0`, `Y0`, ...
+        Memory2D: `X0_0`, `Y0_0`, ...
+        Memory3D: `X0_0_0`, `Y0_0_0`, ...
         """
         assert type in DahliaNameExtension, f'{name} with {type} is not supported yet.'
         return name + DahliaNameExtension[type]
@@ -80,7 +80,7 @@ def get_dahlia_declaration(self, function_name, cells, args, attrs):
             function = RelayFunctionCalls[function_name]
             name = function.__name__
         else:
-            assert False, f'{function_name} with type {input_type} is not supported.'
+            assert False, f'{function_name} is not supported for lowering to FuTIL.'
         return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name),
                                  op=op, inputs=args, attributes=attrs, function=function)
 
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 106c000205..e266904fb7 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -52,7 +52,7 @@ def broadcast(declaration):
     """
     https://numpy.org/doc/stable/user/basics.broadcasting.html
     Implements array broadcasting:
-    Two dimensions are compatible when either (1) they're equal, or (2) one of them is 1.
+    Two dimensions are compatible when either (1) they're equal, or (2) one of them is `1`.
     It is not required that both operands have the same number of dimensions either.
     - When lowering from Relay IR, we are guaranteed the arrays are compatible for broadcasting.
     - Variable names for indexing through the array begin with `i`, and continue alphabetically.
@@ -88,11 +88,9 @@ def broadcast(declaration):
         res_indices.append(current_dimension)
         if op1_dims > op2_dims and len(op2_sizes) <= i:
             op1_indices.append(current_dimension)
-            continue
-        if op2_dims > op1_dims and len(op1_sizes) <= i:
+        elif op2_dims > op1_dims and len(op1_sizes) <= i:
             op2_indices.append(current_dimension)
-            continue
-        if op1_sizes[i] == op2_sizes[i]:
+        elif op1_sizes[i] == op2_sizes[i]:
             op1_indices.append(current_dimension)
             op2_indices.append(current_dimension)
         elif op1_sizes[i] > op2_sizes[i]:
@@ -170,11 +168,11 @@ def bias_add(declaration):
 def relu(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
     data, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = data.data[0], data.type
+    bitwidth, num_dimensions, data_type = data.data[0], data.type, data.data_type
 
     declarations = pp_dahlia_memory_declarations([data, res])
-    zero = '0.0' if data.data_type == 'ufix' else '0'
-    let_zero = f'let zero: {data.data_type}<{bitwidth}> = {zero};'
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    let_zero = f'let zero: {data_type}<{bitwidth}> = {zero};'
 
     indices = ""
     variable_name = CHARACTER_I
@@ -301,7 +299,7 @@ def batch_matmul(declaration):
 # TODO(cgyurgyik): Similar to batch_matmul, this requires a temporary memory to store the output
 # of the matrix multiply. Otherwise, the values aren't computed properly. Look deeper into this.
 def dense(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.dense"""
     # TODO(cgyurgyik): Add support for `units`.
     units = declaration.attributes.get_int("units")
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
@@ -335,3 +333,75 @@ def dense(declaration):
     }}
     """
     return lower_dahlia_program(program, declaration.component_name)
+
+
+# TODO(cgyurgyik): Currently, only supports a small subset (namely those used in our VGG net and MLP net examples).
+def softmax(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.softmax"""
+    op, res = declaration.inputs[0].primitive, declaration.output.primitive
+    axis = declaration.attributes.get_int("axis")
+    data_type = op.data_type
+    assert op.type == PrimitiveType.Memory2D, f'nn.softmax with pritmive type Memory{op.type}D is not supported.'
+    assert axis == -1 or axis == 1, f'nn.softmax with axis = {axis} is not supported.'
+    bitwidth, size0, size1, index_size0, index_size1 = op.data[0], op.data[1], op.data[2], op.data[3], op.data[4]
+
+    import_exp = f"""import "std_exp.h" {{ def exp(x: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
+    declarations = pp_dahlia_memory_declarations([res, op])
+
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    body = f"""
+    for (let i: ubit<{index_size0}> = 0..{size0}) {{
+      let {op.name}_expsum: {data_type}<{bitwidth}> = {zero};
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{ {op.name}_expsum += exp({op.name}[i][j]); }}
+      for (let k: ubit<{index_size1}> = 0..{size1}) {{ 
+        {res.name}[i][k] := exp({op.name}[i][k]); 
+        ---
+        {res.name}[i][k] := {res.name}[i][k] / {op.name}_expsum;
+      }}
+    }}
+    """
+    program = f"""{import_exp}{NEWL}{declarations}{body}"""
+    return lower_dahlia_program(program, declaration.component_name)
+
+
+def max_pool2d(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.max_pool2d"""
+    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+
+    strides = declaration.attributes.get_int_tuple("strides")
+    pool_size = declaration.attributes.get_int_tuple("pool_size")
+    padding = declaration.attributes.get_int_tuple("padding")
+    layout = declaration.attributes.get_str("layout")
+    ceil_mode = declaration.attributes.get_int("ceil_mode")
+    for p in padding: assert p == 0, f"Non-zero padding: {padding} is not currently supported for nn.max_pool2d"
+    assert layout == 'NCHW', f"Layout \'{layout}\' is not currently supported for nn.max_pool2d; please use `NCHW`"
+    assert ceil_mode == False, "`ceil_mode` is not currently supported for nn.max_pool2d"
+    bitwidth, data_type = data.data[0], data.data_type
+    size0, size1, size2, size3 = res.data[1], res.data[2], res.data[3], res.data[4]
+
+    declarations = pp_dahlia_memory_declarations([res, data])
+    program_body = f"""
+    for (let i: ubit<32> = 0..{size0}) {{
+      for (let j: ubit<32> = 0..{size1}) {{
+        for (let k: ubit<32> = 0..{size2}) {{
+          for (let l: ubit<32> = 0..{size3}) {{
+            let stride_x: ubit<32> = k * {strides[0]}/*stride[0]*/;
+            let stride_y: ubit<32> = l * {strides[1]}/*stride[1]*/;
+            
+            let max: {data_type}<{bitwidth}> = {data.name}[i][j][stride_x][stride_y];
+            for (let m: ubit<32> = 0..{pool_size[0]}/*pool_size[0]*/) {{
+              for (let n: ubit<32> = 0..{pool_size[1]}/*pool_size[1]*/) {{
+                let pool_x: ubit<32> = stride_x + m;
+                let pool_y: ubit<32> = stride_y + n;
+                let current: {data_type}<{bitwidth}> = {data.name}[i][j][pool_x][pool_y]; 
+                if (current > max) {{ max := current; }} else {{ max := max; }}
+              }}
+            }}
+            {res.name}[i][j][k][l] := max;
+          }} 
+        }} 
+      }} 
+    }} 
+    """
+    program = f"""{declarations}{NEWL}{program_body}"""
+    return lower_dahlia_program(program, declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 0a986e9453..69db563726 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -49,6 +49,15 @@ def dense():
     return relay.Function([x, y], relay.nn.dense(x, y, units=10))
 
 
+def softmax():
+    x = relay.var('x', shape=[1, 10], dtype='float32')
+    return relay.Function([x], relay.nn.softmax(x))
+
+
+def max_pool2d():
+    data = relay.var('data', shape=[2, 2, 4, 4], dtype='int32')
+    return relay.Function([data], relay.nn.max_pool2d(data, padding=[0,0,0,0], strides=[2,2], pool_size=[2,2]))
+
 def mlp_net():
     """The MLP test from Relay."""
     from tvm.relay.testing import mlp
@@ -58,11 +67,12 @@ def mlp_net():
 def vgg_net():
     """The VGG test from Relay."""
     from tvm.relay.testing import vgg
-    return vgg.get_net(batch_size=1, image_shape=(3, 224, 224), num_classes=10, dtype='int32', num_layers=11,
+    return vgg.get_net(batch_size=5, image_shape=(3, 224, 224), num_classes=10, dtype='int32', num_layers=13,
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, dense, mlp_net, vgg_net]
+ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, dense, softmax, mlp_net,
+             vgg_net, max_pool2d]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/tests/data/max_pool2d.expect b/frontends/relay-futil/tests/data/max_pool2d.expect
new file mode 100644
index 0000000000..a1c95c7c88
--- /dev/null
+++ b/frontends/relay-futil/tests/data/max_pool2d.expect
@@ -0,0 +1,162 @@
+{
+  "data": [
+    [
+      [
+        [
+          12,
+          30,
+          34,
+          37
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          8,
+          2,
+          112,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          20,
+          0,
+          70,
+          25
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          12,
+          0,
+          100,
+          12
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ]
+  ],
+  "result": [
+    [
+      [
+        [
+          30,
+          37
+        ],
+        [
+          2,
+          4
+        ]
+      ],
+      [
+        [
+          8,
+          112
+        ],
+        [
+          2,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          20,
+          70
+        ],
+        [
+          2,
+          4
+        ]
+      ],
+      [
+        [
+          12,
+          100
+        ],
+        [
+          2,
+          4
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/max_pool2d.relay b/frontends/relay-futil/tests/data/max_pool2d.relay
new file mode 100644
index 0000000000..e1ba79d351
--- /dev/null
+++ b/frontends/relay-futil/tests/data/max_pool2d.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%data: Tensor[(2, 2, 4, 4), int32]) {
+  let %result: Tensor[(2, 2, 2, 2), int32] = nn.max_pool2d(%data, pool_size=[2, 2], strides=[2, 2], padding=[0,0,0,0]);
+  %result
+}
+
diff --git a/frontends/relay-futil/tests/data/max_pool2d.relay.data b/frontends/relay-futil/tests/data/max_pool2d.relay.data
new file mode 100644
index 0000000000..9e19e8a053
--- /dev/null
+++ b/frontends/relay-futil/tests/data/max_pool2d.relay.data
@@ -0,0 +1,20 @@
+{
+  "data": {
+    "data": [
+            [[[12,30,34,37], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[8,2,112,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]],
+            [[[20,0,70,25], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[12,0,100,12], [1,2,3,4], [1,2,3,4], [1,2,3,4]]]
+            ],
+    "bitwidth": 32
+  },
+  "max": {
+      "data": [0],
+      "bitwidth": 32
+  },
+  "result": {
+    "data": [
+             [[[0,0], [0,0]], [[0,0], [0,0]]],
+             [[[0,0], [0,0]], [[0,0], [0,0]]]
+            ],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/softmax.expect b/frontends/relay-futil/tests/data/softmax.expect
new file mode 100644
index 0000000000..1073dc7c6c
--- /dev/null
+++ b/frontends/relay-futil/tests/data/softmax.expect
@@ -0,0 +1,14 @@
+{
+  "x": [
+    [
+      4,
+      16
+    ]
+  ],
+  "x1": [
+    [
+      0,
+      0
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/softmax.relay b/frontends/relay-futil/tests/data/softmax.relay
new file mode 100644
index 0000000000..858ae52126
--- /dev/null
+++ b/frontends/relay-futil/tests/data/softmax.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(1, 2), float32]) {
+  let %x1: Tensor[(1, 2), float32] = nn.softmax(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/tests/data/softmax.relay.data b/frontends/relay-futil/tests/data/softmax.relay.data
new file mode 100644
index 0000000000..f0d81e4e55
--- /dev/null
+++ b/frontends/relay-futil/tests/data/softmax.relay.data
@@ -0,0 +1,10 @@
+{
+  "x": {
+    "data": [[4, 16]],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [[0, 0]],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.expect b/frontends/relay-futil/tests/data/tensor4d_multiply.expect
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.relay b/frontends/relay-futil/tests/data/tensor4d_multiply.relay
new file mode 100644
index 0000000000..197d3c9564
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor4d_multiply.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(2, 2, 4, 4), int32], %x1: Tensor[(2, 2, 4, 4), int32]) {
+  let %x2: Tensor[(2, 2, 4, 4), int32] = multiply(%x, %x1);
+  %x2
+}
diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.relay.data b/frontends/relay-futil/tests/data/tensor4d_multiply.relay.data
new file mode 100644
index 0000000000..6cdaa8c7a7
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor4d_multiply.relay.data
@@ -0,0 +1,23 @@
+{
+  "x": {
+    "data": [
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]],
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]]
+            ],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]],
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]]
+            ],
+    "bitwidth": 32
+  },
+  "x2": {
+    "data": [
+            [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]], [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]],
+            [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]], [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]]
+            ],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/max_pool2d.expect b/frontends/relay-futil/tests/max_pool2d.expect
new file mode 100644
index 0000000000..cbd0da89b2
--- /dev/null
+++ b/frontends/relay-futil/tests/max_pool2d.expect
@@ -0,0 +1,379 @@
+import "primitives/std.lib";
+
+component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, result0_0_0_0_read_data: 32, result0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 2, data0_0_0_0_addr1: 2, data0_0_0_0_addr2: 3, data0_0_0_0_addr3: 3, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, result0_0_0_0_addr0: 2, result0_0_0_0_addr1: 2, result0_0_0_0_addr2: 2, result0_0_0_0_addr3: 2, result0_0_0_0_write_data: 32, result0_0_0_0_write_en: 1, result0_0_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(32);
+    add2 = prim std_add(32);
+    add3 = prim std_add(32);
+    add4 = prim std_add(32);
+    add5 = prim std_add(32);
+    add6 = prim std_add(32);
+    add7 = prim std_add(32);
+    bin_read0_0 = prim std_reg(32);
+    bin_read1_0 = prim std_reg(32);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(32, 1);
+    const10 = prim std_const(32, 0);
+    const11 = prim std_const(32, 1);
+    const12 = prim std_const(32, 0);
+    const13 = prim std_const(32, 1);
+    const14 = prim std_const(32, 1);
+    const15 = prim std_const(32, 1);
+    const16 = prim std_const(32, 1);
+    const17 = prim std_const(32, 1);
+    const18 = prim std_const(32, 1);
+    const19 = prim std_const(32, 1);
+    const2 = prim std_const(32, 0);
+    const3 = prim std_const(32, 1);
+    const4 = prim std_const(32, 0);
+    const5 = prim std_const(32, 1);
+    const6 = prim std_const(32, 0);
+    const7 = prim std_const(32, 1);
+    const8 = prim std_const(32, 2);
+    const9 = prim std_const(32, 2);
+    data_read0_0 = prim std_reg(32);
+    data_read1_0 = prim std_reg(32);
+    gt0 = prim std_gt(32);
+    i0 = prim std_reg(32);
+    j0 = prim std_reg(32);
+    k0 = prim std_reg(32);
+    l0 = prim std_reg(32);
+    le0 = prim std_le(32);
+    le1 = prim std_le(32);
+    le2 = prim std_le(32);
+    le3 = prim std_le(32);
+    le4 = prim std_le(32);
+    le5 = prim std_le(32);
+    m0 = prim std_reg(32);
+    max_0 = prim std_reg(32);
+    mult_pipe0 = prim std_mult_pipe(32);
+    mult_pipe1 = prim std_mult_pipe(32);
+    n0 = prim std_reg(32);
+    pool_x_0 = prim std_reg(32);
+    pool_y_0 = prim std_reg(32);
+    slice0 = prim std_slice(32, 2);
+    slice1 = prim std_slice(32, 2);
+    slice10 = prim std_slice(32, 3);
+    slice11 = prim std_slice(32, 3);
+    slice12 = prim std_slice(32, 2);
+    slice13 = prim std_slice(32, 2);
+    slice14 = prim std_slice(32, 2);
+    slice15 = prim std_slice(32, 2);
+    slice2 = prim std_slice(32, 3);
+    slice3 = prim std_slice(32, 3);
+    slice4 = prim std_slice(32, 2);
+    slice5 = prim std_slice(32, 2);
+    slice6 = prim std_slice(32, 3);
+    slice7 = prim std_slice(32, 3);
+    slice8 = prim std_slice(32, 2);
+    slice9 = prim std_slice(32, 2);
+    stride_k_0 = prim std_reg(32);
+    stride_l_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = l0.out;
+      le3.right = const7.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = m0.out;
+      le4.right = const11.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = n0.out;
+      le5.right = const13.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      gt0.left = data_read0_0.out;
+      gt0.right = max_0.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let1[done] = j0.done;
+    }
+    group let10<"static"=1> {
+      pool_x_0.in = add0.out;
+      pool_x_0.write_en = 1'd1;
+      let10[done] = pool_x_0.done;
+      add0.left = stride_k_0.out;
+      add0.right = m0.out;
+    }
+    group let11<"static"=1> {
+      pool_y_0.in = add1.out;
+      pool_y_0.write_en = 1'd1;
+      let11[done] = pool_y_0.done;
+      add1.left = stride_l_0.out;
+      add1.right = n0.out;
+    }
+    group let2<"static"=1> {
+      k0.in = const4.out;
+      k0.write_en = 1'd1;
+      let2[done] = k0.done;
+    }
+    group let3<"static"=1> {
+      l0.in = const6.out;
+      l0.write_en = 1'd1;
+      let3[done] = l0.done;
+    }
+    group let4<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let4[done] = bin_read0_0.done;
+      mult_pipe0.left = k0.out;
+      mult_pipe0.right = const8.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let5<"static"=1> {
+      stride_k_0.in = bin_read0_0.out;
+      stride_k_0.write_en = 1'd1;
+      let5[done] = stride_k_0.done;
+    }
+    group let6<"static"=4> {
+      bin_read1_0.in = mult_pipe1.out;
+      bin_read1_0.write_en = mult_pipe1.done;
+      let6[done] = bin_read1_0.done;
+      mult_pipe1.left = l0.out;
+      mult_pipe1.right = const9.out;
+      mult_pipe1.go = !mult_pipe1.done ? 1'd1;
+    }
+    group let7<"static"=1> {
+      stride_l_0.in = bin_read1_0.out;
+      stride_l_0.write_en = 1'd1;
+      let7[done] = stride_l_0.done;
+    }
+    group let8<"static"=1> {
+      m0.in = const10.out;
+      m0.write_en = 1'd1;
+      let8[done] = m0.done;
+    }
+    group let9<"static"=1> {
+      n0.in = const12.out;
+      n0.write_en = 1'd1;
+      let9[done] = n0.done;
+    }
+    group upd0<"static"=1> {
+      max_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice3.out;
+      slice3.in = stride_l_0.out;
+      data0_0_0_0_addr2 = slice2.out;
+      slice2.in = stride_k_0.out;
+      data0_0_0_0_addr1 = slice1.out;
+      slice1.in = j0.out;
+      data0_0_0_0_addr0 = slice0.out;
+      slice0.in = i0.out;
+      max_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd0[done] = max_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      data_read0_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice7.out;
+      slice7.in = pool_y_0.out;
+      data0_0_0_0_addr2 = slice6.out;
+      slice6.in = pool_x_0.out;
+      data0_0_0_0_addr1 = slice5.out;
+      slice5.in = j0.out;
+      data0_0_0_0_addr0 = slice4.out;
+      slice4.in = i0.out;
+      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd1[done] = data_read0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      i0.write_en = 1'd1;
+      add7.left = i0.out;
+      add7.right = const19.out;
+      i0.in = 1'd1 ? add7.out;
+      upd10[done] = i0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      data_read1_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice11.out;
+      slice11.in = pool_y_0.out;
+      data0_0_0_0_addr2 = slice10.out;
+      slice10.in = pool_x_0.out;
+      data0_0_0_0_addr1 = slice9.out;
+      slice9.in = j0.out;
+      data0_0_0_0_addr0 = slice8.out;
+      slice8.in = i0.out;
+      data_read1_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd2[done] = data_read1_0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      max_0.write_en = 1'd1;
+      max_0.in = 1'd1 ? data_read1_0.out;
+      upd3[done] = max_0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      n0.write_en = 1'd1;
+      add2.left = n0.out;
+      add2.right = const14.out;
+      n0.in = 1'd1 ? add2.out;
+      upd4[done] = n0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      m0.write_en = 1'd1;
+      add3.left = m0.out;
+      add3.right = const15.out;
+      m0.in = 1'd1 ? add3.out;
+      upd5[done] = m0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      result0_0_0_0_addr3 = slice15.out;
+      slice15.in = l0.out;
+      result0_0_0_0_addr2 = slice14.out;
+      slice14.in = k0.out;
+      result0_0_0_0_addr1 = slice13.out;
+      slice13.in = j0.out;
+      result0_0_0_0_addr0 = slice12.out;
+      slice12.in = i0.out;
+      result0_0_0_0_write_en = 1'd1;
+      result0_0_0_0_write_data = 1'd1 ? max_0.out;
+      upd6[done] = result0_0_0_0_done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      l0.write_en = 1'd1;
+      add4.left = l0.out;
+      add4.right = const16.out;
+      l0.in = 1'd1 ? add4.out;
+      upd7[done] = l0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      k0.write_en = 1'd1;
+      add5.left = k0.out;
+      add5.right = const17.out;
+      k0.in = 1'd1 ? add5.out;
+      upd8[done] = k0.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      j0.write_en = 1'd1;
+      add6.left = j0.out;
+      add6.right = const18.out;
+      j0.in = 1'd1 ? add6.out;
+      upd9[done] = j0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      par {
+                        seq {
+                          let4;
+                          let5;
+                        }
+                        seq {
+                          let6;
+                          let7;
+                        }
+                      }
+                      upd0;
+                      par {
+                        seq {
+                          let8;
+                          while le4.out with cond4 {
+                            seq {
+                              let9;
+                              while le5.out with cond5 {
+                                seq {
+                                  par {
+                                    let10;
+                                    let11;
+                                  }
+                                  upd1;
+                                  if gt0.out with cond6 {
+                                    seq {
+                                      upd2;
+                                      upd3;
+                                    }
+                                  }
+                                  upd4;
+                                }
+                              }
+                              upd5;
+                            }
+                          }
+                        }
+                        upd6;
+                      }
+                      upd7;
+                    }
+                  }
+                  upd8;
+                }
+              }
+              upd9;
+            }
+          }
+          upd10;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    result = prim std_mem_d4(32, 2, 2, 2, 2, 2, 2, 2, 2);
+    data = prim std_mem_d4(32, 2, 2, 4, 4, 2, 2, 3, 3);
+    max_pool2d0 = max_pool2d;
+  }
+  wires {
+    group run_max_pool2d {
+      data.addr0 = max_pool2d0.data0_0_0_0_addr0;
+      max_pool2d0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = max_pool2d0.data0_0_0_0_addr1;
+      data.addr2 = max_pool2d0.data0_0_0_0_addr2;
+      data.addr3 = max_pool2d0.data0_0_0_0_addr3;
+      result.addr0 = max_pool2d0.result0_0_0_0_addr0;
+      result.addr1 = max_pool2d0.result0_0_0_0_addr1;
+      result.addr2 = max_pool2d0.result0_0_0_0_addr2;
+      result.addr3 = max_pool2d0.result0_0_0_0_addr3;
+      result.write_data = max_pool2d0.result0_0_0_0_write_data;
+      result.write_en = max_pool2d0.result0_0_0_0_write_en;
+      max_pool2d0.result0_0_0_0_done = result.done;
+      max_pool2d0.go = 1'd1;
+      run_max_pool2d[done] = max_pool2d0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_max_pool2d;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/max_pool2d.relay b/frontends/relay-futil/tests/max_pool2d.relay
new file mode 100644
index 0000000000..e1ba79d351
--- /dev/null
+++ b/frontends/relay-futil/tests/max_pool2d.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%data: Tensor[(2, 2, 4, 4), int32]) {
+  let %result: Tensor[(2, 2, 2, 2), int32] = nn.max_pool2d(%data, pool_size=[2, 2], strides=[2, 2], padding=[0,0,0,0]);
+  %result
+}
+
diff --git a/frontends/relay-futil/tests/mlp_net.expect b/frontends/relay-futil/tests/mlp_net.expect
index 812a0381fd..b780037235 100644
--- a/frontends/relay-futil/tests/mlp_net.expect
+++ b/frontends/relay-futil/tests/mlp_net.expect
@@ -1,5 +1,169 @@
 import "primitives/std.lib";
 
+component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_data: 32, x90_0_done: 1) -> (done: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1, x90_0_addr0: 1, x90_0_addr1: 4, x90_0_write_data: 32, x90_0_write_en: 1, x90_0_clk: 1) {
+  cells {
+    add0 = prim fixed_p_std_add(32, 16, 16);
+    add1 = prim std_add(4);
+    add2 = prim std_add(4);
+    add3 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(4, 0);
+    const3 = prim std_const(4, 9);
+    const4 = prim std_const(4, 1);
+    const5 = prim std_const(4, 0);
+    const6 = prim std_const(4, 9);
+    const7 = prim std_const(4, 1);
+    const8 = prim std_const(1, 1);
+    div_pipe0 = prim std_div_pipe(32);
+    exp0 = prim std_exp();
+    exp1 = prim std_exp();
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(4);
+    k0 = prim std_reg(4);
+    le0 = prim std_le(1);
+    le1 = prim std_le(4);
+    le2 = prim std_le(4);
+    slice0 = prim std_slice(32, 32);
+    x8_expsum_0 = prim std_reg(32);
+    x8_read0_0 = prim std_reg(32);
+    x8_read1_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const6.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      x8_expsum_0.in = fpconst0.out;
+      x8_expsum_0.write_en = 1'd1;
+      let1[done] = x8_expsum_0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group let3<"static"=1> {
+      k0.in = const5.out;
+      k0.write_en = 1'd1;
+      let3[done] = k0.done;
+    }
+    group let4<"static"=1> {
+      bin_read0_0.in = slice0.out;
+      bin_read0_0.write_en = 1'd1;
+      let4[done] = bin_read0_0.done;
+      slice0.in = div_pipe0.out;
+      div_pipe0.left = exp1.out;
+      div_pipe0.right = x8_expsum_0.out;
+      div_pipe0.go = !div_pipe0.done ? 1'd1;
+      exp1.exponent = x8_read1_0.out;
+      exp1.go = !exp1.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      x8_read0_0.write_en = 1'd1;
+      x80_0_addr1 = j0.out;
+      x80_0_addr0 = i0.out;
+      x8_read0_0.in = 1'd1 ? x80_0_read_data;
+      upd0[done] = x8_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=2> {
+      x8_expsum_0.write_en = 1'd1;
+      add0.left = x8_expsum_0.out;
+      add0.right = exp0.out;
+      exp0.exponent = x8_read0_0.out;
+      exp0.go = !exp0.done ? 1'd1;
+      x8_expsum_0.in = 1'd1 ? add0.out;
+      upd1[done] = x8_expsum_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      x8_read1_0.write_en = 1'd1;
+      x80_0_addr1 = k0.out;
+      x80_0_addr0 = i0.out;
+      x8_read1_0.in = 1'd1 ? x80_0_read_data;
+      upd3[done] = x8_read1_0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      x90_0_addr1 = k0.out;
+      x90_0_addr0 = i0.out;
+      x90_0_write_en = 1'd1;
+      x90_0_write_data = 1'd1 ? bin_read0_0.out;
+      upd4[done] = x90_0_done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      k0.write_en = 1'd1;
+      add2.left = k0.out;
+      add2.right = const7.out;
+      k0.in = 1'd1 ? add2.out;
+      upd5[done] = k0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      i0.write_en = 1'd1;
+      add3.left = i0.out;
+      add3.right = const8.out;
+      i0.in = 1'd1 ? add3.out;
+      upd6[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            let1;
+            seq {
+              let2;
+              while le1.out with cond1 {
+                seq {
+                  upd0;
+                  upd1;
+                  upd2;
+                }
+              }
+            }
+          }
+          let3;
+          while le2.out with cond2 {
+            seq {
+              upd3;
+              let4;
+              upd4;
+              upd5;
+            }
+          }
+          upd6;
+        }
+      }
+    }
+  }
+}
 component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x70_0_read_data: 32, x70_0_done: 1, x80_0_read_data: 32, x80_0_done: 1) -> (done: 1, fc3_bias0_addr0: 4, fc3_bias0_write_data: 32, fc3_bias0_write_en: 1, fc3_bias0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
@@ -1623,7 +1787,9 @@ component batch_flatten(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_do
 
 component main () -> () {
   cells {
+    x9 = prim std_mem_d2(32, 1, 10, 1, 4);
     x8 = prim std_mem_d2(32, 1, 10, 1, 4);
+    softmax0 = softmax;
     x7 = prim std_mem_d2(32, 1, 10, 1, 4);
     fc3_bias = prim std_mem_d1(32, 10, 4);
     bias_add2 = bias_add2;
@@ -1774,6 +1940,18 @@ component main () -> () {
       bias_add2.go = 1'd1;
       run_bias_add2[done] = bias_add2.done ? 1'd1;
     }
+    group run_softmax {
+      x8.addr0 = softmax0.x80_0_addr0;
+      softmax0.x80_0_read_data = x8.read_data;
+      x8.addr1 = softmax0.x80_0_addr1;
+      x9.addr0 = softmax0.x90_0_addr0;
+      x9.addr1 = softmax0.x90_0_addr1;
+      x9.write_data = softmax0.x90_0_write_data;
+      x9.write_en = softmax0.x90_0_write_en;
+      softmax0.x90_0_done = x9.done;
+      softmax0.go = 1'd1;
+      run_softmax[done] = softmax0.done ? 1'd1;
+    }
   }
   control {
     seq {
@@ -1786,6 +1964,7 @@ component main () -> () {
       run_relu1;
       run_dense2;
       run_bias_add2;
+      run_softmax;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/mlp_net.relay b/frontends/relay-futil/tests/mlp_net.relay
index 8943360100..4368b51016 100644
--- a/frontends/relay-futil/tests/mlp_net.relay
+++ b/frontends/relay-futil/tests/mlp_net.relay
@@ -11,7 +11,6 @@ fn (%data: Tensor[(1, 1, 28, 28), float32], %fc1_weight: Tensor[(128, 784), floa
   let %x6: Tensor[(1, 64), float32] = nn.relu(%x5) /* ty=Tensor[(1, 64), float32] */;
   let %x7: Tensor[(1, 10), float32] = nn.dense(%x6, %fc3_weight, units=10) /* ty=Tensor[(1, 10), float32] */;
   let %x8: Tensor[(1, 10), float32] = nn.bias_add(%x7, %fc3_bias, axis=-1) /* ty=Tensor[(1, 10), float32] */;
-  %x8
-  // let %x9: Tensor[(1, 10), float32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), float32] */;
-  // %x9
+  let %x9: Tensor[(1, 10), float32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), float32] */;
+  %x9
 }
diff --git a/frontends/relay-futil/tests/softmax.expect b/frontends/relay-futil/tests/softmax.expect
new file mode 100644
index 0000000000..8fc76f5b1c
--- /dev/null
+++ b/frontends/relay-futil/tests/softmax.expect
@@ -0,0 +1,193 @@
+import "primitives/std.lib";
+
+component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 4, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+  cells {
+    add0 = prim fixed_p_std_add(32, 16, 16);
+    add1 = prim std_add(4);
+    add2 = prim std_add(4);
+    add3 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(4, 0);
+    const3 = prim std_const(4, 9);
+    const4 = prim std_const(4, 1);
+    const5 = prim std_const(4, 0);
+    const6 = prim std_const(4, 9);
+    const7 = prim std_const(4, 1);
+    const8 = prim std_const(1, 1);
+    div_pipe0 = prim std_div_pipe(32);
+    exp0 = prim std_exp();
+    exp1 = prim std_exp();
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(4);
+    k0 = prim std_reg(4);
+    le0 = prim std_le(1);
+    le1 = prim std_le(4);
+    le2 = prim std_le(4);
+    slice0 = prim std_slice(32, 32);
+    x_expsum_0 = prim std_reg(32);
+    x_read0_0 = prim std_reg(32);
+    x_read1_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const6.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      x_expsum_0.in = fpconst0.out;
+      x_expsum_0.write_en = 1'd1;
+      let1[done] = x_expsum_0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group let3<"static"=1> {
+      k0.in = const5.out;
+      k0.write_en = 1'd1;
+      let3[done] = k0.done;
+    }
+    group let4<"static"=1> {
+      bin_read0_0.in = slice0.out;
+      bin_read0_0.write_en = 1'd1;
+      let4[done] = bin_read0_0.done;
+      slice0.in = div_pipe0.out;
+      div_pipe0.left = exp1.out;
+      div_pipe0.right = x_expsum_0.out;
+      div_pipe0.go = !div_pipe0.done ? 1'd1;
+      exp1.exponent = x_read1_0.out;
+      exp1.go = !exp1.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = j0.out;
+      x0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=2> {
+      x_expsum_0.write_en = 1'd1;
+      add0.left = x_expsum_0.out;
+      add0.right = exp0.out;
+      exp0.exponent = x_read0_0.out;
+      exp0.go = !exp0.done ? 1'd1;
+      x_expsum_0.in = 1'd1 ? add0.out;
+      upd1[done] = x_expsum_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      x_read1_0.write_en = 1'd1;
+      x0_0_addr1 = k0.out;
+      x0_0_addr0 = i0.out;
+      x_read1_0.in = 1'd1 ? x0_0_read_data;
+      upd3[done] = x_read1_0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? bin_read0_0.out;
+      upd4[done] = x10_0_done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      k0.write_en = 1'd1;
+      add2.left = k0.out;
+      add2.right = const7.out;
+      k0.in = 1'd1 ? add2.out;
+      upd5[done] = k0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      i0.write_en = 1'd1;
+      add3.left = i0.out;
+      add3.right = const8.out;
+      i0.in = 1'd1 ? add3.out;
+      upd6[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            let1;
+            seq {
+              let2;
+              while le1.out with cond1 {
+                seq {
+                  upd0;
+                  upd1;
+                  upd2;
+                }
+              }
+            }
+          }
+          let3;
+          while le2.out with cond2 {
+            seq {
+              upd3;
+              let4;
+              upd4;
+              upd5;
+            }
+          }
+          upd6;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 1, 10, 1, 4);
+    x = prim std_mem_d2(32, 1, 10, 1, 4);
+    softmax0 = softmax;
+  }
+  wires {
+    group run_softmax {
+      x.addr0 = softmax0.x0_0_addr0;
+      softmax0.x0_0_read_data = x.read_data;
+      x.addr1 = softmax0.x0_0_addr1;
+      x1.addr0 = softmax0.x10_0_addr0;
+      x1.addr1 = softmax0.x10_0_addr1;
+      x1.write_data = softmax0.x10_0_write_data;
+      x1.write_en = softmax0.x10_0_write_en;
+      softmax0.x10_0_done = x1.done;
+      softmax0.go = 1'd1;
+      run_softmax[done] = softmax0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_softmax;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/softmax.relay b/frontends/relay-futil/tests/softmax.relay
new file mode 100644
index 0000000000..df46a20d70
--- /dev/null
+++ b/frontends/relay-futil/tests/softmax.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(1, 10), float32]) {
+  let %x1: Tensor[(1, 10), float32] = nn.softmax(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index aab5fd2a72..fc171f60e9 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -96,13 +96,18 @@ def build_main_controls(c: FComponent):
             wires.append(FWire(f'{prim.name}.addr1', f'{declaration.decl_name}.{input.dahlia_name}_addr1'))
             if prim.type == PrimitiveType.Memory2D: continue
             wires.append(FWire(f'{prim.name}.addr2', f'{declaration.decl_name}.{input.dahlia_name}_addr2'))
+            if prim.type == PrimitiveType.Memory3D: continue
+            wires.append(FWire(f'{prim.name}.addr3', f'{declaration.decl_name}.{input.dahlia_name}_addr3'))
 
         output = declaration.output
+        type = output.primitive.type
         wires.append(FWire(f'{output.primitive.name}.addr0', f'{declaration.decl_name}.{output.dahlia_name}_addr0'))
-        if output.primitive.type == PrimitiveType.Memory2D or output.primitive.type == PrimitiveType.Memory3D:
+        if type == PrimitiveType.Memory2D or type == PrimitiveType.Memory3D or type == PrimitiveType.Memory4D:
             wires.append(FWire(f'{output.primitive.name}.addr1', f'{declaration.decl_name}.{output.dahlia_name}_addr1'))
-        if output.primitive.type == PrimitiveType.Memory3D:
+        if type == PrimitiveType.Memory3D or type == PrimitiveType.Memory4D:
             wires.append(FWire(f'{output.primitive.name}.addr2', f'{declaration.decl_name}.{output.dahlia_name}_addr2'))
+        if type == PrimitiveType.Memory4D:
+            wires.append(FWire(f'{output.primitive.name}.addr3', f'{declaration.decl_name}.{output.dahlia_name}_addr3'))
 
         wires.append(
             FWire(f'{output.primitive.name}.write_data', f'{declaration.decl_name}.{output.dahlia_name}_write_data'))

From 4935912db0aa25f17cb8b6b78a20032ee965d46f Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 13:40:22 -0500
Subject: [PATCH 59/75] Add stdlib

---
 primitives/std.lib | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/primitives/std.lib b/primitives/std.lib
index ad63d71ff8..61e65d7e66 100644
--- a/primitives/std.lib
+++ b/primitives/std.lib
@@ -63,7 +63,7 @@ primitive std_mem_d2[width, d0_size, d1_size, d0_idx_size, d1_idx_size](
         parameter d1_idx_size = 4)
        (input logic [d0_idx_size-1:0] addr0,
         input logic [d1_idx_size-1:0] addr1,
-        input logic [width-1:0]   write_data,
+        input logic [width-1:0]   write_data /*verilator public*/,
         input logic               write_en,
         input logic               clk,
         output logic [width-1:0]  read_data,
@@ -697,6 +697,28 @@ primitive std_le<"share"=1>[width](left: width, right: width) -> (out: 1) {
   }
 }
 
+primitive std_exp(exponent: 32, go: 1, clk: 1) -> (out: 32, done: 1) {
+   verilog {
+     module std_exp
+       (input  logic [31:0]  exponent,
+        input  logic        go,
+        input  logic        clk,
+        output logic [31:0] out,
+        output logic        done);
+        always_ff @(posedge clk) begin
+          if (go) begin
+            /* verilator lint_off REALCVT */
+            out = 2.718281 ** exponent;
+            done = 1;
+          end else begin
+            out = 0;
+            done = 0;
+          end
+        end
+     endmodule
+  }
+}
+
 primitive std_sqrt(in: 32, go: 1, clk: 1) -> (out: 32, done: 1) {
   verilog {
     module std_sqrt
@@ -810,11 +832,12 @@ primitive fixed_p_std_const[width, int_width, fract_width, value1, value2] () ->
         module fixed_p_std_const
             #(parameter width=32,
             parameter int_width = 8,
-            parameter fract_width= 24,
+            parameter fract_width = 24,
             parameter value1 = 0,
             parameter value2 = 0)
 
         (output logic [width-1:0] out);
+        /* verilator lint_off WIDTHCONCAT */
         assign out = {value1, value2};
         endmodule
     }
@@ -966,4 +989,4 @@ primitive fixed_p_std_add_dbit[width, int_width1, fract_width1, int_width2, frac
         assign out = {whole_int, whole_fract};
       endmodule
     }
-}
+}
\ No newline at end of file

From 1bcfefe4b7997bb53b5eeb3a4ec66bbdc762c4c0 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 17:08:44 -0500
Subject: [PATCH 60/75] Softmax for integer type fixed.

---
 frontends/relay-futil/dahlia_functions.py  |  4 +--
 frontends/relay-futil/tests/softmax.expect | 41 +++++++++++++++-------
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index e266904fb7..56ed424b51 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -393,8 +393,8 @@ def max_pool2d(declaration):
               for (let n: ubit<32> = 0..{pool_size[1]}/*pool_size[1]*/) {{
                 let pool_x: ubit<32> = stride_x + m;
                 let pool_y: ubit<32> = stride_y + n;
-                let current: {data_type}<{bitwidth}> = {data.name}[i][j][pool_x][pool_y]; 
-                if (current > max) {{ max := current; }} else {{ max := max; }}
+                let current: {data_type}<{bitwidth}> = {data.name}[i][j][pool_x][pool_y];
+                if (current > max) {{ max := current; }} else {{ max := max; }} 
               }}
             }}
             {res.name}[i][j][k][l] := max;
diff --git a/frontends/relay-futil/tests/softmax.expect b/frontends/relay-futil/tests/softmax.expect
index 8fc76f5b1c..92246625a7 100644
--- a/frontends/relay-futil/tests/softmax.expect
+++ b/frontends/relay-futil/tests/softmax.expect
@@ -27,6 +27,7 @@ component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_da
     le1 = prim std_le(4);
     le2 = prim std_le(4);
     slice0 = prim std_slice(32, 32);
+    x1_read0_0 = prim std_reg(32);
     x_expsum_0 = prim std_reg(32);
     x_read0_0 = prim std_reg(32);
     x_read1_0 = prim std_reg(32);
@@ -72,11 +73,9 @@ component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_da
       bin_read0_0.write_en = 1'd1;
       let4[done] = bin_read0_0.done;
       slice0.in = div_pipe0.out;
-      div_pipe0.left = exp1.out;
+      div_pipe0.left = x1_read0_0.out;
       div_pipe0.right = x_expsum_0.out;
       div_pipe0.go = !div_pipe0.done ? 1'd1;
-      exp1.exponent = x_read1_0.out;
-      exp1.go = !exp1.done ? 1'd1;
     }
     group upd0<"static"=1> {
       x_read0_0.write_en = 1'd1;
@@ -85,7 +84,7 @@ component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_da
       x_read0_0.in = 1'd1 ? x0_0_read_data;
       upd0[done] = x_read0_0.done ? 1'd1;
     }
-    group upd1<"static"=2> {
+    group upd1 {
       x_expsum_0.write_en = 1'd1;
       add0.left = x_expsum_0.out;
       add0.right = exp0.out;
@@ -108,26 +107,42 @@ component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_da
       x_read1_0.in = 1'd1 ? x0_0_read_data;
       upd3[done] = x_read1_0.done ? 1'd1;
     }
-    group upd4<"static"=1> {
+    group upd4 {
       x10_0_addr1 = k0.out;
       x10_0_addr0 = i0.out;
-      x10_0_write_en = 1'd1;
-      x10_0_write_data = 1'd1 ? bin_read0_0.out;
+      x10_0_write_en = exp1.done;
+      exp1.exponent = x_read1_0.out;
+      exp1.go = !exp1.done ? 1'd1;
+      x10_0_write_data = exp1.done ? exp1.out;
       upd4[done] = x10_0_done ? 1'd1;
     }
     group upd5<"static"=1> {
+      x1_read0_0.write_en = 1'd1;
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = i0.out;
+      x1_read0_0.in = 1'd1 ? x10_0_read_data;
+      upd5[done] = x1_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? bin_read0_0.out;
+      upd6[done] = x10_0_done ? 1'd1;
+    }
+    group upd7<"static"=1> {
       k0.write_en = 1'd1;
       add2.left = k0.out;
       add2.right = const7.out;
       k0.in = 1'd1 ? add2.out;
-      upd5[done] = k0.done ? 1'd1;
+      upd7[done] = k0.done ? 1'd1;
     }
-    group upd6<"static"=1> {
+    group upd8<"static"=1> {
       i0.write_en = 1'd1;
       add3.left = i0.out;
       add3.right = const8.out;
       i0.in = 1'd1 ? add3.out;
-      upd6[done] = i0.done ? 1'd1;
+      upd8[done] = i0.done ? 1'd1;
     }
   }
 
@@ -153,12 +168,14 @@ component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_da
           while le2.out with cond2 {
             seq {
               upd3;
-              let4;
               upd4;
               upd5;
+              let4;
+              upd6;
+              upd7;
             }
           }
-          upd6;
+          upd8;
         }
       }
     }

From 78d7168afbdfd1b5901f5b9490e7ce6a3d387f54 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 17:09:48 -0500
Subject: [PATCH 61/75] Remove unnecessary else.

---
 frontends/relay-futil/dahlia_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 56ed424b51..cfb482d3f5 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -394,7 +394,7 @@ def max_pool2d(declaration):
                 let pool_x: ubit<32> = stride_x + m;
                 let pool_y: ubit<32> = stride_y + n;
                 let current: {data_type}<{bitwidth}> = {data.name}[i][j][pool_x][pool_y];
-                if (current > max) {{ max := current; }} else {{ max := max; }} 
+                if (current > max) {{ max := current; }} 
               }}
             }}
             {res.name}[i][j][k][l] := max;

From 75c149948e6d44ab70068b10b6680ce20efe1c44 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 20:18:06 -0500
Subject: [PATCH 62/75] Add working max_pool2d.

---
 frontends/relay-futil/dahlia_functions.py     |  22 +--
 .../relay-futil/tests/data/max_pool2d.expect  | 140 +++++++++---------
 .../tests/data/max_pool2d.relay.data          |  43 ++++--
 3 files changed, 112 insertions(+), 93 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index cfb482d3f5..aab69765e7 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -381,23 +381,23 @@ def max_pool2d(declaration):
 
     declarations = pp_dahlia_memory_declarations([res, data])
     program_body = f"""
-    for (let i: ubit<32> = 0..{size0}) {{
-      for (let j: ubit<32> = 0..{size1}) {{
-        for (let k: ubit<32> = 0..{size2}) {{
-          for (let l: ubit<32> = 0..{size3}) {{
-            let stride_x: ubit<32> = k * {strides[0]}/*stride[0]*/;
-            let stride_y: ubit<32> = l * {strides[1]}/*stride[1]*/;
+    for (let b: ubit<32> = 0..{size0}) {{
+      for (let c: ubit<32> = 0..{size1}) {{
+        for (let y: ubit<32> = 0..{size2}) {{
+          for (let x: ubit<32> = 0..{size3}) {{
+            let stride_y: ubit<32> = y * {strides[1]}/*strides[1]*/;
+            let stride_x: ubit<32> = x * {strides[0]}/*strides[0]*/;
             
-            let max: {data_type}<{bitwidth}> = {data.name}[i][j][stride_x][stride_y];
+            let max: {data_type}<{bitwidth}> = {data.name}[b][c][stride_y][stride_x];
             for (let m: ubit<32> = 0..{pool_size[0]}/*pool_size[0]*/) {{
               for (let n: ubit<32> = 0..{pool_size[1]}/*pool_size[1]*/) {{
-                let pool_x: ubit<32> = stride_x + m;
-                let pool_y: ubit<32> = stride_y + n;
-                let current: {data_type}<{bitwidth}> = {data.name}[i][j][pool_x][pool_y];
+                let pool_y: ubit<32> = stride_y + m;
+                let pool_x: ubit<32> = stride_x + n;
+                let current: {data_type}<{bitwidth}> = {data.name}[b][c][pool_y][pool_x];
                 if (current > max) {{ max := current; }} 
               }}
             }}
-            {res.name}[i][j][k][l] := max;
+            {res.name}[b][c][y][x] := max;
           }} 
         }} 
       }} 
diff --git a/frontends/relay-futil/tests/data/max_pool2d.expect b/frontends/relay-futil/tests/data/max_pool2d.expect
index a1c95c7c88..2e4f5739ae 100644
--- a/frontends/relay-futil/tests/data/max_pool2d.expect
+++ b/frontends/relay-futil/tests/data/max_pool2d.expect
@@ -3,112 +3,108 @@
     [
       [
         [
-          12,
-          30,
-          34,
-          37
+          10,
+          20,
+          100,
+          101
         ],
         [
-          1,
-          2,
-          3,
-          4
+          30,
+          40,
+          102,
+          103
         ],
         [
-          1,
-          2,
-          3,
-          4
+          20,
+          30,
+          100,
+          103
         ],
         [
-          1,
-          2,
-          3,
-          4
+          10,
+          40,
+          103,
+          100
         ]
-      ]
-    ],
-    [
+      ],
       [
         [
-          8,
-          2,
-          112,
-          4
+          20,
+          0,
+          70,
+          25
         ],
         [
           1,
           2,
-          3,
+          13,
           4
         ],
         [
           1,
           2,
-          3,
-          4
+          5,
+          6
         ],
         [
-          1,
-          2,
           3,
-          4
+          4,
+          7,
+          8
         ]
       ]
     ],
     [
       [
         [
-          20,
-          0,
-          70,
-          25
+          11,
+          21,
+          109,
+          10
         ],
         [
-          1,
-          2,
-          3,
-          4
+          31,
+          41,
+          0,
+          14
         ],
         [
+          19,
+          42,
           1,
-          2,
-          3,
-          4
+          103
         ],
         [
           1,
-          2,
-          3,
-          4
+          18,
+          10,
+          101
         ]
-      ]
-    ],
-    [
+      ],
       [
-        [
-          12,
-          0,
-          100,
-          12
-        ],
         [
           1,
           2,
+          4,
+          3
+        ],
+        [
           3,
-          4
+          4,
+          2,
+          1
         ],
         [
-          1,
+          4,
+          2,
           2,
-          3,
           4
         ],
         [
           1,
-          2,
           3,
-          4
+          3,
+          1
         ]
       ]
     ]
@@ -117,43 +113,43 @@
     [
       [
         [
-          30,
-          37
+          40,
+          103
         ],
         [
-          2,
-          4
+          40,
+          103
         ]
       ],
       [
         [
-          8,
-          112
+          20,
+          70
         ],
         [
-          2,
-          4
+          4,
+          8
         ]
       ]
     ],
     [
       [
         [
-          20,
-          70
+          41,
+          109
         ],
         [
-          2,
-          4
+          42,
+          103
         ]
       ],
       [
         [
-          12,
-          100
+          4,
+          4
         ],
         [
-          2,
+          4,
           4
         ]
       ]
diff --git a/frontends/relay-futil/tests/data/max_pool2d.relay.data b/frontends/relay-futil/tests/data/max_pool2d.relay.data
index 9e19e8a053..517b34c9cc 100644
--- a/frontends/relay-futil/tests/data/max_pool2d.relay.data
+++ b/frontends/relay-futil/tests/data/max_pool2d.relay.data
@@ -1,20 +1,43 @@
 {
   "data": {
     "data": [
-            [[[12,30,34,37], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[8,2,112,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]],
-            [[[20,0,70,25], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[12,0,100,12], [1,2,3,4], [1,2,3,4], [1,2,3,4]]]
+             [
+              [
+               [10,20,  100,101],
+               [30,40,  102,103],
+
+               [20,30,  100,103],
+               [10,40,  103,100]
+              ],
+              [
+               [20,0,     70,25],
+               [1, 2,      13,4],
+
+               [1,2,        5,6],
+               [3,4,        7,8]
+              ]
+             ],
+             [
+              [
+               [11,21,   109,10],
+               [31,41,     0,14],
+
+               [19,42,    1,103],
+               [1,18,    10,101]
+              ],
+              [
+               [1,2,        4,3],
+               [3,4,        2,1],
+
+               [4,2,        2,4],
+               [1,3,        3,1]
+              ]
+             ]
             ],
     "bitwidth": 32
   },
-  "max": {
-      "data": [0],
-      "bitwidth": 32
-  },
   "result": {
-    "data": [
-             [[[0,0], [0,0]], [[0,0], [0,0]]],
-             [[[0,0], [0,0]], [[0,0], [0,0]]]
-            ],
+    "data": [ [[[0,0], [0,0]], [[0,0], [0,0]]], [[[0,0], [0,0]], [[0,0], [0,0]]] ],
     "bitwidth": 32
   }
 }
\ No newline at end of file

From d63fd81487946f03630867cf12170f830f3103e2 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 20:29:54 -0500
Subject: [PATCH 63/75] Add expected output.

---
 .../tests/data/tensor4d_multiply.expect       | 344 ++++++++++++++++++
 1 file changed, 344 insertions(+)

diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.expect b/frontends/relay-futil/tests/data/tensor4d_multiply.expect
index e69de29bb2..bd548739d2 100644
--- a/frontends/relay-futil/tests/data/tensor4d_multiply.expect
+++ b/frontends/relay-futil/tests/data/tensor4d_multiply.expect
@@ -0,0 +1,344 @@
+{
+  "x": [
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ]
+  ],
+  "x1": [
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ]
+  ],
+  "x2": [
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ]
+  ]
+}

From 82363ad00d08abeb5f22b559e550b6e6ba8a1493 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Wed, 2 Dec 2020 20:35:52 -0500
Subject: [PATCH 64/75] Remove ellipsis

---
 frontends/relay-futil/compiler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 5212354512..b432882e04 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -59,9 +59,9 @@ def relay_id(self, name):
     def dahlia_name(self, name, type):
         """
         Dahlia uses the following naming scheme for arbitrary variables `X`, `Y`:
-        Memory1D: `X0`, `Y0`, ...
-        Memory2D: `X0_0`, `Y0_0`, ...
-        Memory3D: `X0_0_0`, `Y0_0_0`, ...
+        Memory1D: `X0`, `Y0`
+        Memory2D: `X0_0`, `Y0_0`
+        Memory3D: `X0_0_0`, `Y0_0_0`
         """
         assert type in DahliaNameExtension, f'{name} with {type} is not supported yet.'
         return name + DahliaNameExtension[type]

From 83a0cc5357c55132b92d06dc2d4aae194d1561b7 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 3 Dec 2020 18:25:32 -0500
Subject: [PATCH 65/75] Cleanup, add externalize registry to fud.

---
 frontends/relay-futil/compiler.py             |   4 +-
 frontends/relay-futil/dahlia_functions.py     |   2 +-
 frontends/relay-futil/example.py              |   7 +-
 frontends/relay-futil/tests/bias_add.expect   |   4 +
 frontends/relay-futil/tests/let2.expect       |   1 +
 frontends/relay-futil/tests/let3.expect       |   4 +-
 frontends/relay-futil/tests/max_pool2d.expect | 243 ++++++++----------
 frontends/relay-futil/tests/mlp_net.expect    |  51 +++-
 frontends/relay-futil/tests/relu.expect       |   4 +
 frontends/relay-futil/tests/sqrt.expect       |   4 +
 .../relay-futil/tests/tensor3d_divide.expect  |   2 +-
 fud/fud/main.py                               |   4 +
 12 files changed, 176 insertions(+), 154 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index b432882e04..a6cd71c690 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -147,10 +147,10 @@ def lower_to_futil(program) -> str:
     program = relay_transforms(program)
     visitor = Relay2Futil()
 
-    PREAMBLE = """import "primitives/std.lib";"""
+    PREAMBLE = """import "primitives/std.lib";\n"""
     MAIN = visitor.visit(program)
     DAHLIA_COMPONENTS = '\n'.join(visitor.dahlia_components)
-    NEWL = '\n\n'
+    NEWL = '\n'
     return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}'
 
 
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index aab69765e7..14dc98ddd4 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -42,7 +42,7 @@ def lower_dahlia_program(prog, component_name):
         fuse_binary = os.environ['DAHLIA_EXEC'] if 'DAHLIA_EXEC' in os.environ else 'fuse'
         command = f"""
                 {fuse_binary} {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {NO_ERR} \
-                 && cargo run -- {tf1.name} -l ../../ -p externalize > {tf2.name} {NO_ERR}"""
+                 && fud e --from futil {tf1.name} --to futil-externalize > {tf2.name} {NO_ERR}"""
         subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
         component = tf2.read().decode()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
         return component
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 69db563726..ca25a9bebd 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -56,7 +56,8 @@ def softmax():
 
 def max_pool2d():
     data = relay.var('data', shape=[2, 2, 4, 4], dtype='int32')
-    return relay.Function([data], relay.nn.max_pool2d(data, padding=[0,0,0,0], strides=[2,2], pool_size=[2,2]))
+    return relay.Function([data], relay.nn.max_pool2d(data, padding=[0, 0, 0, 0], strides=[2, 2], pool_size=[2, 2]))
+
 
 def mlp_net():
     """The MLP test from Relay."""
@@ -71,8 +72,8 @@ def vgg_net():
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, dense, softmax, mlp_net,
-             vgg_net, max_pool2d]
+ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul,
+             bias_add, relu, dense, softmax, mlp_net, vgg_net, max_pool2d]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
index 620da35d44..c181b95bbf 100644
--- a/frontends/relay-futil/tests/bias_add.expect
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -175,9 +175,13 @@ component main () -> () {
       bias_add0.x0_0_0_0_read_data = x.read_data;
       x.addr1 = bias_add0.x0_0_0_0_addr1;
       x.addr2 = bias_add0.x0_0_0_0_addr2;
+      x.addr3 = bias_add0.x0_0_0_0_addr3;
       bias.addr0 = bias_add0.bias0_addr0;
       bias_add0.bias0_read_data = bias.read_data;
       x1.addr0 = bias_add0.x10_0_0_0_addr0;
+      x1.addr1 = bias_add0.x10_0_0_0_addr1;
+      x1.addr2 = bias_add0.x10_0_0_0_addr2;
+      x1.addr3 = bias_add0.x10_0_0_0_addr3;
       x1.write_data = bias_add0.x10_0_0_0_write_data;
       x1.write_en = bias_add0.x10_0_0_0_write_en;
       bias_add0.x10_0_0_0_done = x1.done;
diff --git a/frontends/relay-futil/tests/let2.expect b/frontends/relay-futil/tests/let2.expect
index b9a9bfd9ec..c4b8afc3cb 100644
--- a/frontends/relay-futil/tests/let2.expect
+++ b/frontends/relay-futil/tests/let2.expect
@@ -68,6 +68,7 @@ component add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_
     }
   }
 }
+
 component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index 11b79b4180..cbb0783fa8 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -76,6 +76,7 @@ component multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32
     }
   }
 }
+
 component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
@@ -100,7 +101,7 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
       i0.write_en = 1'd1;
       let0[done] = i0.done;
     }
-    group let1<> {
+    group let1 {
       bin_read0_0.in = div_pipe0.out;
       bin_read0_0.write_en = div_pipe0.done;
       let1[done] = bin_read0_0.done;
@@ -152,6 +153,7 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
     }
   }
 }
+
 component subtract(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
diff --git a/frontends/relay-futil/tests/max_pool2d.expect b/frontends/relay-futil/tests/max_pool2d.expect
index cbd0da89b2..47decb83ba 100644
--- a/frontends/relay-futil/tests/max_pool2d.expect
+++ b/frontends/relay-futil/tests/max_pool2d.expect
@@ -10,8 +10,10 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
     add5 = prim std_add(32);
     add6 = prim std_add(32);
     add7 = prim std_add(32);
+    b0 = prim std_reg(32);
     bin_read0_0 = prim std_reg(32);
     bin_read1_0 = prim std_reg(32);
+    c0 = prim std_reg(32);
     const0 = prim std_const(32, 0);
     const1 = prim std_const(32, 1);
     const10 = prim std_const(32, 0);
@@ -32,13 +34,8 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
     const7 = prim std_const(32, 1);
     const8 = prim std_const(32, 2);
     const9 = prim std_const(32, 2);
-    data_read0_0 = prim std_reg(32);
-    data_read1_0 = prim std_reg(32);
+    current_0 = prim std_reg(32);
     gt0 = prim std_gt(32);
-    i0 = prim std_reg(32);
-    j0 = prim std_reg(32);
-    k0 = prim std_reg(32);
-    l0 = prim std_reg(32);
     le0 = prim std_le(32);
     le1 = prim std_le(32);
     le2 = prim std_le(32);
@@ -54,12 +51,8 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
     pool_y_0 = prim std_reg(32);
     slice0 = prim std_slice(32, 2);
     slice1 = prim std_slice(32, 2);
-    slice10 = prim std_slice(32, 3);
-    slice11 = prim std_slice(32, 3);
-    slice12 = prim std_slice(32, 2);
-    slice13 = prim std_slice(32, 2);
-    slice14 = prim std_slice(32, 2);
-    slice15 = prim std_slice(32, 2);
+    slice10 = prim std_slice(32, 2);
+    slice11 = prim std_slice(32, 2);
     slice2 = prim std_slice(32, 3);
     slice3 = prim std_slice(32, 3);
     slice4 = prim std_slice(32, 2);
@@ -68,28 +61,30 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
     slice7 = prim std_slice(32, 3);
     slice8 = prim std_slice(32, 2);
     slice9 = prim std_slice(32, 2);
-    stride_k_0 = prim std_reg(32);
-    stride_l_0 = prim std_reg(32);
+    stride_x_0 = prim std_reg(32);
+    stride_y_0 = prim std_reg(32);
+    x0 = prim std_reg(32);
+    y0 = prim std_reg(32);
   }
   wires {
     group cond0<"static"=0> {
       cond0[done] = 1'd1;
-      le0.left = i0.out;
+      le0.left = b0.out;
       le0.right = const1.out;
     }
     group cond1<"static"=0> {
       cond1[done] = 1'd1;
-      le1.left = j0.out;
+      le1.left = c0.out;
       le1.right = const3.out;
     }
     group cond2<"static"=0> {
       cond2[done] = 1'd1;
-      le2.left = k0.out;
+      le2.left = y0.out;
       le2.right = const5.out;
     }
     group cond3<"static"=0> {
       cond3[done] = 1'd1;
-      le3.left = l0.out;
+      le3.left = x0.out;
       le3.right = const7.out;
     }
     group cond4<"static"=0> {
@@ -104,68 +99,68 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
     }
     group cond6<"static"=0> {
       cond6[done] = 1'd1;
-      gt0.left = data_read0_0.out;
+      gt0.left = current_0.out;
       gt0.right = max_0.out;
     }
     group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
+      b0.in = const0.out;
+      b0.write_en = 1'd1;
+      let0[done] = b0.done;
     }
     group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
+      c0.in = const2.out;
+      c0.write_en = 1'd1;
+      let1[done] = c0.done;
     }
     group let10<"static"=1> {
-      pool_x_0.in = add0.out;
-      pool_x_0.write_en = 1'd1;
-      let10[done] = pool_x_0.done;
-      add0.left = stride_k_0.out;
+      pool_y_0.in = add0.out;
+      pool_y_0.write_en = 1'd1;
+      let10[done] = pool_y_0.done;
+      add0.left = stride_y_0.out;
       add0.right = m0.out;
     }
     group let11<"static"=1> {
-      pool_y_0.in = add1.out;
-      pool_y_0.write_en = 1'd1;
-      let11[done] = pool_y_0.done;
-      add1.left = stride_l_0.out;
+      pool_x_0.in = add1.out;
+      pool_x_0.write_en = 1'd1;
+      let11[done] = pool_x_0.done;
+      add1.left = stride_x_0.out;
       add1.right = n0.out;
     }
     group let2<"static"=1> {
-      k0.in = const4.out;
-      k0.write_en = 1'd1;
-      let2[done] = k0.done;
+      y0.in = const4.out;
+      y0.write_en = 1'd1;
+      let2[done] = y0.done;
     }
     group let3<"static"=1> {
-      l0.in = const6.out;
-      l0.write_en = 1'd1;
-      let3[done] = l0.done;
+      x0.in = const6.out;
+      x0.write_en = 1'd1;
+      let3[done] = x0.done;
     }
     group let4<"static"=4> {
       bin_read0_0.in = mult_pipe0.out;
       bin_read0_0.write_en = mult_pipe0.done;
       let4[done] = bin_read0_0.done;
-      mult_pipe0.left = k0.out;
+      mult_pipe0.left = y0.out;
       mult_pipe0.right = const8.out;
       mult_pipe0.go = !mult_pipe0.done ? 1'd1;
     }
     group let5<"static"=1> {
-      stride_k_0.in = bin_read0_0.out;
-      stride_k_0.write_en = 1'd1;
-      let5[done] = stride_k_0.done;
+      stride_y_0.in = bin_read0_0.out;
+      stride_y_0.write_en = 1'd1;
+      let5[done] = stride_y_0.done;
     }
     group let6<"static"=4> {
       bin_read1_0.in = mult_pipe1.out;
       bin_read1_0.write_en = mult_pipe1.done;
       let6[done] = bin_read1_0.done;
-      mult_pipe1.left = l0.out;
+      mult_pipe1.left = x0.out;
       mult_pipe1.right = const9.out;
       mult_pipe1.go = !mult_pipe1.done ? 1'd1;
     }
     group let7<"static"=1> {
-      stride_l_0.in = bin_read1_0.out;
-      stride_l_0.write_en = 1'd1;
-      let7[done] = stride_l_0.done;
+      stride_x_0.in = bin_read1_0.out;
+      stride_x_0.write_en = 1'd1;
+      let7[done] = stride_x_0.done;
     }
     group let8<"static"=1> {
       m0.in = const10.out;
@@ -180,101 +175,88 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
     group upd0<"static"=1> {
       max_0.write_en = 1'd1;
       data0_0_0_0_addr3 = slice3.out;
-      slice3.in = stride_l_0.out;
+      slice3.in = stride_x_0.out;
       data0_0_0_0_addr2 = slice2.out;
-      slice2.in = stride_k_0.out;
+      slice2.in = stride_y_0.out;
       data0_0_0_0_addr1 = slice1.out;
-      slice1.in = j0.out;
+      slice1.in = c0.out;
       data0_0_0_0_addr0 = slice0.out;
-      slice0.in = i0.out;
+      slice0.in = b0.out;
       max_0.in = 1'd1 ? data0_0_0_0_read_data;
       upd0[done] = max_0.done ? 1'd1;
     }
     group upd1<"static"=1> {
-      data_read0_0.write_en = 1'd1;
+      current_0.write_en = 1'd1;
       data0_0_0_0_addr3 = slice7.out;
-      slice7.in = pool_y_0.out;
+      slice7.in = pool_x_0.out;
       data0_0_0_0_addr2 = slice6.out;
-      slice6.in = pool_x_0.out;
+      slice6.in = pool_y_0.out;
       data0_0_0_0_addr1 = slice5.out;
-      slice5.in = j0.out;
+      slice5.in = c0.out;
       data0_0_0_0_addr0 = slice4.out;
-      slice4.in = i0.out;
-      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
-      upd1[done] = data_read0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      i0.write_en = 1'd1;
-      add7.left = i0.out;
-      add7.right = const19.out;
-      i0.in = 1'd1 ? add7.out;
-      upd10[done] = i0.done ? 1'd1;
+      slice4.in = b0.out;
+      current_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd1[done] = current_0.done ? 1'd1;
     }
     group upd2<"static"=1> {
-      data_read1_0.write_en = 1'd1;
-      data0_0_0_0_addr3 = slice11.out;
-      slice11.in = pool_y_0.out;
-      data0_0_0_0_addr2 = slice10.out;
-      slice10.in = pool_x_0.out;
-      data0_0_0_0_addr1 = slice9.out;
-      slice9.in = j0.out;
-      data0_0_0_0_addr0 = slice8.out;
-      slice8.in = i0.out;
-      data_read1_0.in = 1'd1 ? data0_0_0_0_read_data;
-      upd2[done] = data_read1_0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
       max_0.write_en = 1'd1;
-      max_0.in = 1'd1 ? data_read1_0.out;
-      upd3[done] = max_0.done ? 1'd1;
+      max_0.in = 1'd1 ? current_0.out;
+      upd2[done] = max_0.done ? 1'd1;
     }
-    group upd4<"static"=1> {
+    group upd3<"static"=1> {
       n0.write_en = 1'd1;
       add2.left = n0.out;
       add2.right = const14.out;
       n0.in = 1'd1 ? add2.out;
-      upd4[done] = n0.done ? 1'd1;
+      upd3[done] = n0.done ? 1'd1;
     }
-    group upd5<"static"=1> {
+    group upd4<"static"=1> {
       m0.write_en = 1'd1;
       add3.left = m0.out;
       add3.right = const15.out;
       m0.in = 1'd1 ? add3.out;
-      upd5[done] = m0.done ? 1'd1;
+      upd4[done] = m0.done ? 1'd1;
     }
-    group upd6<"static"=1> {
-      result0_0_0_0_addr3 = slice15.out;
-      slice15.in = l0.out;
-      result0_0_0_0_addr2 = slice14.out;
-      slice14.in = k0.out;
-      result0_0_0_0_addr1 = slice13.out;
-      slice13.in = j0.out;
-      result0_0_0_0_addr0 = slice12.out;
-      slice12.in = i0.out;
+    group upd5<"static"=1> {
+      result0_0_0_0_addr3 = slice11.out;
+      slice11.in = x0.out;
+      result0_0_0_0_addr2 = slice10.out;
+      slice10.in = y0.out;
+      result0_0_0_0_addr1 = slice9.out;
+      slice9.in = c0.out;
+      result0_0_0_0_addr0 = slice8.out;
+      slice8.in = b0.out;
       result0_0_0_0_write_en = 1'd1;
       result0_0_0_0_write_data = 1'd1 ? max_0.out;
-      upd6[done] = result0_0_0_0_done ? 1'd1;
+      upd5[done] = result0_0_0_0_done ? 1'd1;
     }
-    group upd7<"static"=1> {
-      l0.write_en = 1'd1;
-      add4.left = l0.out;
+    group upd6<"static"=1> {
+      x0.write_en = 1'd1;
+      add4.left = x0.out;
       add4.right = const16.out;
-      l0.in = 1'd1 ? add4.out;
-      upd7[done] = l0.done ? 1'd1;
+      x0.in = 1'd1 ? add4.out;
+      upd6[done] = x0.done ? 1'd1;
     }
-    group upd8<"static"=1> {
-      k0.write_en = 1'd1;
-      add5.left = k0.out;
+    group upd7<"static"=1> {
+      y0.write_en = 1'd1;
+      add5.left = y0.out;
       add5.right = const17.out;
-      k0.in = 1'd1 ? add5.out;
-      upd8[done] = k0.done ? 1'd1;
+      y0.in = 1'd1 ? add5.out;
+      upd7[done] = y0.done ? 1'd1;
     }
-    group upd9<"static"=1> {
-      j0.write_en = 1'd1;
-      add6.left = j0.out;
+    group upd8<"static"=1> {
+      c0.write_en = 1'd1;
+      add6.left = c0.out;
       add6.right = const18.out;
-      j0.in = 1'd1 ? add6.out;
-      upd9[done] = j0.done ? 1'd1;
+      c0.in = 1'd1 ? add6.out;
+      upd8[done] = c0.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      b0.write_en = 1'd1;
+      add7.left = b0.out;
+      add7.right = const19.out;
+      b0.in = 1'd1 ? add7.out;
+      upd9[done] = b0.done ? 1'd1;
     }
   }
 
@@ -303,44 +285,37 @@ component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done:
                         }
                       }
                       upd0;
-                      par {
+                      let8;
+                      while le4.out with cond4 {
                         seq {
-                          let8;
-                          while le4.out with cond4 {
+                          let9;
+                          while le5.out with cond5 {
                             seq {
-                              let9;
-                              while le5.out with cond5 {
-                                seq {
-                                  par {
-                                    let10;
-                                    let11;
-                                  }
-                                  upd1;
-                                  if gt0.out with cond6 {
-                                    seq {
-                                      upd2;
-                                      upd3;
-                                    }
-                                  }
-                                  upd4;
-                                }
+                              par {
+                                let10;
+                                let11;
+                              }
+                              upd1;
+                              if gt0.out with cond6 {
+                                upd2;
                               }
-                              upd5;
+                              upd3;
                             }
                           }
+                          upd4;
                         }
-                        upd6;
                       }
-                      upd7;
+                      upd5;
+                      upd6;
                     }
                   }
-                  upd8;
+                  upd7;
                 }
               }
-              upd9;
+              upd8;
             }
           }
-          upd10;
+          upd9;
         }
       }
     }
diff --git a/frontends/relay-futil/tests/mlp_net.expect b/frontends/relay-futil/tests/mlp_net.expect
index b780037235..9f7c781f88 100644
--- a/frontends/relay-futil/tests/mlp_net.expect
+++ b/frontends/relay-futil/tests/mlp_net.expect
@@ -30,6 +30,7 @@ component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_
     x8_expsum_0 = prim std_reg(32);
     x8_read0_0 = prim std_reg(32);
     x8_read1_0 = prim std_reg(32);
+    x9_read0_0 = prim std_reg(32);
   }
   wires {
     group cond0<"static"=0> {
@@ -72,11 +73,9 @@ component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_
       bin_read0_0.write_en = 1'd1;
       let4[done] = bin_read0_0.done;
       slice0.in = div_pipe0.out;
-      div_pipe0.left = exp1.out;
+      div_pipe0.left = x9_read0_0.out;
       div_pipe0.right = x8_expsum_0.out;
       div_pipe0.go = !div_pipe0.done ? 1'd1;
-      exp1.exponent = x8_read1_0.out;
-      exp1.go = !exp1.done ? 1'd1;
     }
     group upd0<"static"=1> {
       x8_read0_0.write_en = 1'd1;
@@ -85,7 +84,7 @@ component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_
       x8_read0_0.in = 1'd1 ? x80_0_read_data;
       upd0[done] = x8_read0_0.done ? 1'd1;
     }
-    group upd1<"static"=2> {
+    group upd1 {
       x8_expsum_0.write_en = 1'd1;
       add0.left = x8_expsum_0.out;
       add0.right = exp0.out;
@@ -108,26 +107,42 @@ component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_
       x8_read1_0.in = 1'd1 ? x80_0_read_data;
       upd3[done] = x8_read1_0.done ? 1'd1;
     }
-    group upd4<"static"=1> {
+    group upd4 {
       x90_0_addr1 = k0.out;
       x90_0_addr0 = i0.out;
-      x90_0_write_en = 1'd1;
-      x90_0_write_data = 1'd1 ? bin_read0_0.out;
+      x90_0_write_en = exp1.done;
+      exp1.exponent = x8_read1_0.out;
+      exp1.go = !exp1.done ? 1'd1;
+      x90_0_write_data = exp1.done ? exp1.out;
       upd4[done] = x90_0_done ? 1'd1;
     }
     group upd5<"static"=1> {
+      x9_read0_0.write_en = 1'd1;
+      x90_0_addr1 = k0.out;
+      x90_0_addr0 = i0.out;
+      x9_read0_0.in = 1'd1 ? x90_0_read_data;
+      upd5[done] = x9_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      x90_0_addr1 = k0.out;
+      x90_0_addr0 = i0.out;
+      x90_0_write_en = 1'd1;
+      x90_0_write_data = 1'd1 ? bin_read0_0.out;
+      upd6[done] = x90_0_done ? 1'd1;
+    }
+    group upd7<"static"=1> {
       k0.write_en = 1'd1;
       add2.left = k0.out;
       add2.right = const7.out;
       k0.in = 1'd1 ? add2.out;
-      upd5[done] = k0.done ? 1'd1;
+      upd7[done] = k0.done ? 1'd1;
     }
-    group upd6<"static"=1> {
+    group upd8<"static"=1> {
       i0.write_en = 1'd1;
       add3.left = i0.out;
       add3.right = const8.out;
       i0.in = 1'd1 ? add3.out;
-      upd6[done] = i0.done ? 1'd1;
+      upd8[done] = i0.done ? 1'd1;
     }
   }
 
@@ -153,17 +168,20 @@ component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_
           while le2.out with cond2 {
             seq {
               upd3;
-              let4;
               upd4;
               upd5;
+              let4;
+              upd6;
+              upd7;
             }
           }
-          upd6;
+          upd8;
         }
       }
     }
   }
 }
+
 component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x70_0_read_data: 32, x70_0_done: 1, x80_0_read_data: 32, x80_0_done: 1) -> (done: 1, fc3_bias0_addr0: 4, fc3_bias0_write_data: 32, fc3_bias0_write_en: 1, fc3_bias0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
@@ -263,6 +281,7 @@ component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x
     }
   }
 }
+
 component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done: 1, x60_0_read_data: 32, x60_0_done: 1, x70_0_read_data: 32, x70_0_done: 1) -> (done: 1, fc3_weight0_0_addr0: 4, fc3_weight0_0_addr1: 7, fc3_weight0_0_write_data: 32, fc3_weight0_0_write_en: 1, fc3_weight0_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1) {
   cells {
     add0 = prim std_add(7);
@@ -568,6 +587,7 @@ component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done:
     }
   }
 }
+
 component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_data: 32, x60_0_done: 1) -> (done: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1) {
   cells {
     add0 = prim std_add(7);
@@ -690,6 +710,7 @@ component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_da
     }
   }
 }
+
 component bias_add1(go: 1, clk: 1, fc2_bias0_read_data: 32, fc2_bias0_done: 1, x40_0_read_data: 32, x40_0_done: 1, x50_0_read_data: 32, x50_0_done: 1) -> (done: 1, fc2_bias0_addr0: 7, fc2_bias0_write_data: 32, fc2_bias0_write_en: 1, fc2_bias0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
@@ -789,6 +810,7 @@ component bias_add1(go: 1, clk: 1, fc2_bias0_read_data: 32, fc2_bias0_done: 1, x
     }
   }
 }
+
 component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done: 1, x30_0_read_data: 32, x30_0_done: 1, x40_0_read_data: 32, x40_0_done: 1) -> (done: 1, fc2_weight0_0_addr0: 7, fc2_weight0_0_addr1: 8, fc2_weight0_0_write_data: 32, fc2_weight0_0_write_en: 1, fc2_weight0_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1) {
   cells {
     add0 = prim std_add(8);
@@ -1094,6 +1116,7 @@ component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done:
     }
   }
 }
+
 component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_data: 32, x30_0_done: 1) -> (done: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1) {
   cells {
     add0 = prim std_add(8);
@@ -1216,6 +1239,7 @@ component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_dat
     }
   }
 }
+
 component bias_add(go: 1, clk: 1, fc1_bias0_read_data: 32, fc1_bias0_done: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_read_data: 32, x20_0_done: 1) -> (done: 1, fc1_bias0_addr0: 8, fc1_bias0_write_data: 32, fc1_bias0_write_en: 1, fc1_bias0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
@@ -1315,6 +1339,7 @@ component bias_add(go: 1, clk: 1, fc1_bias0_read_data: 32, fc1_bias0_done: 1, x1
     }
   }
 }
+
 component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, fc1_weight0_0_addr0: 8, fc1_weight0_0_addr1: 10, fc1_weight0_0_write_data: 32, fc1_weight0_0_write_en: 1, fc1_weight0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
   cells {
     add0 = prim std_add(10);
@@ -1620,6 +1645,7 @@ component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done:
     }
   }
 }
+
 component batch_flatten(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 1, data0_0_0_0_addr1: 1, data0_0_0_0_addr2: 5, data0_0_0_0_addr3: 5, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1) {
   cells {
     add0 = prim std_add(10);
@@ -1821,6 +1847,7 @@ component main () -> () {
       batch_flatten0.data0_0_0_0_read_data = data.read_data;
       data.addr1 = batch_flatten0.data0_0_0_0_addr1;
       data.addr2 = batch_flatten0.data0_0_0_0_addr2;
+      data.addr3 = batch_flatten0.data0_0_0_0_addr3;
       x.addr0 = batch_flatten0.x0_0_addr0;
       x.addr1 = batch_flatten0.x0_0_addr1;
       x.write_data = batch_flatten0.x0_0_write_data;
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
index 74b5646d9b..7a65c37f5a 100644
--- a/frontends/relay-futil/tests/relu.expect
+++ b/frontends/relay-futil/tests/relu.expect
@@ -201,7 +201,11 @@ component main () -> () {
       relu0.x0_0_0_0_read_data = x.read_data;
       x.addr1 = relu0.x0_0_0_0_addr1;
       x.addr2 = relu0.x0_0_0_0_addr2;
+      x.addr3 = relu0.x0_0_0_0_addr3;
       x1.addr0 = relu0.x10_0_0_0_addr0;
+      x1.addr1 = relu0.x10_0_0_0_addr1;
+      x1.addr2 = relu0.x10_0_0_0_addr2;
+      x1.addr3 = relu0.x10_0_0_0_addr3;
       x1.write_data = relu0.x10_0_0_0_write_data;
       x1.write_en = relu0.x10_0_0_0_write_en;
       relu0.x10_0_0_0_done = x1.done;
diff --git a/frontends/relay-futil/tests/sqrt.expect b/frontends/relay-futil/tests/sqrt.expect
index edb40c6259..2963943f4f 100644
--- a/frontends/relay-futil/tests/sqrt.expect
+++ b/frontends/relay-futil/tests/sqrt.expect
@@ -164,7 +164,11 @@ component main () -> () {
       sqrt0.x0_0_0_0_read_data = x.read_data;
       x.addr1 = sqrt0.x0_0_0_0_addr1;
       x.addr2 = sqrt0.x0_0_0_0_addr2;
+      x.addr3 = sqrt0.x0_0_0_0_addr3;
       x1.addr0 = sqrt0.x10_0_0_0_addr0;
+      x1.addr1 = sqrt0.x10_0_0_0_addr1;
+      x1.addr2 = sqrt0.x10_0_0_0_addr2;
+      x1.addr3 = sqrt0.x10_0_0_0_addr3;
       x1.write_data = sqrt0.x10_0_0_0_write_data;
       x1.write_en = sqrt0.x10_0_0_0_write_en;
       sqrt0.x10_0_0_0_done = x1.done;
diff --git a/frontends/relay-futil/tests/tensor3d_divide.expect b/frontends/relay-futil/tests/tensor3d_divide.expect
index 5058296dd8..a823a0ff96 100644
--- a/frontends/relay-futil/tests/tensor3d_divide.expect
+++ b/frontends/relay-futil/tests/tensor3d_divide.expect
@@ -56,7 +56,7 @@ component divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_re
       k0.write_en = 1'd1;
       let2[done] = k0.done;
     }
-    group let3<> {
+    group let3 {
       bin_read0_0.in = div_pipe0.out;
       bin_read0_0.write_en = div_pipe0.done;
       let3[done] = bin_read0_0.done;
diff --git a/fud/fud/main.py b/fud/fud/main.py
index 8d4cd2687c..6f2ba5810c 100644
--- a/fud/fud/main.py
+++ b/fud/fud/main.py
@@ -36,6 +36,10 @@ def register_stages(registry, config):
         futil.FutilStage(config, 'futil-noinline', '-b futil -d hole-inliner',
                          'Compile FuTIL to FuTIL to remove all control and inline groups'))
 
+    registry.register(
+        futil.FutilStage(config, 'futil-externalize', '-b futil -p externalize',
+                         'Compile FuTIL to FuTIL to externalize all external memories primitives'))
+
     # Verilator
     registry.register(
         verilator.VerilatorStage(config, 'vcd',

From fd3f133e3082f0637be37797e10995c5df2ad2a5 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 3 Dec 2020 21:49:00 -0500
Subject: [PATCH 66/75] Initial conv2d commit.

---
 frontends/relay-futil/compiler.py             |  2 +-
 frontends/relay-futil/dahlia_functions.py     | 54 ++++++++++++++++---
 frontends/relay-futil/example.py              |  8 ++-
 frontends/relay-futil/tests/conv2d.expect     |  0
 frontends/relay-futil/tests/conv2d.relay      |  6 +++
 .../relay-futil/tests/data/conv2d.expect      |  0
 frontends/relay-futil/tests/data/conv2d.relay |  5 ++
 .../relay-futil/tests/data/conv2d.relay.data  | 15 ++++++
 8 files changed, 81 insertions(+), 9 deletions(-)
 create mode 100644 frontends/relay-futil/tests/conv2d.expect
 create mode 100644 frontends/relay-futil/tests/conv2d.relay
 create mode 100644 frontends/relay-futil/tests/data/conv2d.expect
 create mode 100644 frontends/relay-futil/tests/data/conv2d.relay
 create mode 100644 frontends/relay-futil/tests/data/conv2d.relay.data

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index a6cd71c690..4b8b5c22e8 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -15,7 +15,7 @@
 # Mapping from Relay function names to their respective Dahlia lowering.
 RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
                       'nn.bias_add': bias_add, 'nn.relu': relu, 'nn.softmax': softmax, 'nn.max_pool2d': max_pool2d,
-                      'negative': negative, 'expand_dims': expand_dims, 'sqrt': sqrt}
+                      'nn.conv2d': conv2d, 'negative': negative, 'expand_dims': expand_dims, 'sqrt': sqrt}
 
 # Mapping between primitive type and associated Dahlia name extension.
 # E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 14dc98ddd4..15cc65e38b 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -300,8 +300,6 @@ def batch_matmul(declaration):
 # of the matrix multiply. Otherwise, the values aren't computed properly. Look deeper into this.
 def dense(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.dense"""
-    # TODO(cgyurgyik): Add support for `units`.
-    units = declaration.attributes.get_int("units")
     op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
     bitwidth, M1_size0, M1_size1 = op1.data[0], op1.data[1], op1.data[2]
     M1_index_size0, M1_index_size1 = op1.data[3], op1.data[4]
@@ -352,7 +350,9 @@ def softmax(declaration):
     body = f"""
     for (let i: ubit<{index_size0}> = 0..{size0}) {{
       let {op.name}_expsum: {data_type}<{bitwidth}> = {zero};
-      for (let j: ubit<{index_size1}> = 0..{size1}) {{ {op.name}_expsum += exp({op.name}[i][j]); }}
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{ 
+        {op.name}_expsum += exp({op.name}[i][j]); 
+      }}
       for (let k: ubit<{index_size1}> = 0..{size1}) {{ 
         {res.name}[i][k] := exp({op.name}[i][k]); 
         ---
@@ -370,10 +370,8 @@ def max_pool2d(declaration):
 
     strides = declaration.attributes.get_int_tuple("strides")
     pool_size = declaration.attributes.get_int_tuple("pool_size")
-    padding = declaration.attributes.get_int_tuple("padding")
     layout = declaration.attributes.get_str("layout")
     ceil_mode = declaration.attributes.get_int("ceil_mode")
-    for p in padding: assert p == 0, f"Non-zero padding: {padding} is not currently supported for nn.max_pool2d"
     assert layout == 'NCHW', f"Layout \'{layout}\' is not currently supported for nn.max_pool2d; please use `NCHW`"
     assert ceil_mode == False, "`ceil_mode` is not currently supported for nn.max_pool2d"
     bitwidth, data_type = data.data[0], data.data_type
@@ -385,8 +383,8 @@ def max_pool2d(declaration):
       for (let c: ubit<32> = 0..{size1}) {{
         for (let y: ubit<32> = 0..{size2}) {{
           for (let x: ubit<32> = 0..{size3}) {{
-            let stride_y: ubit<32> = y * {strides[1]}/*strides[1]*/;
-            let stride_x: ubit<32> = x * {strides[0]}/*strides[0]*/;
+            let stride_y: ubit<32> = y * {strides[0]}/*strides[0]*/;
+            let stride_x: ubit<32> = x * {strides[1]}/*strides[1]*/;
             
             let max: {data_type}<{bitwidth}> = {data.name}[b][c][stride_y][stride_x];
             for (let m: ubit<32> = 0..{pool_size[0]}/*pool_size[0]*/) {{
@@ -405,3 +403,45 @@ def max_pool2d(declaration):
     """
     program = f"""{declarations}{NEWL}{program_body}"""
     return lower_dahlia_program(program, declaration.component_name)
+
+
+# Only supports a small subset of the `conv2d` function. For example,
+# dilation and grouped convlution are not supported.
+def conv2d(declaration):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.conv2d"""
+    data, weight, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+
+    strides = declaration.attributes.get_int_tuple("strides")
+    kernel_size = declaration.attributes.get_int_tuple("kernel_size")
+    channels = declaration.attributes.get_int("channels")
+    bitwidth, data_type = data.data[0], data.data_type
+    size0, size1, size2, size3 = res.data[1], res.data[2], res.data[3], res.data[4]
+
+    declarations = pp_dahlia_memory_declarations([res, data, weight])
+
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    program_body = f"""
+    for (let b: ubit<32> = 0..{size0}) {{
+      for (let c: ubit<32> = 0..{size1}) {{
+        for (let y: ubit<32> = 0..{size2}) {{
+          for (let x: ubit<32> = 0..{size3}) {{
+            let weighted_sum: {data_type}<{bitwidth}> = {zero};
+            
+            for (let k: ubit<32> = 0..{channels}) {{
+              for (let dy: ubit<32> = 0..{kernel_size[1]}/*kernel_size[1]*/) {{
+                for (let dx: ubit<32> = 0..{kernel_size[0]}/*kernel_size[0]*/) {{
+                  let kernel_y: ubit<32> = /*strides[0]*/{strides[0]} * y + dy;
+                  let kernel_x: ubit<32> = /*strides[1]*/{strides[1]} * x + dx;
+                  weighted_sum += {data.name}[b][k][kernel_y][kernel_x] * {weight.name}[c][k][dy][dx];
+                }}
+              }}
+            }}
+            {res.name}[b][c][y][x] := weighted_sum;
+          }} 
+        }} 
+      }} 
+    }} 
+    """
+    program = f"""{declarations}{NEWL}{program_body}"""
+
+    return lower_dahlia_program(program, declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index ca25a9bebd..34dc9120ee 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -59,6 +59,12 @@ def max_pool2d():
     return relay.Function([data], relay.nn.max_pool2d(data, padding=[0, 0, 0, 0], strides=[2, 2], pool_size=[2, 2]))
 
 
+def conv2d():
+    d = relay.var('data', shape=[1,2,2,2], dtype='int32')
+    w = relay.var('weight', shape=[1,2,2,2], dtype='int32')
+    return relay.Function([d, w], relay.nn.conv2d(d, w, padding=[1, 1, 1, 1], channels=1, kernel_size=[2,2]))
+
+
 def mlp_net():
     """The MLP test from Relay."""
     from tvm.relay.testing import mlp
@@ -73,7 +79,7 @@ def vgg_net():
 
 
 ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul,
-             bias_add, relu, dense, softmax, mlp_net, vgg_net, max_pool2d]
+             bias_add, relu, dense, softmax, conv2d, max_pool2d, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/tests/conv2d.expect b/frontends/relay-futil/tests/conv2d.expect
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/frontends/relay-futil/tests/conv2d.relay b/frontends/relay-futil/tests/conv2d.relay
new file mode 100644
index 0000000000..f59a6cef8f
--- /dev/null
+++ b/frontends/relay-futil/tests/conv2d.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%data: Tensor[(5, 512, 14, 14), int32], %weight: Tensor[(512, 512, 3, 3), int32]) -> Tensor[(5, 512, 14, 14), int32] {
+  let %x: Tensor[(5, 512, 14, 14), int32] = nn.conv2d(%data, %weight, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]) /* ty=Tensor[(5, 512, 14, 14), int32] */;
+  %x
+}
+
diff --git a/frontends/relay-futil/tests/data/conv2d.expect b/frontends/relay-futil/tests/data/conv2d.expect
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/frontends/relay-futil/tests/data/conv2d.relay b/frontends/relay-futil/tests/data/conv2d.relay
new file mode 100644
index 0000000000..d85f4aae0e
--- /dev/null
+++ b/frontends/relay-futil/tests/data/conv2d.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%data: Tensor[(2, 2, 2, 2), int32], %weight: Tensor[(2, 2, 2, 2), int32]) {
+  let %x = nn.conv2d(%data, %weight, padding=[1, 1, 1, 1], channels=2, kernel_size=[2, 2]);
+  %x
+}
diff --git a/frontends/relay-futil/tests/data/conv2d.relay.data b/frontends/relay-futil/tests/data/conv2d.relay.data
new file mode 100644
index 0000000000..c149ae31ea
--- /dev/null
+++ b/frontends/relay-futil/tests/data/conv2d.relay.data
@@ -0,0 +1,15 @@
+{
+  "data": {
+    "data": [ [[[1,0], [0,11]], [[10,4], [11,14]]], [[[10,100], [0,0]], [[0,10], [0,4]]] ],
+    "bitwidth": 32
+  },
+  "weight": {
+    "data": [ [[[2,1], [3,4]], [[5,5], [0,3]]], [[[2,1], [4,4]], [[19,0], [20,0]]] ],
+    "bitwidth": 32
+  },
+  "x": {
+    "data": [ [[[0,0,0], [0,0,0], [0,0,0]], [[0,0,0], [0,0,0], [0,0,0]]],
+              [[[0,0,0], [0,0,0], [0,0,0]], [[0,0,0], [0,0,0], [0,0,0]]] ],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file

From 2bc240eee34d886495f0248bc24c15fc613ccae1 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 3 Dec 2020 21:51:39 -0500
Subject: [PATCH 67/75] Update conv2d expect.

---
 frontends/relay-futil/tests/conv2d.expect | 395 ++++++++++++++++++++++
 frontends/relay-futil/tests/conv2d.relay  |   4 +-
 2 files changed, 397 insertions(+), 2 deletions(-)

diff --git a/frontends/relay-futil/tests/conv2d.expect b/frontends/relay-futil/tests/conv2d.expect
index e69de29bb2..a9fa6b4dfd 100644
--- a/frontends/relay-futil/tests/conv2d.expect
+++ b/frontends/relay-futil/tests/conv2d.expect
@@ -0,0 +1,395 @@
+import "primitives/std.lib";
+
+component conv2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, weight0_0_0_0_read_data: 32, weight0_0_0_0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 3, data0_0_0_0_addr1: 10, data0_0_0_0_addr2: 4, data0_0_0_0_addr3: 4, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, weight0_0_0_0_addr0: 10, weight0_0_0_0_addr1: 10, weight0_0_0_0_addr2: 2, weight0_0_0_0_addr3: 2, weight0_0_0_0_write_data: 32, weight0_0_0_0_write_en: 1, weight0_0_0_0_clk: 1, x0_0_0_0_addr0: 3, x0_0_0_0_addr1: 10, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 4, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(32);
+    add2 = prim fixed_p_std_add(32, 16, 16);
+    add3 = prim std_add(32);
+    add4 = prim std_add(32);
+    add5 = prim std_add(32);
+    add6 = prim std_add(32);
+    add7 = prim std_add(32);
+    add8 = prim std_add(32);
+    add9 = prim std_add(32);
+    b0 = prim std_reg(32);
+    bin_read0_0 = prim std_reg(32);
+    bin_read1_0 = prim std_reg(32);
+    bin_read2_0 = prim std_reg(32);
+    c0 = prim std_reg(32);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(32, 4);
+    const10 = prim std_const(32, 0);
+    const11 = prim std_const(32, 2);
+    const12 = prim std_const(32, 0);
+    const13 = prim std_const(32, 2);
+    const14 = prim std_const(32, 1);
+    const15 = prim std_const(32, 1);
+    const16 = prim std_const(32, 1);
+    const17 = prim std_const(32, 1);
+    const18 = prim std_const(32, 1);
+    const19 = prim std_const(32, 1);
+    const2 = prim std_const(32, 0);
+    const20 = prim std_const(32, 1);
+    const21 = prim std_const(32, 1);
+    const22 = prim std_const(32, 1);
+    const3 = prim std_const(32, 511);
+    const4 = prim std_const(32, 0);
+    const5 = prim std_const(32, 13);
+    const6 = prim std_const(32, 0);
+    const7 = prim std_const(32, 13);
+    const8 = prim std_const(32, 0);
+    const9 = prim std_const(32, 511);
+    data_read0_0 = prim std_reg(32);
+    dx0 = prim std_reg(32);
+    dy0 = prim std_reg(32);
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    k0 = prim std_reg(32);
+    kernel_x_0 = prim std_reg(32);
+    kernel_y_0 = prim std_reg(32);
+    le0 = prim std_le(32);
+    le1 = prim std_le(32);
+    le2 = prim std_le(32);
+    le3 = prim std_le(32);
+    le4 = prim std_le(32);
+    le5 = prim std_le(32);
+    le6 = prim std_le(32);
+    mult_pipe0 = prim std_mult_pipe(32);
+    mult_pipe1 = prim std_mult_pipe(32);
+    mult_pipe2 = prim std_mult_pipe(32);
+    slice0 = prim std_slice(32, 3);
+    slice1 = prim std_slice(32, 10);
+    slice10 = prim std_slice(32, 10);
+    slice11 = prim std_slice(32, 4);
+    slice12 = prim std_slice(32, 4);
+    slice2 = prim std_slice(32, 4);
+    slice3 = prim std_slice(32, 4);
+    slice4 = prim std_slice(32, 10);
+    slice5 = prim std_slice(32, 10);
+    slice6 = prim std_slice(32, 2);
+    slice7 = prim std_slice(32, 2);
+    slice8 = prim std_slice(32, 32);
+    slice9 = prim std_slice(32, 3);
+    weight_read0_0 = prim std_reg(32);
+    weighted_sum_0 = prim std_reg(32);
+    x0 = prim std_reg(32);
+    y0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = b0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = c0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = y0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = x0.out;
+      le3.right = const7.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = k0.out;
+      le4.right = const9.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = dy0.out;
+      le5.right = const11.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = dx0.out;
+      le6.right = const13.out;
+    }
+    group let0<"static"=1> {
+      b0.in = const0.out;
+      b0.write_en = 1'd1;
+      let0[done] = b0.done;
+    }
+    group let1<"static"=1> {
+      c0.in = const2.out;
+      c0.write_en = 1'd1;
+      let1[done] = c0.done;
+    }
+    group let10<"static"=4> {
+      bin_read1_0.in = mult_pipe1.out;
+      bin_read1_0.write_en = mult_pipe1.done;
+      let10[done] = bin_read1_0.done;
+      mult_pipe1.left = const15.out;
+      mult_pipe1.right = x0.out;
+      mult_pipe1.go = !mult_pipe1.done ? 1'd1;
+    }
+    group let11<"static"=1> {
+      kernel_x_0.in = add1.out;
+      kernel_x_0.write_en = 1'd1;
+      let11[done] = kernel_x_0.done;
+      add1.left = bin_read1_0.out;
+      add1.right = dx0.out;
+    }
+    group let12<"static"=1> {
+      bin_read2_0.in = slice8.out;
+      bin_read2_0.write_en = 1'd1;
+      let12[done] = bin_read2_0.done;
+      slice8.in = mult_pipe2.out;
+      mult_pipe2.left = data_read0_0.out;
+      mult_pipe2.right = weight_read0_0.out;
+      mult_pipe2.go = !mult_pipe2.done ? 1'd1;
+    }
+    group let2<"static"=1> {
+      y0.in = const4.out;
+      y0.write_en = 1'd1;
+      let2[done] = y0.done;
+    }
+    group let3<"static"=1> {
+      x0.in = const6.out;
+      x0.write_en = 1'd1;
+      let3[done] = x0.done;
+    }
+    group let4<"static"=1> {
+      weighted_sum_0.in = fpconst0.out;
+      weighted_sum_0.write_en = 1'd1;
+      let4[done] = weighted_sum_0.done;
+    }
+    group let5<"static"=1> {
+      k0.in = const8.out;
+      k0.write_en = 1'd1;
+      let5[done] = k0.done;
+    }
+    group let6<"static"=1> {
+      dy0.in = const10.out;
+      dy0.write_en = 1'd1;
+      let6[done] = dy0.done;
+    }
+    group let7<"static"=1> {
+      dx0.in = const12.out;
+      dx0.write_en = 1'd1;
+      let7[done] = dx0.done;
+    }
+    group let8<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let8[done] = bin_read0_0.done;
+      mult_pipe0.left = const14.out;
+      mult_pipe0.right = y0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let9<"static"=1> {
+      kernel_y_0.in = add0.out;
+      kernel_y_0.write_en = 1'd1;
+      let9[done] = kernel_y_0.done;
+      add0.left = bin_read0_0.out;
+      add0.right = dy0.out;
+    }
+    group upd0<"static"=1> {
+      data_read0_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice3.out;
+      slice3.in = kernel_x_0.out;
+      data0_0_0_0_addr2 = slice2.out;
+      slice2.in = kernel_y_0.out;
+      data0_0_0_0_addr1 = slice1.out;
+      slice1.in = k0.out;
+      data0_0_0_0_addr0 = slice0.out;
+      slice0.in = b0.out;
+      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd0[done] = data_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      weight_read0_0.write_en = 1'd1;
+      weight0_0_0_0_addr3 = slice7.out;
+      slice7.in = dx0.out;
+      weight0_0_0_0_addr2 = slice6.out;
+      slice6.in = dy0.out;
+      weight0_0_0_0_addr1 = slice5.out;
+      slice5.in = k0.out;
+      weight0_0_0_0_addr0 = slice4.out;
+      slice4.in = c0.out;
+      weight_read0_0.in = 1'd1 ? weight0_0_0_0_read_data;
+      upd1[done] = weight_read0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      b0.write_en = 1'd1;
+      add9.left = b0.out;
+      add9.right = const22.out;
+      b0.in = 1'd1 ? add9.out;
+      upd10[done] = b0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      weighted_sum_0.write_en = 1'd1;
+      add2.left = weighted_sum_0.out;
+      add2.right = bin_read2_0.out;
+      weighted_sum_0.in = 1'd1 ? add2.out;
+      upd2[done] = weighted_sum_0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      dx0.write_en = 1'd1;
+      add3.left = dx0.out;
+      add3.right = const16.out;
+      dx0.in = 1'd1 ? add3.out;
+      upd3[done] = dx0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      dy0.write_en = 1'd1;
+      add4.left = dy0.out;
+      add4.right = const17.out;
+      dy0.in = 1'd1 ? add4.out;
+      upd4[done] = dy0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      k0.write_en = 1'd1;
+      add5.left = k0.out;
+      add5.right = const18.out;
+      k0.in = 1'd1 ? add5.out;
+      upd5[done] = k0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      x0_0_0_0_addr3 = slice12.out;
+      slice12.in = x0.out;
+      x0_0_0_0_addr2 = slice11.out;
+      slice11.in = y0.out;
+      x0_0_0_0_addr1 = slice10.out;
+      slice10.in = c0.out;
+      x0_0_0_0_addr0 = slice9.out;
+      slice9.in = b0.out;
+      x0_0_0_0_write_en = 1'd1;
+      x0_0_0_0_write_data = 1'd1 ? weighted_sum_0.out;
+      upd6[done] = x0_0_0_0_done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      x0.write_en = 1'd1;
+      add6.left = x0.out;
+      add6.right = const19.out;
+      x0.in = 1'd1 ? add6.out;
+      upd7[done] = x0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      y0.write_en = 1'd1;
+      add7.left = y0.out;
+      add7.right = const20.out;
+      y0.in = 1'd1 ? add7.out;
+      upd8[done] = y0.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      c0.write_en = 1'd1;
+      add8.left = c0.out;
+      add8.right = const21.out;
+      c0.in = 1'd1 ? add8.out;
+      upd9[done] = c0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      par {
+                        let4;
+                        seq {
+                          let5;
+                          while le4.out with cond4 {
+                            seq {
+                              let6;
+                              while le5.out with cond5 {
+                                seq {
+                                  let7;
+                                  while le6.out with cond6 {
+                                    seq {
+                                      par {
+                                        seq {
+                                          let8;
+                                          let9;
+                                        }
+                                        seq {
+                                          let10;
+                                          let11;
+                                        }
+                                      }
+                                      par {
+                                        upd0;
+                                        upd1;
+                                      }
+                                      let12;
+                                      upd2;
+                                      upd3;
+                                    }
+                                  }
+                                  upd4;
+                                }
+                              }
+                              upd5;
+                            }
+                          }
+                        }
+                      }
+                      upd6;
+                      upd7;
+                    }
+                  }
+                  upd8;
+                }
+              }
+              upd9;
+            }
+          }
+          upd10;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x = prim std_mem_d4(32, 5, 512, 14, 14, 3, 10, 4, 4);
+    data = prim std_mem_d4(32, 5, 512, 14, 14, 3, 10, 4, 4);
+    weight = prim std_mem_d4(32, 512, 512, 3, 3, 10, 10, 2, 2);
+    conv2d0 = conv2d;
+  }
+  wires {
+    group run_conv2d {
+      data.addr0 = conv2d0.data0_0_0_0_addr0;
+      conv2d0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = conv2d0.data0_0_0_0_addr1;
+      data.addr2 = conv2d0.data0_0_0_0_addr2;
+      data.addr3 = conv2d0.data0_0_0_0_addr3;
+      weight.addr0 = conv2d0.weight0_0_0_0_addr0;
+      conv2d0.weight0_0_0_0_read_data = weight.read_data;
+      weight.addr1 = conv2d0.weight0_0_0_0_addr1;
+      weight.addr2 = conv2d0.weight0_0_0_0_addr2;
+      weight.addr3 = conv2d0.weight0_0_0_0_addr3;
+      x.addr0 = conv2d0.x0_0_0_0_addr0;
+      x.addr1 = conv2d0.x0_0_0_0_addr1;
+      x.addr2 = conv2d0.x0_0_0_0_addr2;
+      x.addr3 = conv2d0.x0_0_0_0_addr3;
+      x.write_data = conv2d0.x0_0_0_0_write_data;
+      x.write_en = conv2d0.x0_0_0_0_write_en;
+      conv2d0.x0_0_0_0_done = x.done;
+      conv2d0.go = 1'd1;
+      run_conv2d[done] = conv2d0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_conv2d;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/conv2d.relay b/frontends/relay-futil/tests/conv2d.relay
index f59a6cef8f..e759bab61a 100644
--- a/frontends/relay-futil/tests/conv2d.relay
+++ b/frontends/relay-futil/tests/conv2d.relay
@@ -1,6 +1,6 @@
 v0.0.4
-fn (%data: Tensor[(5, 512, 14, 14), int32], %weight: Tensor[(512, 512, 3, 3), int32]) -> Tensor[(5, 512, 14, 14), int32] {
-  let %x: Tensor[(5, 512, 14, 14), int32] = nn.conv2d(%data, %weight, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]) /* ty=Tensor[(5, 512, 14, 14), int32] */;
+fn (%data: Tensor[(5, 512, 14, 14), float32], %weight: Tensor[(512, 512, 3, 3), float32]) -> Tensor[(5, 512, 14, 14), float32] {
+  let %x: Tensor[(5, 512, 14, 14), float32] = nn.conv2d(%data, %weight, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]) /* ty=Tensor[(5, 512, 14, 14), float32] */;
   %x
 }
 

From 839a40e6d0d4ba044150b22a24e1f2a34a28f4f5 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Fri, 4 Dec 2020 12:34:29 -0500
Subject: [PATCH 68/75] Singular.

---
 fud/fud/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fud/fud/main.py b/fud/fud/main.py
index 6f2ba5810c..04fdee2709 100644
--- a/fud/fud/main.py
+++ b/fud/fud/main.py
@@ -38,7 +38,7 @@ def register_stages(registry, config):
 
     registry.register(
         futil.FutilStage(config, 'futil-externalize', '-b futil -p externalize',
-                         'Compile FuTIL to FuTIL to externalize all external memories primitives'))
+                         'Compile FuTIL to FuTIL to externalize all external memory primitives'))
 
     # Verilator
     registry.register(

From 3dc4b291c5892f9825b3db3c6b22a573f988281d Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 5 Dec 2020 14:52:07 -0500
Subject: [PATCH 69/75] conv2d.

---
 frontends/relay-futil/dahlia_functions.py     |  19 ++-
 frontends/relay-futil/example.py              |   6 +-
 frontends/relay-futil/tests/conv2d.expect     |  18 +--
 .../relay-futil/tests/data/conv2d.expect      | 120 ++++++++++++++++++
 frontends/relay-futil/tests/data/conv2d.relay |   2 +-
 .../relay-futil/tests/data/conv2d.relay.data  |   7 +-
 6 files changed, 145 insertions(+), 27 deletions(-)

diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 15cc65e38b..1822c45c4f 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -191,7 +191,7 @@ def relu(declaration):
 def negative(declaration):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
     op, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = op.data[0], op.type
+    bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
 
     indices = ""
     variable_name = CHARACTER_I
@@ -201,7 +201,8 @@ def negative(declaration):
         variable_name = next_character(variable_name)
 
     declarations = pp_dahlia_memory_declarations([op, res])
-    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := -{op.name}{indices};""")
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := {zero} - {op.name}{indices};""")
     return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
@@ -406,7 +407,7 @@ def max_pool2d(declaration):
 
 
 # Only supports a small subset of the `conv2d` function. For example,
-# dilation and grouped convlution are not supported.
+# dilation and grouped convolution are not supported.
 def conv2d(declaration):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.conv2d"""
     data, weight, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
@@ -425,23 +426,21 @@ def conv2d(declaration):
       for (let c: ubit<32> = 0..{size1}) {{
         for (let y: ubit<32> = 0..{size2}) {{
           for (let x: ubit<32> = 0..{size3}) {{
-            let weighted_sum: {data_type}<{bitwidth}> = {zero};
+            let sum: {data_type}<{bitwidth}> = {zero};
             
             for (let k: ubit<32> = 0..{channels}) {{
               for (let dy: ubit<32> = 0..{kernel_size[1]}/*kernel_size[1]*/) {{
                 for (let dx: ubit<32> = 0..{kernel_size[0]}/*kernel_size[0]*/) {{
-                  let kernel_y: ubit<32> = /*strides[0]*/{strides[0]} * y + dy;
-                  let kernel_x: ubit<32> = /*strides[1]*/{strides[1]} * x + dx;
-                  weighted_sum += {data.name}[b][k][kernel_y][kernel_x] * {weight.name}[c][k][dy][dx];
-                }}
+                  let kernel_y: ubit<32> = (/*strides[0]*/{strides[0]} * y) + dy;
+                  let kernel_x: ubit<32> = (/*strides[1]*/{strides[1]} * x) + dx;     
+                }} combine {{ sum += {data.name}[b][k][kernel_y][kernel_x] * {weight.name}[c][k][dy][dx]; }}
               }}
             }}
-            {res.name}[b][c][y][x] := weighted_sum;
+            {res.name}[b][c][y][x] := sum;
           }} 
         }} 
       }} 
     }} 
     """
     program = f"""{declarations}{NEWL}{program_body}"""
-
     return lower_dahlia_program(program, declaration.component_name)
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 34dc9120ee..1028e7cb47 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -60,9 +60,9 @@ def max_pool2d():
 
 
 def conv2d():
-    d = relay.var('data', shape=[1,2,2,2], dtype='int32')
-    w = relay.var('weight', shape=[1,2,2,2], dtype='int32')
-    return relay.Function([d, w], relay.nn.conv2d(d, w, padding=[1, 1, 1, 1], channels=1, kernel_size=[2,2]))
+    d = relay.var('data', shape=[5, 512, 14, 14], dtype='int32')
+    w = relay.var('weight', shape=[512, 512, 3, 3], dtype='int32')
+    return relay.Function([d, w], relay.nn.conv2d(d, w, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]))
 
 
 def mlp_net():
diff --git a/frontends/relay-futil/tests/conv2d.expect b/frontends/relay-futil/tests/conv2d.expect
index a9fa6b4dfd..1d2163c61e 100644
--- a/frontends/relay-futil/tests/conv2d.expect
+++ b/frontends/relay-futil/tests/conv2d.expect
@@ -70,8 +70,8 @@ component conv2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1,
     slice7 = prim std_slice(32, 2);
     slice8 = prim std_slice(32, 32);
     slice9 = prim std_slice(32, 3);
+    sum_0 = prim std_reg(32);
     weight_read0_0 = prim std_reg(32);
-    weighted_sum_0 = prim std_reg(32);
     x0 = prim std_reg(32);
     y0 = prim std_reg(32);
   }
@@ -156,9 +156,9 @@ component conv2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1,
       let3[done] = x0.done;
     }
     group let4<"static"=1> {
-      weighted_sum_0.in = fpconst0.out;
-      weighted_sum_0.write_en = 1'd1;
-      let4[done] = weighted_sum_0.done;
+      sum_0.in = fpconst0.out;
+      sum_0.write_en = 1'd1;
+      let4[done] = sum_0.done;
     }
     group let5<"static"=1> {
       k0.in = const8.out;
@@ -224,11 +224,11 @@ component conv2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1,
       upd10[done] = b0.done ? 1'd1;
     }
     group upd2<"static"=1> {
-      weighted_sum_0.write_en = 1'd1;
-      add2.left = weighted_sum_0.out;
+      sum_0.write_en = 1'd1;
+      add2.left = sum_0.out;
       add2.right = bin_read2_0.out;
-      weighted_sum_0.in = 1'd1 ? add2.out;
-      upd2[done] = weighted_sum_0.done ? 1'd1;
+      sum_0.in = 1'd1 ? add2.out;
+      upd2[done] = sum_0.done ? 1'd1;
     }
     group upd3<"static"=1> {
       dx0.write_en = 1'd1;
@@ -261,7 +261,7 @@ component conv2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1,
       x0_0_0_0_addr0 = slice9.out;
       slice9.in = b0.out;
       x0_0_0_0_write_en = 1'd1;
-      x0_0_0_0_write_data = 1'd1 ? weighted_sum_0.out;
+      x0_0_0_0_write_data = 1'd1 ? sum_0.out;
       upd6[done] = x0_0_0_0_done ? 1'd1;
     }
     group upd7<"static"=1> {
diff --git a/frontends/relay-futil/tests/data/conv2d.expect b/frontends/relay-futil/tests/data/conv2d.expect
index e69de29bb2..2f8cb5e0be 100644
--- a/frontends/relay-futil/tests/data/conv2d.expect
+++ b/frontends/relay-futil/tests/data/conv2d.expect
@@ -0,0 +1,120 @@
+{
+  "data": [
+    [
+      [
+        [
+          1,
+          1
+        ],
+        [
+          4,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ]
+    ]
+  ],
+  "weight": [
+    [
+      [
+        [
+          2,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          3
+        ],
+        [
+          1,
+          4
+        ]
+      ]
+    ]
+  ],
+  "x": [
+    [
+      [
+        [
+          12
+        ]
+      ],
+      [
+        [
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          9
+        ]
+      ],
+      [
+        [
+          13
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/conv2d.relay b/frontends/relay-futil/tests/data/conv2d.relay
index d85f4aae0e..168e53e418 100644
--- a/frontends/relay-futil/tests/data/conv2d.relay
+++ b/frontends/relay-futil/tests/data/conv2d.relay
@@ -1,5 +1,5 @@
 v0.0.4
 fn (%data: Tensor[(2, 2, 2, 2), int32], %weight: Tensor[(2, 2, 2, 2), int32]) {
-  let %x = nn.conv2d(%data, %weight, padding=[1, 1, 1, 1], channels=2, kernel_size=[2, 2]);
+  let %x = nn.conv2d(%data, %weight, channels=2, kernel_size=[2,2]);
   %x
 }
diff --git a/frontends/relay-futil/tests/data/conv2d.relay.data b/frontends/relay-futil/tests/data/conv2d.relay.data
index c149ae31ea..81591e0997 100644
--- a/frontends/relay-futil/tests/data/conv2d.relay.data
+++ b/frontends/relay-futil/tests/data/conv2d.relay.data
@@ -1,15 +1,14 @@
 {
   "data": {
-    "data": [ [[[1,0], [0,11]], [[10,4], [11,14]]], [[[10,100], [0,0]], [[0,10], [0,4]]] ],
+    "data": [ [[[1,1], [4,1]], [[1,1], [1,1]]], [[[1,1], [1,1]], [[1,1], [1,1]]] ],
     "bitwidth": 32
   },
   "weight": {
-    "data": [ [[[2,1], [3,4]], [[5,5], [0,3]]], [[[2,1], [4,4]], [[19,0], [20,0]]] ],
+    "data": [ [[[2,1], [1,1]], [[1,1], [1,1]]], [[[1,1], [1,1]], [[1,3], [1,4]]] ],
     "bitwidth": 32
   },
   "x": {
-    "data": [ [[[0,0,0], [0,0,0], [0,0,0]], [[0,0,0], [0,0,0], [0,0,0]]],
-              [[[0,0,0], [0,0,0], [0,0,0]], [[0,0,0], [0,0,0], [0,0,0]]] ],
+    "data": [ [[[0]], [[0]]], [[[0]], [[0]]] ],
     "bitwidth": 32
   }
 }
\ No newline at end of file

From 13ec4ff31824f5a00f48143530881c24cf01a710 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sat, 5 Dec 2020 14:57:55 -0500
Subject: [PATCH 70/75] Remove mlp from test suite.

---
 frontends/relay-futil/tests/mlp_net.expect | 1997 --------------------
 frontends/relay-futil/tests/mlp_net.relay  |   16 -
 2 files changed, 2013 deletions(-)
 delete mode 100644 frontends/relay-futil/tests/mlp_net.expect
 delete mode 100644 frontends/relay-futil/tests/mlp_net.relay

diff --git a/frontends/relay-futil/tests/mlp_net.expect b/frontends/relay-futil/tests/mlp_net.expect
deleted file mode 100644
index 9f7c781f88..0000000000
--- a/frontends/relay-futil/tests/mlp_net.expect
+++ /dev/null
@@ -1,1997 +0,0 @@
-import "primitives/std.lib";
-
-component softmax(go: 1, clk: 1, x80_0_read_data: 32, x80_0_done: 1, x90_0_read_data: 32, x90_0_done: 1) -> (done: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1, x90_0_addr0: 1, x90_0_addr1: 4, x90_0_write_data: 32, x90_0_write_en: 1, x90_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(4);
-    add2 = prim std_add(4);
-    add3 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(4, 0);
-    const3 = prim std_const(4, 9);
-    const4 = prim std_const(4, 1);
-    const5 = prim std_const(4, 0);
-    const6 = prim std_const(4, 9);
-    const7 = prim std_const(4, 1);
-    const8 = prim std_const(1, 1);
-    div_pipe0 = prim std_div_pipe(32);
-    exp0 = prim std_exp();
-    exp1 = prim std_exp();
-    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(4);
-    k0 = prim std_reg(4);
-    le0 = prim std_le(1);
-    le1 = prim std_le(4);
-    le2 = prim std_le(4);
-    slice0 = prim std_slice(32, 32);
-    x8_expsum_0 = prim std_reg(32);
-    x8_read0_0 = prim std_reg(32);
-    x8_read1_0 = prim std_reg(32);
-    x9_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = k0.out;
-      le2.right = const6.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      x8_expsum_0.in = fpconst0.out;
-      x8_expsum_0.write_en = 1'd1;
-      let1[done] = x8_expsum_0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group let3<"static"=1> {
-      k0.in = const5.out;
-      k0.write_en = 1'd1;
-      let3[done] = k0.done;
-    }
-    group let4<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let4[done] = bin_read0_0.done;
-      slice0.in = div_pipe0.out;
-      div_pipe0.left = x9_read0_0.out;
-      div_pipe0.right = x8_expsum_0.out;
-      div_pipe0.go = !div_pipe0.done ? 1'd1;
-    }
-    group upd0<"static"=1> {
-      x8_read0_0.write_en = 1'd1;
-      x80_0_addr1 = j0.out;
-      x80_0_addr0 = i0.out;
-      x8_read0_0.in = 1'd1 ? x80_0_read_data;
-      upd0[done] = x8_read0_0.done ? 1'd1;
-    }
-    group upd1 {
-      x8_expsum_0.write_en = 1'd1;
-      add0.left = x8_expsum_0.out;
-      add0.right = exp0.out;
-      exp0.exponent = x8_read0_0.out;
-      exp0.go = !exp0.done ? 1'd1;
-      x8_expsum_0.in = 1'd1 ? add0.out;
-      upd1[done] = x8_expsum_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      x8_read1_0.write_en = 1'd1;
-      x80_0_addr1 = k0.out;
-      x80_0_addr0 = i0.out;
-      x8_read1_0.in = 1'd1 ? x80_0_read_data;
-      upd3[done] = x8_read1_0.done ? 1'd1;
-    }
-    group upd4 {
-      x90_0_addr1 = k0.out;
-      x90_0_addr0 = i0.out;
-      x90_0_write_en = exp1.done;
-      exp1.exponent = x8_read1_0.out;
-      exp1.go = !exp1.done ? 1'd1;
-      x90_0_write_data = exp1.done ? exp1.out;
-      upd4[done] = x90_0_done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      x9_read0_0.write_en = 1'd1;
-      x90_0_addr1 = k0.out;
-      x90_0_addr0 = i0.out;
-      x9_read0_0.in = 1'd1 ? x90_0_read_data;
-      upd5[done] = x9_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      x90_0_addr1 = k0.out;
-      x90_0_addr0 = i0.out;
-      x90_0_write_en = 1'd1;
-      x90_0_write_data = 1'd1 ? bin_read0_0.out;
-      upd6[done] = x90_0_done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add2.left = k0.out;
-      add2.right = const7.out;
-      k0.in = 1'd1 ? add2.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      i0.write_en = 1'd1;
-      add3.left = i0.out;
-      add3.right = const8.out;
-      i0.in = 1'd1 ? add3.out;
-      upd8[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          par {
-            let1;
-            seq {
-              let2;
-              while le1.out with cond1 {
-                seq {
-                  upd0;
-                  upd1;
-                  upd2;
-                }
-              }
-            }
-          }
-          let3;
-          while le2.out with cond2 {
-            seq {
-              upd3;
-              upd4;
-              upd5;
-              let4;
-              upd6;
-              upd7;
-            }
-          }
-          upd8;
-        }
-      }
-    }
-  }
-}
-
-component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x70_0_read_data: 32, x70_0_done: 1, x80_0_read_data: 32, x80_0_done: 1) -> (done: 1, fc3_bias0_addr0: 4, fc3_bias0_write_data: 32, fc3_bias0_write_en: 1, fc3_bias0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(4);
-    add2 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(4, 0);
-    const3 = prim std_const(4, 9);
-    const4 = prim std_const(4, 1);
-    const5 = prim std_const(1, 1);
-    fc3_bias_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(4);
-    le0 = prim std_le(1);
-    le1 = prim std_le(4);
-    x7_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x7_read0_0.write_en = 1'd1;
-      x70_0_addr1 = j0.out;
-      x70_0_addr0 = i0.out;
-      x7_read0_0.in = 1'd1 ? x70_0_read_data;
-      upd0[done] = x7_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      fc3_bias_read0_0.write_en = 1'd1;
-      fc3_bias0_addr0 = j0.out;
-      fc3_bias_read0_0.in = 1'd1 ? fc3_bias0_read_data;
-      upd1[done] = fc3_bias_read0_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x80_0_addr1 = j0.out;
-      x80_0_addr0 = i0.out;
-      x80_0_write_en = 1'd1;
-      add0.left = x7_read0_0.out;
-      add0.right = fc3_bias_read0_0.out;
-      x80_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x80_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              par {
-                upd0;
-                upd1;
-              }
-              upd2;
-              upd3;
-            }
-          }
-          upd4;
-        }
-      }
-    }
-  }
-}
-
-component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done: 1, x60_0_read_data: 32, x60_0_done: 1, x70_0_read_data: 32, x70_0_done: 1) -> (done: 1, fc3_weight0_0_addr0: 4, fc3_weight0_0_addr1: 7, fc3_weight0_0_write_data: 32, fc3_weight0_0_write_en: 1, fc3_weight0_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1) {
-  cells {
-    add0 = prim std_add(7);
-    add1 = prim std_add(4);
-    add2 = prim fixed_p_std_add(32, 16, 16);
-    add3 = prim std_add(7);
-    add4 = prim std_add(4);
-    add5 = prim std_add(1);
-    add6 = prim std_add(4);
-    add7 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(4, 0);
-    const1 = prim std_const(4, 9);
-    const10 = prim std_const(7, 0);
-    const11 = prim std_const(7, 63);
-    const12 = prim std_const(7, 1);
-    const13 = prim std_const(4, 1);
-    const14 = prim std_const(1, 1);
-    const15 = prim std_const(1, 0);
-    const16 = prim std_const(1, 0);
-    const17 = prim std_const(4, 0);
-    const18 = prim std_const(4, 9);
-    const19 = prim std_const(4, 1);
-    const2 = prim std_const(7, 0);
-    const20 = prim std_const(1, 1);
-    const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(4, 1);
-    const6 = prim std_const(1, 0);
-    const7 = prim std_const(1, 0);
-    const8 = prim std_const(4, 0);
-    const9 = prim std_const(4, 9);
-    fc3_weight_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(4);
-    i1 = prim std_reg(1);
-    i2 = prim std_reg(1);
-    j0 = prim std_reg(7);
-    j1 = prim std_reg(4);
-    j2 = prim std_reg(4);
-    k0 = prim std_reg(7);
-    le0 = prim std_le(4);
-    le1 = prim std_le(7);
-    le2 = prim std_le(1);
-    le3 = prim std_le(4);
-    le4 = prim std_le(7);
-    le5 = prim std_le(1);
-    le6 = prim std_le(4);
-    mult_pipe0 = prim std_mult_pipe(32);
-    product_0 = prim std_reg(32);
-    slice0 = prim std_slice(32, 32);
-    slice1 = prim std_slice(32, 32);
-    temporary_x70_0 = prim std_mem_d2(32, 1, 10, 1, 4);
-    temporary_x7_read0_0 = prim std_reg(32);
-    transpose_fc3_weight0_0 = prim std_mem_d2(32, 64, 10, 7, 4);
-    transpose_fc3_weight_read0_0 = prim std_reg(32);
-    x6_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = i1.out;
-      le2.right = const7.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = j1.out;
-      le3.right = const9.out;
-    }
-    group cond4<"static"=0> {
-      cond4[done] = 1'd1;
-      le4.left = k0.out;
-      le4.right = const11.out;
-    }
-    group cond5<"static"=0> {
-      cond5[done] = 1'd1;
-      le5.left = i2.out;
-      le5.right = const16.out;
-    }
-    group cond6<"static"=0> {
-      cond6[done] = 1'd1;
-      le6.left = j2.out;
-      le6.right = const18.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group let2<"static"=1> {
-      i1.in = const6.out;
-      i1.write_en = 1'd1;
-      let2[done] = i1.done;
-    }
-    group let3<"static"=1> {
-      j1.in = const8.out;
-      j1.write_en = 1'd1;
-      let3[done] = j1.done;
-    }
-    group let4<"static"=1> {
-      k0.in = const10.out;
-      k0.write_en = 1'd1;
-      let4[done] = k0.done;
-    }
-    group let5<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let5[done] = bin_read0_0.done;
-      slice0.in = mult_pipe0.out;
-      mult_pipe0.left = x6_read0_0.out;
-      mult_pipe0.right = transpose_fc3_weight_read0_0.out;
-      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
-    }
-    group let6<"static"=1> {
-      product_0.in = slice1.out;
-      product_0.write_en = 1'd1;
-      let6[done] = product_0.done;
-      slice1.in = bin_read0_0.out;
-    }
-    group let7<"static"=1> {
-      i2.in = const15.out;
-      i2.write_en = 1'd1;
-      let7[done] = i2.done;
-    }
-    group let8<"static"=1> {
-      j2.in = const17.out;
-      j2.write_en = 1'd1;
-      let8[done] = j2.done;
-    }
-    group upd0<"static"=1> {
-      fc3_weight_read0_0.write_en = 1'd1;
-      fc3_weight0_0_addr1 = j0.out;
-      fc3_weight0_0_addr0 = i0.out;
-      fc3_weight_read0_0.in = 1'd1 ? fc3_weight0_0_read_data;
-      upd0[done] = fc3_weight_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      transpose_fc3_weight0_0.addr1 = i0.out;
-      transpose_fc3_weight0_0.addr0 = j0.out;
-      transpose_fc3_weight0_0.write_en = 1'd1;
-      transpose_fc3_weight0_0.write_data = 1'd1 ? fc3_weight_read0_0.out;
-      upd1[done] = transpose_fc3_weight0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      temporary_x7_read0_0.write_en = 1'd1;
-      temporary_x70_0.addr1 = j2.out;
-      temporary_x70_0.addr0 = i2.out;
-      temporary_x7_read0_0.in = 1'd1 ? temporary_x70_0.read_data;
-      upd10[done] = temporary_x7_read0_0.done ? 1'd1;
-    }
-    group upd11<"static"=1> {
-      x70_0_addr1 = j2.out;
-      x70_0_addr0 = i2.out;
-      x70_0_write_en = 1'd1;
-      x70_0_write_data = 1'd1 ? temporary_x7_read0_0.out;
-      upd11[done] = x70_0_done ? 1'd1;
-    }
-    group upd12<"static"=1> {
-      j2.write_en = 1'd1;
-      add6.left = j2.out;
-      add6.right = const19.out;
-      j2.in = 1'd1 ? add6.out;
-      upd12[done] = j2.done ? 1'd1;
-    }
-    group upd13<"static"=1> {
-      i2.write_en = 1'd1;
-      add7.left = i2.out;
-      add7.right = const20.out;
-      i2.in = 1'd1 ? add7.out;
-      upd13[done] = i2.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd3[done] = i0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      x6_read0_0.write_en = 1'd1;
-      x60_0_addr1 = k0.out;
-      x60_0_addr0 = i1.out;
-      x6_read0_0.in = 1'd1 ? x60_0_read_data;
-      upd4[done] = x6_read0_0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      transpose_fc3_weight_read0_0.write_en = 1'd1;
-      transpose_fc3_weight0_0.addr1 = j1.out;
-      transpose_fc3_weight0_0.addr0 = k0.out;
-      transpose_fc3_weight_read0_0.in = 1'd1 ? transpose_fc3_weight0_0.read_data;
-      upd5[done] = transpose_fc3_weight_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      temporary_x70_0.addr1 = j1.out;
-      temporary_x70_0.addr0 = i1.out;
-      temporary_x70_0.write_en = 1'd1;
-      add2.left = temporary_x70_0.read_data;
-      add2.right = product_0.out;
-      temporary_x70_0.addr1 = j1.out;
-      temporary_x70_0.addr0 = i1.out;
-      temporary_x70_0.write_data = 1'd1 ? add2.out;
-      upd6[done] = temporary_x70_0.done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add3.left = k0.out;
-      add3.right = const12.out;
-      k0.in = 1'd1 ? add3.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      j1.write_en = 1'd1;
-      add4.left = j1.out;
-      add4.right = const13.out;
-      j1.in = 1'd1 ? add4.out;
-      upd8[done] = j1.done ? 1'd1;
-    }
-    group upd9<"static"=1> {
-      i1.write_en = 1'd1;
-      add5.left = i1.out;
-      add5.right = const14.out;
-      i1.in = 1'd1 ? add5.out;
-      upd9[done] = i1.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              upd1;
-              upd2;
-            }
-          }
-          upd3;
-        }
-      }
-      let2;
-      while le2.out with cond2 {
-        seq {
-          let3;
-          while le3.out with cond3 {
-            seq {
-              let4;
-              while le4.out with cond4 {
-                seq {
-                  par {
-                    upd4;
-                    upd5;
-                  }
-                  let5;
-                  let6;
-                  upd6;
-                  upd7;
-                }
-              }
-              upd8;
-            }
-          }
-          upd9;
-        }
-      }
-      let7;
-      while le5.out with cond5 {
-        seq {
-          let8;
-          while le6.out with cond6 {
-            seq {
-              upd10;
-              upd11;
-              upd12;
-            }
-          }
-          upd13;
-        }
-      }
-    }
-  }
-}
-
-component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_data: 32, x60_0_done: 1) -> (done: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1) {
-  cells {
-    add0 = prim std_add(7);
-    add1 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(7, 0);
-    const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(1, 1);
-    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
-    gt0 = prim fixed_p_std_gt(32, 16, 16);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(7);
-    le0 = prim std_le(1);
-    le1 = prim std_le(7);
-    x5_read0_0 = prim std_reg(32);
-    x5_read1_0 = prim std_reg(32);
-    zero_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      gt0.left = x5_read0_0.out;
-      gt0.right = zero_0.out;
-    }
-    group let0<"static"=1> {
-      zero_0.in = fpconst0.out;
-      zero_0.write_en = 1'd1;
-      let0[done] = zero_0.done;
-    }
-    group let1<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let1[done] = i0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x5_read0_0.write_en = 1'd1;
-      x50_0_addr1 = j0.out;
-      x50_0_addr0 = i0.out;
-      x5_read0_0.in = 1'd1 ? x50_0_read_data;
-      upd0[done] = x5_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      x5_read1_0.write_en = 1'd1;
-      x50_0_addr1 = j0.out;
-      x50_0_addr0 = i0.out;
-      x5_read1_0.in = 1'd1 ? x50_0_read_data;
-      upd1[done] = x5_read1_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x60_0_addr1 = j0.out;
-      x60_0_addr0 = i0.out;
-      x60_0_write_en = 1'd1;
-      x60_0_write_data = 1'd1 ? x5_read1_0.out;
-      upd2[done] = x60_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      x60_0_addr1 = j0.out;
-      x60_0_addr0 = i0.out;
-      x60_0_write_en = 1'd1;
-      x60_0_write_data = 1'd1 ? zero_0.out;
-      upd3[done] = x60_0_done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd4[done] = j0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd5[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      let1;
-      while le0.out with cond0 {
-        seq {
-          let2;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              if gt0.out with cond2 {
-                seq {
-                  upd1;
-                  upd2;
-                }
-              } else {
-                upd3;
-              }
-              upd4;
-            }
-          }
-          upd5;
-        }
-      }
-    }
-  }
-}
-
-component bias_add1(go: 1, clk: 1, fc2_bias0_read_data: 32, fc2_bias0_done: 1, x40_0_read_data: 32, x40_0_done: 1, x50_0_read_data: 32, x50_0_done: 1) -> (done: 1, fc2_bias0_addr0: 7, fc2_bias0_write_data: 32, fc2_bias0_write_en: 1, fc2_bias0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(7);
-    add2 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(7, 0);
-    const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(1, 1);
-    fc2_bias_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(7);
-    le0 = prim std_le(1);
-    le1 = prim std_le(7);
-    x4_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x4_read0_0.write_en = 1'd1;
-      x40_0_addr1 = j0.out;
-      x40_0_addr0 = i0.out;
-      x4_read0_0.in = 1'd1 ? x40_0_read_data;
-      upd0[done] = x4_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      fc2_bias_read0_0.write_en = 1'd1;
-      fc2_bias0_addr0 = j0.out;
-      fc2_bias_read0_0.in = 1'd1 ? fc2_bias0_read_data;
-      upd1[done] = fc2_bias_read0_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x50_0_addr1 = j0.out;
-      x50_0_addr0 = i0.out;
-      x50_0_write_en = 1'd1;
-      add0.left = x4_read0_0.out;
-      add0.right = fc2_bias_read0_0.out;
-      x50_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x50_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              par {
-                upd0;
-                upd1;
-              }
-              upd2;
-              upd3;
-            }
-          }
-          upd4;
-        }
-      }
-    }
-  }
-}
-
-component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done: 1, x30_0_read_data: 32, x30_0_done: 1, x40_0_read_data: 32, x40_0_done: 1) -> (done: 1, fc2_weight0_0_addr0: 7, fc2_weight0_0_addr1: 8, fc2_weight0_0_write_data: 32, fc2_weight0_0_write_en: 1, fc2_weight0_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1) {
-  cells {
-    add0 = prim std_add(8);
-    add1 = prim std_add(7);
-    add2 = prim fixed_p_std_add(32, 16, 16);
-    add3 = prim std_add(8);
-    add4 = prim std_add(7);
-    add5 = prim std_add(1);
-    add6 = prim std_add(7);
-    add7 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(7, 0);
-    const1 = prim std_const(7, 63);
-    const10 = prim std_const(8, 0);
-    const11 = prim std_const(8, 127);
-    const12 = prim std_const(8, 1);
-    const13 = prim std_const(7, 1);
-    const14 = prim std_const(1, 1);
-    const15 = prim std_const(1, 0);
-    const16 = prim std_const(1, 0);
-    const17 = prim std_const(7, 0);
-    const18 = prim std_const(7, 63);
-    const19 = prim std_const(7, 1);
-    const2 = prim std_const(8, 0);
-    const20 = prim std_const(1, 1);
-    const3 = prim std_const(8, 127);
-    const4 = prim std_const(8, 1);
-    const5 = prim std_const(7, 1);
-    const6 = prim std_const(1, 0);
-    const7 = prim std_const(1, 0);
-    const8 = prim std_const(7, 0);
-    const9 = prim std_const(7, 63);
-    fc2_weight_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(7);
-    i1 = prim std_reg(1);
-    i2 = prim std_reg(1);
-    j0 = prim std_reg(8);
-    j1 = prim std_reg(7);
-    j2 = prim std_reg(7);
-    k0 = prim std_reg(8);
-    le0 = prim std_le(7);
-    le1 = prim std_le(8);
-    le2 = prim std_le(1);
-    le3 = prim std_le(7);
-    le4 = prim std_le(8);
-    le5 = prim std_le(1);
-    le6 = prim std_le(7);
-    mult_pipe0 = prim std_mult_pipe(32);
-    product_0 = prim std_reg(32);
-    slice0 = prim std_slice(32, 32);
-    slice1 = prim std_slice(32, 32);
-    temporary_x40_0 = prim std_mem_d2(32, 1, 64, 1, 7);
-    temporary_x4_read0_0 = prim std_reg(32);
-    transpose_fc2_weight0_0 = prim std_mem_d2(32, 128, 64, 8, 7);
-    transpose_fc2_weight_read0_0 = prim std_reg(32);
-    x3_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = i1.out;
-      le2.right = const7.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = j1.out;
-      le3.right = const9.out;
-    }
-    group cond4<"static"=0> {
-      cond4[done] = 1'd1;
-      le4.left = k0.out;
-      le4.right = const11.out;
-    }
-    group cond5<"static"=0> {
-      cond5[done] = 1'd1;
-      le5.left = i2.out;
-      le5.right = const16.out;
-    }
-    group cond6<"static"=0> {
-      cond6[done] = 1'd1;
-      le6.left = j2.out;
-      le6.right = const18.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group let2<"static"=1> {
-      i1.in = const6.out;
-      i1.write_en = 1'd1;
-      let2[done] = i1.done;
-    }
-    group let3<"static"=1> {
-      j1.in = const8.out;
-      j1.write_en = 1'd1;
-      let3[done] = j1.done;
-    }
-    group let4<"static"=1> {
-      k0.in = const10.out;
-      k0.write_en = 1'd1;
-      let4[done] = k0.done;
-    }
-    group let5<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let5[done] = bin_read0_0.done;
-      slice0.in = mult_pipe0.out;
-      mult_pipe0.left = x3_read0_0.out;
-      mult_pipe0.right = transpose_fc2_weight_read0_0.out;
-      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
-    }
-    group let6<"static"=1> {
-      product_0.in = slice1.out;
-      product_0.write_en = 1'd1;
-      let6[done] = product_0.done;
-      slice1.in = bin_read0_0.out;
-    }
-    group let7<"static"=1> {
-      i2.in = const15.out;
-      i2.write_en = 1'd1;
-      let7[done] = i2.done;
-    }
-    group let8<"static"=1> {
-      j2.in = const17.out;
-      j2.write_en = 1'd1;
-      let8[done] = j2.done;
-    }
-    group upd0<"static"=1> {
-      fc2_weight_read0_0.write_en = 1'd1;
-      fc2_weight0_0_addr1 = j0.out;
-      fc2_weight0_0_addr0 = i0.out;
-      fc2_weight_read0_0.in = 1'd1 ? fc2_weight0_0_read_data;
-      upd0[done] = fc2_weight_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      transpose_fc2_weight0_0.addr1 = i0.out;
-      transpose_fc2_weight0_0.addr0 = j0.out;
-      transpose_fc2_weight0_0.write_en = 1'd1;
-      transpose_fc2_weight0_0.write_data = 1'd1 ? fc2_weight_read0_0.out;
-      upd1[done] = transpose_fc2_weight0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      temporary_x4_read0_0.write_en = 1'd1;
-      temporary_x40_0.addr1 = j2.out;
-      temporary_x40_0.addr0 = i2.out;
-      temporary_x4_read0_0.in = 1'd1 ? temporary_x40_0.read_data;
-      upd10[done] = temporary_x4_read0_0.done ? 1'd1;
-    }
-    group upd11<"static"=1> {
-      x40_0_addr1 = j2.out;
-      x40_0_addr0 = i2.out;
-      x40_0_write_en = 1'd1;
-      x40_0_write_data = 1'd1 ? temporary_x4_read0_0.out;
-      upd11[done] = x40_0_done ? 1'd1;
-    }
-    group upd12<"static"=1> {
-      j2.write_en = 1'd1;
-      add6.left = j2.out;
-      add6.right = const19.out;
-      j2.in = 1'd1 ? add6.out;
-      upd12[done] = j2.done ? 1'd1;
-    }
-    group upd13<"static"=1> {
-      i2.write_en = 1'd1;
-      add7.left = i2.out;
-      add7.right = const20.out;
-      i2.in = 1'd1 ? add7.out;
-      upd13[done] = i2.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd3[done] = i0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      x3_read0_0.write_en = 1'd1;
-      x30_0_addr1 = k0.out;
-      x30_0_addr0 = i1.out;
-      x3_read0_0.in = 1'd1 ? x30_0_read_data;
-      upd4[done] = x3_read0_0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      transpose_fc2_weight_read0_0.write_en = 1'd1;
-      transpose_fc2_weight0_0.addr1 = j1.out;
-      transpose_fc2_weight0_0.addr0 = k0.out;
-      transpose_fc2_weight_read0_0.in = 1'd1 ? transpose_fc2_weight0_0.read_data;
-      upd5[done] = transpose_fc2_weight_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      temporary_x40_0.addr1 = j1.out;
-      temporary_x40_0.addr0 = i1.out;
-      temporary_x40_0.write_en = 1'd1;
-      add2.left = temporary_x40_0.read_data;
-      add2.right = product_0.out;
-      temporary_x40_0.addr1 = j1.out;
-      temporary_x40_0.addr0 = i1.out;
-      temporary_x40_0.write_data = 1'd1 ? add2.out;
-      upd6[done] = temporary_x40_0.done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add3.left = k0.out;
-      add3.right = const12.out;
-      k0.in = 1'd1 ? add3.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      j1.write_en = 1'd1;
-      add4.left = j1.out;
-      add4.right = const13.out;
-      j1.in = 1'd1 ? add4.out;
-      upd8[done] = j1.done ? 1'd1;
-    }
-    group upd9<"static"=1> {
-      i1.write_en = 1'd1;
-      add5.left = i1.out;
-      add5.right = const14.out;
-      i1.in = 1'd1 ? add5.out;
-      upd9[done] = i1.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              upd1;
-              upd2;
-            }
-          }
-          upd3;
-        }
-      }
-      let2;
-      while le2.out with cond2 {
-        seq {
-          let3;
-          while le3.out with cond3 {
-            seq {
-              let4;
-              while le4.out with cond4 {
-                seq {
-                  par {
-                    upd4;
-                    upd5;
-                  }
-                  let5;
-                  let6;
-                  upd6;
-                  upd7;
-                }
-              }
-              upd8;
-            }
-          }
-          upd9;
-        }
-      }
-      let7;
-      while le5.out with cond5 {
-        seq {
-          let8;
-          while le6.out with cond6 {
-            seq {
-              upd10;
-              upd11;
-              upd12;
-            }
-          }
-          upd13;
-        }
-      }
-    }
-  }
-}
-
-component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_data: 32, x30_0_done: 1) -> (done: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1) {
-  cells {
-    add0 = prim std_add(8);
-    add1 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(8, 0);
-    const3 = prim std_const(8, 127);
-    const4 = prim std_const(8, 1);
-    const5 = prim std_const(1, 1);
-    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
-    gt0 = prim fixed_p_std_gt(32, 16, 16);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(8);
-    le0 = prim std_le(1);
-    le1 = prim std_le(8);
-    x2_read0_0 = prim std_reg(32);
-    x2_read1_0 = prim std_reg(32);
-    zero_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      gt0.left = x2_read0_0.out;
-      gt0.right = zero_0.out;
-    }
-    group let0<"static"=1> {
-      zero_0.in = fpconst0.out;
-      zero_0.write_en = 1'd1;
-      let0[done] = zero_0.done;
-    }
-    group let1<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let1[done] = i0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x2_read0_0.write_en = 1'd1;
-      x20_0_addr1 = j0.out;
-      x20_0_addr0 = i0.out;
-      x2_read0_0.in = 1'd1 ? x20_0_read_data;
-      upd0[done] = x2_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      x2_read1_0.write_en = 1'd1;
-      x20_0_addr1 = j0.out;
-      x20_0_addr0 = i0.out;
-      x2_read1_0.in = 1'd1 ? x20_0_read_data;
-      upd1[done] = x2_read1_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x30_0_addr1 = j0.out;
-      x30_0_addr0 = i0.out;
-      x30_0_write_en = 1'd1;
-      x30_0_write_data = 1'd1 ? x2_read1_0.out;
-      upd2[done] = x30_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      x30_0_addr1 = j0.out;
-      x30_0_addr0 = i0.out;
-      x30_0_write_en = 1'd1;
-      x30_0_write_data = 1'd1 ? zero_0.out;
-      upd3[done] = x30_0_done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd4[done] = j0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd5[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      let1;
-      while le0.out with cond0 {
-        seq {
-          let2;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              if gt0.out with cond2 {
-                seq {
-                  upd1;
-                  upd2;
-                }
-              } else {
-                upd3;
-              }
-              upd4;
-            }
-          }
-          upd5;
-        }
-      }
-    }
-  }
-}
-
-component bias_add(go: 1, clk: 1, fc1_bias0_read_data: 32, fc1_bias0_done: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_read_data: 32, x20_0_done: 1) -> (done: 1, fc1_bias0_addr0: 8, fc1_bias0_write_data: 32, fc1_bias0_write_en: 1, fc1_bias0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(8);
-    add2 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(8, 0);
-    const3 = prim std_const(8, 127);
-    const4 = prim std_const(8, 1);
-    const5 = prim std_const(1, 1);
-    fc1_bias_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(8);
-    le0 = prim std_le(1);
-    le1 = prim std_le(8);
-    x1_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x1_read0_0.write_en = 1'd1;
-      x10_0_addr1 = j0.out;
-      x10_0_addr0 = i0.out;
-      x1_read0_0.in = 1'd1 ? x10_0_read_data;
-      upd0[done] = x1_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      fc1_bias_read0_0.write_en = 1'd1;
-      fc1_bias0_addr0 = j0.out;
-      fc1_bias_read0_0.in = 1'd1 ? fc1_bias0_read_data;
-      upd1[done] = fc1_bias_read0_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x20_0_addr1 = j0.out;
-      x20_0_addr0 = i0.out;
-      x20_0_write_en = 1'd1;
-      add0.left = x1_read0_0.out;
-      add0.right = fc1_bias_read0_0.out;
-      x20_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x20_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              par {
-                upd0;
-                upd1;
-              }
-              upd2;
-              upd3;
-            }
-          }
-          upd4;
-        }
-      }
-    }
-  }
-}
-
-component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, fc1_weight0_0_addr0: 8, fc1_weight0_0_addr1: 10, fc1_weight0_0_write_data: 32, fc1_weight0_0_write_en: 1, fc1_weight0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
-  cells {
-    add0 = prim std_add(10);
-    add1 = prim std_add(8);
-    add2 = prim fixed_p_std_add(32, 16, 16);
-    add3 = prim std_add(10);
-    add4 = prim std_add(8);
-    add5 = prim std_add(1);
-    add6 = prim std_add(8);
-    add7 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(8, 0);
-    const1 = prim std_const(8, 127);
-    const10 = prim std_const(10, 0);
-    const11 = prim std_const(10, 783);
-    const12 = prim std_const(10, 1);
-    const13 = prim std_const(8, 1);
-    const14 = prim std_const(1, 1);
-    const15 = prim std_const(1, 0);
-    const16 = prim std_const(1, 0);
-    const17 = prim std_const(8, 0);
-    const18 = prim std_const(8, 127);
-    const19 = prim std_const(8, 1);
-    const2 = prim std_const(10, 0);
-    const20 = prim std_const(1, 1);
-    const3 = prim std_const(10, 783);
-    const4 = prim std_const(10, 1);
-    const5 = prim std_const(8, 1);
-    const6 = prim std_const(1, 0);
-    const7 = prim std_const(1, 0);
-    const8 = prim std_const(8, 0);
-    const9 = prim std_const(8, 127);
-    fc1_weight_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(8);
-    i1 = prim std_reg(1);
-    i2 = prim std_reg(1);
-    j0 = prim std_reg(10);
-    j1 = prim std_reg(8);
-    j2 = prim std_reg(8);
-    k0 = prim std_reg(10);
-    le0 = prim std_le(8);
-    le1 = prim std_le(10);
-    le2 = prim std_le(1);
-    le3 = prim std_le(8);
-    le4 = prim std_le(10);
-    le5 = prim std_le(1);
-    le6 = prim std_le(8);
-    mult_pipe0 = prim std_mult_pipe(32);
-    product_0 = prim std_reg(32);
-    slice0 = prim std_slice(32, 32);
-    slice1 = prim std_slice(32, 32);
-    temporary_x10_0 = prim std_mem_d2(32, 1, 128, 1, 8);
-    temporary_x1_read0_0 = prim std_reg(32);
-    transpose_fc1_weight0_0 = prim std_mem_d2(32, 784, 128, 10, 8);
-    transpose_fc1_weight_read0_0 = prim std_reg(32);
-    x_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = i1.out;
-      le2.right = const7.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = j1.out;
-      le3.right = const9.out;
-    }
-    group cond4<"static"=0> {
-      cond4[done] = 1'd1;
-      le4.left = k0.out;
-      le4.right = const11.out;
-    }
-    group cond5<"static"=0> {
-      cond5[done] = 1'd1;
-      le5.left = i2.out;
-      le5.right = const16.out;
-    }
-    group cond6<"static"=0> {
-      cond6[done] = 1'd1;
-      le6.left = j2.out;
-      le6.right = const18.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group let2<"static"=1> {
-      i1.in = const6.out;
-      i1.write_en = 1'd1;
-      let2[done] = i1.done;
-    }
-    group let3<"static"=1> {
-      j1.in = const8.out;
-      j1.write_en = 1'd1;
-      let3[done] = j1.done;
-    }
-    group let4<"static"=1> {
-      k0.in = const10.out;
-      k0.write_en = 1'd1;
-      let4[done] = k0.done;
-    }
-    group let5<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let5[done] = bin_read0_0.done;
-      slice0.in = mult_pipe0.out;
-      mult_pipe0.left = x_read0_0.out;
-      mult_pipe0.right = transpose_fc1_weight_read0_0.out;
-      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
-    }
-    group let6<"static"=1> {
-      product_0.in = slice1.out;
-      product_0.write_en = 1'd1;
-      let6[done] = product_0.done;
-      slice1.in = bin_read0_0.out;
-    }
-    group let7<"static"=1> {
-      i2.in = const15.out;
-      i2.write_en = 1'd1;
-      let7[done] = i2.done;
-    }
-    group let8<"static"=1> {
-      j2.in = const17.out;
-      j2.write_en = 1'd1;
-      let8[done] = j2.done;
-    }
-    group upd0<"static"=1> {
-      fc1_weight_read0_0.write_en = 1'd1;
-      fc1_weight0_0_addr1 = j0.out;
-      fc1_weight0_0_addr0 = i0.out;
-      fc1_weight_read0_0.in = 1'd1 ? fc1_weight0_0_read_data;
-      upd0[done] = fc1_weight_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      transpose_fc1_weight0_0.addr1 = i0.out;
-      transpose_fc1_weight0_0.addr0 = j0.out;
-      transpose_fc1_weight0_0.write_en = 1'd1;
-      transpose_fc1_weight0_0.write_data = 1'd1 ? fc1_weight_read0_0.out;
-      upd1[done] = transpose_fc1_weight0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      temporary_x1_read0_0.write_en = 1'd1;
-      temporary_x10_0.addr1 = j2.out;
-      temporary_x10_0.addr0 = i2.out;
-      temporary_x1_read0_0.in = 1'd1 ? temporary_x10_0.read_data;
-      upd10[done] = temporary_x1_read0_0.done ? 1'd1;
-    }
-    group upd11<"static"=1> {
-      x10_0_addr1 = j2.out;
-      x10_0_addr0 = i2.out;
-      x10_0_write_en = 1'd1;
-      x10_0_write_data = 1'd1 ? temporary_x1_read0_0.out;
-      upd11[done] = x10_0_done ? 1'd1;
-    }
-    group upd12<"static"=1> {
-      j2.write_en = 1'd1;
-      add6.left = j2.out;
-      add6.right = const19.out;
-      j2.in = 1'd1 ? add6.out;
-      upd12[done] = j2.done ? 1'd1;
-    }
-    group upd13<"static"=1> {
-      i2.write_en = 1'd1;
-      add7.left = i2.out;
-      add7.right = const20.out;
-      i2.in = 1'd1 ? add7.out;
-      upd13[done] = i2.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd3[done] = i0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      x_read0_0.write_en = 1'd1;
-      x0_0_addr1 = k0.out;
-      x0_0_addr0 = i1.out;
-      x_read0_0.in = 1'd1 ? x0_0_read_data;
-      upd4[done] = x_read0_0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      transpose_fc1_weight_read0_0.write_en = 1'd1;
-      transpose_fc1_weight0_0.addr1 = j1.out;
-      transpose_fc1_weight0_0.addr0 = k0.out;
-      transpose_fc1_weight_read0_0.in = 1'd1 ? transpose_fc1_weight0_0.read_data;
-      upd5[done] = transpose_fc1_weight_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      temporary_x10_0.addr1 = j1.out;
-      temporary_x10_0.addr0 = i1.out;
-      temporary_x10_0.write_en = 1'd1;
-      add2.left = temporary_x10_0.read_data;
-      add2.right = product_0.out;
-      temporary_x10_0.addr1 = j1.out;
-      temporary_x10_0.addr0 = i1.out;
-      temporary_x10_0.write_data = 1'd1 ? add2.out;
-      upd6[done] = temporary_x10_0.done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add3.left = k0.out;
-      add3.right = const12.out;
-      k0.in = 1'd1 ? add3.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      j1.write_en = 1'd1;
-      add4.left = j1.out;
-      add4.right = const13.out;
-      j1.in = 1'd1 ? add4.out;
-      upd8[done] = j1.done ? 1'd1;
-    }
-    group upd9<"static"=1> {
-      i1.write_en = 1'd1;
-      add5.left = i1.out;
-      add5.right = const14.out;
-      i1.in = 1'd1 ? add5.out;
-      upd9[done] = i1.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              upd1;
-              upd2;
-            }
-          }
-          upd3;
-        }
-      }
-      let2;
-      while le2.out with cond2 {
-        seq {
-          let3;
-          while le3.out with cond3 {
-            seq {
-              let4;
-              while le4.out with cond4 {
-                seq {
-                  par {
-                    upd4;
-                    upd5;
-                  }
-                  let5;
-                  let6;
-                  upd6;
-                  upd7;
-                }
-              }
-              upd8;
-            }
-          }
-          upd9;
-        }
-      }
-      let7;
-      while le5.out with cond5 {
-        seq {
-          let8;
-          while le6.out with cond6 {
-            seq {
-              upd10;
-              upd11;
-              upd12;
-            }
-          }
-          upd13;
-        }
-      }
-    }
-  }
-}
-
-component batch_flatten(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 1, data0_0_0_0_addr1: 1, data0_0_0_0_addr2: 5, data0_0_0_0_addr3: 5, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1) {
-  cells {
-    add0 = prim std_add(10);
-    add1 = prim std_add(5);
-    add2 = prim std_add(5);
-    add3 = prim std_add(1);
-    add4 = prim std_add(1);
-    const0 = prim std_const(10, 0);
-    const1 = prim std_const(1, 0);
-    const10 = prim std_const(5, 1);
-    const11 = prim std_const(5, 1);
-    const12 = prim std_const(1, 1);
-    const13 = prim std_const(1, 1);
-    const2 = prim std_const(1, 0);
-    const3 = prim std_const(1, 0);
-    const4 = prim std_const(1, 0);
-    const5 = prim std_const(5, 0);
-    const6 = prim std_const(5, 27);
-    const7 = prim std_const(5, 0);
-    const8 = prim std_const(5, 27);
-    const9 = prim std_const(10, 1);
-    data_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(1);
-    k0 = prim std_reg(5);
-    l0 = prim std_reg(5);
-    le0 = prim std_le(1);
-    le1 = prim std_le(1);
-    le2 = prim std_le(5);
-    le3 = prim std_le(5);
-    m_0 = prim std_reg(10);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const2.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const4.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = k0.out;
-      le2.right = const6.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = l0.out;
-      le3.right = const8.out;
-    }
-    group let0<"static"=1> {
-      m_0.in = const0.out;
-      m_0.write_en = 1'd1;
-      let0[done] = m_0.done;
-    }
-    group let1<"static"=1> {
-      i0.in = const1.out;
-      i0.write_en = 1'd1;
-      let1[done] = i0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const3.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group let3<"static"=1> {
-      k0.in = const5.out;
-      k0.write_en = 1'd1;
-      let3[done] = k0.done;
-    }
-    group let4<"static"=1> {
-      l0.in = const7.out;
-      l0.write_en = 1'd1;
-      let4[done] = l0.done;
-    }
-    group upd0<"static"=1> {
-      data_read0_0.write_en = 1'd1;
-      data0_0_0_0_addr3 = l0.out;
-      data0_0_0_0_addr2 = k0.out;
-      data0_0_0_0_addr1 = j0.out;
-      data0_0_0_0_addr0 = i0.out;
-      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
-      upd0[done] = data_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      x0_0_addr1 = m_0.out;
-      x0_0_addr0 = i0.out;
-      x0_0_write_en = 1'd1;
-      x0_0_write_data = 1'd1 ? data_read0_0.out;
-      upd1[done] = x0_0_done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      m_0.write_en = 1'd1;
-      add0.left = m_0.out;
-      add0.right = const9.out;
-      m_0.in = 1'd1 ? add0.out;
-      upd2[done] = m_0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      l0.write_en = 1'd1;
-      add1.left = l0.out;
-      add1.right = const10.out;
-      l0.in = 1'd1 ? add1.out;
-      upd3[done] = l0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      k0.write_en = 1'd1;
-      add2.left = k0.out;
-      add2.right = const11.out;
-      k0.in = 1'd1 ? add2.out;
-      upd4[done] = k0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      j0.write_en = 1'd1;
-      add3.left = j0.out;
-      add3.right = const12.out;
-      j0.in = 1'd1 ? add3.out;
-      upd5[done] = j0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      i0.write_en = 1'd1;
-      add4.left = i0.out;
-      add4.right = const13.out;
-      i0.in = 1'd1 ? add4.out;
-      upd6[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      let1;
-      while le0.out with cond0 {
-        seq {
-          let2;
-          while le1.out with cond1 {
-            seq {
-              let3;
-              while le2.out with cond2 {
-                seq {
-                  let4;
-                  while le3.out with cond3 {
-                    seq {
-                      upd0;
-                      upd1;
-                      upd2;
-                      upd3;
-                    }
-                  }
-                  upd4;
-                }
-              }
-              upd5;
-            }
-          }
-          upd6;
-        }
-      }
-    }
-  }
-}
-
-component main () -> () {
-  cells {
-    x9 = prim std_mem_d2(32, 1, 10, 1, 4);
-    x8 = prim std_mem_d2(32, 1, 10, 1, 4);
-    softmax0 = softmax;
-    x7 = prim std_mem_d2(32, 1, 10, 1, 4);
-    fc3_bias = prim std_mem_d1(32, 10, 4);
-    bias_add2 = bias_add2;
-    x6 = prim std_mem_d2(32, 1, 64, 1, 7);
-    fc3_weight = prim std_mem_d2(32, 10, 64, 4, 7);
-    dense2 = dense2;
-    x5 = prim std_mem_d2(32, 1, 64, 1, 7);
-    relu1 = relu1;
-    x4 = prim std_mem_d2(32, 1, 64, 1, 7);
-    fc2_bias = prim std_mem_d1(32, 64, 7);
-    bias_add1 = bias_add1;
-    x3 = prim std_mem_d2(32, 1, 128, 1, 8);
-    fc2_weight = prim std_mem_d2(32, 64, 128, 7, 8);
-    dense1 = dense1;
-    x2 = prim std_mem_d2(32, 1, 128, 1, 8);
-    relu0 = relu;
-    x1 = prim std_mem_d2(32, 1, 128, 1, 8);
-    fc1_bias = prim std_mem_d1(32, 128, 8);
-    bias_add0 = bias_add;
-    x = prim std_mem_d2(32, 1, 784, 1, 10);
-    fc1_weight = prim std_mem_d2(32, 128, 784, 8, 10);
-    dense0 = dense;
-    data = prim std_mem_d4(32, 1, 1, 28, 28, 1, 1, 5, 5);
-    batch_flatten0 = batch_flatten;
-  }
-  wires {
-    group run_batch_flatten {
-      data.addr0 = batch_flatten0.data0_0_0_0_addr0;
-      batch_flatten0.data0_0_0_0_read_data = data.read_data;
-      data.addr1 = batch_flatten0.data0_0_0_0_addr1;
-      data.addr2 = batch_flatten0.data0_0_0_0_addr2;
-      data.addr3 = batch_flatten0.data0_0_0_0_addr3;
-      x.addr0 = batch_flatten0.x0_0_addr0;
-      x.addr1 = batch_flatten0.x0_0_addr1;
-      x.write_data = batch_flatten0.x0_0_write_data;
-      x.write_en = batch_flatten0.x0_0_write_en;
-      batch_flatten0.x0_0_done = x.done;
-      batch_flatten0.go = 1'd1;
-      run_batch_flatten[done] = batch_flatten0.done ? 1'd1;
-    }
-    group run_dense {
-      x.addr0 = dense0.x0_0_addr0;
-      dense0.x0_0_read_data = x.read_data;
-      x.addr1 = dense0.x0_0_addr1;
-      fc1_weight.addr0 = dense0.fc1_weight0_0_addr0;
-      dense0.fc1_weight0_0_read_data = fc1_weight.read_data;
-      fc1_weight.addr1 = dense0.fc1_weight0_0_addr1;
-      x1.addr0 = dense0.x10_0_addr0;
-      x1.addr1 = dense0.x10_0_addr1;
-      x1.write_data = dense0.x10_0_write_data;
-      x1.write_en = dense0.x10_0_write_en;
-      dense0.x10_0_done = x1.done;
-      dense0.go = 1'd1;
-      run_dense[done] = dense0.done ? 1'd1;
-    }
-    group run_bias_add {
-      x1.addr0 = bias_add0.x10_0_addr0;
-      bias_add0.x10_0_read_data = x1.read_data;
-      x1.addr1 = bias_add0.x10_0_addr1;
-      fc1_bias.addr0 = bias_add0.fc1_bias0_addr0;
-      bias_add0.fc1_bias0_read_data = fc1_bias.read_data;
-      x2.addr0 = bias_add0.x20_0_addr0;
-      x2.addr1 = bias_add0.x20_0_addr1;
-      x2.write_data = bias_add0.x20_0_write_data;
-      x2.write_en = bias_add0.x20_0_write_en;
-      bias_add0.x20_0_done = x2.done;
-      bias_add0.go = 1'd1;
-      run_bias_add[done] = bias_add0.done ? 1'd1;
-    }
-    group run_relu {
-      x2.addr0 = relu0.x20_0_addr0;
-      relu0.x20_0_read_data = x2.read_data;
-      x2.addr1 = relu0.x20_0_addr1;
-      x3.addr0 = relu0.x30_0_addr0;
-      x3.addr1 = relu0.x30_0_addr1;
-      x3.write_data = relu0.x30_0_write_data;
-      x3.write_en = relu0.x30_0_write_en;
-      relu0.x30_0_done = x3.done;
-      relu0.go = 1'd1;
-      run_relu[done] = relu0.done ? 1'd1;
-    }
-    group run_dense1 {
-      x3.addr0 = dense1.x30_0_addr0;
-      dense1.x30_0_read_data = x3.read_data;
-      x3.addr1 = dense1.x30_0_addr1;
-      fc2_weight.addr0 = dense1.fc2_weight0_0_addr0;
-      dense1.fc2_weight0_0_read_data = fc2_weight.read_data;
-      fc2_weight.addr1 = dense1.fc2_weight0_0_addr1;
-      x4.addr0 = dense1.x40_0_addr0;
-      x4.addr1 = dense1.x40_0_addr1;
-      x4.write_data = dense1.x40_0_write_data;
-      x4.write_en = dense1.x40_0_write_en;
-      dense1.x40_0_done = x4.done;
-      dense1.go = 1'd1;
-      run_dense1[done] = dense1.done ? 1'd1;
-    }
-    group run_bias_add1 {
-      x4.addr0 = bias_add1.x40_0_addr0;
-      bias_add1.x40_0_read_data = x4.read_data;
-      x4.addr1 = bias_add1.x40_0_addr1;
-      fc2_bias.addr0 = bias_add1.fc2_bias0_addr0;
-      bias_add1.fc2_bias0_read_data = fc2_bias.read_data;
-      x5.addr0 = bias_add1.x50_0_addr0;
-      x5.addr1 = bias_add1.x50_0_addr1;
-      x5.write_data = bias_add1.x50_0_write_data;
-      x5.write_en = bias_add1.x50_0_write_en;
-      bias_add1.x50_0_done = x5.done;
-      bias_add1.go = 1'd1;
-      run_bias_add1[done] = bias_add1.done ? 1'd1;
-    }
-    group run_relu1 {
-      x5.addr0 = relu1.x50_0_addr0;
-      relu1.x50_0_read_data = x5.read_data;
-      x5.addr1 = relu1.x50_0_addr1;
-      x6.addr0 = relu1.x60_0_addr0;
-      x6.addr1 = relu1.x60_0_addr1;
-      x6.write_data = relu1.x60_0_write_data;
-      x6.write_en = relu1.x60_0_write_en;
-      relu1.x60_0_done = x6.done;
-      relu1.go = 1'd1;
-      run_relu1[done] = relu1.done ? 1'd1;
-    }
-    group run_dense2 {
-      x6.addr0 = dense2.x60_0_addr0;
-      dense2.x60_0_read_data = x6.read_data;
-      x6.addr1 = dense2.x60_0_addr1;
-      fc3_weight.addr0 = dense2.fc3_weight0_0_addr0;
-      dense2.fc3_weight0_0_read_data = fc3_weight.read_data;
-      fc3_weight.addr1 = dense2.fc3_weight0_0_addr1;
-      x7.addr0 = dense2.x70_0_addr0;
-      x7.addr1 = dense2.x70_0_addr1;
-      x7.write_data = dense2.x70_0_write_data;
-      x7.write_en = dense2.x70_0_write_en;
-      dense2.x70_0_done = x7.done;
-      dense2.go = 1'd1;
-      run_dense2[done] = dense2.done ? 1'd1;
-    }
-    group run_bias_add2 {
-      x7.addr0 = bias_add2.x70_0_addr0;
-      bias_add2.x70_0_read_data = x7.read_data;
-      x7.addr1 = bias_add2.x70_0_addr1;
-      fc3_bias.addr0 = bias_add2.fc3_bias0_addr0;
-      bias_add2.fc3_bias0_read_data = fc3_bias.read_data;
-      x8.addr0 = bias_add2.x80_0_addr0;
-      x8.addr1 = bias_add2.x80_0_addr1;
-      x8.write_data = bias_add2.x80_0_write_data;
-      x8.write_en = bias_add2.x80_0_write_en;
-      bias_add2.x80_0_done = x8.done;
-      bias_add2.go = 1'd1;
-      run_bias_add2[done] = bias_add2.done ? 1'd1;
-    }
-    group run_softmax {
-      x8.addr0 = softmax0.x80_0_addr0;
-      softmax0.x80_0_read_data = x8.read_data;
-      x8.addr1 = softmax0.x80_0_addr1;
-      x9.addr0 = softmax0.x90_0_addr0;
-      x9.addr1 = softmax0.x90_0_addr1;
-      x9.write_data = softmax0.x90_0_write_data;
-      x9.write_en = softmax0.x90_0_write_en;
-      softmax0.x90_0_done = x9.done;
-      softmax0.go = 1'd1;
-      run_softmax[done] = softmax0.done ? 1'd1;
-    }
-  }
-  control {
-    seq {
-      run_batch_flatten;
-      run_dense;
-      run_bias_add;
-      run_relu;
-      run_dense1;
-      run_bias_add1;
-      run_relu1;
-      run_dense2;
-      run_bias_add2;
-      run_softmax;
-    }
-  }
-}
diff --git a/frontends/relay-futil/tests/mlp_net.relay b/frontends/relay-futil/tests/mlp_net.relay
deleted file mode 100644
index 4368b51016..0000000000
--- a/frontends/relay-futil/tests/mlp_net.relay
+++ /dev/null
@@ -1,16 +0,0 @@
-v0.0.4
-fn (%data: Tensor[(1, 1, 28, 28), float32], %fc1_weight: Tensor[(128, 784), float32], %fc1_bias: Tensor[(128), float32],
-    %fc2_weight: Tensor[(64, 128), float32], %fc2_bias: Tensor[(64), float32], %fc3_weight: Tensor[(10, 64), float32],
-    %fc3_bias: Tensor[(10), float32]) -> Tensor[(1, 10), float32] {
-  let %x: Tensor[(1, 784), float32] = nn.batch_flatten(%data) /* ty=Tensor[(1, 784), float32] */;
-  let %x1: Tensor[(1, 128), float32] = nn.dense(%x, %fc1_weight, units=128) /* ty=Tensor[(1, 128), float32] */;
-  let %x2: Tensor[(1, 128), float32] = nn.bias_add(%x1, %fc1_bias, axis=-1) /* ty=Tensor[(1, 128), float32] */;
-  let %x3: Tensor[(1, 128), float32] = nn.relu(%x2) /* ty=Tensor[(1, 128), float32] */;
-  let %x4: Tensor[(1, 64), float32] = nn.dense(%x3, %fc2_weight, units=64) /* ty=Tensor[(1, 64), float32] */;
-  let %x5: Tensor[(1, 64), float32] = nn.bias_add(%x4, %fc2_bias, axis=-1) /* ty=Tensor[(1, 64), float32] */;
-  let %x6: Tensor[(1, 64), float32] = nn.relu(%x5) /* ty=Tensor[(1, 64), float32] */;
-  let %x7: Tensor[(1, 10), float32] = nn.dense(%x6, %fc3_weight, units=10) /* ty=Tensor[(1, 10), float32] */;
-  let %x8: Tensor[(1, 10), float32] = nn.bias_add(%x7, %fc3_bias, axis=-1) /* ty=Tensor[(1, 10), float32] */;
-  let %x9: Tensor[(1, 10), float32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), float32] */;
-  %x9
-}

From 89cda694dd377dcd267af08d36f76a250a555dd9 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Mon, 7 Dec 2020 22:02:13 -0500
Subject: [PATCH 71/75] Cleanup.

---
 frontends/relay-futil/compiler.py         |  82 +++++------------
 frontends/relay-futil/dahlia_functions.py |  69 +++++++++++----
 frontends/relay-futil/futil_ast.py        |  59 ++++---------
 frontends/relay-futil/pretty_print.py     |  80 +++++++++++------
 frontends/relay-futil/utilities.py        | 103 +++++++++++-----------
 5 files changed, 197 insertions(+), 196 deletions(-)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 4b8b5c22e8..4dd0de566e 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -9,19 +9,6 @@
 from futil_ast import *
 from dahlia_functions import *
 
-# Mapping from Relay binary calls to the respective Dahlia operator.
-BuiltInBinaryOps = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
-
-# Mapping from Relay function names to their respective Dahlia lowering.
-RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
-                      'nn.bias_add': bias_add, 'nn.relu': relu, 'nn.softmax': softmax, 'nn.max_pool2d': max_pool2d,
-                      'nn.conv2d': conv2d, 'negative': negative, 'expand_dims': expand_dims, 'sqrt': sqrt}
-
-# Mapping between primitive type and associated Dahlia name extension.
-# E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
-DahliaNameExtension = {PrimitiveType.Memory1D: '0', PrimitiveType.Memory2D: '0_0',
-                       PrimitiveType.Memory3D: '0_0_0', PrimitiveType.Memory4D: '0_0_0_0'}
-
 
 class Relay2Futil(ExprFunctor):
     """The main compilation visitor."""
@@ -30,17 +17,16 @@ def __init__(self):
         super(Relay2Futil, self).__init__()
         self.id_dictionary = defaultdict(int)
         self.relay_id_dictionary = defaultdict(int)
-        self.dahlia_components = []
-        self.main = FComponent(name="main", cells=[], wires=[])
+        self.main = FComponent(name="main")
 
     def id(self, name):
         """
         Provides a unique identification for a given name.
         For example, if 'a' is seen three times, it will produce: 'a0', 'a1', 'a2'.
         """
-        id_number = self.id_dictionary[name]
+        id_number = str(self.id_dictionary[name])
         self.id_dictionary[name] += 1
-        return name + str(id_number)
+        return ''.join((name, id_number))
 
     def relay_id(self, name):
         """
@@ -54,7 +40,7 @@ def relay_id(self, name):
         id_number = self.relay_id_dictionary[name]
         self.relay_id_dictionary[name] += 1
         if id_number == 0: return name
-        return name + str(id_number)
+        return ''.join((name, str(id_number)))
 
     def dahlia_name(self, name, type):
         """
@@ -64,69 +50,47 @@ def dahlia_name(self, name, type):
         Memory3D: `X0_0_0`, `Y0_0_0`
         """
         assert type in DahliaNameExtension, f'{name} with {type} is not supported yet.'
-        return name + DahliaNameExtension[type]
+        return ''.join((name, DahliaNameExtension[type]))
 
-    def get_dahlia_declaration(self, function_name, cells, args, attrs):
-        """
-        Returns the corresponding name, Dahlia function type, and op (if it is a binary op, otherwise None).
-        If the function type isn't supported, fails with an assertion.
-        """
-        input_type = cells[0].primitive.type
-        function = name = op = None
-        if function_name in BuiltInBinaryOps:
-            op = BuiltInBinaryOps[function_name]
-            function, name = broadcast, function_name
-        elif function_name in RelayFunctionCalls:
-            function = RelayFunctionCalls[function_name]
-            name = function.__name__
-        else:
-            assert False, f'{function_name} is not supported for lowering to FuTIL.'
-        return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name),
-                                 op=op, inputs=args, attributes=attrs, function=function)
-
-    def visit_var(self, var):
+    def visit_var(self, var) -> FCell:
         name = self.relay_id(var.name_hint)
-        # Do not add duplicate primitives to main.
-        if self.main.contains_primitive(name): return cell
+        if name in self.main.cells: return cell
         data, type, data_type = get_memory_parameters(var.type_annotation)
-        dahlia_name = self.dahlia_name(name, type)
-        return FCell(dahlia_name=dahlia_name,
+        return FCell(dahlia_name=self.dahlia_name(name, type),
                      primitive=FPrimitive(name=name, data=data, data_type=data_type, type=type))
 
     def visit_let(self, let):
         values, output = self.visit(let.value), self.visit(let.var)
         if isinstance(values, list):
             for value in values:
-                if not value.is_dahlia_declaration(): continue
-                value.dahlia_declaration.output = output
-                value.dahlia_declaration.invoke()
+                if value.is_relay_function(): value.relay_function.output = output
         return [self.visit(let.body), values]
 
-    def visit_constant(self, const):
+    def visit_constant(self, const) -> FCell:
         # Note: We're currently treating constants defined in a `let` statement in Relay IR as 1D Memory.
-        type, shape = const.data.dtype, const.data.shape
-        name, data = self.id("const"), [get_bitwidth(type), int(const.data.asnumpy())]
-        data_type = get_memory_parameters(type)
-        return FCell(primitive=FPrimitive(name=name, data=data, data_type=data_type, type=PrimitiveType.Constant))
+        # type, shape = const.data.dtype, const.data.shape
+        pass
 
-    def visit_call(self, call):
+    def visit_call(self, call) -> List[FCell]:
         attributes = call.attrs
         cells, args = [], []
         for arg in call.args:
             argument = self.visit(arg)
             cells.append(argument)
             args.append(argument)
-        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, args, call.attrs)))
+        # We are representing all function calls in Relay IR at the Dahlia level, which will then be lowered to FuTIL.
+        # Note, the Relay function's output is not defined until the `let` statement is visited.
+        function, name, op = GetRelayFunctionCall(call.op.name)
+        relay_function_call = RelayFunctionCall(component_name=self.relay_id(name), name=self.id(name), op=op,
+                                                inputs=args, attributes=call.attrs, lowering_function=function)
+        cells.append(FCell(relay_function=relay_function_call))
         return cells
 
     def visit_function(self, function):
         body = self.visit(function.body)
-        for cell in flatten(body):
-            self.main.add_cell(cell)
-            if not cell.is_dahlia_declaration(): continue
-            self.dahlia_components.append(cell.dahlia_declaration.program)
+        for cell in flatten(body): self.main.add_cell(cell)
         build_main_controls(self.main)
-        return pp_component(self.main)
+        return pp_lowered_relay_function(self.main)
 
 
 def relay_transforms(expr: Function) -> Function:
@@ -149,9 +113,7 @@ def lower_to_futil(program) -> str:
 
     PREAMBLE = """import "primitives/std.lib";\n"""
     MAIN = visitor.visit(program)
-    DAHLIA_COMPONENTS = '\n'.join(visitor.dahlia_components)
-    NEWL = '\n'
-    return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}'
+    return '\n'.join((PREAMBLE, MAIN))
 
 
 if __name__ == '__main__':
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
index 1822c45c4f..a3af6206e6 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_functions.py
@@ -7,11 +7,11 @@
 
 IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
 NO_ERR = "2>/dev/null"
-CHARACTER_I = chr(ord('i'))
 NEWL = '\n'
+CHARACTER_I = chr(ord('i'))  # Starting index variable name for Dahlia array iteration.
 
 
-def lower_dahlia_program(prog, component_name):
+def LowerDahliaProgramToFuTIL(program, component_name):
     """
     Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
     and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
@@ -35,7 +35,7 @@ def lower_dahlia_program(prog, component_name):
            ...
         }
     """
-    program_string = '\n'.join(prog.splitlines())
+    program_string = '\n'.join(program.splitlines())
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
         tf0.write(bytes(program_string, 'UTF-8'))
         tf0.seek(0), tf1.seek(0), tf2.seek(0)
@@ -48,6 +48,10 @@ def lower_dahlia_program(prog, component_name):
         return component
 
 
+####################################################################################################
+################################ Dahlia Implementations ############################################
+####################################################################################################
+
 def broadcast(declaration):
     """
     https://numpy.org/doc/stable/user/basics.broadcasting.html
@@ -110,7 +114,7 @@ def broadcast(declaration):
     program_body = pp_dahlia_loop(res, loop_body)
     declarations = pp_dahlia_memory_declarations([res, op1, op2])
     program = f"""{declarations}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
 
 
 def batch_flatten(declaration):
@@ -134,7 +138,7 @@ def batch_flatten(declaration):
     body = f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;"
     program_body = pp_dahlia_loop(data, body)
     program = f"""{declarations}{NEWL}{let_flattened}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
 
 
 def bias_add(declaration):
@@ -158,7 +162,7 @@ def bias_add(declaration):
     declarations = pp_dahlia_memory_declarations([data, bias, res])
     body = (f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};")
     program_body = pp_dahlia_loop(data, body)
-    return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
+    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
 # TODO(cgyurgyik):
@@ -184,7 +188,8 @@ def relu(declaration):
     body = f"""if ({data.name}{indices} > zero) {{ {res.name}{indices} := {data.name}{indices}; }} 
         else {{ {res.name}{indices} := zero; }}"""
     program_body = pp_dahlia_loop(data, body)
-    return lower_dahlia_program(f"""{declarations}{NEWL}{let_zero}{NEWL}{program_body}""", declaration.component_name)
+    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{let_zero}{NEWL}{program_body}""",
+                                     declaration.component_name)
 
 
 # TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
@@ -203,12 +208,11 @@ def negative(declaration):
     declarations = pp_dahlia_memory_declarations([op, res])
     zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
     program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := {zero} - {op.name}{indices};""")
-    return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
+    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
-# TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
 def sqrt(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.sqrt"""
     op, res = declaration.inputs[0].primitive, declaration.output.primitive
     bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
     include_sqrt = f"""import "fxp_sqrt.h" {{ def sqrt(value: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
@@ -222,8 +226,8 @@ def sqrt(declaration):
 
     declarations = pp_dahlia_memory_declarations([op, res])
     program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := sqrt({op.name}{indices});""")
-    return lower_dahlia_program(f"""{include_sqrt}{NEWL}{declarations}{NEWL}{program_body}""",
-                                declaration.component_name)
+    return LowerDahliaProgramToFuTIL(f"""{include_sqrt}{NEWL}{declarations}{NEWL}{program_body}""",
+                                     declaration.component_name)
 
 
 def expand_dims(declaration):
@@ -246,8 +250,7 @@ def expand_dims(declaration):
         variable_name = next_character(variable_name)
 
     program_body = pp_dahlia_loop(data, f'{res.name}{res_indices} := {data.name}{data_indices}')
-    program = f"""{declarations}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
 
 
 def batch_matmul(declaration):
@@ -294,7 +297,7 @@ def batch_matmul(declaration):
       }}
     }} 
     """
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
 
 
 # TODO(cgyurgyik): Similar to batch_matmul, this requires a temporary memory to store the output
@@ -331,7 +334,7 @@ def dense(declaration):
       }}
     }}
     """
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
 
 
 # TODO(cgyurgyik): Currently, only supports a small subset (namely those used in our VGG net and MLP net examples).
@@ -362,7 +365,8 @@ def softmax(declaration):
     }}
     """
     program = f"""{import_exp}{NEWL}{declarations}{body}"""
-    return lower_dahlia_program(program, declaration.component_name)
+
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
 
 
 def max_pool2d(declaration):
@@ -403,7 +407,7 @@ def max_pool2d(declaration):
     }} 
     """
     program = f"""{declarations}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
 
 
 # Only supports a small subset of the `conv2d` function. For example,
@@ -443,4 +447,31 @@ def conv2d(declaration):
     }} 
     """
     program = f"""{declarations}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+
+
+# Mapping from Relay function names to their respective Dahlia lowering.
+RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
+                      'nn.bias_add': bias_add, 'nn.relu': relu, 'nn.softmax': softmax, 'nn.max_pool2d': max_pool2d,
+                      'nn.conv2d': conv2d, 'negative': negative, 'expand_dims': expand_dims, 'sqrt': sqrt}
+
+# Mapping from Relay binary calls to the respective Dahlia operator.
+BuiltInBinaryOps = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
+
+
+def GetRelayFunctionCall(function_name) -> RelayFunctionCall:
+    """
+    Returns the corresponding name, function, and op (if it is a binary op, otherwise None).
+    If the function isn't supported, fails with an assertion.
+    """
+    function = name = op = None
+    assert function_name in BuiltInBinaryOps or function_name in RelayFunctionCalls, \
+        f'{function_name} is not supported for lowering from Relay IR to FuTIL.'
+    if function_name in BuiltInBinaryOps:
+        op = BuiltInBinaryOps[function_name]
+        function = broadcast
+        name = function_name
+    else:
+        function = RelayFunctionCalls[function_name]
+        name = function.__name__
+    return function, name, op
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 03f2e8fa0e..e1194a826c 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -4,6 +4,7 @@
 from types import FunctionType
 from enum import Enum, IntEnum
 
+
 # Note: The integer value N for Memory with dimension N is used; these should remain unchanged.
 class PrimitiveType(IntEnum):
     Memory1D = 1
@@ -110,65 +111,43 @@ class FComponent:
     Represents a FuTIL component.
     '''
     name: str
-    cells: List[Cell]  # Instantiated sub-components.
-    wires: List[FConnection]  # Wire connections between components.
+    wires = []  # Wire connections between components.
+    cells = {}  # Instantiated sub-components. This is a mapping from {`dahlia_name`, FCell}.
     controls: FControl = None  # Control statement for this component.
     signature: FSignature = None  # Input and output ports.
 
-    def contains_primitive(self, name: str):
-        '''
-        Determines whether this component contains a primitive with the given name.
-        '''
-        # TODO(cgyurgyik): Rethink data structure here.
-        for cell in self.cells:
-            if not cell.is_primitive(): continue
-            if cell.primitive.name == name: return True
-        return False
-
     def add_cell(self, subcomponent: Cell):
         '''
         Appends a subcomponent to this component's list of FuTIL cells.
         '''
-        if not subcomponent.is_primitive():
-            self.cells.append(subcomponent)
-            return
-        if self.contains_primitive(subcomponent.primitive.name): return
-        self.cells.append(subcomponent)
+        if subcomponent == None: return
+        if subcomponent.is_primitive():
+            self.cells[subcomponent.primitive.name] = subcomponent
+        elif subcomponent.is_relay_function():
+            self.cells[subcomponent.relay_function.name] = subcomponent
 
 
 @dataclass
-class DahliaDeclaration:
-    decl_name: str
+class RelayFunctionCall:
+    """
+    Represents a Relay function call. This will eventually be translated to Dahlia and subsequently lowered to FuTIL.
+    """
+    name: str
     component_name: str
-    op: str = None
+    op: str = None  # Binary operation associated with the Relay function call, if it exists.
+    attributes: tvm.ir.Attrs = None  # Attributes associated with the Relay function call, e.g. `axis`, `padding`.
+    lowering_function: FunctionType = None  # The function used to convert the Dahlia representation to FuTIL.
     inputs: List[Cell] = None
     output: Cell = None
-    attributes: tvm.ir.Attrs = None
-    function: FunctionType = None
-    program: str = None
-
-    def invoke(self):
-        self.program = self.function(self)
-
-
-@dataclass
-class FDeclaration:
-    '''
-    Represents a FuTIL declaration.
-    '''
-    name: str
-    component: FComponent = None
 
 
 @dataclass
 class FCell(Cell):
     dahlia_name: str = None
     primitive: FPrimitive = None
-    declaration: FDeclaration = None
-    dahlia_declaration: DahliaDeclaration = None
+    relay_function: RelayFunctionCall = None
 
+    # TODO(cgyurgyik): Is there a better way to do this, such as std::variant in C++?
     def is_primitive(self): return self.primitive != None
 
-    def is_declaration(self): return self.declaration != None
-
-    def is_dahlia_declaration(self): return self.dahlia_declaration != None
+    def is_relay_function(self): return self.relay_function != None
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 7a57e6e6b7..fa3935f6e4 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -2,14 +2,14 @@
 import textwrap
 
 
-def mk_block(decl, contents, indent=2):
+def pp_block(decl, contents, indent=2):
     """Format a block like this:
         decl {
           contents
         }
     where `decl` is one line but contents can be multiple lines.
     """
-    return decl + ' {\n' + textwrap.indent(contents, indent * ' ') + '\n}'
+    return ''.join((decl, ' {\n', textwrap.indent(contents, indent * ' '), '\n}'))
 
 
 def pp_component_signature(component: FComponent):
@@ -39,7 +39,7 @@ def pp_connections(component: FConnection):
             wires = []
             for wire in connection.group.wires:
                 wires.append(pp_wire(wire))
-            connections.append(mk_block(f'group {connection.group.name}', '\n'.join(wires)))
+            connections.append(pp_block(f'group {connection.group.name}', '\n'.join(wires)))
     return connections
 
 
@@ -49,24 +49,57 @@ def pp_control(component: FComponent):
         groups = []
         for group_name in control.stmts:
             groups.append(f'{group_name};')
-        ctrls.append(mk_block(control.name, '\n'.join(groups)))
+        ctrls.append(pp_block(control.name, '\n'.join(groups)))
     return ctrls
 
 
-def pp_component(component: FComponent):
+def pp_lowered_dahlia_components(component: FComponent):
+    relay_functions = []
+    for cell in component.cells.values():
+        if cell == None or not cell.is_relay_function(): continue
+        relay_call = cell.relay_function
+        relay_functions.append(relay_call.lowering_function(relay_call))
+    return '\n'.join(relay_functions)
+
+
+def pp_lowered_relay_function(component: FComponent):
+    """
+    Pretty prints the main program. This consists of the following:
+    1. Relay functions lowered from Dahlia -> FuTIL.
+    2. The `main` component.
+
+    Example:
+    ------------------------------------
+    Input
+    ```
+      fn (%x: int32, %y: int32) { let %z = add(%x, %y); %z }
+    ```
+    ------------------------------------
+    Output
+    ```
+      component add(...) -> (...) { ... }
+
+      component main() -> () {
+        ...
+        control { run_add; }
+      }
+    ```
+    """
+    relay_function_components = pp_lowered_dahlia_components(component)
+
     subcomponents = []
-    for cell in component.cells:
-        if cell == None:
-            continue
+    for cell in component.cells.values():
+        if cell == None: continue
         subcomponents.append(pp_cell(cell))
-    cells = mk_block("cells", '\n'.join(subcomponents))
+    cells = pp_block("cells", '\n'.join(subcomponents))
     inputs, outputs = pp_component_signature(component)
-    wires = mk_block("wires", '\n'.join(pp_connections(component)))
-
-    controls = "" if component.controls == None else '\n'.join(pp_control(component))
-    control = mk_block("control", controls)
+    wires = pp_block("wires", '\n'.join(pp_connections(component)))
 
-    return mk_block(f'component {component.name} ({inputs}) -> ({outputs})', '\n'.join([cells, wires, control]))
+    controls = '\n'.join(pp_control(component))
+    control = pp_block("control", controls)
+    main_component = pp_block(f'component {component.name} ({inputs}) -> ({outputs})',
+                              '\n'.join([cells, wires, control]))
+    return '\n'.join((relay_function_components, main_component))
 
 
 def pp_cell(cell: FCell):
@@ -100,11 +133,8 @@ def pp_cell(cell: FCell):
         if cell.primitive.type == PrimitiveType.BinOp:
             op = data[1]
             return f'{cell.primitive.name} = prim std_{op}({bitwidth});'
-        assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
-    elif cell.is_declaration():
-        return f'{cell.declaration.name} = {cell.declaration.component.name};'
-    elif cell.is_dahlia_declaration():
-        return f'{cell.dahlia_declaration.decl_name} = {cell.dahlia_declaration.component_name};'
+    if cell.is_relay_function(): return f'{cell.relay_function.name} = {cell.relay_function.component_name};'
+    assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
 
 
 # Dahlia Pretty Printing.
@@ -112,17 +142,17 @@ def pp_cell(cell: FCell):
 def next_character(ch, dir=1):
     """
     Returns the next character after 'ch'.
-    If dir is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
+    If `dir` is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
     """
-    return chr(ord(ch) + dir) if dir > 0 else chr(ord(ch) - 1)
+    return chr(ord(ch) + 1) if dir > 0 else chr(ord(ch) - 1)
 
 
 def pp_dahlia_memory_declarations(declaration_list):
     declarations = []
-    for decl in declaration_list:
-        decl_string = f'decl {decl.name}: {decl.data_type}<{decl.data[0]}>'
-        for i in range(0, decl.type): decl_string += f'[{decl.data[i + 1]}]'
-        declarations.append(f'{decl_string};')
+    for declaration in declaration_list:
+        string = f'decl {declaration.name}: {declaration.data_type}<{declaration.data[0]}>'
+        for i in range(0, declaration.type): string += f'[{declaration.data[i + 1]}]'
+        declarations.append(string + ";")
     return '\n'.join(declarations)
 
 
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index fc171f60e9..69cc8c4e0a 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -6,6 +6,11 @@
 NumDimensionsToPrimitive = {1: PrimitiveType.Memory1D, 2: PrimitiveType.Memory2D,
                             3: PrimitiveType.Memory3D, 4: PrimitiveType.Memory4D}
 
+# Mapping between primitive type and associated Dahlia name extension.
+# E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
+DahliaNameExtension = {PrimitiveType.Memory1D: '0', PrimitiveType.Memory2D: '0_0',
+                       PrimitiveType.Memory3D: '0_0_0', PrimitiveType.Memory4D: '0_0_0_0'}
+
 
 def flatten(l):
     '''
@@ -26,8 +31,9 @@ def get_dahlia_data_type(relay_type):
     Gets the Dahlia data type from the given Relay type.
     NOTE: Currently, Dahlia does not support signed types for arrays.
     '''
-    if 'int' in relay_type: return 'ubit'
-    if 'float' in relay_type: return 'ufix'
+    dtype = relay_type.dtype
+    if 'int' in dtype: return 'ubit'
+    if 'float' in dtype: return 'ufix'
     assert False, f'{relay_type} is not supported.'
 
 
@@ -37,12 +43,12 @@ def get_bitwidth(relay_type):
     If the relay_type is floating point of size N, returns a fixed point of size <N, N/2>.
     This lowers to a fixed point cell with `int_width` of size N/2, and a `fract_width` of size N/2.
     '''
-    type = str(relay_type)
-    length = len(type)
-    if 'int' in type: return type[3:length]
-    if 'float' in type:
-        width = int(type[5:length])
-        return f'{width}, {int(width / 2)}'
+    dtype = relay_type.dtype
+    length = len(dtype)
+    if 'int' in dtype: return dtype[3:length]
+    if 'float' in dtype:
+        width = dtype[5:length]
+        return f'{width}, {int(width) // 2}'
     assert False, f'{relay_type} is not supported.'
 
 
@@ -55,16 +61,16 @@ def get_memory_parameters(type):
 
     We then parse this to determine the corresponding FuTIL and Dahlia types.
     '''
-    t = str(type)
-    data_type = get_dahlia_data_type(t)
-    if t[0:3] == 'int' or t[0:5] == 'float':
+    typ = str(type)
+    data_type = get_dahlia_data_type(type)
+
+    if typ[0:3] == 'int' or typ[0:5] == 'float':
+        # Currently, we are treating scalar values as 1D Memory primitives.
         return [get_bitwidth(type), 1, 1], PrimitiveType.Memory1D, data_type
-    assert t[0:6] == 'Tensor', f'{type} is not currently supported.'
-    string_type = t[t.find(")") + 3:t.find("]")]
-    string_dimensions = t[t.find("(") + 1:t.find(")")]
+    assert typ[0:6] == 'Tensor', f'{type} is not currently supported.'
 
-    tensor_dimensions = list(map(int, string_dimensions.split(',')))
-    data, num_dimensions = [get_bitwidth(string_type)], len(tensor_dimensions)
+    tensor_dimensions = type.concrete_shape
+    data, num_dimensions = [get_bitwidth(type)], len(tensor_dimensions)
     assert num_dimensions in NumDimensionsToPrimitive, f'{num_dimensions} dimensions is not supported.'
     for dimension in tensor_dimensions: data.append(dimension)  # Size.
     for dimension in tensor_dimensions: data.append(int(math.log2(dimension) + 1))  # Index size.
@@ -73,51 +79,44 @@ def get_memory_parameters(type):
 
 def build_main_controls(c: FComponent):
     '''
-    Builds the wires and control for the `main` component.
-    This is done by creating a group run_* with its respective
-    wiring for each Dahlia declaration, and adding it to the
-    control.
+    Builds the wires and control for the `main` component. This is done by creating a group `run_*`
+    with its respective wiring for each Relay function call, and adding it to the control.
     '''
-    dahlia_declarations = []
-    for cell in reversed(c.cells):
-        if not cell.is_dahlia_declaration(): continue
-        dahlia_declarations.append(cell.dahlia_declaration)
-
-    for declaration in dahlia_declarations:
-        inputs = declaration.inputs
+    for cell in reversed(c.cells.values()):
+        if not cell.is_relay_function(): continue
+        function = cell.relay_function
+        inputs, output = function.inputs, function.output
         wires = []
-        group_name = f'run_{declaration.component_name}'
+        group_name = f'run_{function.component_name}'
         for input in flatten(inputs):
             prim = input.primitive
-            wires.append(FWire(f'{prim.name}.addr0', f'{declaration.decl_name}.{input.dahlia_name}_addr0'))
+            wires.append(FWire(f'{prim.name}.addr0', f'{function.name}.{input.dahlia_name}_addr0'))
             wires.append(
-                FWire(f'{declaration.decl_name}.{input.dahlia_name}_read_data', f'{prim.name}.read_data'))
+                FWire(f'{function.name}.{input.dahlia_name}_read_data', f'{prim.name}.read_data'))
             if prim.type == PrimitiveType.Memory1D: continue
-            wires.append(FWire(f'{prim.name}.addr1', f'{declaration.decl_name}.{input.dahlia_name}_addr1'))
+            wires.append(FWire(f'{prim.name}.addr1', f'{function.name}.{input.dahlia_name}_addr1'))
             if prim.type == PrimitiveType.Memory2D: continue
-            wires.append(FWire(f'{prim.name}.addr2', f'{declaration.decl_name}.{input.dahlia_name}_addr2'))
+            wires.append(FWire(f'{prim.name}.addr2', f'{function.name}.{input.dahlia_name}_addr2'))
             if prim.type == PrimitiveType.Memory3D: continue
-            wires.append(FWire(f'{prim.name}.addr3', f'{declaration.decl_name}.{input.dahlia_name}_addr3'))
-
-        output = declaration.output
-        type = output.primitive.type
-        wires.append(FWire(f'{output.primitive.name}.addr0', f'{declaration.decl_name}.{output.dahlia_name}_addr0'))
-        if type == PrimitiveType.Memory2D or type == PrimitiveType.Memory3D or type == PrimitiveType.Memory4D:
-            wires.append(FWire(f'{output.primitive.name}.addr1', f'{declaration.decl_name}.{output.dahlia_name}_addr1'))
-        if type == PrimitiveType.Memory3D or type == PrimitiveType.Memory4D:
-            wires.append(FWire(f'{output.primitive.name}.addr2', f'{declaration.decl_name}.{output.dahlia_name}_addr2'))
-        if type == PrimitiveType.Memory4D:
-            wires.append(FWire(f'{output.primitive.name}.addr3', f'{declaration.decl_name}.{output.dahlia_name}_addr3'))
-
-        wires.append(
-            FWire(f'{output.primitive.name}.write_data', f'{declaration.decl_name}.{output.dahlia_name}_write_data'))
-        wires.append(
-            FWire(f'{output.primitive.name}.write_en', f'{declaration.decl_name}.{output.dahlia_name}_write_en'))
-        wires.append(FWire(f'{declaration.decl_name}.{output.dahlia_name}_done', f'{output.primitive.name}.done'))
-        wires.append(FWire(f'{declaration.decl_name}.go', "1'd1"))
-        wires.append(FWire(f'{group_name}[done]', f"{declaration.decl_name}.done ? 1'd1"))
+            wires.append(FWire(f'{prim.name}.addr3', f'{function.name}.{input.dahlia_name}_addr3'))
+
+        output_type, output_name = output.primitive.type, output.primitive.name
+        for i in range(0, 1):
+            wires.append(FWire(f'{output_name}.addr0', f'{function.name}.{output.dahlia_name}_addr0'))
+            if output_type == PrimitiveType.Memory1D: break
+            wires.append(FWire(f'{output_name}.addr1', f'{function.name}.{output.dahlia_name}_addr1'))
+            if output_type == PrimitiveType.Memory2D: break
+            wires.append(FWire(f'{output_name}.addr2', f'{function.name}.{output.dahlia_name}_addr2'))
+            if output_type == PrimitiveType.Memory3D: break
+            wires.append(FWire(f'{output_name}.addr3', f'{function.name}.{output.dahlia_name}_addr3'))
+
+        wires.append(FWire(f'{output_name}.write_data', f'{function.name}.{output.dahlia_name}_write_data'))
+        wires.append(FWire(f'{output_name}.write_en', f'{function.name}.{output.dahlia_name}_write_en'))
+        wires.append(FWire(f'{function.name}.{output.dahlia_name}_done', f'{output_name}.done'))
+        wires.append(FWire(f'{function.name}.go', "1'd1"))
+        wires.append(FWire(f'{group_name}[done]', f"{function.name}.done ? 1'd1"))
         c.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
 
-    # Ensures that only group names make it into the controls of a component.
+    # Ensures that only group names make it into the controls of a FuTIL component.
     connections = list(filter(lambda w: w.is_group(), c.wires))
     c.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]

From cc13e7cc59ef48064559dca8dbecfc960b1908d3 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Tue, 8 Dec 2020 20:31:29 -0500
Subject: [PATCH 72/75] Continued modular efforts.

---
 frontends/relay-futil/compiler.py             |  20 +-
 ...dahlia_functions.py => dahlia_lowering.py} | 217 ++++++++++--------
 frontends/relay-futil/example.py              |   1 +
 frontends/relay-futil/pretty_print.py         |  51 ----
 frontends/relay-futil/tests/add.expect        |  28 +--
 .../relay-futil/tests/batch_flatten.expect    |  30 +--
 .../relay-futil/tests/batch_matmul.expect     |  40 ++--
 frontends/relay-futil/tests/bias_add.expect   |  40 ++--
 frontends/relay-futil/tests/broadcast.expect  |  38 +--
 frontends/relay-futil/tests/conv2d.expect     |  46 ++--
 frontends/relay-futil/tests/dense.expect      |  34 +--
 .../relay-futil/tests/fixed_point_add.expect  |  28 +--
 frontends/relay-futil/tests/let1.expect       |  28 +--
 frontends/relay-futil/tests/let2.expect       |  58 ++---
 frontends/relay-futil/tests/let3.expect       | 100 ++++----
 frontends/relay-futil/tests/let3.relay        |   2 +-
 frontends/relay-futil/tests/max_pool2d.expect |  36 +--
 frontends/relay-futil/tests/relu.expect       |  36 +--
 frontends/relay-futil/tests/softmax.expect    |  28 +--
 frontends/relay-futil/tests/sqrt.expect       |  36 +--
 frontends/relay-futil/tests/sub.expect        |  28 +--
 .../relay-futil/tests/tensor1d_mult.expect    |  28 +--
 .../relay-futil/tests/tensor2d_add.expect     |  34 +--
 .../relay-futil/tests/tensor3d_divide.expect  |  40 ++--
 frontends/relay-futil/utilities.py            |  10 +-
 25 files changed, 508 insertions(+), 529 deletions(-)
 rename frontends/relay-futil/{dahlia_functions.py => dahlia_lowering.py} (72%)

diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 4dd0de566e..d3eb4461d3 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -7,7 +7,7 @@
 from pretty_print import *
 from utilities import *
 from futil_ast import *
-from dahlia_functions import *
+from dahlia_lowering import *
 
 
 class Relay2Futil(ExprFunctor):
@@ -34,8 +34,8 @@ def relay_id(self, name):
         %x  = foo(%y);
         %x1 = bar(%x); // Here, at this level, the name_hint associated with `x1` is still 'x'.
 
-        To avoid this, we provide Relay with its own identification dictionary. If 'x' is seen
-        three times, it will produce: 'x', 'x1', x2'.
+        To avoid this, we provide Relay with its own identification dictionary.
+        If 'x' is seen three times, it will produce: 'x', 'x1', x2'.
         """
         id_number = self.relay_id_dictionary[name]
         self.relay_id_dictionary[name] += 1
@@ -62,7 +62,7 @@ def visit_var(self, var) -> FCell:
     def visit_let(self, let):
         values, output = self.visit(let.value), self.visit(let.var)
         if isinstance(values, list):
-            for value in values:
+            for value in flatten(values):
                 if value.is_relay_function(): value.relay_function.output = output
         return [self.visit(let.body), values]
 
@@ -81,8 +81,9 @@ def visit_call(self, call) -> List[FCell]:
         # We are representing all function calls in Relay IR at the Dahlia level, which will then be lowered to FuTIL.
         # Note, the Relay function's output is not defined until the `let` statement is visited.
         function, name, op = GetRelayFunctionCall(call.op.name)
-        relay_function_call = RelayFunctionCall(component_name=self.relay_id(name), name=self.id(name), op=op,
-                                                inputs=args, attributes=call.attrs, lowering_function=function)
+        component_name = self.id(name)
+        relay_function_call = RelayFunctionCall(component_name=component_name, name=f'comp_{component_name}',
+                                                op=op, inputs=args, attributes=call.attrs, lowering_function=function)
         cells.append(FCell(relay_function=relay_function_call))
         return cells
 
@@ -95,14 +96,13 @@ def visit_function(self, function):
 
 def relay_transforms(expr: Function) -> Function:
     """https://tvm.apache.org/docs/api/python/relay/transform.html"""
-    transform = tvm.transform.Sequential([
+    transforms = tvm.transform.Sequential([
         relay.transform.SimplifyExpr(),
         relay.transform.SimplifyInference(),
-        relay.transform.InferType()
+        relay.transform.InferType(),
     ])
     mod = ir.IRModule.from_expr(expr)
-    mod['main'] = expr
-    mod = transform(mod)
+    mod = transforms(mod)
     return mod['main']
 
 
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_lowering.py
similarity index 72%
rename from frontends/relay-futil/dahlia_functions.py
rename to frontends/relay-futil/dahlia_lowering.py
index a3af6206e6..3bb822e681 100644
--- a/frontends/relay-futil/dahlia_functions.py
+++ b/frontends/relay-futil/dahlia_lowering.py
@@ -3,7 +3,6 @@
 
 from tempfile import NamedTemporaryFile, TemporaryFile
 from futil_ast import *
-from pretty_print import *
 
 IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
 NO_ERR = "2>/dev/null"
@@ -11,7 +10,68 @@
 CHARACTER_I = chr(ord('i'))  # Starting index variable name for Dahlia array iteration.
 
 
-def LowerDahliaProgramToFuTIL(program, component_name):
+def next_character(ch, dir=1):
+    """
+    Returns the next character after 'ch'.
+    If `dir` is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
+    """
+    return chr(ord(ch) + 1) if dir > 0 else chr(ord(ch) - 1)
+
+
+def PPDahliaMemoryDeclarations(relay_function):
+    """
+    Pretty print for Dahlia memory declarations, e.g.
+    `decl X: ubit<32> [1][10];`
+    """
+    cell_list = relay_function.inputs
+    cell_list.append(relay_function.output)
+
+    declarations = []
+    for cell in cell_list:
+        declaration = cell.primitive
+        declaration_str = f'decl {declaration.name}: {declaration.data_type}<{declaration.data[0]}>'
+        for i in range(0, declaration.type): declaration_str += f'[{declaration.data[i + 1]}]'
+        declarations.append(declaration_str + ";")
+    return '\n'.join(declarations)
+
+
+def PPDahliaLoop(relay_function, body, num_dimensions, data=None):
+    """
+    Returns an iteration over data with `body` as the work done within the nested loop(s).
+    Many tensor functions share the same control flow: (1) Iterate `num_dimensions` times, and (2) do some work in body.
+    For example, if `data` is a 2D primitive of size (M, N) and body == `X;`, then this will return:
+
+    ```
+    for (let i: ubit<X> = 0..M) {
+      for (let j: ubit<Y> = 0..N) {
+        X;
+      }
+    }
+    ```
+
+    Notes:
+    If `data` is provided, it will be used to determine the `num_dimensions` as well as the corresponding bitwidths
+    and memory sizes. This occurs only in special cases; otherwise, the `output` of the `relay_function` will
+    determine these.
+    """
+    variable_name = CHARACTER_I
+    program = []
+    SPACING = ''
+    output = relay_function.output.primitive if data == None else data
+    for i in range(0, num_dimensions):
+        size, index_size = output.data[i + 1], output.data[i + num_dimensions + 1]
+        program.append(f'{SPACING}for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
+        variable_name = next_character(variable_name)
+        SPACING += '  '
+    program.append(f'{SPACING}{body}')
+
+    for i in range(0, num_dimensions):
+        SPACING = SPACING[:-2]
+        program.append(SPACING + '}')
+    return '\n'.join(program)
+
+
+def LowerDahliaProgramToFuTIL(relay_function, dahlia_body, dahlia_imports=None):
     """
     Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
     and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
@@ -19,6 +79,7 @@ def LowerDahliaProgramToFuTIL(program, component_name):
 
     Example:
         ------ Dahlia, component name: ProcessX ------
+        import "foo.h" { ... }
         decl X: ubit<32>[4];
         ...
 
@@ -35,13 +96,15 @@ def LowerDahliaProgramToFuTIL(program, component_name):
            ...
         }
     """
-    program_string = '\n'.join(program.splitlines())
+    if dahlia_imports == None: dahlia_imports = ''
+    program_string = '\n'.join((dahlia_imports, PPDahliaMemoryDeclarations(relay_function), dahlia_body))
+
     with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
         tf0.write(bytes(program_string, 'UTF-8'))
         tf0.seek(0), tf1.seek(0), tf2.seek(0)
         fuse_binary = os.environ['DAHLIA_EXEC'] if 'DAHLIA_EXEC' in os.environ else 'fuse'
         command = f"""
-                {fuse_binary} {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {NO_ERR} \
+                {fuse_binary} {tf0.name} --lower -b=futil -n={relay_function.component_name} > {tf1.name} {NO_ERR} \
                  && fud e --from futil {tf1.name} --to futil-externalize > {tf2.name} {NO_ERR}"""
         subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
         component = tf2.read().decode()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
@@ -49,10 +112,10 @@ def LowerDahliaProgramToFuTIL(program, component_name):
 
 
 ####################################################################################################
-################################ Dahlia Implementations ############################################
+################## Dahlia Implementations for Relay Function Calls #################################
 ####################################################################################################
 
-def broadcast(declaration):
+def broadcast(function: RelayFunctionCall):
     """
     https://numpy.org/doc/stable/user/basics.broadcasting.html
     Implements array broadcasting:
@@ -72,8 +135,7 @@ def broadcast(declaration):
               result[i][j][k] := op1[i][0][k] op op2[j][0];
               ...
     """
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-
+    op1, op2, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
     op1_dims, op2_dims, res_dims = op1.type, op2.type, res.type
     op1_sizes, op2_sizes, res_sizes = [], [], []
     # Get memory sizes in reversed order.
@@ -109,18 +171,15 @@ def broadcast(declaration):
     op1_index = ''.join(reversed(op1_indices))
     op2_index = ''.join(reversed(op2_indices))
     res_index = ''.join(reversed(res_indices))
-    loop_body = f'{res.name}{res_index} := {op1.name}{op1_index} {declaration.op} {op2.name}{op2_index};'
+    loop_body = f'{res.name}{res_index} := {op1.name}{op1_index} {function.op} {op2.name}{op2_index};'
 
-    program_body = pp_dahlia_loop(res, loop_body)
-    declarations = pp_dahlia_memory_declarations([res, op1, op2])
-    program = f"""{declarations}{NEWL}{program_body}"""
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(function, PPDahliaLoop(function, loop_body, num_dimensions=res_dims))
 
 
-def batch_flatten(declaration):
+def batch_flatten(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = data.data[0], data.type
+    data, res = function.inputs[0].primitive, function.output.primitive
+    bitwidth, num_dimensions = res.data[0], data.type
     res_index_size1 = res.data[4]
 
     variable_name = CHARACTER_I
@@ -133,20 +192,18 @@ def batch_flatten(declaration):
         variable_name = next_character(variable_name)
     res_indices += f'[{variable_name}]'
 
-    declarations = pp_dahlia_memory_declarations([data, res])
     let_flattened = f'let {variable_name}: ubit<{res_index_size1}> = 0;'
     body = f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;"
-    program_body = pp_dahlia_loop(data, body)
-    program = f"""{declarations}{NEWL}{let_flattened}{NEWL}{program_body}"""
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    program_body = '\n'.join((let_flattened, PPDahliaLoop(function, body, num_dimensions, data)))
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
-def bias_add(declaration):
+def bias_add(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
-    data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    data, bias, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
     bitwidth, num_dimensions = data.data[0], data.type
 
-    axis_attribute = declaration.attributes.get_int("axis")
+    axis_attribute = function.attributes.get_int("axis")
     axis = num_dimensions - 1 if axis_attribute == -1 else axis_attribute
 
     variable_name = CHARACTER_I
@@ -159,22 +216,19 @@ def bias_add(declaration):
         data_indices += index
         variable_name = next_character(variable_name)
 
-    declarations = pp_dahlia_memory_declarations([data, bias, res])
-    body = (f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};")
-    program_body = pp_dahlia_loop(data, body)
-    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
+    body = f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};"
+    return LowerDahliaProgramToFuTIL(function, PPDahliaLoop(function, body, num_dimensions))
 
 
 # TODO(cgyurgyik):
 #  1. This won't work for fixed point currently, since Dahlia
 #     will not take fixed point operands for the `>` operator.
 #  2. Without signed bit array support, this is also meaningless.
-def relu(declaration):
+def relu(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    data, res = function.inputs[0].primitive, function.output.primitive
     bitwidth, num_dimensions, data_type = data.data[0], data.type, data.data_type
 
-    declarations = pp_dahlia_memory_declarations([data, res])
     zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
     let_zero = f'let zero: {data_type}<{bitwidth}> = {zero};'
 
@@ -186,16 +240,15 @@ def relu(declaration):
         variable_name = next_character(variable_name)
 
     body = f"""if ({data.name}{indices} > zero) {{ {res.name}{indices} := {data.name}{indices}; }} 
-        else {{ {res.name}{indices} := zero; }}"""
-    program_body = pp_dahlia_loop(data, body)
-    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{let_zero}{NEWL}{program_body}""",
-                                     declaration.component_name)
+               else {{ {res.name}{indices} := zero; }}"""
+    program_body = '\n'.join((let_zero, PPDahliaLoop(function, body, num_dimensions)))
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
 # TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
-def negative(declaration):
+def negative(function):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
-    op, res = declaration.inputs[0].primitive, declaration.output.primitive
+    op, res = function.inputs[0].primitive, function.output.primitive
     bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
 
     indices = ""
@@ -205,15 +258,14 @@ def negative(declaration):
         indices += f'[{variable_name}]'
         variable_name = next_character(variable_name)
 
-    declarations = pp_dahlia_memory_declarations([op, res])
     zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
-    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := {zero} - {op.name}{indices};""")
-    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
+    program_body = PPDahliaLoop(function, f"""{res.name}{indices} := {zero} - {op.name}{indices};""", num_dimensions)
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
-def sqrt(declaration):
+def sqrt(function):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.sqrt"""
-    op, res = declaration.inputs[0].primitive, declaration.output.primitive
+    op, res = function.inputs[0].primitive, function.output.primitive
     bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
     include_sqrt = f"""import "fxp_sqrt.h" {{ def sqrt(value: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
 
@@ -224,20 +276,16 @@ def sqrt(declaration):
         indices += f'[{variable_name}]'
         variable_name = next_character(variable_name)
 
-    declarations = pp_dahlia_memory_declarations([op, res])
-    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := sqrt({op.name}{indices});""")
-    return LowerDahliaProgramToFuTIL(f"""{include_sqrt}{NEWL}{declarations}{NEWL}{program_body}""",
-                                     declaration.component_name)
+    program_body = PPDahliaLoop(function, f"""{res.name}{indices} := sqrt({op.name}{indices});""", num_dimensions)
+    return LowerDahliaProgramToFuTIL(function, program_body, include_sqrt)
 
 
-def expand_dims(declaration):
+def expand_dims(function):
     """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
-    axis, num_newaxis = declaration.attributes.get_int("axis"), declaration.attributes.get_int("num_newaxis")
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    axis, num_newaxis = function.attributes.get_int("axis"), function.attributes.get_int("num_newaxis")
+    data, res = function.inputs[0].primitive, function.output.primitive
     bitwidth, num_dimensions = data.data[0], data.type
 
-    declarations = pp_dahlia_memory_declarations([data, res])
-
     res_indices, data_indices = "", ""
     variable_name = CHARACTER_I
     for i in range(0, num_dimensions):
@@ -249,13 +297,13 @@ def expand_dims(declaration):
             for _ in range(0, num_newaxis): res_indices += '[0]'
         variable_name = next_character(variable_name)
 
-    program_body = pp_dahlia_loop(data, f'{res.name}{res_indices} := {data.name}{data_indices}')
-    return LowerDahliaProgramToFuTIL(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
+    program_body = PPDahliaLoop(function, f'{res.name}{res_indices} := {data.name}{data_indices}', num_dimensions, data)
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
-def batch_matmul(declaration):
+def batch_matmul(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    op1, op2, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
     bitwidth, M1_size0, M1_size1, M1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
     M1_index_size0, M1_index_size1, M1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
     M2_size0, M2_size1, M2_size2 = op2.data[1], op2.data[2], op2.data[3]
@@ -265,8 +313,7 @@ def batch_matmul(declaration):
     # 3. Copy temporary value to return value.*
     #    * This third step may not be necessary, but trying to conduct the matrix multiply
     #      directly with the return value declared resulted in incorrect outputs.
-    declarations = pp_dahlia_memory_declarations([res, op1, op2])
-    program = f"""{declarations}
+    program_body = f"""
     let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size2}][{M2_size1}];
     let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
     for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
@@ -297,19 +344,18 @@ def batch_matmul(declaration):
       }}
     }} 
     """
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
 # TODO(cgyurgyik): Similar to batch_matmul, this requires a temporary memory to store the output
 # of the matrix multiply. Otherwise, the values aren't computed properly. Look deeper into this.
-def dense(declaration):
+def dense(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.dense"""
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    op1, op2, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
     bitwidth, M1_size0, M1_size1 = op1.data[0], op1.data[1], op1.data[2]
     M1_index_size0, M1_index_size1 = op1.data[3], op1.data[4]
     M2_size0, M2_size1, M2_index_size0, M2_index_size1 = op2.data[1], op2.data[2], op2.data[3], op2.data[4]
     program = f"""
-    {pp_dahlia_memory_declarations([res, op1, op2])}
     let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size1}][{M2_size0}];
     let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
     for (let i: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
@@ -334,24 +380,22 @@ def dense(declaration):
       }}
     }}
     """
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(function, program)
 
 
 # TODO(cgyurgyik): Currently, only supports a small subset (namely those used in our VGG net and MLP net examples).
-def softmax(declaration):
+def softmax(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.softmax"""
-    op, res = declaration.inputs[0].primitive, declaration.output.primitive
-    axis = declaration.attributes.get_int("axis")
+    op, res = function.inputs[0].primitive, function.output.primitive
+    axis = function.attributes.get_int("axis")
     data_type = op.data_type
     assert op.type == PrimitiveType.Memory2D, f'nn.softmax with pritmive type Memory{op.type}D is not supported.'
     assert axis == -1 or axis == 1, f'nn.softmax with axis = {axis} is not supported.'
     bitwidth, size0, size1, index_size0, index_size1 = op.data[0], op.data[1], op.data[2], op.data[3], op.data[4]
 
     import_exp = f"""import "std_exp.h" {{ def exp(x: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
-    declarations = pp_dahlia_memory_declarations([res, op])
-
     zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
-    body = f"""
+    program_body = f"""
     for (let i: ubit<{index_size0}> = 0..{size0}) {{
       let {op.name}_expsum: {data_type}<{bitwidth}> = {zero};
       for (let j: ubit<{index_size1}> = 0..{size1}) {{ 
@@ -364,25 +408,22 @@ def softmax(declaration):
       }}
     }}
     """
-    program = f"""{import_exp}{NEWL}{declarations}{body}"""
-
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(function, program_body, import_exp)
 
 
-def max_pool2d(declaration):
+def max_pool2d(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.max_pool2d"""
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
+    data, res = function.inputs[0].primitive, function.output.primitive
 
-    strides = declaration.attributes.get_int_tuple("strides")
-    pool_size = declaration.attributes.get_int_tuple("pool_size")
-    layout = declaration.attributes.get_str("layout")
-    ceil_mode = declaration.attributes.get_int("ceil_mode")
+    strides = function.attributes.get_int_tuple("strides")
+    pool_size = function.attributes.get_int_tuple("pool_size")
+    layout = function.attributes.get_str("layout")
+    ceil_mode = function.attributes.get_int("ceil_mode")
     assert layout == 'NCHW', f"Layout \'{layout}\' is not currently supported for nn.max_pool2d; please use `NCHW`"
     assert ceil_mode == False, "`ceil_mode` is not currently supported for nn.max_pool2d"
     bitwidth, data_type = data.data[0], data.data_type
     size0, size1, size2, size3 = res.data[1], res.data[2], res.data[3], res.data[4]
 
-    declarations = pp_dahlia_memory_declarations([res, data])
     program_body = f"""
     for (let b: ubit<32> = 0..{size0}) {{
       for (let c: ubit<32> = 0..{size1}) {{
@@ -406,24 +447,21 @@ def max_pool2d(declaration):
       }} 
     }} 
     """
-    program = f"""{declarations}{NEWL}{program_body}"""
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
 # Only supports a small subset of the `conv2d` function. For example,
 # dilation and grouped convolution are not supported.
-def conv2d(declaration):
+def conv2d(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.conv2d"""
-    data, weight, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
+    data, weight, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
 
-    strides = declaration.attributes.get_int_tuple("strides")
-    kernel_size = declaration.attributes.get_int_tuple("kernel_size")
-    channels = declaration.attributes.get_int("channels")
+    strides = function.attributes.get_int_tuple("strides")
+    kernel_size = function.attributes.get_int_tuple("kernel_size")
+    channels = function.attributes.get_int("channels")
     bitwidth, data_type = data.data[0], data.data_type
     size0, size1, size2, size3 = res.data[1], res.data[2], res.data[3], res.data[4]
 
-    declarations = pp_dahlia_memory_declarations([res, data, weight])
-
     zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
     program_body = f"""
     for (let b: ubit<32> = 0..{size0}) {{
@@ -446,8 +484,7 @@ def conv2d(declaration):
       }} 
     }} 
     """
-    program = f"""{declarations}{NEWL}{program_body}"""
-    return LowerDahliaProgramToFuTIL(program, declaration.component_name)
+    return LowerDahliaProgramToFuTIL(function, program_body)
 
 
 # Mapping from Relay function names to their respective Dahlia lowering.
@@ -461,8 +498,8 @@ def conv2d(declaration):
 
 def GetRelayFunctionCall(function_name) -> RelayFunctionCall:
     """
-    Returns the corresponding name, function, and op (if it is a binary op, otherwise None).
-    If the function isn't supported, fails with an assertion.
+    Returns the corresponding name, function, and `op` type (if it is a binary op, otherwise None)
+    of the Relay function call. If the function call isn't supported, fails with an assertion.
     """
     function = name = op = None
     assert function_name in BuiltInBinaryOps or function_name in RelayFunctionCalls, \
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 1028e7cb47..078e90f248 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -16,6 +16,7 @@ def tensor_subtract():
     return relay.Function([x, y], relay.subtract(x, y))
 
 
+# Trying to read in a function that uses `expand_dims` with relay.fromtext() leads to some peculiar errors.
 def expand_dims():
     x = relay.var('x', shape=[512], dtype='int32')
     return relay.Function([x], relay.expand_dims(x, axis=1, num_newaxis=2))
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index fa3935f6e4..2ce59f4139 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -135,54 +135,3 @@ def pp_cell(cell: FCell):
             return f'{cell.primitive.name} = prim std_{op}({bitwidth});'
     if cell.is_relay_function(): return f'{cell.relay_function.name} = {cell.relay_function.component_name};'
     assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
-
-
-# Dahlia Pretty Printing.
-
-def next_character(ch, dir=1):
-    """
-    Returns the next character after 'ch'.
-    If `dir` is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
-    """
-    return chr(ord(ch) + 1) if dir > 0 else chr(ord(ch) - 1)
-
-
-def pp_dahlia_memory_declarations(declaration_list):
-    declarations = []
-    for declaration in declaration_list:
-        string = f'decl {declaration.name}: {declaration.data_type}<{declaration.data[0]}>'
-        for i in range(0, declaration.type): string += f'[{declaration.data[i + 1]}]'
-        declarations.append(string + ";")
-    return '\n'.join(declarations)
-
-
-def pp_dahlia_loop(data, body):
-    """
-    Returns an iteration over data with `body` as the work done within the nested loop(s).
-    Many tensor functions share the same control flow: (1) Iterate over `data`, and (2) do some work in body.
-    For example, if `data` is a 2D primitive of size (M, N) and body == `X;`, then this will return:
-
-    ```
-    for (let i: ubit<X> = 0..M) {
-      for (let j: ubit<Y> = 0..N) {
-        X;
-      }
-    }
-    ```
-    """
-    variable_name = chr(ord('i'))
-    num_dimensions = data.type
-
-    program = []
-    SPACING = ''
-    for i in range(0, num_dimensions):
-        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
-        program.append(f'{SPACING}for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
-        variable_name = next_character(variable_name)
-        SPACING += '  '
-    program.append(f'{SPACING}{body}')
-
-    for i in range(0, num_dimensions):
-        SPACING = SPACING[:-2]
-        program.append(f'{SPACING}}}')
-    return '\n'.join(program)
diff --git a/frontends/relay-futil/tests/add.expect b/frontends/relay-futil/tests/add.expect
index f239d18b42..8c08e35f31 100644
--- a/frontends/relay-futil/tests/add.expect
+++ b/frontends/relay-futil/tests/add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component add0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(1);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x.addr0 = add0.x0_addr0;
-      add0.x0_read_data = x.read_data;
-      y.addr0 = add0.y0_addr0;
-      add0.y0_read_data = y.read_data;
-      z.addr0 = add0.z0_addr0;
-      z.write_data = add0.z0_write_data;
-      z.write_en = add0.z0_write_en;
-      add0.z0_done = z.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x.addr0 = comp_add0.x0_addr0;
+      comp_add0.x0_read_data = x.read_data;
+      y.addr0 = comp_add0.y0_addr0;
+      comp_add0.y0_read_data = y.read_data;
+      z.addr0 = comp_add0.z0_addr0;
+      z.write_data = comp_add0.z0_write_data;
+      z.write_en = comp_add0.z0_write_en;
+      comp_add0.z0_done = z.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/batch_flatten.expect b/frontends/relay-futil/tests/batch_flatten.expect
index 6927e4ad85..c1d01a7bae 100644
--- a/frontends/relay-futil/tests/batch_flatten.expect
+++ b/frontends/relay-futil/tests/batch_flatten.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component batch_flatten(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component batch_flatten0(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     add1 = prim std_add(2);
@@ -139,26 +139,26 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d2(32, 1, 4, 1, 3);
     x = prim std_mem_d3(32, 1, 2, 2, 1, 2, 2);
-    batch_flatten0 = batch_flatten;
+    comp_batch_flatten0 = batch_flatten0;
   }
   wires {
-    group run_batch_flatten {
-      x.addr0 = batch_flatten0.x0_0_0_addr0;
-      batch_flatten0.x0_0_0_read_data = x.read_data;
-      x.addr1 = batch_flatten0.x0_0_0_addr1;
-      x.addr2 = batch_flatten0.x0_0_0_addr2;
-      x1.addr0 = batch_flatten0.x10_0_addr0;
-      x1.addr1 = batch_flatten0.x10_0_addr1;
-      x1.write_data = batch_flatten0.x10_0_write_data;
-      x1.write_en = batch_flatten0.x10_0_write_en;
-      batch_flatten0.x10_0_done = x1.done;
-      batch_flatten0.go = 1'd1;
-      run_batch_flatten[done] = batch_flatten0.done ? 1'd1;
+    group run_batch_flatten0 {
+      x.addr0 = comp_batch_flatten0.x0_0_0_addr0;
+      comp_batch_flatten0.x0_0_0_read_data = x.read_data;
+      x.addr1 = comp_batch_flatten0.x0_0_0_addr1;
+      x.addr2 = comp_batch_flatten0.x0_0_0_addr2;
+      x1.addr0 = comp_batch_flatten0.x10_0_addr0;
+      x1.addr1 = comp_batch_flatten0.x10_0_addr1;
+      x1.write_data = comp_batch_flatten0.x10_0_write_data;
+      x1.write_en = comp_batch_flatten0.x10_0_write_en;
+      comp_batch_flatten0.x10_0_done = x1.done;
+      comp_batch_flatten0.go = 1'd1;
+      run_batch_flatten0[done] = comp_batch_flatten0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_batch_flatten;
+      run_batch_flatten0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/batch_matmul.expect b/frontends/relay-futil/tests/batch_matmul.expect
index 0bf73d4754..93a95d5712 100644
--- a/frontends/relay-futil/tests/batch_matmul.expect
+++ b/frontends/relay-futil/tests/batch_matmul.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 3, a0_0_0_addr1: 3, a0_0_0_addr2: 3, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 3, b0_0_0_addr1: 3, b0_0_0_addr2: 3, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
+component batch_matmul0(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 3, a0_0_0_addr1: 3, a0_0_0_addr2: 3, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 3, b0_0_0_addr1: 3, b0_0_0_addr2: 3, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(3);
@@ -402,31 +402,31 @@ component main () -> () {
     x = prim std_mem_d3(32, 4, 7, 7, 3, 3, 3);
     a = prim std_mem_d3(32, 4, 7, 5, 3, 3, 3);
     b = prim std_mem_d3(32, 4, 7, 5, 3, 3, 3);
-    batch_matmul0 = batch_matmul;
+    comp_batch_matmul0 = batch_matmul0;
   }
   wires {
-    group run_batch_matmul {
-      a.addr0 = batch_matmul0.a0_0_0_addr0;
-      batch_matmul0.a0_0_0_read_data = a.read_data;
-      a.addr1 = batch_matmul0.a0_0_0_addr1;
-      a.addr2 = batch_matmul0.a0_0_0_addr2;
-      b.addr0 = batch_matmul0.b0_0_0_addr0;
-      batch_matmul0.b0_0_0_read_data = b.read_data;
-      b.addr1 = batch_matmul0.b0_0_0_addr1;
-      b.addr2 = batch_matmul0.b0_0_0_addr2;
-      x.addr0 = batch_matmul0.x0_0_0_addr0;
-      x.addr1 = batch_matmul0.x0_0_0_addr1;
-      x.addr2 = batch_matmul0.x0_0_0_addr2;
-      x.write_data = batch_matmul0.x0_0_0_write_data;
-      x.write_en = batch_matmul0.x0_0_0_write_en;
-      batch_matmul0.x0_0_0_done = x.done;
-      batch_matmul0.go = 1'd1;
-      run_batch_matmul[done] = batch_matmul0.done ? 1'd1;
+    group run_batch_matmul0 {
+      a.addr0 = comp_batch_matmul0.a0_0_0_addr0;
+      comp_batch_matmul0.a0_0_0_read_data = a.read_data;
+      a.addr1 = comp_batch_matmul0.a0_0_0_addr1;
+      a.addr2 = comp_batch_matmul0.a0_0_0_addr2;
+      b.addr0 = comp_batch_matmul0.b0_0_0_addr0;
+      comp_batch_matmul0.b0_0_0_read_data = b.read_data;
+      b.addr1 = comp_batch_matmul0.b0_0_0_addr1;
+      b.addr2 = comp_batch_matmul0.b0_0_0_addr2;
+      x.addr0 = comp_batch_matmul0.x0_0_0_addr0;
+      x.addr1 = comp_batch_matmul0.x0_0_0_addr1;
+      x.addr2 = comp_batch_matmul0.x0_0_0_addr2;
+      x.write_data = comp_batch_matmul0.x0_0_0_write_data;
+      x.write_en = comp_batch_matmul0.x0_0_0_write_en;
+      comp_batch_matmul0.x0_0_0_done = x.done;
+      comp_batch_matmul0.go = 1'd1;
+      run_batch_matmul0[done] = comp_batch_matmul0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_batch_matmul;
+      run_batch_matmul0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
index c181b95bbf..18ba0a8d0e 100644
--- a/frontends/relay-futil/tests/bias_add.expect
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_0_0_addr0: 1, x0_0_0_0_addr1: 7, x0_0_0_0_addr2: 10, x0_0_0_0_addr3: 9, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 1, x10_0_0_0_addr1: 7, x10_0_0_0_addr2: 10, x10_0_0_0_addr3: 9, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+component bias_add0(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_0_0_addr0: 1, x0_0_0_0_addr1: 7, x0_0_0_0_addr2: 10, x0_0_0_0_addr3: 9, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 1, x10_0_0_0_addr1: 7, x10_0_0_0_addr2: 10, x10_0_0_0_addr3: 9, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(9);
@@ -167,31 +167,31 @@ component main () -> () {
     x1 = prim std_mem_d4(32, 1, 64, 512, 256, 1, 7, 10, 9);
     x = prim std_mem_d4(32, 1, 64, 512, 256, 1, 7, 10, 9);
     bias = prim std_mem_d1(32, 64, 7);
-    bias_add0 = bias_add;
+    comp_bias_add0 = bias_add0;
   }
   wires {
-    group run_bias_add {
-      x.addr0 = bias_add0.x0_0_0_0_addr0;
-      bias_add0.x0_0_0_0_read_data = x.read_data;
-      x.addr1 = bias_add0.x0_0_0_0_addr1;
-      x.addr2 = bias_add0.x0_0_0_0_addr2;
-      x.addr3 = bias_add0.x0_0_0_0_addr3;
-      bias.addr0 = bias_add0.bias0_addr0;
-      bias_add0.bias0_read_data = bias.read_data;
-      x1.addr0 = bias_add0.x10_0_0_0_addr0;
-      x1.addr1 = bias_add0.x10_0_0_0_addr1;
-      x1.addr2 = bias_add0.x10_0_0_0_addr2;
-      x1.addr3 = bias_add0.x10_0_0_0_addr3;
-      x1.write_data = bias_add0.x10_0_0_0_write_data;
-      x1.write_en = bias_add0.x10_0_0_0_write_en;
-      bias_add0.x10_0_0_0_done = x1.done;
-      bias_add0.go = 1'd1;
-      run_bias_add[done] = bias_add0.done ? 1'd1;
+    group run_bias_add0 {
+      x.addr0 = comp_bias_add0.x0_0_0_0_addr0;
+      comp_bias_add0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = comp_bias_add0.x0_0_0_0_addr1;
+      x.addr2 = comp_bias_add0.x0_0_0_0_addr2;
+      x.addr3 = comp_bias_add0.x0_0_0_0_addr3;
+      bias.addr0 = comp_bias_add0.bias0_addr0;
+      comp_bias_add0.bias0_read_data = bias.read_data;
+      x1.addr0 = comp_bias_add0.x10_0_0_0_addr0;
+      x1.addr1 = comp_bias_add0.x10_0_0_0_addr1;
+      x1.addr2 = comp_bias_add0.x10_0_0_0_addr2;
+      x1.addr3 = comp_bias_add0.x10_0_0_0_addr3;
+      x1.write_data = comp_bias_add0.x10_0_0_0_write_data;
+      x1.write_en = comp_bias_add0.x10_0_0_0_write_en;
+      comp_bias_add0.x10_0_0_0_done = x1.done;
+      comp_bias_add0.go = 1'd1;
+      run_bias_add0[done] = comp_bias_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_bias_add;
+      run_bias_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/broadcast.expect b/frontends/relay-futil/tests/broadcast.expect
index 84f5962b54..5eb74f0ca1 100644
--- a/frontends/relay-futil/tests/broadcast.expect
+++ b/frontends/relay-futil/tests/broadcast.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_addr0: 2, x10_0_addr1: 2, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
+component add0(go: 1, clk: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_addr0: 2, x10_0_addr1: 2, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(2);
@@ -139,30 +139,30 @@ component main () -> () {
     x3 = prim std_mem_d3(32, 2, 2, 2, 2, 2, 2);
     x1 = prim std_mem_d2(32, 2, 2, 2, 2);
     x2 = prim std_mem_d3(32, 2, 1, 1, 2, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x1.addr0 = add0.x10_0_addr0;
-      add0.x10_0_read_data = x1.read_data;
-      x1.addr1 = add0.x10_0_addr1;
-      x2.addr0 = add0.x20_0_0_addr0;
-      add0.x20_0_0_read_data = x2.read_data;
-      x2.addr1 = add0.x20_0_0_addr1;
-      x2.addr2 = add0.x20_0_0_addr2;
-      x3.addr0 = add0.x30_0_0_addr0;
-      x3.addr1 = add0.x30_0_0_addr1;
-      x3.addr2 = add0.x30_0_0_addr2;
-      x3.write_data = add0.x30_0_0_write_data;
-      x3.write_en = add0.x30_0_0_write_en;
-      add0.x30_0_0_done = x3.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x1.addr0 = comp_add0.x10_0_addr0;
+      comp_add0.x10_0_read_data = x1.read_data;
+      x1.addr1 = comp_add0.x10_0_addr1;
+      x2.addr0 = comp_add0.x20_0_0_addr0;
+      comp_add0.x20_0_0_read_data = x2.read_data;
+      x2.addr1 = comp_add0.x20_0_0_addr1;
+      x2.addr2 = comp_add0.x20_0_0_addr2;
+      x3.addr0 = comp_add0.x30_0_0_addr0;
+      x3.addr1 = comp_add0.x30_0_0_addr1;
+      x3.addr2 = comp_add0.x30_0_0_addr2;
+      x3.write_data = comp_add0.x30_0_0_write_data;
+      x3.write_en = comp_add0.x30_0_0_write_en;
+      comp_add0.x30_0_0_done = x3.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/conv2d.expect b/frontends/relay-futil/tests/conv2d.expect
index 1d2163c61e..aa5ca04744 100644
--- a/frontends/relay-futil/tests/conv2d.expect
+++ b/frontends/relay-futil/tests/conv2d.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component conv2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, weight0_0_0_0_read_data: 32, weight0_0_0_0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 3, data0_0_0_0_addr1: 10, data0_0_0_0_addr2: 4, data0_0_0_0_addr3: 4, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, weight0_0_0_0_addr0: 10, weight0_0_0_0_addr1: 10, weight0_0_0_0_addr2: 2, weight0_0_0_0_addr3: 2, weight0_0_0_0_write_data: 32, weight0_0_0_0_write_en: 1, weight0_0_0_0_clk: 1, x0_0_0_0_addr0: 3, x0_0_0_0_addr1: 10, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 4, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1) {
+component conv2d0(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, weight0_0_0_0_read_data: 32, weight0_0_0_0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 3, data0_0_0_0_addr1: 10, data0_0_0_0_addr2: 4, data0_0_0_0_addr3: 4, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, weight0_0_0_0_addr0: 10, weight0_0_0_0_addr1: 10, weight0_0_0_0_addr2: 2, weight0_0_0_0_addr3: 2, weight0_0_0_0_write_data: 32, weight0_0_0_0_write_en: 1, weight0_0_0_0_clk: 1, x0_0_0_0_addr0: 3, x0_0_0_0_addr1: 10, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 4, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(32);
@@ -362,34 +362,34 @@ component main () -> () {
     x = prim std_mem_d4(32, 5, 512, 14, 14, 3, 10, 4, 4);
     data = prim std_mem_d4(32, 5, 512, 14, 14, 3, 10, 4, 4);
     weight = prim std_mem_d4(32, 512, 512, 3, 3, 10, 10, 2, 2);
-    conv2d0 = conv2d;
+    comp_conv2d0 = conv2d0;
   }
   wires {
-    group run_conv2d {
-      data.addr0 = conv2d0.data0_0_0_0_addr0;
-      conv2d0.data0_0_0_0_read_data = data.read_data;
-      data.addr1 = conv2d0.data0_0_0_0_addr1;
-      data.addr2 = conv2d0.data0_0_0_0_addr2;
-      data.addr3 = conv2d0.data0_0_0_0_addr3;
-      weight.addr0 = conv2d0.weight0_0_0_0_addr0;
-      conv2d0.weight0_0_0_0_read_data = weight.read_data;
-      weight.addr1 = conv2d0.weight0_0_0_0_addr1;
-      weight.addr2 = conv2d0.weight0_0_0_0_addr2;
-      weight.addr3 = conv2d0.weight0_0_0_0_addr3;
-      x.addr0 = conv2d0.x0_0_0_0_addr0;
-      x.addr1 = conv2d0.x0_0_0_0_addr1;
-      x.addr2 = conv2d0.x0_0_0_0_addr2;
-      x.addr3 = conv2d0.x0_0_0_0_addr3;
-      x.write_data = conv2d0.x0_0_0_0_write_data;
-      x.write_en = conv2d0.x0_0_0_0_write_en;
-      conv2d0.x0_0_0_0_done = x.done;
-      conv2d0.go = 1'd1;
-      run_conv2d[done] = conv2d0.done ? 1'd1;
+    group run_conv2d0 {
+      data.addr0 = comp_conv2d0.data0_0_0_0_addr0;
+      comp_conv2d0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = comp_conv2d0.data0_0_0_0_addr1;
+      data.addr2 = comp_conv2d0.data0_0_0_0_addr2;
+      data.addr3 = comp_conv2d0.data0_0_0_0_addr3;
+      weight.addr0 = comp_conv2d0.weight0_0_0_0_addr0;
+      comp_conv2d0.weight0_0_0_0_read_data = weight.read_data;
+      weight.addr1 = comp_conv2d0.weight0_0_0_0_addr1;
+      weight.addr2 = comp_conv2d0.weight0_0_0_0_addr2;
+      weight.addr3 = comp_conv2d0.weight0_0_0_0_addr3;
+      x.addr0 = comp_conv2d0.x0_0_0_0_addr0;
+      x.addr1 = comp_conv2d0.x0_0_0_0_addr1;
+      x.addr2 = comp_conv2d0.x0_0_0_0_addr2;
+      x.addr3 = comp_conv2d0.x0_0_0_0_addr3;
+      x.write_data = comp_conv2d0.x0_0_0_0_write_data;
+      x.write_en = comp_conv2d0.x0_0_0_0_write_en;
+      comp_conv2d0.x0_0_0_0_done = x.done;
+      comp_conv2d0.go = 1'd1;
+      run_conv2d0[done] = comp_conv2d0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_conv2d;
+      run_conv2d0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/dense.expect b/frontends/relay-futil/tests/dense.expect
index a0d5ead2b1..9ca0f57adb 100644
--- a/frontends/relay-futil/tests/dense.expect
+++ b/frontends/relay-futil/tests/dense.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component dense(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 13, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 4, y0_0_addr1: 13, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+component dense0(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 13, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 4, y0_0_addr1: 13, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
   cells {
     add0 = prim std_add(13);
     add1 = prim std_add(4);
@@ -307,28 +307,28 @@ component main () -> () {
     x1 = prim std_mem_d2(32, 1, 10, 1, 4);
     x = prim std_mem_d2(32, 1, 4096, 1, 13);
     y = prim std_mem_d2(32, 10, 4096, 4, 13);
-    dense0 = dense;
+    comp_dense0 = dense0;
   }
   wires {
-    group run_dense {
-      x.addr0 = dense0.x0_0_addr0;
-      dense0.x0_0_read_data = x.read_data;
-      x.addr1 = dense0.x0_0_addr1;
-      y.addr0 = dense0.y0_0_addr0;
-      dense0.y0_0_read_data = y.read_data;
-      y.addr1 = dense0.y0_0_addr1;
-      x1.addr0 = dense0.x10_0_addr0;
-      x1.addr1 = dense0.x10_0_addr1;
-      x1.write_data = dense0.x10_0_write_data;
-      x1.write_en = dense0.x10_0_write_en;
-      dense0.x10_0_done = x1.done;
-      dense0.go = 1'd1;
-      run_dense[done] = dense0.done ? 1'd1;
+    group run_dense0 {
+      x.addr0 = comp_dense0.x0_0_addr0;
+      comp_dense0.x0_0_read_data = x.read_data;
+      x.addr1 = comp_dense0.x0_0_addr1;
+      y.addr0 = comp_dense0.y0_0_addr0;
+      comp_dense0.y0_0_read_data = y.read_data;
+      y.addr1 = comp_dense0.y0_0_addr1;
+      x1.addr0 = comp_dense0.x10_0_addr0;
+      x1.addr1 = comp_dense0.x10_0_addr1;
+      x1.write_data = comp_dense0.x10_0_write_data;
+      x1.write_en = comp_dense0.x10_0_write_en;
+      comp_dense0.x10_0_done = x1.done;
+      comp_dense0.go = 1'd1;
+      run_dense0[done] = comp_dense0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_dense;
+      run_dense0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/fixed_point_add.expect b/frontends/relay-futil/tests/fixed_point_add.expect
index aa8240b4cf..9c4910177e 100644
--- a/frontends/relay-futil/tests/fixed_point_add.expect
+++ b/frontends/relay-futil/tests/fixed_point_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component add0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(1);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x.addr0 = add0.x0_addr0;
-      add0.x0_read_data = x.read_data;
-      y.addr0 = add0.y0_addr0;
-      add0.y0_read_data = y.read_data;
-      z.addr0 = add0.z0_addr0;
-      z.write_data = add0.z0_write_data;
-      z.write_en = add0.z0_write_en;
-      add0.z0_done = z.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x.addr0 = comp_add0.x0_addr0;
+      comp_add0.x0_read_data = x.read_data;
+      y.addr0 = comp_add0.y0_addr0;
+      comp_add0.y0_read_data = y.read_data;
+      z.addr0 = comp_add0.z0_addr0;
+      z.write_data = comp_add0.z0_write_data;
+      z.write_en = comp_add0.z0_write_en;
+      comp_add0.z0_done = z.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let1.expect b/frontends/relay-futil/tests/let1.expect
index cf228003ae..e59cbebcd9 100644
--- a/frontends/relay-futil/tests/let1.expect
+++ b/frontends/relay-futil/tests/let1.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component multiply0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -82,25 +82,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
     b = prim std_mem_d1(32, 1, 1);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
   }
   wires {
-    group run_multiply {
-      a.addr0 = multiply0.a0_addr0;
-      multiply0.a0_read_data = a.read_data;
-      b.addr0 = multiply0.b0_addr0;
-      multiply0.b0_read_data = b.read_data;
-      z.addr0 = multiply0.z0_addr0;
-      z.write_data = multiply0.z0_write_data;
-      z.write_en = multiply0.z0_write_en;
-      multiply0.z0_done = z.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
+    group run_multiply0 {
+      a.addr0 = comp_multiply0.a0_addr0;
+      comp_multiply0.a0_read_data = a.read_data;
+      b.addr0 = comp_multiply0.b0_addr0;
+      comp_multiply0.b0_read_data = b.read_data;
+      z.addr0 = comp_multiply0.z0_addr0;
+      z.write_data = comp_multiply0.z0_write_data;
+      z.write_en = comp_multiply0.z0_write_en;
+      comp_multiply0.z0_done = z.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_multiply;
+      run_multiply0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let2.expect b/frontends/relay-futil/tests/let2.expect
index c4b8afc3cb..451a17e8df 100644
--- a/frontends/relay-futil/tests/let2.expect
+++ b/frontends/relay-futil/tests/let2.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+component add0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(32);
@@ -69,7 +69,7 @@ component add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_
   }
 }
 
-component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
+component multiply0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -151,40 +151,40 @@ component main () -> () {
     d = prim std_mem_d1(32, 1, 1);
     c = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
     b = prim std_mem_d1(32, 1, 1);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
   }
   wires {
-    group run_multiply {
-      a.addr0 = multiply0.a0_addr0;
-      multiply0.a0_read_data = a.read_data;
-      b.addr0 = multiply0.b0_addr0;
-      multiply0.b0_read_data = b.read_data;
-      c.addr0 = multiply0.c0_addr0;
-      c.write_data = multiply0.c0_write_data;
-      c.write_en = multiply0.c0_write_en;
-      multiply0.c0_done = c.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
-    }
-    group run_add {
-      c.addr0 = add0.c0_addr0;
-      add0.c0_read_data = c.read_data;
-      a.addr0 = add0.a0_addr0;
-      add0.a0_read_data = a.read_data;
-      d.addr0 = add0.d0_addr0;
-      d.write_data = add0.d0_write_data;
-      d.write_en = add0.d0_write_en;
-      add0.d0_done = d.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_multiply0 {
+      a.addr0 = comp_multiply0.a0_addr0;
+      comp_multiply0.a0_read_data = a.read_data;
+      b.addr0 = comp_multiply0.b0_addr0;
+      comp_multiply0.b0_read_data = b.read_data;
+      c.addr0 = comp_multiply0.c0_addr0;
+      c.write_data = comp_multiply0.c0_write_data;
+      c.write_en = comp_multiply0.c0_write_en;
+      comp_multiply0.c0_done = c.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
+    }
+    group run_add0 {
+      c.addr0 = comp_add0.c0_addr0;
+      comp_add0.c0_read_data = c.read_data;
+      a.addr0 = comp_add0.a0_addr0;
+      comp_add0.a0_read_data = a.read_data;
+      d.addr0 = comp_add0.d0_addr0;
+      d.write_data = comp_add0.d0_write_data;
+      d.write_en = comp_add0.d0_write_en;
+      comp_add0.d0_done = d.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_multiply;
-      run_add;
+      run_multiply0;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index cbb0783fa8..222268b304 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
+component multiply0(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
   cells {
     add0 = prim std_add(1);
     bin_read0_0 = prim std_reg(32);
@@ -77,18 +77,17 @@ component multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32
   }
 }
 
-component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+component subtract1(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
     c_read0_0 = prim std_reg(32);
     const0 = prim std_const(1, 0);
     const1 = prim std_const(1, 0);
     const2 = prim std_const(1, 1);
-    div_pipe0 = prim std_div_pipe(32);
     i0 = prim std_reg(1);
     le0 = prim std_le(1);
+    sub0 = prim std_sub(32);
   }
   wires {
     group cond0<"static"=0> {
@@ -101,14 +100,6 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
       i0.write_en = 1'd1;
       let0[done] = i0.done;
     }
-    group let1 {
-      bin_read0_0.in = div_pipe0.out;
-      bin_read0_0.write_en = div_pipe0.done;
-      let1[done] = bin_read0_0.done;
-      div_pipe0.left = c_read0_0.out;
-      div_pipe0.right = a_read0_0.out;
-      div_pipe0.go = !div_pipe0.done ? 1'd1;
-    }
     group upd0<"static"=1> {
       c_read0_0.write_en = 1'd1;
       c0_addr0 = i0.out;
@@ -124,7 +115,9 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
     group upd2<"static"=1> {
       d0_addr0 = i0.out;
       d0_write_en = 1'd1;
-      d0_write_data = 1'd1 ? bin_read0_0.out;
+      sub0.left = c_read0_0.out;
+      sub0.right = a_read0_0.out;
+      d0_write_data = 1'd1 ? sub0.out;
       upd2[done] = d0_done ? 1'd1;
     }
     group upd3<"static"=1> {
@@ -145,7 +138,6 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
             upd0;
             upd1;
           }
-          let1;
           upd2;
           upd3;
         }
@@ -154,7 +146,7 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
   }
 }
 
-component subtract(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
+component subtract0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -228,55 +220,55 @@ component main () -> () {
     e = prim std_mem_d1(32, 1, 1);
     c = prim std_mem_d1(32, 1, 1);
     d = prim std_mem_d1(32, 1, 1);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
     a = prim std_mem_d1(32, 1, 1);
-    divide0 = divide;
+    comp_subtract1 = subtract1;
     b = prim std_mem_d1(32, 1, 1);
-    subtract0 = subtract;
+    comp_subtract0 = subtract0;
   }
   wires {
-    group run_subtract {
-      a.addr0 = subtract0.a0_addr0;
-      subtract0.a0_read_data = a.read_data;
-      b.addr0 = subtract0.b0_addr0;
-      subtract0.b0_read_data = b.read_data;
-      c.addr0 = subtract0.c0_addr0;
-      c.write_data = subtract0.c0_write_data;
-      c.write_en = subtract0.c0_write_en;
-      subtract0.c0_done = c.done;
-      subtract0.go = 1'd1;
-      run_subtract[done] = subtract0.done ? 1'd1;
+    group run_subtract0 {
+      a.addr0 = comp_subtract0.a0_addr0;
+      comp_subtract0.a0_read_data = a.read_data;
+      b.addr0 = comp_subtract0.b0_addr0;
+      comp_subtract0.b0_read_data = b.read_data;
+      c.addr0 = comp_subtract0.c0_addr0;
+      c.write_data = comp_subtract0.c0_write_data;
+      c.write_en = comp_subtract0.c0_write_en;
+      comp_subtract0.c0_done = c.done;
+      comp_subtract0.go = 1'd1;
+      run_subtract0[done] = comp_subtract0.done ? 1'd1;
     }
-    group run_divide {
-      c.addr0 = divide0.c0_addr0;
-      divide0.c0_read_data = c.read_data;
-      a.addr0 = divide0.a0_addr0;
-      divide0.a0_read_data = a.read_data;
-      d.addr0 = divide0.d0_addr0;
-      d.write_data = divide0.d0_write_data;
-      d.write_en = divide0.d0_write_en;
-      divide0.d0_done = d.done;
-      divide0.go = 1'd1;
-      run_divide[done] = divide0.done ? 1'd1;
+    group run_subtract1 {
+      c.addr0 = comp_subtract1.c0_addr0;
+      comp_subtract1.c0_read_data = c.read_data;
+      a.addr0 = comp_subtract1.a0_addr0;
+      comp_subtract1.a0_read_data = a.read_data;
+      d.addr0 = comp_subtract1.d0_addr0;
+      d.write_data = comp_subtract1.d0_write_data;
+      d.write_en = comp_subtract1.d0_write_en;
+      comp_subtract1.d0_done = d.done;
+      comp_subtract1.go = 1'd1;
+      run_subtract1[done] = comp_subtract1.done ? 1'd1;
     }
-    group run_multiply {
-      c.addr0 = multiply0.c0_addr0;
-      multiply0.c0_read_data = c.read_data;
-      d.addr0 = multiply0.d0_addr0;
-      multiply0.d0_read_data = d.read_data;
-      e.addr0 = multiply0.e0_addr0;
-      e.write_data = multiply0.e0_write_data;
-      e.write_en = multiply0.e0_write_en;
-      multiply0.e0_done = e.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
+    group run_multiply0 {
+      c.addr0 = comp_multiply0.c0_addr0;
+      comp_multiply0.c0_read_data = c.read_data;
+      d.addr0 = comp_multiply0.d0_addr0;
+      comp_multiply0.d0_read_data = d.read_data;
+      e.addr0 = comp_multiply0.e0_addr0;
+      e.write_data = comp_multiply0.e0_write_data;
+      e.write_en = comp_multiply0.e0_write_en;
+      comp_multiply0.e0_done = e.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_subtract;
-      run_divide;
-      run_multiply;
+      run_subtract0;
+      run_subtract1;
+      run_multiply0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let3.relay b/frontends/relay-futil/tests/let3.relay
index 50aa9a8064..725e75ab94 100644
--- a/frontends/relay-futil/tests/let3.relay
+++ b/frontends/relay-futil/tests/let3.relay
@@ -1,7 +1,7 @@
 v0.0.4
 fn (%a: int32, %b: int32) {
   let %c = subtract(%a, %b);
-  let %d = divide(%c, %a);
+  let %d = subtract(%c, %a);
   let %e = multiply(%c, %d);
   %e
 }
diff --git a/frontends/relay-futil/tests/max_pool2d.expect b/frontends/relay-futil/tests/max_pool2d.expect
index 47decb83ba..ee3eb04a8b 100644
--- a/frontends/relay-futil/tests/max_pool2d.expect
+++ b/frontends/relay-futil/tests/max_pool2d.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component max_pool2d(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, result0_0_0_0_read_data: 32, result0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 2, data0_0_0_0_addr1: 2, data0_0_0_0_addr2: 3, data0_0_0_0_addr3: 3, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, result0_0_0_0_addr0: 2, result0_0_0_0_addr1: 2, result0_0_0_0_addr2: 2, result0_0_0_0_addr3: 2, result0_0_0_0_write_data: 32, result0_0_0_0_write_en: 1, result0_0_0_0_clk: 1) {
+component max_pool2d0(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, result0_0_0_0_read_data: 32, result0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 2, data0_0_0_0_addr1: 2, data0_0_0_0_addr2: 3, data0_0_0_0_addr3: 3, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, result0_0_0_0_addr0: 2, result0_0_0_0_addr1: 2, result0_0_0_0_addr2: 2, result0_0_0_0_addr3: 2, result0_0_0_0_write_data: 32, result0_0_0_0_write_en: 1, result0_0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(32);
@@ -326,29 +326,29 @@ component main () -> () {
   cells {
     result = prim std_mem_d4(32, 2, 2, 2, 2, 2, 2, 2, 2);
     data = prim std_mem_d4(32, 2, 2, 4, 4, 2, 2, 3, 3);
-    max_pool2d0 = max_pool2d;
+    comp_max_pool2d0 = max_pool2d0;
   }
   wires {
-    group run_max_pool2d {
-      data.addr0 = max_pool2d0.data0_0_0_0_addr0;
-      max_pool2d0.data0_0_0_0_read_data = data.read_data;
-      data.addr1 = max_pool2d0.data0_0_0_0_addr1;
-      data.addr2 = max_pool2d0.data0_0_0_0_addr2;
-      data.addr3 = max_pool2d0.data0_0_0_0_addr3;
-      result.addr0 = max_pool2d0.result0_0_0_0_addr0;
-      result.addr1 = max_pool2d0.result0_0_0_0_addr1;
-      result.addr2 = max_pool2d0.result0_0_0_0_addr2;
-      result.addr3 = max_pool2d0.result0_0_0_0_addr3;
-      result.write_data = max_pool2d0.result0_0_0_0_write_data;
-      result.write_en = max_pool2d0.result0_0_0_0_write_en;
-      max_pool2d0.result0_0_0_0_done = result.done;
-      max_pool2d0.go = 1'd1;
-      run_max_pool2d[done] = max_pool2d0.done ? 1'd1;
+    group run_max_pool2d0 {
+      data.addr0 = comp_max_pool2d0.data0_0_0_0_addr0;
+      comp_max_pool2d0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = comp_max_pool2d0.data0_0_0_0_addr1;
+      data.addr2 = comp_max_pool2d0.data0_0_0_0_addr2;
+      data.addr3 = comp_max_pool2d0.data0_0_0_0_addr3;
+      result.addr0 = comp_max_pool2d0.result0_0_0_0_addr0;
+      result.addr1 = comp_max_pool2d0.result0_0_0_0_addr1;
+      result.addr2 = comp_max_pool2d0.result0_0_0_0_addr2;
+      result.addr3 = comp_max_pool2d0.result0_0_0_0_addr3;
+      result.write_data = comp_max_pool2d0.result0_0_0_0_write_data;
+      result.write_en = comp_max_pool2d0.result0_0_0_0_write_en;
+      comp_max_pool2d0.result0_0_0_0_done = result.done;
+      comp_max_pool2d0.go = 1'd1;
+      run_max_pool2d0[done] = comp_max_pool2d0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_max_pool2d;
+      run_max_pool2d0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
index 7a65c37f5a..66f3dd53e8 100644
--- a/frontends/relay-futil/tests/relu.expect
+++ b/frontends/relay-futil/tests/relu.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 2, x0_0_0_0_addr1: 3, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 6, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 2, x10_0_0_0_addr1: 3, x10_0_0_0_addr2: 4, x10_0_0_0_addr3: 6, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+component relu0(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 2, x0_0_0_0_addr1: 3, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 6, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 2, x10_0_0_0_addr1: 3, x10_0_0_0_addr2: 4, x10_0_0_0_addr3: 6, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(6);
     add1 = prim std_add(4);
@@ -193,29 +193,29 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d4(32, 2, 4, 8, 32, 2, 3, 4, 6);
     x = prim std_mem_d4(32, 2, 4, 8, 32, 2, 3, 4, 6);
-    relu0 = relu;
+    comp_relu0 = relu0;
   }
   wires {
-    group run_relu {
-      x.addr0 = relu0.x0_0_0_0_addr0;
-      relu0.x0_0_0_0_read_data = x.read_data;
-      x.addr1 = relu0.x0_0_0_0_addr1;
-      x.addr2 = relu0.x0_0_0_0_addr2;
-      x.addr3 = relu0.x0_0_0_0_addr3;
-      x1.addr0 = relu0.x10_0_0_0_addr0;
-      x1.addr1 = relu0.x10_0_0_0_addr1;
-      x1.addr2 = relu0.x10_0_0_0_addr2;
-      x1.addr3 = relu0.x10_0_0_0_addr3;
-      x1.write_data = relu0.x10_0_0_0_write_data;
-      x1.write_en = relu0.x10_0_0_0_write_en;
-      relu0.x10_0_0_0_done = x1.done;
-      relu0.go = 1'd1;
-      run_relu[done] = relu0.done ? 1'd1;
+    group run_relu0 {
+      x.addr0 = comp_relu0.x0_0_0_0_addr0;
+      comp_relu0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = comp_relu0.x0_0_0_0_addr1;
+      x.addr2 = comp_relu0.x0_0_0_0_addr2;
+      x.addr3 = comp_relu0.x0_0_0_0_addr3;
+      x1.addr0 = comp_relu0.x10_0_0_0_addr0;
+      x1.addr1 = comp_relu0.x10_0_0_0_addr1;
+      x1.addr2 = comp_relu0.x10_0_0_0_addr2;
+      x1.addr3 = comp_relu0.x10_0_0_0_addr3;
+      x1.write_data = comp_relu0.x10_0_0_0_write_data;
+      x1.write_en = comp_relu0.x10_0_0_0_write_en;
+      comp_relu0.x10_0_0_0_done = x1.done;
+      comp_relu0.go = 1'd1;
+      run_relu0[done] = comp_relu0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_relu;
+      run_relu0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/softmax.expect b/frontends/relay-futil/tests/softmax.expect
index 92246625a7..128b2ebc0d 100644
--- a/frontends/relay-futil/tests/softmax.expect
+++ b/frontends/relay-futil/tests/softmax.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component softmax(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 4, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component softmax0(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 4, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(4);
@@ -186,25 +186,25 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d2(32, 1, 10, 1, 4);
     x = prim std_mem_d2(32, 1, 10, 1, 4);
-    softmax0 = softmax;
+    comp_softmax0 = softmax0;
   }
   wires {
-    group run_softmax {
-      x.addr0 = softmax0.x0_0_addr0;
-      softmax0.x0_0_read_data = x.read_data;
-      x.addr1 = softmax0.x0_0_addr1;
-      x1.addr0 = softmax0.x10_0_addr0;
-      x1.addr1 = softmax0.x10_0_addr1;
-      x1.write_data = softmax0.x10_0_write_data;
-      x1.write_en = softmax0.x10_0_write_en;
-      softmax0.x10_0_done = x1.done;
-      softmax0.go = 1'd1;
-      run_softmax[done] = softmax0.done ? 1'd1;
+    group run_softmax0 {
+      x.addr0 = comp_softmax0.x0_0_addr0;
+      comp_softmax0.x0_0_read_data = x.read_data;
+      x.addr1 = comp_softmax0.x0_0_addr1;
+      x1.addr0 = comp_softmax0.x10_0_addr0;
+      x1.addr1 = comp_softmax0.x10_0_addr1;
+      x1.write_data = comp_softmax0.x10_0_write_data;
+      x1.write_en = comp_softmax0.x10_0_write_en;
+      comp_softmax0.x10_0_done = x1.done;
+      comp_softmax0.go = 1'd1;
+      run_softmax0[done] = comp_softmax0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_softmax;
+      run_softmax0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/sqrt.expect b/frontends/relay-futil/tests/sqrt.expect
index 2963943f4f..ab67351192 100644
--- a/frontends/relay-futil/tests/sqrt.expect
+++ b/frontends/relay-futil/tests/sqrt.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component sqrt(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 4, x0_0_0_0_addr1: 4, x0_0_0_0_addr2: 5, x0_0_0_0_addr3: 7, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 4, x10_0_0_0_addr1: 4, x10_0_0_0_addr2: 5, x10_0_0_0_addr3: 7, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+component sqrt0(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 4, x0_0_0_0_addr1: 4, x0_0_0_0_addr2: 5, x0_0_0_0_addr3: 7, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 4, x10_0_0_0_addr1: 4, x10_0_0_0_addr2: 5, x10_0_0_0_addr3: 7, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(7);
     add1 = prim std_add(5);
@@ -156,29 +156,29 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d4(32, 8, 8, 16, 64, 4, 4, 5, 7);
     x = prim std_mem_d4(32, 8, 8, 16, 64, 4, 4, 5, 7);
-    sqrt0 = sqrt;
+    comp_sqrt0 = sqrt0;
   }
   wires {
-    group run_sqrt {
-      x.addr0 = sqrt0.x0_0_0_0_addr0;
-      sqrt0.x0_0_0_0_read_data = x.read_data;
-      x.addr1 = sqrt0.x0_0_0_0_addr1;
-      x.addr2 = sqrt0.x0_0_0_0_addr2;
-      x.addr3 = sqrt0.x0_0_0_0_addr3;
-      x1.addr0 = sqrt0.x10_0_0_0_addr0;
-      x1.addr1 = sqrt0.x10_0_0_0_addr1;
-      x1.addr2 = sqrt0.x10_0_0_0_addr2;
-      x1.addr3 = sqrt0.x10_0_0_0_addr3;
-      x1.write_data = sqrt0.x10_0_0_0_write_data;
-      x1.write_en = sqrt0.x10_0_0_0_write_en;
-      sqrt0.x10_0_0_0_done = x1.done;
-      sqrt0.go = 1'd1;
-      run_sqrt[done] = sqrt0.done ? 1'd1;
+    group run_sqrt0 {
+      x.addr0 = comp_sqrt0.x0_0_0_0_addr0;
+      comp_sqrt0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = comp_sqrt0.x0_0_0_0_addr1;
+      x.addr2 = comp_sqrt0.x0_0_0_0_addr2;
+      x.addr3 = comp_sqrt0.x0_0_0_0_addr3;
+      x1.addr0 = comp_sqrt0.x10_0_0_0_addr0;
+      x1.addr1 = comp_sqrt0.x10_0_0_0_addr1;
+      x1.addr2 = comp_sqrt0.x10_0_0_0_addr2;
+      x1.addr3 = comp_sqrt0.x10_0_0_0_addr3;
+      x1.write_data = comp_sqrt0.x10_0_0_0_write_data;
+      x1.write_en = comp_sqrt0.x10_0_0_0_write_en;
+      comp_sqrt0.x10_0_0_0_done = x1.done;
+      comp_sqrt0.go = 1'd1;
+      run_sqrt0[done] = comp_sqrt0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_sqrt;
+      run_sqrt0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/sub.expect b/frontends/relay-futil/tests/sub.expect
index c74af4fb2e..7b3f88385d 100644
--- a/frontends/relay-futil/tests/sub.expect
+++ b/frontends/relay-futil/tests/sub.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component subtract(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component subtract0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim std_add(1);
     const0 = prim std_const(1, 0);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    subtract0 = subtract;
+    comp_subtract0 = subtract0;
   }
   wires {
-    group run_subtract {
-      x.addr0 = subtract0.x0_addr0;
-      subtract0.x0_read_data = x.read_data;
-      y.addr0 = subtract0.y0_addr0;
-      subtract0.y0_read_data = y.read_data;
-      z.addr0 = subtract0.z0_addr0;
-      z.write_data = subtract0.z0_write_data;
-      z.write_en = subtract0.z0_write_en;
-      subtract0.z0_done = z.done;
-      subtract0.go = 1'd1;
-      run_subtract[done] = subtract0.done ? 1'd1;
+    group run_subtract0 {
+      x.addr0 = comp_subtract0.x0_addr0;
+      comp_subtract0.x0_read_data = x.read_data;
+      y.addr0 = comp_subtract0.y0_addr0;
+      comp_subtract0.y0_read_data = y.read_data;
+      z.addr0 = comp_subtract0.z0_addr0;
+      z.write_data = comp_subtract0.z0_write_data;
+      z.write_en = comp_subtract0.z0_write_en;
+      comp_subtract0.z0_done = z.done;
+      comp_subtract0.go = 1'd1;
+      run_subtract0[done] = comp_subtract0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_subtract;
+      run_subtract0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor1d_mult.expect b/frontends/relay-futil/tests/tensor1d_mult.expect
index dac0e76d85..a2b7d5d802 100644
--- a/frontends/relay-futil/tests/tensor1d_mult.expect
+++ b/frontends/relay-futil/tests/tensor1d_mult.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component multiply(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
+component multiply0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     bin_read0_0 = prim std_reg(32);
@@ -82,25 +82,25 @@ component main () -> () {
     x1 = prim std_mem_d1(32, 4, 3);
     x = prim std_mem_d1(32, 4, 3);
     y = prim std_mem_d1(32, 4, 3);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
   }
   wires {
-    group run_multiply {
-      x.addr0 = multiply0.x0_addr0;
-      multiply0.x0_read_data = x.read_data;
-      y.addr0 = multiply0.y0_addr0;
-      multiply0.y0_read_data = y.read_data;
-      x1.addr0 = multiply0.x10_addr0;
-      x1.write_data = multiply0.x10_write_data;
-      x1.write_en = multiply0.x10_write_en;
-      multiply0.x10_done = x1.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
+    group run_multiply0 {
+      x.addr0 = comp_multiply0.x0_addr0;
+      comp_multiply0.x0_read_data = x.read_data;
+      y.addr0 = comp_multiply0.y0_addr0;
+      comp_multiply0.y0_read_data = y.read_data;
+      x1.addr0 = comp_multiply0.x10_addr0;
+      x1.write_data = comp_multiply0.x10_write_data;
+      x1.write_en = comp_multiply0.x10_write_en;
+      comp_multiply0.x10_done = x1.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_multiply;
+      run_multiply0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor2d_add.expect b/frontends/relay-futil/tests/tensor2d_add.expect
index d289badb27..c6a409b5af 100644
--- a/frontends/relay-futil/tests/tensor2d_add.expect
+++ b/frontends/relay-futil/tests/tensor2d_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+component add0(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(3);
@@ -106,28 +106,28 @@ component main () -> () {
     x1 = prim std_mem_d2(32, 2, 4, 2, 3);
     x = prim std_mem_d2(32, 2, 4, 2, 3);
     y = prim std_mem_d2(32, 2, 4, 2, 3);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x.addr0 = add0.x0_0_addr0;
-      add0.x0_0_read_data = x.read_data;
-      x.addr1 = add0.x0_0_addr1;
-      y.addr0 = add0.y0_0_addr0;
-      add0.y0_0_read_data = y.read_data;
-      y.addr1 = add0.y0_0_addr1;
-      x1.addr0 = add0.x10_0_addr0;
-      x1.addr1 = add0.x10_0_addr1;
-      x1.write_data = add0.x10_0_write_data;
-      x1.write_en = add0.x10_0_write_en;
-      add0.x10_0_done = x1.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x.addr0 = comp_add0.x0_0_addr0;
+      comp_add0.x0_0_read_data = x.read_data;
+      x.addr1 = comp_add0.x0_0_addr1;
+      y.addr0 = comp_add0.y0_0_addr0;
+      comp_add0.y0_0_read_data = y.read_data;
+      y.addr1 = comp_add0.y0_0_addr1;
+      x1.addr0 = comp_add0.x10_0_addr0;
+      x1.addr1 = comp_add0.x10_0_addr1;
+      x1.write_data = comp_add0.x10_0_write_data;
+      x1.write_en = comp_add0.x10_0_write_en;
+      comp_add0.x10_0_done = x1.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor3d_divide.expect b/frontends/relay-futil/tests/tensor3d_divide.expect
index a823a0ff96..0476ac946a 100644
--- a/frontends/relay-futil/tests/tensor3d_divide.expect
+++ b/frontends/relay-futil/tests/tensor3d_divide.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
+component divide0(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     add1 = prim std_add(3);
@@ -146,31 +146,31 @@ component main () -> () {
     x1 = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
     x = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
     y = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
-    divide0 = divide;
+    comp_divide0 = divide0;
   }
   wires {
-    group run_divide {
-      x.addr0 = divide0.x0_0_0_addr0;
-      divide0.x0_0_0_read_data = x.read_data;
-      x.addr1 = divide0.x0_0_0_addr1;
-      x.addr2 = divide0.x0_0_0_addr2;
-      y.addr0 = divide0.y0_0_0_addr0;
-      divide0.y0_0_0_read_data = y.read_data;
-      y.addr1 = divide0.y0_0_0_addr1;
-      y.addr2 = divide0.y0_0_0_addr2;
-      x1.addr0 = divide0.x10_0_0_addr0;
-      x1.addr1 = divide0.x10_0_0_addr1;
-      x1.addr2 = divide0.x10_0_0_addr2;
-      x1.write_data = divide0.x10_0_0_write_data;
-      x1.write_en = divide0.x10_0_0_write_en;
-      divide0.x10_0_0_done = x1.done;
-      divide0.go = 1'd1;
-      run_divide[done] = divide0.done ? 1'd1;
+    group run_divide0 {
+      x.addr0 = comp_divide0.x0_0_0_addr0;
+      comp_divide0.x0_0_0_read_data = x.read_data;
+      x.addr1 = comp_divide0.x0_0_0_addr1;
+      x.addr2 = comp_divide0.x0_0_0_addr2;
+      y.addr0 = comp_divide0.y0_0_0_addr0;
+      comp_divide0.y0_0_0_read_data = y.read_data;
+      y.addr1 = comp_divide0.y0_0_0_addr1;
+      y.addr2 = comp_divide0.y0_0_0_addr2;
+      x1.addr0 = comp_divide0.x10_0_0_addr0;
+      x1.addr1 = comp_divide0.x10_0_0_addr1;
+      x1.addr2 = comp_divide0.x10_0_0_addr2;
+      x1.write_data = comp_divide0.x10_0_0_write_data;
+      x1.write_en = comp_divide0.x10_0_0_write_en;
+      comp_divide0.x10_0_0_done = x1.done;
+      comp_divide0.go = 1'd1;
+      run_divide0[done] = comp_divide0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_divide;
+      run_divide0;
     }
   }
 }
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index 69cc8c4e0a..58fbb1ebc1 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -77,12 +77,12 @@ def get_memory_parameters(type):
     return data, NumDimensionsToPrimitive[num_dimensions], data_type
 
 
-def build_main_controls(c: FComponent):
+def build_main_controls(component: FComponent):
     '''
     Builds the wires and control for the `main` component. This is done by creating a group `run_*`
     with its respective wiring for each Relay function call, and adding it to the control.
     '''
-    for cell in reversed(c.cells.values()):
+    for cell in reversed(component.cells.values()):
         if not cell.is_relay_function(): continue
         function = cell.relay_function
         inputs, output = function.inputs, function.output
@@ -115,8 +115,8 @@ def build_main_controls(c: FComponent):
         wires.append(FWire(f'{function.name}.{output.dahlia_name}_done', f'{output_name}.done'))
         wires.append(FWire(f'{function.name}.go', "1'd1"))
         wires.append(FWire(f'{group_name}[done]', f"{function.name}.done ? 1'd1"))
-        c.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
+        component.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
 
     # Ensures that only group names make it into the controls of a FuTIL component.
-    connections = list(filter(lambda w: w.is_group(), c.wires))
-    c.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]
+    connections = list(filter(lambda w: w.is_group(), component.wires))
+    component.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]

From 06dff4f5794b19d142bd8475d46b780a3264f107 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Thu, 10 Dec 2020 20:11:08 -0500
Subject: [PATCH 73/75] Fix fud externalize stage.

---
 fud/fud/main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fud/fud/main.py b/fud/fud/main.py
index 6b78b1db62..5bcbb3d7a3 100644
--- a/fud/fud/main.py
+++ b/fud/fud/main.py
@@ -40,10 +40,10 @@ def register_stages(registry, cfg):
             cfg, 'futil-noinline', '-b futil -d hole-inliner',
             'Compile FuTIL to FuTIL to remove all control and inline groups'
         ))
-
     registry.register(
-        futil.FutilStage(config, 'futil-externalize', '-b futil -p externalize',
-                         'Compile FuTIL to FuTIL to externalize all external memory primitives'))
+        futil.FutilStage(cfg, 'futil-externalize', '-b futil -p externalize',
+                         'Compile FuTIL to FuTIL to externalize all external memory primitives'
+        ))
 
     # Verilator
     registry.register(

From f2290cb84b384f19e6e0d1d926c71dbcb1da7c9b Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 20 Dec 2020 08:06:56 -0500
Subject: [PATCH 74/75] Remove primitive library changes.

---
 primitives/std.lib | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/primitives/std.lib b/primitives/std.lib
index 30648aa7d8..2386742b88 100644
--- a/primitives/std.lib
+++ b/primitives/std.lib
@@ -63,7 +63,7 @@ primitive std_mem_d2[width, d0_size, d1_size, d0_idx_size, d1_idx_size](
         parameter d1_idx_size = 4)
        (input logic [d0_idx_size-1:0] addr0,
         input logic [d1_idx_size-1:0] addr1,
-        input logic [width-1:0]   write_data /*verilator public*/,
+        input logic [width-1:0]   write_data,
         input logic               write_en,
         input logic               clk,
         output logic [width-1:0]  read_data,
@@ -849,7 +849,6 @@ primitive fixed_p_std_const[width, int_width, fract_width, value1, value2] () ->
             parameter value2 = 0)
 
         (output logic [width-1:0] out);
-        /* verilator lint_off WIDTHCONCAT */
         assign out = {value1, value2};
         endmodule
     }

From c3018f5dd2e6218a37ca18d5774aab2775ad8597 Mon Sep 17 00:00:00 2001
From: cgyurgyik <gyurgyikcp@gmail.com>
Date: Sun, 20 Dec 2020 08:10:12 -0500
Subject: [PATCH 75/75] Mark softmax as unimplemented.

---
 frontends/relay-futil/dahlia_lowering.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/frontends/relay-futil/dahlia_lowering.py b/frontends/relay-futil/dahlia_lowering.py
index 3bb822e681..6b83a28784 100644
--- a/frontends/relay-futil/dahlia_lowering.py
+++ b/frontends/relay-futil/dahlia_lowering.py
@@ -386,6 +386,7 @@ def dense(function):
 # TODO(cgyurgyik): Currently, only supports a small subset (namely those used in our VGG net and MLP net examples).
 def softmax(function):
     """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.softmax"""
+    assert False, "Unimplemented."
     op, res = function.inputs[0].primitive, function.output.primitive
     axis = function.attributes.get_int("axis")
     data_type = op.data_type