diff --git a/frontends/relay-futil/compiler.py b/frontends/relay-futil/compiler.py
index 86fd5bd3e3..d3eb4461d3 100644
--- a/frontends/relay-futil/compiler.py
+++ b/frontends/relay-futil/compiler.py
@@ -7,20 +7,7 @@
 from pretty_print import *
 from utilities import *
 from futil_ast import *
-from dahlia_functions import *
-
-# Mapping from Relay binary calls to the respective Dahlia operator.
-BuiltInBinaryOps = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
-
-# Mapping from Relay function names to their respective Dahlia lowering.
-RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
-                      'nn.bias_add': bias_add, 'nn.relu': relu, 'negative': negative, 'expand_dims': expand_dims,
-                      'sqrt': sqrt}
-
-# Mapping between primitive type and associated Dahlia name extension.
-# E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
-DahliaNameExtension = {PrimitiveType.Memory1D: '0', PrimitiveType.Memory2D: '0_0',
-                       PrimitiveType.Memory3D: '0_0_0', PrimitiveType.Memory4D: '0_0_0_0'}
+from dahlia_lowering import *
 
 
 class Relay2Futil(ExprFunctor):
@@ -30,17 +17,16 @@ def __init__(self):
         super(Relay2Futil, self).__init__()
         self.id_dictionary = defaultdict(int)
         self.relay_id_dictionary = defaultdict(int)
-        self.dahlia_components = []
-        self.main = FComponent(name="main", cells=[], wires=[])
+        self.main = FComponent(name="main")
 
     def id(self, name):
         """
         Provides a unique identification for a given name.
         For example, if 'a' is seen three times, it will produce: 'a0', 'a1', 'a2'.
         """
-        id_number = self.id_dictionary[name]
+        id_number = str(self.id_dictionary[name])
         self.id_dictionary[name] += 1
-        return name + str(id_number)
+        return ''.join((name, id_number))
 
     def relay_id(self, name):
         """
@@ -48,97 +34,75 @@ def relay_id(self, name):
         %x  = foo(%y);
         %x1 = bar(%x); // Here, at this level, the name_hint associated with `x1` is still 'x'.
 
-        To avoid this, we provide Relay with its own identification dictionary. If 'x' is seen
-        three times, it will produce: 'x', 'x1', x2'.
+        To avoid this, we provide Relay with its own identification dictionary.
+        If 'x' is seen three times, it will produce: 'x', 'x1', x2'.
         """
         id_number = self.relay_id_dictionary[name]
         self.relay_id_dictionary[name] += 1
         if id_number == 0: return name
-        return name + str(id_number)
+        return ''.join((name, str(id_number)))
 
     def dahlia_name(self, name, type):
         """
-        Dahlia uses the following naming scheme for an arbitrary variable 'X':
-        Memory1D: 'X0', 'X1', 'X2', ...
-        Memory2D: 'X0_0', 'X1_0', 'X2_0', ...
-        Memory3D: 'X0_0_0', 'X1_0_0', 'X2_0_0', ...
+        Dahlia uses the following naming scheme for arbitrary variables `X`, `Y`:
+        Memory1D: `X0`, `Y0`
+        Memory2D: `X0_0`, `Y0_0`
+        Memory3D: `X0_0_0`, `Y0_0_0`
         """
         assert type in DahliaNameExtension, f'{name} with {type} is not supported yet.'
-        return name + DahliaNameExtension[type]
+        return ''.join((name, DahliaNameExtension[type]))
 
-    def get_dahlia_declaration(self, function_name, cells, args, attrs):
-        """
-        Returns the corresponding name, Dahlia function type, and op (if it is a binary op, otherwise None).
-        If the function type isn't supported, fails with an assertion.
-        """
-        input_type = cells[0].primitive.type
-        function = name = op = None
-        if function_name in BuiltInBinaryOps:
-            op = BuiltInBinaryOps[function_name]
-            function, name = broadcast, function_name
-        elif function_name in RelayFunctionCalls:
-            function = RelayFunctionCalls[function_name]
-            name = function.__name__
-        else:
-            assert False, f'{function_name} with type {input_type} is not supported.'
-        return DahliaDeclaration(component_name=self.relay_id(name), decl_name=self.id(name),
-                                 op=op, inputs=args, attributes=attrs, function=function)
-
-    def visit_var(self, var):
+    def visit_var(self, var) -> FCell:
         name = self.relay_id(var.name_hint)
-        # Do not add duplicate primitives to main.
-        if self.main.contains_primitive(name): return cell
+        if name in self.main.cells: return cell
         data, type, data_type = get_memory_parameters(var.type_annotation)
-        dahlia_name = self.dahlia_name(name, type)
-        return FCell(dahlia_name=dahlia_name,
+        return FCell(dahlia_name=self.dahlia_name(name, type),
                      primitive=FPrimitive(name=name, data=data, data_type=data_type, type=type))
 
     def visit_let(self, let):
         values, output = self.visit(let.value), self.visit(let.var)
         if isinstance(values, list):
-            for value in values:
-                if not value.is_dahlia_declaration(): continue
-                value.dahlia_declaration.output = output
-                value.dahlia_declaration.invoke()
+            for value in flatten(values):
+                if value.is_relay_function(): value.relay_function.output = output
         return [self.visit(let.body), values]
 
-    def visit_constant(self, const):
+    def visit_constant(self, const) -> FCell:
         # Note: We're currently treating constants defined in a `let` statement in Relay IR as 1D Memory.
-        type, shape = const.data.dtype, const.data.shape
-        name, data = self.id("const"), [get_bitwidth(type), int(const.data.asnumpy())]
-        data_type = get_memory_parameters(type)
-        return FCell(primitive=FPrimitive(name=name, data=data, data_type=data_type, type=PrimitiveType.Constant))
+        # type, shape = const.data.dtype, const.data.shape
+        pass
 
-    def visit_call(self, call):
+    def visit_call(self, call) -> List[FCell]:
         attributes = call.attrs
         cells, args = [], []
         for arg in call.args:
             argument = self.visit(arg)
             cells.append(argument)
             args.append(argument)
-        cells.append(FCell(dahlia_declaration=self.get_dahlia_declaration(call.op.name, cells, args, call.attrs)))
+        # We are representing all function calls in Relay IR at the Dahlia level, which will then be lowered to FuTIL.
+        # Note, the Relay function's output is not defined until the `let` statement is visited.
+        function, name, op = GetRelayFunctionCall(call.op.name)
+        component_name = self.id(name)
+        relay_function_call = RelayFunctionCall(component_name=component_name, name=f'comp_{component_name}',
+                                                op=op, inputs=args, attributes=call.attrs, lowering_function=function)
+        cells.append(FCell(relay_function=relay_function_call))
         return cells
 
     def visit_function(self, function):
         body = self.visit(function.body)
-        for cell in flatten(body):
-            self.main.add_cell(cell)
-            if not cell.is_dahlia_declaration(): continue
-            self.dahlia_components.append(cell.dahlia_declaration.program)
+        for cell in flatten(body): self.main.add_cell(cell)
         build_main_controls(self.main)
-        return pp_component(self.main)
+        return pp_lowered_relay_function(self.main)
 
 
 def relay_transforms(expr: Function) -> Function:
     """https://tvm.apache.org/docs/api/python/relay/transform.html"""
-    transform = tvm.transform.Sequential([
+    transforms = tvm.transform.Sequential([
         relay.transform.SimplifyExpr(),
         relay.transform.SimplifyInference(),
-        relay.transform.InferType()
+        relay.transform.InferType(),
     ])
     mod = ir.IRModule.from_expr(expr)
-    mod['main'] = expr
-    mod = transform(mod)
+    mod = transforms(mod)
     return mod['main']
 
 
@@ -147,11 +111,9 @@ def lower_to_futil(program) -> str:
     program = relay_transforms(program)
     visitor = Relay2Futil()
 
-    PREAMBLE = """import "primitives/std.lib";"""
+    PREAMBLE = """import "primitives/std.lib";\n"""
     MAIN = visitor.visit(program)
-    DAHLIA_COMPONENTS = '\n'.join(visitor.dahlia_components)
-    NEWL = '\n\n'
-    return f'{PREAMBLE}{NEWL}{DAHLIA_COMPONENTS}{NEWL}{MAIN}'
+    return '\n'.join((PREAMBLE, MAIN))
 
 
 if __name__ == '__main__':
diff --git a/frontends/relay-futil/dahlia_functions.py b/frontends/relay-futil/dahlia_functions.py
deleted file mode 100644
index 106c000205..0000000000
--- a/frontends/relay-futil/dahlia_functions.py
+++ /dev/null
@@ -1,337 +0,0 @@
-import subprocess
-import os
-
-from tempfile import NamedTemporaryFile, TemporaryFile
-from futil_ast import *
-from pretty_print import *
-
-IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
-NO_ERR = "2>/dev/null"
-CHARACTER_I = chr(ord('i'))
-NEWL = '\n'
-
-
-def lower_dahlia_program(prog, component_name):
-    """
-    Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
-    and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
-    declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the respective component.
-
-    Example:
-        ------ Dahlia, component name: ProcessX ------
-        decl X: ubit<32>[4];
-        ...
-
-        ------------- Lower to FuTIL -----------------
-        component ProcessX() -> () {
-          X = prim std_mem_d1_ext(32, 4, 2);
-          ...
-        }
-
-        ------------- Externalize Pass ---------------
-        component ProcessX
-        (go: 1, clk: 1, X0_read_data: 32, X0_done: 1) ->
-        (done: 1, X0_addr0: 2, X0_write_data: 32, X0_write_en: 1, X0_clk: 1) {
-           ...
-        }
-    """
-    program_string = '\n'.join(prog.splitlines())
-    with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
-        tf0.write(bytes(program_string, 'UTF-8'))
-        tf0.seek(0), tf1.seek(0), tf2.seek(0)
-        fuse_binary = os.environ['DAHLIA_EXEC'] if 'DAHLIA_EXEC' in os.environ else 'fuse'
-        command = f"""
-                {fuse_binary} {tf0.name} --lower -b=futil -n={component_name} > {tf1.name} {NO_ERR} \
-                 && cargo run -- {tf1.name} -l ../../ -p externalize > {tf2.name} {NO_ERR}"""
-        subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
-        component = tf2.read().decode()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
-        return component
-
-
-def broadcast(declaration):
-    """
-    https://numpy.org/doc/stable/user/basics.broadcasting.html
-    Implements array broadcasting:
-    Two dimensions are compatible when either (1) they're equal, or (2) one of them is 1.
-    It is not required that both operands have the same number of dimensions either.
-    - When lowering from Relay IR, we are guaranteed the arrays are compatible for broadcasting.
-    - Variable names for indexing through the array begin with `i`, and continue alphabetically.
-
-    Example:
-         first operand:  64 x  1 x 32
-        second operand:       16 x  1
-                result:  64 x 16 x 32
-        ->
-        for (i = 0...64) {
-          for (j = 0..16) {
-            for (k = 0..32) {
-              result[i][j][k] := op1[i][0][k] op op2[j][0];
-              ...
-    """
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-
-    op1_dims, op2_dims, res_dims = op1.type, op2.type, res.type
-    op1_sizes, op2_sizes, res_sizes = [], [], []
-    # Get memory sizes in reversed order.
-    for i in reversed(range(0, op1_dims)): op1_sizes.append(op1.data[i + 1])
-    for i in reversed(range(0, op2_dims)): op2_sizes.append(op2.data[i + 1])
-    for i in reversed(range(0, res_dims)): res_sizes.append(res.data[i + 1])
-
-    # Gets the last variable name since we will compare sizes in the reverse direction.
-    variable_name = chr(ord(CHARACTER_I) + res_dims - 1)
-    # Determine the value at the indices in reverse order.
-    # For each dimension, this will either be `[x]` for index_variable `x`, or `[0]`
-    # depending on the relationship between the dimensions sizes.
-    op1_indices, op2_indices, res_indices = [], [], []
-    for i in range(0, len(res_sizes)):
-        current_dimension, index_zero = f'[{variable_name}]', '[0]'
-        res_indices.append(current_dimension)
-        if op1_dims > op2_dims and len(op2_sizes) <= i:
-            op1_indices.append(current_dimension)
-            continue
-        if op2_dims > op1_dims and len(op1_sizes) <= i:
-            op2_indices.append(current_dimension)
-            continue
-        if op1_sizes[i] == op2_sizes[i]:
-            op1_indices.append(current_dimension)
-            op2_indices.append(current_dimension)
-        elif op1_sizes[i] > op2_sizes[i]:
-            op1_indices.append(current_dimension)
-            op2_indices.append(index_zero)
-        else:  # op2_sizes[i] < op1_sizes[i]
-            op1_indices.append(index_zero)
-            op2_indices.append(current_dimension)
-        variable_name = next_character(variable_name, -1)
-
-    # Resulting index in the nested for loop, e.g. for `op1[i][j][0][k]`, this is `[i][j][0][k]`.
-    op1_index = ''.join(reversed(op1_indices))
-    op2_index = ''.join(reversed(op2_indices))
-    res_index = ''.join(reversed(res_indices))
-    loop_body = f'{res.name}{res_index} := {op1.name}{op1_index} {declaration.op} {op2.name}{op2_index};'
-
-    program_body = pp_dahlia_loop(res, loop_body)
-    declarations = pp_dahlia_memory_declarations([res, op1, op2])
-    program = f"""{declarations}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
-
-
-def batch_flatten(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = data.data[0], data.type
-    res_index_size1 = res.data[4]
-
-    variable_name = CHARACTER_I
-    data_indices, res_indices = "", f'[{variable_name}]'
-    for i in range(0, num_dimensions):
-        # Determine loop body indices based on `axis` provided.
-        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
-        index = f'[{variable_name}]'
-        data_indices += index
-        variable_name = next_character(variable_name)
-    res_indices += f'[{variable_name}]'
-
-    declarations = pp_dahlia_memory_declarations([data, res])
-    let_flattened = f'let {variable_name}: ubit<{res_index_size1}> = 0;'
-    body = f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;"
-    program_body = pp_dahlia_loop(data, body)
-    program = f"""{declarations}{NEWL}{let_flattened}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
-
-
-def bias_add(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
-    data, bias, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = data.data[0], data.type
-
-    axis_attribute = declaration.attributes.get_int("axis")
-    axis = num_dimensions - 1 if axis_attribute == -1 else axis_attribute
-
-    variable_name = CHARACTER_I
-    data_indices = ""
-    for i in range(0, num_dimensions):
-        # Determine loop body indices based on `axis` provided.
-        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
-        index = f'[{variable_name}]'
-        if axis == i: bias_index = index
-        data_indices += index
-        variable_name = next_character(variable_name)
-
-    declarations = pp_dahlia_memory_declarations([data, bias, res])
-    body = (f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};")
-    program_body = pp_dahlia_loop(data, body)
-    return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
-
-
-# TODO(cgyurgyik):
-#  1. This won't work for fixed point currently, since Dahlia
-#     will not take fixed point operands for the `>` operator.
-#  2. Without signed bit array support, this is also meaningless.
-def relu(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = data.data[0], data.type
-
-    declarations = pp_dahlia_memory_declarations([data, res])
-    zero = '0.0' if data.data_type == 'ufix' else '0'
-    let_zero = f'let zero: {data.data_type}<{bitwidth}> = {zero};'
-
-    indices = ""
-    variable_name = CHARACTER_I
-    for i in range(0, num_dimensions):
-        # Determine loop body indices.
-        indices += f'[{variable_name}]'
-        variable_name = next_character(variable_name)
-
-    body = f"""if ({data.name}{indices} > zero) {{ {res.name}{indices} := {data.name}{indices}; }} 
-        else {{ {res.name}{indices} := zero; }}"""
-    program_body = pp_dahlia_loop(data, body)
-    return lower_dahlia_program(f"""{declarations}{NEWL}{let_zero}{NEWL}{program_body}""", declaration.component_name)
-
-
-# TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
-def negative(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
-    op, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = op.data[0], op.type
-
-    indices = ""
-    variable_name = CHARACTER_I
-    for i in range(0, num_dimensions):
-        # Determine loop body indices.
-        indices += f'[{variable_name}]'
-        variable_name = next_character(variable_name)
-
-    declarations = pp_dahlia_memory_declarations([op, res])
-    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := -{op.name}{indices};""")
-    return lower_dahlia_program(f"""{declarations}{NEWL}{program_body}""", declaration.component_name)
-
-
-# TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
-def sqrt(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
-    op, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
-    include_sqrt = f"""import "fxp_sqrt.h" {{ def sqrt(value: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
-
-    indices = ""
-    variable_name = CHARACTER_I
-    for i in range(0, num_dimensions):
-        # Determine loop body indices.
-        indices += f'[{variable_name}]'
-        variable_name = next_character(variable_name)
-
-    declarations = pp_dahlia_memory_declarations([op, res])
-    program_body = pp_dahlia_loop(op, f"""{res.name}{indices} := sqrt({op.name}{indices});""")
-    return lower_dahlia_program(f"""{include_sqrt}{NEWL}{declarations}{NEWL}{program_body}""",
-                                declaration.component_name)
-
-
-def expand_dims(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
-    axis, num_newaxis = declaration.attributes.get_int("axis"), declaration.attributes.get_int("num_newaxis")
-    data, res = declaration.inputs[0].primitive, declaration.output.primitive
-    bitwidth, num_dimensions = data.data[0], data.type
-
-    declarations = pp_dahlia_memory_declarations([data, res])
-
-    res_indices, data_indices = "", ""
-    variable_name = CHARACTER_I
-    for i in range(0, num_dimensions):
-        # Determine loop body indices.
-        index = f'[{variable_name}]'
-        res_indices += index
-        data_indices += index
-        if axis == i + 1:
-            for _ in range(0, num_newaxis): res_indices += '[0]'
-        variable_name = next_character(variable_name)
-
-    program_body = pp_dahlia_loop(data, f'{res.name}{res_indices} := {data.name}{data_indices}')
-    program = f"""{declarations}{NEWL}{program_body}"""
-    return lower_dahlia_program(program, declaration.component_name)
-
-
-def batch_matmul(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, M1_size0, M1_size1, M1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
-    M1_index_size0, M1_index_size1, M1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
-    M2_size0, M2_size1, M2_size2 = op2.data[1], op2.data[2], op2.data[3]
-    M2_index_size0, M2_index_size1, M2_index_size2 = op2.data[4], op2.data[5], op2.data[6]
-    # 1. Get transpose of second operand.
-    # 2. Create temporary value `t`. Then, t = op1 * transpose(op2).
-    # 3. Copy temporary value to return value.*
-    #    * This third step may not be necessary, but trying to conduct the matrix multiply
-    #      directly with the return value declared resulted in incorrect outputs.
-    declarations = pp_dahlia_memory_declarations([res, op1, op2])
-    program = f"""{declarations}
-    let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size2}][{M2_size1}];
-    let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
-    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
-      for (let i: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
-        for (let j: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
-          transpose_{op2.name}[batch][j][i] := {op2.name}[batch][i][j];
-        }}
-      }}
-    }} 
-
-    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
-      for (let i: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
-        for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
-          for (let k: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
-            let product = {op1.name}[batch][i][k] * transpose_{op2.name}[batch][k][j];
-          }} combine {{
-            temporary_{res.name}[batch][i][j] += product;
-          }}
-        }}
-      }}
-    }}
-
-    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
-      for (let i: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
-        for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
-          {res.name}[batch][i][j] := temporary_{res.name}[batch][i][j];
-        }}
-      }}
-    }} 
-    """
-    return lower_dahlia_program(program, declaration.component_name)
-
-
-# TODO(cgyurgyik): Similar to batch_matmul, this requires a temporary memory to store the output
-# of the matrix multiply. Otherwise, the values aren't computed properly. Look deeper into this.
-def dense(declaration):
-    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
-    # TODO(cgyurgyik): Add support for `units`.
-    units = declaration.attributes.get_int("units")
-    op1, op2, res = declaration.inputs[0].primitive, declaration.inputs[1].primitive, declaration.output.primitive
-    bitwidth, M1_size0, M1_size1 = op1.data[0], op1.data[1], op1.data[2]
-    M1_index_size0, M1_index_size1 = op1.data[3], op1.data[4]
-    M2_size0, M2_size1, M2_index_size0, M2_index_size1 = op2.data[1], op2.data[2], op2.data[3], op2.data[4]
-    program = f"""
-    {pp_dahlia_memory_declarations([res, op1, op2])}
-    let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size1}][{M2_size0}];
-    let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
-    for (let i: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
-      for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
-        transpose_{op2.name}[j][i] := {op2.name}[i][j];
-      }}
-    }} 
-
-    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
-      for (let j: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
-        for (let k: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
-          let product = {op1.name}[i][k] * transpose_{op2.name}[k][j];
-        }} combine {{
-          temporary_{res.name}[i][j] += product;
-        }}
-      }}
-    }}
-
-    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
-      for (let j: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
-        {res.name}[i][j] := temporary_{res.name}[i][j];
-      }}
-    }}
-    """
-    return lower_dahlia_program(program, declaration.component_name)
diff --git a/frontends/relay-futil/dahlia_lowering.py b/frontends/relay-futil/dahlia_lowering.py
new file mode 100644
index 0000000000..6b83a28784
--- /dev/null
+++ b/frontends/relay-futil/dahlia_lowering.py
@@ -0,0 +1,515 @@
+import subprocess
+import os
+
+from tempfile import NamedTemporaryFile, TemporaryFile
+from futil_ast import *
+
+IMPORT_STATEMENT = """import "primitives/std.lib";\n"""
+NO_ERR = "2>/dev/null"
+NEWL = '\n'
+CHARACTER_I = chr(ord('i'))  # Starting index variable name for Dahlia array iteration.
+
+
+def next_character(ch, dir=1):
+    """
+    Returns the next character after 'ch'.
+    If `dir` is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
+    """
+    return chr(ord(ch) + 1) if dir > 0 else chr(ord(ch) - 1)
+
+
+def PPDahliaMemoryDeclarations(relay_function):
+    """
+    Pretty print for Dahlia memory declarations, e.g.
+    `decl X: ubit<32> [1][10];`
+    """
+    cell_list = relay_function.inputs
+    cell_list.append(relay_function.output)
+
+    declarations = []
+    for cell in cell_list:
+        declaration = cell.primitive
+        declaration_str = f'decl {declaration.name}: {declaration.data_type}<{declaration.data[0]}>'
+        for i in range(0, declaration.type): declaration_str += f'[{declaration.data[i + 1]}]'
+        declarations.append(declaration_str + ";")
+    return '\n'.join(declarations)
+
+
+def PPDahliaLoop(relay_function, body, num_dimensions, data=None):
+    """
+    Returns an iteration over data with `body` as the work done within the nested loop(s).
+    Many tensor functions share the same control flow: (1) Iterate `num_dimensions` times, and (2) do some work in body.
+    For example, if `data` is a 2D primitive of size (M, N) and body == `X;`, then this will return:
+
+    ```
+    for (let i: ubit<X> = 0..M) {
+      for (let j: ubit<Y> = 0..N) {
+        X;
+      }
+    }
+    ```
+
+    Notes:
+    If `data` is provided, it will be used to determine the `num_dimensions` as well as the corresponding bitwidths
+    and memory sizes. This occurs only in special cases; otherwise, the `output` of the `relay_function` will
+    determine these.
+    """
+    variable_name = CHARACTER_I
+    program = []
+    SPACING = ''
+    output = relay_function.output.primitive if data == None else data
+    for i in range(0, num_dimensions):
+        size, index_size = output.data[i + 1], output.data[i + num_dimensions + 1]
+        program.append(f'{SPACING}for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
+        variable_name = next_character(variable_name)
+        SPACING += '  '
+    program.append(f'{SPACING}{body}')
+
+    for i in range(0, num_dimensions):
+        SPACING = SPACING[:-2]
+        program.append(SPACING + '}')
+    return '\n'.join(program)
+
+
+def LowerDahliaProgramToFuTIL(relay_function, dahlia_body, dahlia_imports=None):
+    """
+    Takes in a string representation of a Dahlia program, lowers it to FuTIL with the given `component_name`,
+    and applies the `externalize` pass. This pass exposes the inputs and outputs of primitive types that are
+    declared external, e.g. `std_mem_d1_ext`, and places them in the inputs and outputs of the respective component.
+
+    Example:
+        ------ Dahlia, component name: ProcessX ------
+        import "foo.h" { ... }
+        decl X: ubit<32>[4];
+        ...
+
+        ------------- Lower to FuTIL -----------------
+        component ProcessX() -> () {
+          X = prim std_mem_d1_ext(32, 4, 2);
+          ...
+        }
+
+        ------------- Externalize Pass ---------------
+        component ProcessX
+        (go: 1, clk: 1, X0_read_data: 32, X0_done: 1) ->
+        (done: 1, X0_addr0: 2, X0_write_data: 32, X0_write_en: 1, X0_clk: 1) {
+           ...
+        }
+    """
+    if dahlia_imports == None: dahlia_imports = ''
+    program_string = '\n'.join((dahlia_imports, PPDahliaMemoryDeclarations(relay_function), dahlia_body))
+
+    with NamedTemporaryFile() as tf0, NamedTemporaryFile() as tf1, NamedTemporaryFile() as tf2:
+        tf0.write(bytes(program_string, 'UTF-8'))
+        tf0.seek(0), tf1.seek(0), tf2.seek(0)
+        fuse_binary = os.environ['DAHLIA_EXEC'] if 'DAHLIA_EXEC' in os.environ else 'fuse'
+        command = f"""
+                {fuse_binary} {tf0.name} --lower -b=futil -n={relay_function.component_name} > {tf1.name} {NO_ERR} \
+                 && fud e --from futil {tf1.name} --to futil-externalize > {tf2.name} {NO_ERR}"""
+        subprocess.Popen(command, stdout=subprocess.PIPE, shell=True).communicate()
+        component = tf2.read().decode()[len(IMPORT_STATEMENT):]  # Skip over importing the primitives library.
+        return component
+
+
+####################################################################################################
+################## Dahlia Implementations for Relay Function Calls #################################
+####################################################################################################
+
+def broadcast(function: RelayFunctionCall):
+    """
+    https://numpy.org/doc/stable/user/basics.broadcasting.html
+    Implements array broadcasting:
+    Two dimensions are compatible when either (1) they're equal, or (2) one of them is `1`.
+    It is not required that both operands have the same number of dimensions either.
+    - When lowering from Relay IR, we are guaranteed the arrays are compatible for broadcasting.
+    - Variable names for indexing through the array begin with `i`, and continue alphabetically.
+
+    Example:
+         first operand:  64 x  1 x 32
+        second operand:       16 x  1
+                result:  64 x 16 x 32
+        ->
+        for (i = 0...64) {
+          for (j = 0..16) {
+            for (k = 0..32) {
+              result[i][j][k] := op1[i][0][k] op op2[j][0];
+              ...
+    """
+    op1, op2, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
+    op1_dims, op2_dims, res_dims = op1.type, op2.type, res.type
+    op1_sizes, op2_sizes, res_sizes = [], [], []
+    # Get memory sizes in reversed order.
+    for i in reversed(range(0, op1_dims)): op1_sizes.append(op1.data[i + 1])
+    for i in reversed(range(0, op2_dims)): op2_sizes.append(op2.data[i + 1])
+    for i in reversed(range(0, res_dims)): res_sizes.append(res.data[i + 1])
+
+    # Gets the last variable name since we will compare sizes in the reverse direction.
+    variable_name = chr(ord(CHARACTER_I) + res_dims - 1)
+    # Determine the value at the indices in reverse order.
+    # For each dimension, this will either be `[x]` for index_variable `x`, or `[0]`
+    # depending on the relationship between the dimensions sizes.
+    op1_indices, op2_indices, res_indices = [], [], []
+    for i in range(0, len(res_sizes)):
+        current_dimension, index_zero = f'[{variable_name}]', '[0]'
+        res_indices.append(current_dimension)
+        if op1_dims > op2_dims and len(op2_sizes) <= i:
+            op1_indices.append(current_dimension)
+        elif op2_dims > op1_dims and len(op1_sizes) <= i:
+            op2_indices.append(current_dimension)
+        elif op1_sizes[i] == op2_sizes[i]:
+            op1_indices.append(current_dimension)
+            op2_indices.append(current_dimension)
+        elif op1_sizes[i] > op2_sizes[i]:
+            op1_indices.append(current_dimension)
+            op2_indices.append(index_zero)
+        else:  # op2_sizes[i] < op1_sizes[i]
+            op1_indices.append(index_zero)
+            op2_indices.append(current_dimension)
+        variable_name = next_character(variable_name, -1)
+
+    # Resulting index in the nested for loop, e.g. for `op1[i][j][0][k]`, this is `[i][j][0][k]`.
+    op1_index = ''.join(reversed(op1_indices))
+    op2_index = ''.join(reversed(op2_indices))
+    res_index = ''.join(reversed(res_indices))
+    loop_body = f'{res.name}{res_index} := {op1.name}{op1_index} {function.op} {op2.name}{op2_index};'
+
+    return LowerDahliaProgramToFuTIL(function, PPDahliaLoop(function, loop_body, num_dimensions=res_dims))
+
+
+def batch_flatten(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_flatten"""
+    data, res = function.inputs[0].primitive, function.output.primitive
+    bitwidth, num_dimensions = res.data[0], data.type
+    res_index_size1 = res.data[4]
+
+    variable_name = CHARACTER_I
+    data_indices, res_indices = "", f'[{variable_name}]'
+    for i in range(0, num_dimensions):
+        # Determine loop body indices based on `axis` provided.
+        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
+        index = f'[{variable_name}]'
+        data_indices += index
+        variable_name = next_character(variable_name)
+    res_indices += f'[{variable_name}]'
+
+    let_flattened = f'let {variable_name}: ubit<{res_index_size1}> = 0;'
+    body = f"{res.name}{res_indices} := {data.name}{data_indices}; {variable_name} := {variable_name} + 1;"
+    program_body = '\n'.join((let_flattened, PPDahliaLoop(function, body, num_dimensions, data)))
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+def bias_add(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.bias_add"""
+    data, bias, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
+    bitwidth, num_dimensions = data.data[0], data.type
+
+    axis_attribute = function.attributes.get_int("axis")
+    axis = num_dimensions - 1 if axis_attribute == -1 else axis_attribute
+
+    variable_name = CHARACTER_I
+    data_indices = ""
+    for i in range(0, num_dimensions):
+        # Determine loop body indices based on `axis` provided.
+        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
+        index = f'[{variable_name}]'
+        if axis == i: bias_index = index
+        data_indices += index
+        variable_name = next_character(variable_name)
+
+    body = f"{res.name}{data_indices} := {data.name}{data_indices} + {bias.name}{bias_index};"
+    return LowerDahliaProgramToFuTIL(function, PPDahliaLoop(function, body, num_dimensions))
+
+
+# TODO(cgyurgyik):
+#  1. This won't work for fixed point currently, since Dahlia
+#     will not take fixed point operands for the `>` operator.
+#  2. Without signed bit array support, this is also meaningless.
+def relu(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.relu"""
+    data, res = function.inputs[0].primitive, function.output.primitive
+    bitwidth, num_dimensions, data_type = data.data[0], data.type, data.data_type
+
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    let_zero = f'let zero: {data_type}<{bitwidth}> = {zero};'
+
+    indices = ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        indices += f'[{variable_name}]'
+        variable_name = next_character(variable_name)
+
+    body = f"""if ({data.name}{indices} > zero) {{ {res.name}{indices} := {data.name}{indices}; }} 
+               else {{ {res.name}{indices} := zero; }}"""
+    program_body = '\n'.join((let_zero, PPDahliaLoop(function, body, num_dimensions)))
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+# TODO(cgyurgyik): Similar to ReLU, this requires signed operands.
+def negative(function):
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.negative"""
+    op, res = function.inputs[0].primitive, function.output.primitive
+    bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
+
+    indices = ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        indices += f'[{variable_name}]'
+        variable_name = next_character(variable_name)
+
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    program_body = PPDahliaLoop(function, f"""{res.name}{indices} := {zero} - {op.name}{indices};""", num_dimensions)
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+def sqrt(function):
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.sqrt"""
+    op, res = function.inputs[0].primitive, function.output.primitive
+    bitwidth, num_dimensions, data_type = op.data[0], op.type, op.data_type
+    include_sqrt = f"""import "fxp_sqrt.h" {{ def sqrt(value: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
+
+    indices = ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        indices += f'[{variable_name}]'
+        variable_name = next_character(variable_name)
+
+    program_body = PPDahliaLoop(function, f"""{res.name}{indices} := sqrt({op.name}{indices});""", num_dimensions)
+    return LowerDahliaProgramToFuTIL(function, program_body, include_sqrt)
+
+
+def expand_dims(function):
+    """https://tvm.apache.org/docs/api/python/relay/index.html#tvm.relay.expand_dims"""
+    axis, num_newaxis = function.attributes.get_int("axis"), function.attributes.get_int("num_newaxis")
+    data, res = function.inputs[0].primitive, function.output.primitive
+    bitwidth, num_dimensions = data.data[0], data.type
+
+    res_indices, data_indices = "", ""
+    variable_name = CHARACTER_I
+    for i in range(0, num_dimensions):
+        # Determine loop body indices.
+        index = f'[{variable_name}]'
+        res_indices += index
+        data_indices += index
+        if axis == i + 1:
+            for _ in range(0, num_newaxis): res_indices += '[0]'
+        variable_name = next_character(variable_name)
+
+    program_body = PPDahliaLoop(function, f'{res.name}{res_indices} := {data.name}{data_indices}', num_dimensions, data)
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+def batch_matmul(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.batch_matmul"""
+    op1, op2, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
+    bitwidth, M1_size0, M1_size1, M1_size2 = op1.data[0], op1.data[1], op1.data[2], op1.data[3]
+    M1_index_size0, M1_index_size1, M1_index_size2 = op1.data[4], op1.data[5], op1.data[6]
+    M2_size0, M2_size1, M2_size2 = op2.data[1], op2.data[2], op2.data[3]
+    M2_index_size0, M2_index_size1, M2_index_size2 = op2.data[4], op2.data[5], op2.data[6]
+    # 1. Get transpose of second operand.
+    # 2. Create temporary value `t`. Then, t = op1 * transpose(op2).
+    # 3. Copy temporary value to return value.*
+    #    * This third step may not be necessary, but trying to conduct the matrix multiply
+    #      directly with the return value declared resulted in incorrect outputs.
+    program_body = f"""
+    let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size0}][{M2_size2}][{M2_size1}];
+    let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M1_size1}][{M2_size1}];
+    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let i: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+        for (let j: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
+          transpose_{op2.name}[batch][j][i] := {op2.name}[batch][i][j];
+        }}
+      }}
+    }} 
+
+    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let i: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+        for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+          for (let k: ubit<{M2_index_size2}> = 0..{M2_size2}) {{
+            let product = {op1.name}[batch][i][k] * transpose_{op2.name}[batch][k][j];
+          }} combine {{
+            temporary_{res.name}[batch][i][j] += product;
+          }}
+        }}
+      }}
+    }}
+
+    for (let batch: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let i: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+        for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+          {res.name}[batch][i][j] := temporary_{res.name}[batch][i][j];
+        }}
+      }}
+    }} 
+    """
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+# TODO(cgyurgyik): Similar to batch_matmul, this requires a temporary memory to store the output
+# of the matrix multiply. Otherwise, the values aren't computed properly. Look deeper into this.
+def dense(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.dense"""
+    op1, op2, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
+    bitwidth, M1_size0, M1_size1 = op1.data[0], op1.data[1], op1.data[2]
+    M1_index_size0, M1_index_size1 = op1.data[3], op1.data[4]
+    M2_size0, M2_size1, M2_index_size0, M2_index_size1 = op2.data[1], op2.data[2], op2.data[3], op2.data[4]
+    program = f"""
+    let transpose_{op2.name}: {op2.data_type}<{bitwidth}>[{M2_size1}][{M2_size0}];
+    let temporary_{res.name}: {res.data_type}<{bitwidth}>[{M1_size0}][{M2_size0}];
+    for (let i: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
+      for (let j: ubit<{M2_index_size1}> = 0..{M2_size1}) {{
+        transpose_{op2.name}[j][i] := {op2.name}[i][j];
+      }}
+    }} 
+
+    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let j: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
+        for (let k: ubit<{M1_index_size1}> = 0..{M1_size1}) {{
+          let product = {op1.name}[i][k] * transpose_{op2.name}[k][j];
+        }} combine {{
+          temporary_{res.name}[i][j] += product;
+        }}
+      }}
+    }}
+
+    for (let i: ubit<{M1_index_size0}> = 0..{M1_size0}) {{
+      for (let j: ubit<{M2_index_size0}> = 0..{M2_size0}) {{
+        {res.name}[i][j] := temporary_{res.name}[i][j];
+      }}
+    }}
+    """
+    return LowerDahliaProgramToFuTIL(function, program)
+
+
+# TODO(cgyurgyik): Currently, only supports a small subset (namely those used in our VGG net and MLP net examples).
+def softmax(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.softmax"""
+    assert False, "Unimplemented."
+    op, res = function.inputs[0].primitive, function.output.primitive
+    axis = function.attributes.get_int("axis")
+    data_type = op.data_type
+    assert op.type == PrimitiveType.Memory2D, f'nn.softmax with pritmive type Memory{op.type}D is not supported.'
+    assert axis == -1 or axis == 1, f'nn.softmax with axis = {axis} is not supported.'
+    bitwidth, size0, size1, index_size0, index_size1 = op.data[0], op.data[1], op.data[2], op.data[3], op.data[4]
+
+    import_exp = f"""import "std_exp.h" {{ def exp(x: {data_type}<{bitwidth}>): {data_type}<{bitwidth}>; }}"""
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    program_body = f"""
+    for (let i: ubit<{index_size0}> = 0..{size0}) {{
+      let {op.name}_expsum: {data_type}<{bitwidth}> = {zero};
+      for (let j: ubit<{index_size1}> = 0..{size1}) {{ 
+        {op.name}_expsum += exp({op.name}[i][j]); 
+      }}
+      for (let k: ubit<{index_size1}> = 0..{size1}) {{ 
+        {res.name}[i][k] := exp({op.name}[i][k]); 
+        ---
+        {res.name}[i][k] := {res.name}[i][k] / {op.name}_expsum;
+      }}
+    }}
+    """
+    return LowerDahliaProgramToFuTIL(function, program_body, import_exp)
+
+
+def max_pool2d(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.max_pool2d"""
+    data, res = function.inputs[0].primitive, function.output.primitive
+
+    strides = function.attributes.get_int_tuple("strides")
+    pool_size = function.attributes.get_int_tuple("pool_size")
+    layout = function.attributes.get_str("layout")
+    ceil_mode = function.attributes.get_int("ceil_mode")
+    assert layout == 'NCHW', f"Layout \'{layout}\' is not currently supported for nn.max_pool2d; please use `NCHW`"
+    assert ceil_mode == False, "`ceil_mode` is not currently supported for nn.max_pool2d"
+    bitwidth, data_type = data.data[0], data.data_type
+    size0, size1, size2, size3 = res.data[1], res.data[2], res.data[3], res.data[4]
+
+    program_body = f"""
+    for (let b: ubit<32> = 0..{size0}) {{
+      for (let c: ubit<32> = 0..{size1}) {{
+        for (let y: ubit<32> = 0..{size2}) {{
+          for (let x: ubit<32> = 0..{size3}) {{
+            let stride_y: ubit<32> = y * {strides[0]}/*strides[0]*/;
+            let stride_x: ubit<32> = x * {strides[1]}/*strides[1]*/;
+            
+            let max: {data_type}<{bitwidth}> = {data.name}[b][c][stride_y][stride_x];
+            for (let m: ubit<32> = 0..{pool_size[0]}/*pool_size[0]*/) {{
+              for (let n: ubit<32> = 0..{pool_size[1]}/*pool_size[1]*/) {{
+                let pool_y: ubit<32> = stride_y + m;
+                let pool_x: ubit<32> = stride_x + n;
+                let current: {data_type}<{bitwidth}> = {data.name}[b][c][pool_y][pool_x];
+                if (current > max) {{ max := current; }} 
+              }}
+            }}
+            {res.name}[b][c][y][x] := max;
+          }} 
+        }} 
+      }} 
+    }} 
+    """
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+# Only supports a small subset of the `conv2d` function. For example,
+# dilation and grouped convolution are not supported.
+def conv2d(function):
+    """https://tvm.apache.org/docs/api/python/relay/nn.html#tvm.relay.nn.conv2d"""
+    data, weight, res = function.inputs[0].primitive, function.inputs[1].primitive, function.output.primitive
+
+    strides = function.attributes.get_int_tuple("strides")
+    kernel_size = function.attributes.get_int_tuple("kernel_size")
+    channels = function.attributes.get_int("channels")
+    bitwidth, data_type = data.data[0], data.data_type
+    size0, size1, size2, size3 = res.data[1], res.data[2], res.data[3], res.data[4]
+
+    zero = '0.0' if data_type == 'ufix' or data_type == 'fix' else '0'
+    program_body = f"""
+    for (let b: ubit<32> = 0..{size0}) {{
+      for (let c: ubit<32> = 0..{size1}) {{
+        for (let y: ubit<32> = 0..{size2}) {{
+          for (let x: ubit<32> = 0..{size3}) {{
+            let sum: {data_type}<{bitwidth}> = {zero};
+            
+            for (let k: ubit<32> = 0..{channels}) {{
+              for (let dy: ubit<32> = 0..{kernel_size[1]}/*kernel_size[1]*/) {{
+                for (let dx: ubit<32> = 0..{kernel_size[0]}/*kernel_size[0]*/) {{
+                  let kernel_y: ubit<32> = (/*strides[0]*/{strides[0]} * y) + dy;
+                  let kernel_x: ubit<32> = (/*strides[1]*/{strides[1]} * x) + dx;     
+                }} combine {{ sum += {data.name}[b][k][kernel_y][kernel_x] * {weight.name}[c][k][dy][dx]; }}
+              }}
+            }}
+            {res.name}[b][c][y][x] := sum;
+          }} 
+        }} 
+      }} 
+    }} 
+    """
+    return LowerDahliaProgramToFuTIL(function, program_body)
+
+
+# Mapping from Relay function names to their respective Dahlia lowering.
+RelayFunctionCalls = {'nn.dense': dense, 'nn.batch_flatten': batch_flatten, 'nn.batch_matmul': batch_matmul,
+                      'nn.bias_add': bias_add, 'nn.relu': relu, 'nn.softmax': softmax, 'nn.max_pool2d': max_pool2d,
+                      'nn.conv2d': conv2d, 'negative': negative, 'expand_dims': expand_dims, 'sqrt': sqrt}
+
+# Mapping from Relay binary calls to the respective Dahlia operator.
+BuiltInBinaryOps = {'add': '+', 'divide': '/', 'multiply': '*', 'subtract': '-'}
+
+
+def GetRelayFunctionCall(function_name) -> RelayFunctionCall:
+    """
+    Returns the corresponding name, function, and `op` type (if it is a binary op, otherwise None)
+    of the Relay function call. If the function call isn't supported, fails with an assertion.
+    """
+    function = name = op = None
+    assert function_name in BuiltInBinaryOps or function_name in RelayFunctionCalls, \
+        f'{function_name} is not supported for lowering from Relay IR to FuTIL.'
+    if function_name in BuiltInBinaryOps:
+        op = BuiltInBinaryOps[function_name]
+        function = broadcast
+        name = function_name
+    else:
+        function = RelayFunctionCalls[function_name]
+        name = function.__name__
+    return function, name, op
diff --git a/frontends/relay-futil/example.py b/frontends/relay-futil/example.py
index 0a986e9453..078e90f248 100644
--- a/frontends/relay-futil/example.py
+++ b/frontends/relay-futil/example.py
@@ -16,6 +16,7 @@ def tensor_subtract():
     return relay.Function([x, y], relay.subtract(x, y))
 
 
+# Trying to read in a function that uses `expand_dims` with relay.fromtext() leads to some peculiar errors.
 def expand_dims():
     x = relay.var('x', shape=[512], dtype='int32')
     return relay.Function([x], relay.expand_dims(x, axis=1, num_newaxis=2))
@@ -49,6 +50,22 @@ def dense():
     return relay.Function([x, y], relay.nn.dense(x, y, units=10))
 
 
+def softmax():
+    x = relay.var('x', shape=[1, 10], dtype='float32')
+    return relay.Function([x], relay.nn.softmax(x))
+
+
+def max_pool2d():
+    data = relay.var('data', shape=[2, 2, 4, 4], dtype='int32')
+    return relay.Function([data], relay.nn.max_pool2d(data, padding=[0, 0, 0, 0], strides=[2, 2], pool_size=[2, 2]))
+
+
+def conv2d():
+    d = relay.var('data', shape=[5, 512, 14, 14], dtype='int32')
+    w = relay.var('weight', shape=[512, 512, 3, 3], dtype='int32')
+    return relay.Function([d, w], relay.nn.conv2d(d, w, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]))
+
+
 def mlp_net():
     """The MLP test from Relay."""
     from tvm.relay.testing import mlp
@@ -58,11 +75,12 @@ def mlp_net():
 def vgg_net():
     """The VGG test from Relay."""
     from tvm.relay.testing import vgg
-    return vgg.get_net(batch_size=1, image_shape=(3, 224, 224), num_classes=10, dtype='int32', num_layers=11,
+    return vgg.get_net(batch_size=5, image_shape=(3, 224, 224), num_classes=10, dtype='int32', num_layers=13,
                        batch_norm=True)
 
 
-ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul, bias_add, relu, dense, mlp_net, vgg_net]
+ALL_FUNCS = [add, tensor_subtract, expand_dims, batch_flatten, batch_matmul,
+             bias_add, relu, dense, softmax, conv2d, max_pool2d, mlp_net, vgg_net]
 FUNC_NAMES = list(map(lambda x: x.__name__, ALL_FUNCS))
 
 
diff --git a/frontends/relay-futil/futil_ast.py b/frontends/relay-futil/futil_ast.py
index 03f2e8fa0e..e1194a826c 100644
--- a/frontends/relay-futil/futil_ast.py
+++ b/frontends/relay-futil/futil_ast.py
@@ -4,6 +4,7 @@
 from types import FunctionType
 from enum import Enum, IntEnum
 
+
 # Note: The integer value N for Memory with dimension N is used; these should remain unchanged.
 class PrimitiveType(IntEnum):
     Memory1D = 1
@@ -110,65 +111,43 @@ class FComponent:
     Represents a FuTIL component.
     '''
     name: str
-    cells: List[Cell]  # Instantiated sub-components.
-    wires: List[FConnection]  # Wire connections between components.
+    wires = []  # Wire connections between components.
+    cells = {}  # Instantiated sub-components. This is a mapping from {`dahlia_name`, FCell}.
     controls: FControl = None  # Control statement for this component.
     signature: FSignature = None  # Input and output ports.
 
-    def contains_primitive(self, name: str):
-        '''
-        Determines whether this component contains a primitive with the given name.
-        '''
-        # TODO(cgyurgyik): Rethink data structure here.
-        for cell in self.cells:
-            if not cell.is_primitive(): continue
-            if cell.primitive.name == name: return True
-        return False
-
     def add_cell(self, subcomponent: Cell):
         '''
         Appends a subcomponent to this component's list of FuTIL cells.
         '''
-        if not subcomponent.is_primitive():
-            self.cells.append(subcomponent)
-            return
-        if self.contains_primitive(subcomponent.primitive.name): return
-        self.cells.append(subcomponent)
+        if subcomponent == None: return
+        if subcomponent.is_primitive():
+            self.cells[subcomponent.primitive.name] = subcomponent
+        elif subcomponent.is_relay_function():
+            self.cells[subcomponent.relay_function.name] = subcomponent
 
 
 @dataclass
-class DahliaDeclaration:
-    decl_name: str
+class RelayFunctionCall:
+    """
+    Represents a Relay function call. This will eventually be translated to Dahlia and subsequently lowered to FuTIL.
+    """
+    name: str
     component_name: str
-    op: str = None
+    op: str = None  # Binary operation associated with the Relay function call, if it exists.
+    attributes: tvm.ir.Attrs = None  # Attributes associated with the Relay function call, e.g. `axis`, `padding`.
+    lowering_function: FunctionType = None  # The function used to convert the Dahlia representation to FuTIL.
     inputs: List[Cell] = None
     output: Cell = None
-    attributes: tvm.ir.Attrs = None
-    function: FunctionType = None
-    program: str = None
-
-    def invoke(self):
-        self.program = self.function(self)
-
-
-@dataclass
-class FDeclaration:
-    '''
-    Represents a FuTIL declaration.
-    '''
-    name: str
-    component: FComponent = None
 
 
 @dataclass
 class FCell(Cell):
     dahlia_name: str = None
     primitive: FPrimitive = None
-    declaration: FDeclaration = None
-    dahlia_declaration: DahliaDeclaration = None
+    relay_function: RelayFunctionCall = None
 
+    # TODO(cgyurgyik): Is there a better way to do this, such as std::variant in C++?
     def is_primitive(self): return self.primitive != None
 
-    def is_declaration(self): return self.declaration != None
-
-    def is_dahlia_declaration(self): return self.dahlia_declaration != None
+    def is_relay_function(self): return self.relay_function != None
diff --git a/frontends/relay-futil/pretty_print.py b/frontends/relay-futil/pretty_print.py
index 7a57e6e6b7..2ce59f4139 100644
--- a/frontends/relay-futil/pretty_print.py
+++ b/frontends/relay-futil/pretty_print.py
@@ -2,14 +2,14 @@
 import textwrap
 
 
-def mk_block(decl, contents, indent=2):
+def pp_block(decl, contents, indent=2):
     """Format a block like this:
         decl {
           contents
         }
     where `decl` is one line but contents can be multiple lines.
     """
-    return decl + ' {\n' + textwrap.indent(contents, indent * ' ') + '\n}'
+    return ''.join((decl, ' {\n', textwrap.indent(contents, indent * ' '), '\n}'))
 
 
 def pp_component_signature(component: FComponent):
@@ -39,7 +39,7 @@ def pp_connections(component: FConnection):
             wires = []
             for wire in connection.group.wires:
                 wires.append(pp_wire(wire))
-            connections.append(mk_block(f'group {connection.group.name}', '\n'.join(wires)))
+            connections.append(pp_block(f'group {connection.group.name}', '\n'.join(wires)))
     return connections
 
 
@@ -49,24 +49,57 @@ def pp_control(component: FComponent):
         groups = []
         for group_name in control.stmts:
             groups.append(f'{group_name};')
-        ctrls.append(mk_block(control.name, '\n'.join(groups)))
+        ctrls.append(pp_block(control.name, '\n'.join(groups)))
     return ctrls
 
 
-def pp_component(component: FComponent):
+def pp_lowered_dahlia_components(component: FComponent):
+    relay_functions = []
+    for cell in component.cells.values():
+        if cell == None or not cell.is_relay_function(): continue
+        relay_call = cell.relay_function
+        relay_functions.append(relay_call.lowering_function(relay_call))
+    return '\n'.join(relay_functions)
+
+
+def pp_lowered_relay_function(component: FComponent):
+    """
+    Pretty prints the main program. This consists of the following:
+    1. Relay functions lowered from Dahlia -> FuTIL.
+    2. The `main` component.
+
+    Example:
+    ------------------------------------
+    Input
+    ```
+      fn (%x: int32, %y: int32) { let %z = add(%x, %y); %z }
+    ```
+    ------------------------------------
+    Output
+    ```
+      component add(...) -> (...) { ... }
+
+      component main() -> () {
+        ...
+        control { run_add; }
+      }
+    ```
+    """
+    relay_function_components = pp_lowered_dahlia_components(component)
+
     subcomponents = []
-    for cell in component.cells:
-        if cell == None:
-            continue
+    for cell in component.cells.values():
+        if cell == None: continue
         subcomponents.append(pp_cell(cell))
-    cells = mk_block("cells", '\n'.join(subcomponents))
+    cells = pp_block("cells", '\n'.join(subcomponents))
     inputs, outputs = pp_component_signature(component)
-    wires = mk_block("wires", '\n'.join(pp_connections(component)))
-
-    controls = "" if component.controls == None else '\n'.join(pp_control(component))
-    control = mk_block("control", controls)
+    wires = pp_block("wires", '\n'.join(pp_connections(component)))
 
-    return mk_block(f'component {component.name} ({inputs}) -> ({outputs})', '\n'.join([cells, wires, control]))
+    controls = '\n'.join(pp_control(component))
+    control = pp_block("control", controls)
+    main_component = pp_block(f'component {component.name} ({inputs}) -> ({outputs})',
+                              '\n'.join([cells, wires, control]))
+    return '\n'.join((relay_function_components, main_component))
 
 
 def pp_cell(cell: FCell):
@@ -100,59 +133,5 @@ def pp_cell(cell: FCell):
         if cell.primitive.type == PrimitiveType.BinOp:
             op = data[1]
             return f'{cell.primitive.name} = prim std_{op}({bitwidth});'
-        assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
-    elif cell.is_declaration():
-        return f'{cell.declaration.name} = {cell.declaration.component.name};'
-    elif cell.is_dahlia_declaration():
-        return f'{cell.dahlia_declaration.decl_name} = {cell.dahlia_declaration.component_name};'
-
-
-# Dahlia Pretty Printing.
-
-def next_character(ch, dir=1):
-    """
-    Returns the next character after 'ch'.
-    If dir is positive, then will return 'ch' + 1. Otherwise, it will return 'ch' - 1.
-    """
-    return chr(ord(ch) + dir) if dir > 0 else chr(ord(ch) - 1)
-
-
-def pp_dahlia_memory_declarations(declaration_list):
-    declarations = []
-    for decl in declaration_list:
-        decl_string = f'decl {decl.name}: {decl.data_type}<{decl.data[0]}>'
-        for i in range(0, decl.type): decl_string += f'[{decl.data[i + 1]}]'
-        declarations.append(f'{decl_string};')
-    return '\n'.join(declarations)
-
-
-def pp_dahlia_loop(data, body):
-    """
-    Returns an iteration over data with `body` as the work done within the nested loop(s).
-    Many tensor functions share the same control flow: (1) Iterate over `data`, and (2) do some work in body.
-    For example, if `data` is a 2D primitive of size (M, N) and body == `X;`, then this will return:
-
-    ```
-    for (let i: ubit<X> = 0..M) {
-      for (let j: ubit<Y> = 0..N) {
-        X;
-      }
-    }
-    ```
-    """
-    variable_name = chr(ord('i'))
-    num_dimensions = data.type
-
-    program = []
-    SPACING = ''
-    for i in range(0, num_dimensions):
-        size, index_size = data.data[i + 1], data.data[i + num_dimensions + 1]
-        program.append(f'{SPACING}for (let {variable_name}: ubit<{index_size}> = 0..{size}) {{')
-        variable_name = next_character(variable_name)
-        SPACING += '  '
-    program.append(f'{SPACING}{body}')
-
-    for i in range(0, num_dimensions):
-        SPACING = SPACING[:-2]
-        program.append(f'{SPACING}}}')
-    return '\n'.join(program)
+    if cell.is_relay_function(): return f'{cell.relay_function.name} = {cell.relay_function.component_name};'
+    assert False, f'FCell pretty print unimplemented for {cell} with name {cell.primitive.name}'
diff --git a/frontends/relay-futil/tests/add.expect b/frontends/relay-futil/tests/add.expect
index f239d18b42..8c08e35f31 100644
--- a/frontends/relay-futil/tests/add.expect
+++ b/frontends/relay-futil/tests/add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component add0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(1);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x.addr0 = add0.x0_addr0;
-      add0.x0_read_data = x.read_data;
-      y.addr0 = add0.y0_addr0;
-      add0.y0_read_data = y.read_data;
-      z.addr0 = add0.z0_addr0;
-      z.write_data = add0.z0_write_data;
-      z.write_en = add0.z0_write_en;
-      add0.z0_done = z.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x.addr0 = comp_add0.x0_addr0;
+      comp_add0.x0_read_data = x.read_data;
+      y.addr0 = comp_add0.y0_addr0;
+      comp_add0.y0_read_data = y.read_data;
+      z.addr0 = comp_add0.z0_addr0;
+      z.write_data = comp_add0.z0_write_data;
+      z.write_en = comp_add0.z0_write_en;
+      comp_add0.z0_done = z.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/batch_flatten.expect b/frontends/relay-futil/tests/batch_flatten.expect
index 6927e4ad85..c1d01a7bae 100644
--- a/frontends/relay-futil/tests/batch_flatten.expect
+++ b/frontends/relay-futil/tests/batch_flatten.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component batch_flatten(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+component batch_flatten0(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_0_addr0: 1, x0_0_0_addr1: 2, x0_0_0_addr2: 2, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     add1 = prim std_add(2);
@@ -139,26 +139,26 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d2(32, 1, 4, 1, 3);
     x = prim std_mem_d3(32, 1, 2, 2, 1, 2, 2);
-    batch_flatten0 = batch_flatten;
+    comp_batch_flatten0 = batch_flatten0;
   }
   wires {
-    group run_batch_flatten {
-      x.addr0 = batch_flatten0.x0_0_0_addr0;
-      batch_flatten0.x0_0_0_read_data = x.read_data;
-      x.addr1 = batch_flatten0.x0_0_0_addr1;
-      x.addr2 = batch_flatten0.x0_0_0_addr2;
-      x1.addr0 = batch_flatten0.x10_0_addr0;
-      x1.addr1 = batch_flatten0.x10_0_addr1;
-      x1.write_data = batch_flatten0.x10_0_write_data;
-      x1.write_en = batch_flatten0.x10_0_write_en;
-      batch_flatten0.x10_0_done = x1.done;
-      batch_flatten0.go = 1'd1;
-      run_batch_flatten[done] = batch_flatten0.done ? 1'd1;
+    group run_batch_flatten0 {
+      x.addr0 = comp_batch_flatten0.x0_0_0_addr0;
+      comp_batch_flatten0.x0_0_0_read_data = x.read_data;
+      x.addr1 = comp_batch_flatten0.x0_0_0_addr1;
+      x.addr2 = comp_batch_flatten0.x0_0_0_addr2;
+      x1.addr0 = comp_batch_flatten0.x10_0_addr0;
+      x1.addr1 = comp_batch_flatten0.x10_0_addr1;
+      x1.write_data = comp_batch_flatten0.x10_0_write_data;
+      x1.write_en = comp_batch_flatten0.x10_0_write_en;
+      comp_batch_flatten0.x10_0_done = x1.done;
+      comp_batch_flatten0.go = 1'd1;
+      run_batch_flatten0[done] = comp_batch_flatten0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_batch_flatten;
+      run_batch_flatten0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/batch_matmul.expect b/frontends/relay-futil/tests/batch_matmul.expect
index 0bf73d4754..93a95d5712 100644
--- a/frontends/relay-futil/tests/batch_matmul.expect
+++ b/frontends/relay-futil/tests/batch_matmul.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component batch_matmul(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 3, a0_0_0_addr1: 3, a0_0_0_addr2: 3, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 3, b0_0_0_addr1: 3, b0_0_0_addr2: 3, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
+component batch_matmul0(go: 1, clk: 1, a0_0_0_read_data: 32, a0_0_0_done: 1, b0_0_0_read_data: 32, b0_0_0_done: 1, x0_0_0_read_data: 32, x0_0_0_done: 1) -> (done: 1, a0_0_0_addr0: 3, a0_0_0_addr1: 3, a0_0_0_addr2: 3, a0_0_0_write_data: 32, a0_0_0_write_en: 1, a0_0_0_clk: 1, b0_0_0_addr0: 3, b0_0_0_addr1: 3, b0_0_0_addr2: 3, b0_0_0_write_data: 32, b0_0_0_write_en: 1, b0_0_0_clk: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(3);
@@ -402,31 +402,31 @@ component main () -> () {
     x = prim std_mem_d3(32, 4, 7, 7, 3, 3, 3);
     a = prim std_mem_d3(32, 4, 7, 5, 3, 3, 3);
     b = prim std_mem_d3(32, 4, 7, 5, 3, 3, 3);
-    batch_matmul0 = batch_matmul;
+    comp_batch_matmul0 = batch_matmul0;
   }
   wires {
-    group run_batch_matmul {
-      a.addr0 = batch_matmul0.a0_0_0_addr0;
-      batch_matmul0.a0_0_0_read_data = a.read_data;
-      a.addr1 = batch_matmul0.a0_0_0_addr1;
-      a.addr2 = batch_matmul0.a0_0_0_addr2;
-      b.addr0 = batch_matmul0.b0_0_0_addr0;
-      batch_matmul0.b0_0_0_read_data = b.read_data;
-      b.addr1 = batch_matmul0.b0_0_0_addr1;
-      b.addr2 = batch_matmul0.b0_0_0_addr2;
-      x.addr0 = batch_matmul0.x0_0_0_addr0;
-      x.addr1 = batch_matmul0.x0_0_0_addr1;
-      x.addr2 = batch_matmul0.x0_0_0_addr2;
-      x.write_data = batch_matmul0.x0_0_0_write_data;
-      x.write_en = batch_matmul0.x0_0_0_write_en;
-      batch_matmul0.x0_0_0_done = x.done;
-      batch_matmul0.go = 1'd1;
-      run_batch_matmul[done] = batch_matmul0.done ? 1'd1;
+    group run_batch_matmul0 {
+      a.addr0 = comp_batch_matmul0.a0_0_0_addr0;
+      comp_batch_matmul0.a0_0_0_read_data = a.read_data;
+      a.addr1 = comp_batch_matmul0.a0_0_0_addr1;
+      a.addr2 = comp_batch_matmul0.a0_0_0_addr2;
+      b.addr0 = comp_batch_matmul0.b0_0_0_addr0;
+      comp_batch_matmul0.b0_0_0_read_data = b.read_data;
+      b.addr1 = comp_batch_matmul0.b0_0_0_addr1;
+      b.addr2 = comp_batch_matmul0.b0_0_0_addr2;
+      x.addr0 = comp_batch_matmul0.x0_0_0_addr0;
+      x.addr1 = comp_batch_matmul0.x0_0_0_addr1;
+      x.addr2 = comp_batch_matmul0.x0_0_0_addr2;
+      x.write_data = comp_batch_matmul0.x0_0_0_write_data;
+      x.write_en = comp_batch_matmul0.x0_0_0_write_en;
+      comp_batch_matmul0.x0_0_0_done = x.done;
+      comp_batch_matmul0.go = 1'd1;
+      run_batch_matmul0[done] = comp_batch_matmul0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_batch_matmul;
+      run_batch_matmul0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/bias_add.expect b/frontends/relay-futil/tests/bias_add.expect
index 620da35d44..18ba0a8d0e 100644
--- a/frontends/relay-futil/tests/bias_add.expect
+++ b/frontends/relay-futil/tests/bias_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component bias_add(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_0_0_addr0: 1, x0_0_0_0_addr1: 7, x0_0_0_0_addr2: 10, x0_0_0_0_addr3: 9, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 1, x10_0_0_0_addr1: 7, x10_0_0_0_addr2: 10, x10_0_0_0_addr3: 9, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+component bias_add0(go: 1, clk: 1, bias0_read_data: 32, bias0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, bias0_addr0: 7, bias0_write_data: 32, bias0_write_en: 1, bias0_clk: 1, x0_0_0_0_addr0: 1, x0_0_0_0_addr1: 7, x0_0_0_0_addr2: 10, x0_0_0_0_addr3: 9, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 1, x10_0_0_0_addr1: 7, x10_0_0_0_addr2: 10, x10_0_0_0_addr3: 9, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(9);
@@ -167,27 +167,31 @@ component main () -> () {
     x1 = prim std_mem_d4(32, 1, 64, 512, 256, 1, 7, 10, 9);
     x = prim std_mem_d4(32, 1, 64, 512, 256, 1, 7, 10, 9);
     bias = prim std_mem_d1(32, 64, 7);
-    bias_add0 = bias_add;
+    comp_bias_add0 = bias_add0;
   }
   wires {
-    group run_bias_add {
-      x.addr0 = bias_add0.x0_0_0_0_addr0;
-      bias_add0.x0_0_0_0_read_data = x.read_data;
-      x.addr1 = bias_add0.x0_0_0_0_addr1;
-      x.addr2 = bias_add0.x0_0_0_0_addr2;
-      bias.addr0 = bias_add0.bias0_addr0;
-      bias_add0.bias0_read_data = bias.read_data;
-      x1.addr0 = bias_add0.x10_0_0_0_addr0;
-      x1.write_data = bias_add0.x10_0_0_0_write_data;
-      x1.write_en = bias_add0.x10_0_0_0_write_en;
-      bias_add0.x10_0_0_0_done = x1.done;
-      bias_add0.go = 1'd1;
-      run_bias_add[done] = bias_add0.done ? 1'd1;
+    group run_bias_add0 {
+      x.addr0 = comp_bias_add0.x0_0_0_0_addr0;
+      comp_bias_add0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = comp_bias_add0.x0_0_0_0_addr1;
+      x.addr2 = comp_bias_add0.x0_0_0_0_addr2;
+      x.addr3 = comp_bias_add0.x0_0_0_0_addr3;
+      bias.addr0 = comp_bias_add0.bias0_addr0;
+      comp_bias_add0.bias0_read_data = bias.read_data;
+      x1.addr0 = comp_bias_add0.x10_0_0_0_addr0;
+      x1.addr1 = comp_bias_add0.x10_0_0_0_addr1;
+      x1.addr2 = comp_bias_add0.x10_0_0_0_addr2;
+      x1.addr3 = comp_bias_add0.x10_0_0_0_addr3;
+      x1.write_data = comp_bias_add0.x10_0_0_0_write_data;
+      x1.write_en = comp_bias_add0.x10_0_0_0_write_en;
+      comp_bias_add0.x10_0_0_0_done = x1.done;
+      comp_bias_add0.go = 1'd1;
+      run_bias_add0[done] = comp_bias_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_bias_add;
+      run_bias_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/broadcast.expect b/frontends/relay-futil/tests/broadcast.expect
index 84f5962b54..5eb74f0ca1 100644
--- a/frontends/relay-futil/tests/broadcast.expect
+++ b/frontends/relay-futil/tests/broadcast.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_addr0: 2, x10_0_addr1: 2, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
+component add0(go: 1, clk: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_0_read_data: 32, x20_0_0_done: 1, x30_0_0_read_data: 32, x30_0_0_done: 1) -> (done: 1, x10_0_addr0: 2, x10_0_addr1: 2, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_0_addr0: 2, x20_0_0_addr1: 1, x20_0_0_addr2: 1, x20_0_0_write_data: 32, x20_0_0_write_en: 1, x20_0_0_clk: 1, x30_0_0_addr0: 2, x30_0_0_addr1: 2, x30_0_0_addr2: 2, x30_0_0_write_data: 32, x30_0_0_write_en: 1, x30_0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(2);
@@ -139,30 +139,30 @@ component main () -> () {
     x3 = prim std_mem_d3(32, 2, 2, 2, 2, 2, 2);
     x1 = prim std_mem_d2(32, 2, 2, 2, 2);
     x2 = prim std_mem_d3(32, 2, 1, 1, 2, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x1.addr0 = add0.x10_0_addr0;
-      add0.x10_0_read_data = x1.read_data;
-      x1.addr1 = add0.x10_0_addr1;
-      x2.addr0 = add0.x20_0_0_addr0;
-      add0.x20_0_0_read_data = x2.read_data;
-      x2.addr1 = add0.x20_0_0_addr1;
-      x2.addr2 = add0.x20_0_0_addr2;
-      x3.addr0 = add0.x30_0_0_addr0;
-      x3.addr1 = add0.x30_0_0_addr1;
-      x3.addr2 = add0.x30_0_0_addr2;
-      x3.write_data = add0.x30_0_0_write_data;
-      x3.write_en = add0.x30_0_0_write_en;
-      add0.x30_0_0_done = x3.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x1.addr0 = comp_add0.x10_0_addr0;
+      comp_add0.x10_0_read_data = x1.read_data;
+      x1.addr1 = comp_add0.x10_0_addr1;
+      x2.addr0 = comp_add0.x20_0_0_addr0;
+      comp_add0.x20_0_0_read_data = x2.read_data;
+      x2.addr1 = comp_add0.x20_0_0_addr1;
+      x2.addr2 = comp_add0.x20_0_0_addr2;
+      x3.addr0 = comp_add0.x30_0_0_addr0;
+      x3.addr1 = comp_add0.x30_0_0_addr1;
+      x3.addr2 = comp_add0.x30_0_0_addr2;
+      x3.write_data = comp_add0.x30_0_0_write_data;
+      x3.write_en = comp_add0.x30_0_0_write_en;
+      comp_add0.x30_0_0_done = x3.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/conv2d.expect b/frontends/relay-futil/tests/conv2d.expect
new file mode 100644
index 0000000000..aa5ca04744
--- /dev/null
+++ b/frontends/relay-futil/tests/conv2d.expect
@@ -0,0 +1,395 @@
+import "primitives/std.lib";
+
+component conv2d0(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, weight0_0_0_0_read_data: 32, weight0_0_0_0_done: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 3, data0_0_0_0_addr1: 10, data0_0_0_0_addr2: 4, data0_0_0_0_addr3: 4, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, weight0_0_0_0_addr0: 10, weight0_0_0_0_addr1: 10, weight0_0_0_0_addr2: 2, weight0_0_0_0_addr3: 2, weight0_0_0_0_write_data: 32, weight0_0_0_0_write_en: 1, weight0_0_0_0_clk: 1, x0_0_0_0_addr0: 3, x0_0_0_0_addr1: 10, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 4, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(32);
+    add2 = prim fixed_p_std_add(32, 16, 16);
+    add3 = prim std_add(32);
+    add4 = prim std_add(32);
+    add5 = prim std_add(32);
+    add6 = prim std_add(32);
+    add7 = prim std_add(32);
+    add8 = prim std_add(32);
+    add9 = prim std_add(32);
+    b0 = prim std_reg(32);
+    bin_read0_0 = prim std_reg(32);
+    bin_read1_0 = prim std_reg(32);
+    bin_read2_0 = prim std_reg(32);
+    c0 = prim std_reg(32);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(32, 4);
+    const10 = prim std_const(32, 0);
+    const11 = prim std_const(32, 2);
+    const12 = prim std_const(32, 0);
+    const13 = prim std_const(32, 2);
+    const14 = prim std_const(32, 1);
+    const15 = prim std_const(32, 1);
+    const16 = prim std_const(32, 1);
+    const17 = prim std_const(32, 1);
+    const18 = prim std_const(32, 1);
+    const19 = prim std_const(32, 1);
+    const2 = prim std_const(32, 0);
+    const20 = prim std_const(32, 1);
+    const21 = prim std_const(32, 1);
+    const22 = prim std_const(32, 1);
+    const3 = prim std_const(32, 511);
+    const4 = prim std_const(32, 0);
+    const5 = prim std_const(32, 13);
+    const6 = prim std_const(32, 0);
+    const7 = prim std_const(32, 13);
+    const8 = prim std_const(32, 0);
+    const9 = prim std_const(32, 511);
+    data_read0_0 = prim std_reg(32);
+    dx0 = prim std_reg(32);
+    dy0 = prim std_reg(32);
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    k0 = prim std_reg(32);
+    kernel_x_0 = prim std_reg(32);
+    kernel_y_0 = prim std_reg(32);
+    le0 = prim std_le(32);
+    le1 = prim std_le(32);
+    le2 = prim std_le(32);
+    le3 = prim std_le(32);
+    le4 = prim std_le(32);
+    le5 = prim std_le(32);
+    le6 = prim std_le(32);
+    mult_pipe0 = prim std_mult_pipe(32);
+    mult_pipe1 = prim std_mult_pipe(32);
+    mult_pipe2 = prim std_mult_pipe(32);
+    slice0 = prim std_slice(32, 3);
+    slice1 = prim std_slice(32, 10);
+    slice10 = prim std_slice(32, 10);
+    slice11 = prim std_slice(32, 4);
+    slice12 = prim std_slice(32, 4);
+    slice2 = prim std_slice(32, 4);
+    slice3 = prim std_slice(32, 4);
+    slice4 = prim std_slice(32, 10);
+    slice5 = prim std_slice(32, 10);
+    slice6 = prim std_slice(32, 2);
+    slice7 = prim std_slice(32, 2);
+    slice8 = prim std_slice(32, 32);
+    slice9 = prim std_slice(32, 3);
+    sum_0 = prim std_reg(32);
+    weight_read0_0 = prim std_reg(32);
+    x0 = prim std_reg(32);
+    y0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = b0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = c0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = y0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = x0.out;
+      le3.right = const7.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = k0.out;
+      le4.right = const9.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = dy0.out;
+      le5.right = const11.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      le6.left = dx0.out;
+      le6.right = const13.out;
+    }
+    group let0<"static"=1> {
+      b0.in = const0.out;
+      b0.write_en = 1'd1;
+      let0[done] = b0.done;
+    }
+    group let1<"static"=1> {
+      c0.in = const2.out;
+      c0.write_en = 1'd1;
+      let1[done] = c0.done;
+    }
+    group let10<"static"=4> {
+      bin_read1_0.in = mult_pipe1.out;
+      bin_read1_0.write_en = mult_pipe1.done;
+      let10[done] = bin_read1_0.done;
+      mult_pipe1.left = const15.out;
+      mult_pipe1.right = x0.out;
+      mult_pipe1.go = !mult_pipe1.done ? 1'd1;
+    }
+    group let11<"static"=1> {
+      kernel_x_0.in = add1.out;
+      kernel_x_0.write_en = 1'd1;
+      let11[done] = kernel_x_0.done;
+      add1.left = bin_read1_0.out;
+      add1.right = dx0.out;
+    }
+    group let12<"static"=1> {
+      bin_read2_0.in = slice8.out;
+      bin_read2_0.write_en = 1'd1;
+      let12[done] = bin_read2_0.done;
+      slice8.in = mult_pipe2.out;
+      mult_pipe2.left = data_read0_0.out;
+      mult_pipe2.right = weight_read0_0.out;
+      mult_pipe2.go = !mult_pipe2.done ? 1'd1;
+    }
+    group let2<"static"=1> {
+      y0.in = const4.out;
+      y0.write_en = 1'd1;
+      let2[done] = y0.done;
+    }
+    group let3<"static"=1> {
+      x0.in = const6.out;
+      x0.write_en = 1'd1;
+      let3[done] = x0.done;
+    }
+    group let4<"static"=1> {
+      sum_0.in = fpconst0.out;
+      sum_0.write_en = 1'd1;
+      let4[done] = sum_0.done;
+    }
+    group let5<"static"=1> {
+      k0.in = const8.out;
+      k0.write_en = 1'd1;
+      let5[done] = k0.done;
+    }
+    group let6<"static"=1> {
+      dy0.in = const10.out;
+      dy0.write_en = 1'd1;
+      let6[done] = dy0.done;
+    }
+    group let7<"static"=1> {
+      dx0.in = const12.out;
+      dx0.write_en = 1'd1;
+      let7[done] = dx0.done;
+    }
+    group let8<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let8[done] = bin_read0_0.done;
+      mult_pipe0.left = const14.out;
+      mult_pipe0.right = y0.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let9<"static"=1> {
+      kernel_y_0.in = add0.out;
+      kernel_y_0.write_en = 1'd1;
+      let9[done] = kernel_y_0.done;
+      add0.left = bin_read0_0.out;
+      add0.right = dy0.out;
+    }
+    group upd0<"static"=1> {
+      data_read0_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice3.out;
+      slice3.in = kernel_x_0.out;
+      data0_0_0_0_addr2 = slice2.out;
+      slice2.in = kernel_y_0.out;
+      data0_0_0_0_addr1 = slice1.out;
+      slice1.in = k0.out;
+      data0_0_0_0_addr0 = slice0.out;
+      slice0.in = b0.out;
+      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd0[done] = data_read0_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      weight_read0_0.write_en = 1'd1;
+      weight0_0_0_0_addr3 = slice7.out;
+      slice7.in = dx0.out;
+      weight0_0_0_0_addr2 = slice6.out;
+      slice6.in = dy0.out;
+      weight0_0_0_0_addr1 = slice5.out;
+      slice5.in = k0.out;
+      weight0_0_0_0_addr0 = slice4.out;
+      slice4.in = c0.out;
+      weight_read0_0.in = 1'd1 ? weight0_0_0_0_read_data;
+      upd1[done] = weight_read0_0.done ? 1'd1;
+    }
+    group upd10<"static"=1> {
+      b0.write_en = 1'd1;
+      add9.left = b0.out;
+      add9.right = const22.out;
+      b0.in = 1'd1 ? add9.out;
+      upd10[done] = b0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      sum_0.write_en = 1'd1;
+      add2.left = sum_0.out;
+      add2.right = bin_read2_0.out;
+      sum_0.in = 1'd1 ? add2.out;
+      upd2[done] = sum_0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      dx0.write_en = 1'd1;
+      add3.left = dx0.out;
+      add3.right = const16.out;
+      dx0.in = 1'd1 ? add3.out;
+      upd3[done] = dx0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      dy0.write_en = 1'd1;
+      add4.left = dy0.out;
+      add4.right = const17.out;
+      dy0.in = 1'd1 ? add4.out;
+      upd4[done] = dy0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      k0.write_en = 1'd1;
+      add5.left = k0.out;
+      add5.right = const18.out;
+      k0.in = 1'd1 ? add5.out;
+      upd5[done] = k0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      x0_0_0_0_addr3 = slice12.out;
+      slice12.in = x0.out;
+      x0_0_0_0_addr2 = slice11.out;
+      slice11.in = y0.out;
+      x0_0_0_0_addr1 = slice10.out;
+      slice10.in = c0.out;
+      x0_0_0_0_addr0 = slice9.out;
+      slice9.in = b0.out;
+      x0_0_0_0_write_en = 1'd1;
+      x0_0_0_0_write_data = 1'd1 ? sum_0.out;
+      upd6[done] = x0_0_0_0_done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      x0.write_en = 1'd1;
+      add6.left = x0.out;
+      add6.right = const19.out;
+      x0.in = 1'd1 ? add6.out;
+      upd7[done] = x0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      y0.write_en = 1'd1;
+      add7.left = y0.out;
+      add7.right = const20.out;
+      y0.in = 1'd1 ? add7.out;
+      upd8[done] = y0.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      c0.write_en = 1'd1;
+      add8.left = c0.out;
+      add8.right = const21.out;
+      c0.in = 1'd1 ? add8.out;
+      upd9[done] = c0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      par {
+                        let4;
+                        seq {
+                          let5;
+                          while le4.out with cond4 {
+                            seq {
+                              let6;
+                              while le5.out with cond5 {
+                                seq {
+                                  let7;
+                                  while le6.out with cond6 {
+                                    seq {
+                                      par {
+                                        seq {
+                                          let8;
+                                          let9;
+                                        }
+                                        seq {
+                                          let10;
+                                          let11;
+                                        }
+                                      }
+                                      par {
+                                        upd0;
+                                        upd1;
+                                      }
+                                      let12;
+                                      upd2;
+                                      upd3;
+                                    }
+                                  }
+                                  upd4;
+                                }
+                              }
+                              upd5;
+                            }
+                          }
+                        }
+                      }
+                      upd6;
+                      upd7;
+                    }
+                  }
+                  upd8;
+                }
+              }
+              upd9;
+            }
+          }
+          upd10;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x = prim std_mem_d4(32, 5, 512, 14, 14, 3, 10, 4, 4);
+    data = prim std_mem_d4(32, 5, 512, 14, 14, 3, 10, 4, 4);
+    weight = prim std_mem_d4(32, 512, 512, 3, 3, 10, 10, 2, 2);
+    comp_conv2d0 = conv2d0;
+  }
+  wires {
+    group run_conv2d0 {
+      data.addr0 = comp_conv2d0.data0_0_0_0_addr0;
+      comp_conv2d0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = comp_conv2d0.data0_0_0_0_addr1;
+      data.addr2 = comp_conv2d0.data0_0_0_0_addr2;
+      data.addr3 = comp_conv2d0.data0_0_0_0_addr3;
+      weight.addr0 = comp_conv2d0.weight0_0_0_0_addr0;
+      comp_conv2d0.weight0_0_0_0_read_data = weight.read_data;
+      weight.addr1 = comp_conv2d0.weight0_0_0_0_addr1;
+      weight.addr2 = comp_conv2d0.weight0_0_0_0_addr2;
+      weight.addr3 = comp_conv2d0.weight0_0_0_0_addr3;
+      x.addr0 = comp_conv2d0.x0_0_0_0_addr0;
+      x.addr1 = comp_conv2d0.x0_0_0_0_addr1;
+      x.addr2 = comp_conv2d0.x0_0_0_0_addr2;
+      x.addr3 = comp_conv2d0.x0_0_0_0_addr3;
+      x.write_data = comp_conv2d0.x0_0_0_0_write_data;
+      x.write_en = comp_conv2d0.x0_0_0_0_write_en;
+      comp_conv2d0.x0_0_0_0_done = x.done;
+      comp_conv2d0.go = 1'd1;
+      run_conv2d0[done] = comp_conv2d0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_conv2d0;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/conv2d.relay b/frontends/relay-futil/tests/conv2d.relay
new file mode 100644
index 0000000000..e759bab61a
--- /dev/null
+++ b/frontends/relay-futil/tests/conv2d.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%data: Tensor[(5, 512, 14, 14), float32], %weight: Tensor[(512, 512, 3, 3), float32]) -> Tensor[(5, 512, 14, 14), float32] {
+  let %x: Tensor[(5, 512, 14, 14), float32] = nn.conv2d(%data, %weight, padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]) /* ty=Tensor[(5, 512, 14, 14), float32] */;
+  %x
+}
+
diff --git a/frontends/relay-futil/tests/data/conv2d.expect b/frontends/relay-futil/tests/data/conv2d.expect
new file mode 100644
index 0000000000..2f8cb5e0be
--- /dev/null
+++ b/frontends/relay-futil/tests/data/conv2d.expect
@@ -0,0 +1,120 @@
+{
+  "data": [
+    [
+      [
+        [
+          1,
+          1
+        ],
+        [
+          4,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ]
+    ]
+  ],
+  "weight": [
+    [
+      [
+        [
+          2,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          1
+        ],
+        [
+          1,
+          1
+        ]
+      ],
+      [
+        [
+          1,
+          3
+        ],
+        [
+          1,
+          4
+        ]
+      ]
+    ]
+  ],
+  "x": [
+    [
+      [
+        [
+          12
+        ]
+      ],
+      [
+        [
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          9
+        ]
+      ],
+      [
+        [
+          13
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/conv2d.relay b/frontends/relay-futil/tests/data/conv2d.relay
new file mode 100644
index 0000000000..168e53e418
--- /dev/null
+++ b/frontends/relay-futil/tests/data/conv2d.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%data: Tensor[(2, 2, 2, 2), int32], %weight: Tensor[(2, 2, 2, 2), int32]) {
+  let %x = nn.conv2d(%data, %weight, channels=2, kernel_size=[2,2]);
+  %x
+}
diff --git a/frontends/relay-futil/tests/data/conv2d.relay.data b/frontends/relay-futil/tests/data/conv2d.relay.data
new file mode 100644
index 0000000000..81591e0997
--- /dev/null
+++ b/frontends/relay-futil/tests/data/conv2d.relay.data
@@ -0,0 +1,14 @@
+{
+  "data": {
+    "data": [ [[[1,1], [4,1]], [[1,1], [1,1]]], [[[1,1], [1,1]], [[1,1], [1,1]]] ],
+    "bitwidth": 32
+  },
+  "weight": {
+    "data": [ [[[2,1], [1,1]], [[1,1], [1,1]]], [[[1,1], [1,1]], [[1,3], [1,4]]] ],
+    "bitwidth": 32
+  },
+  "x": {
+    "data": [ [[[0]], [[0]]], [[[0]], [[0]]] ],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/max_pool2d.expect b/frontends/relay-futil/tests/data/max_pool2d.expect
new file mode 100644
index 0000000000..2e4f5739ae
--- /dev/null
+++ b/frontends/relay-futil/tests/data/max_pool2d.expect
@@ -0,0 +1,158 @@
+{
+  "data": [
+    [
+      [
+        [
+          10,
+          20,
+          100,
+          101
+        ],
+        [
+          30,
+          40,
+          102,
+          103
+        ],
+        [
+          20,
+          30,
+          100,
+          103
+        ],
+        [
+          10,
+          40,
+          103,
+          100
+        ]
+      ],
+      [
+        [
+          20,
+          0,
+          70,
+          25
+        ],
+        [
+          1,
+          2,
+          13,
+          4
+        ],
+        [
+          1,
+          2,
+          5,
+          6
+        ],
+        [
+          3,
+          4,
+          7,
+          8
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          11,
+          21,
+          109,
+          10
+        ],
+        [
+          31,
+          41,
+          0,
+          14
+        ],
+        [
+          19,
+          42,
+          1,
+          103
+        ],
+        [
+          1,
+          18,
+          10,
+          101
+        ]
+      ],
+      [
+        [
+          1,
+          2,
+          4,
+          3
+        ],
+        [
+          3,
+          4,
+          2,
+          1
+        ],
+        [
+          4,
+          2,
+          2,
+          4
+        ],
+        [
+          1,
+          3,
+          3,
+          1
+        ]
+      ]
+    ]
+  ],
+  "result": [
+    [
+      [
+        [
+          40,
+          103
+        ],
+        [
+          40,
+          103
+        ]
+      ],
+      [
+        [
+          20,
+          70
+        ],
+        [
+          4,
+          8
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          41,
+          109
+        ],
+        [
+          42,
+          103
+        ]
+      ],
+      [
+        [
+          4,
+          4
+        ],
+        [
+          4,
+          4
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/max_pool2d.relay b/frontends/relay-futil/tests/data/max_pool2d.relay
new file mode 100644
index 0000000000..e1ba79d351
--- /dev/null
+++ b/frontends/relay-futil/tests/data/max_pool2d.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%data: Tensor[(2, 2, 4, 4), int32]) {
+  let %result: Tensor[(2, 2, 2, 2), int32] = nn.max_pool2d(%data, pool_size=[2, 2], strides=[2, 2], padding=[0,0,0,0]);
+  %result
+}
+
diff --git a/frontends/relay-futil/tests/data/max_pool2d.relay.data b/frontends/relay-futil/tests/data/max_pool2d.relay.data
new file mode 100644
index 0000000000..517b34c9cc
--- /dev/null
+++ b/frontends/relay-futil/tests/data/max_pool2d.relay.data
@@ -0,0 +1,43 @@
+{
+  "data": {
+    "data": [
+             [
+              [
+               [10,20,  100,101],
+               [30,40,  102,103],
+
+               [20,30,  100,103],
+               [10,40,  103,100]
+              ],
+              [
+               [20,0,     70,25],
+               [1, 2,      13,4],
+
+               [1,2,        5,6],
+               [3,4,        7,8]
+              ]
+             ],
+             [
+              [
+               [11,21,   109,10],
+               [31,41,     0,14],
+
+               [19,42,    1,103],
+               [1,18,    10,101]
+              ],
+              [
+               [1,2,        4,3],
+               [3,4,        2,1],
+
+               [4,2,        2,4],
+               [1,3,        3,1]
+              ]
+             ]
+            ],
+    "bitwidth": 32
+  },
+  "result": {
+    "data": [ [[[0,0], [0,0]], [[0,0], [0,0]]], [[[0,0], [0,0]], [[0,0], [0,0]]] ],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/softmax.expect b/frontends/relay-futil/tests/data/softmax.expect
new file mode 100644
index 0000000000..1073dc7c6c
--- /dev/null
+++ b/frontends/relay-futil/tests/data/softmax.expect
@@ -0,0 +1,14 @@
+{
+  "x": [
+    [
+      4,
+      16
+    ]
+  ],
+  "x1": [
+    [
+      0,
+      0
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/softmax.relay b/frontends/relay-futil/tests/data/softmax.relay
new file mode 100644
index 0000000000..858ae52126
--- /dev/null
+++ b/frontends/relay-futil/tests/data/softmax.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(1, 2), float32]) {
+  let %x1: Tensor[(1, 2), float32] = nn.softmax(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/tests/data/softmax.relay.data b/frontends/relay-futil/tests/data/softmax.relay.data
new file mode 100644
index 0000000000..f0d81e4e55
--- /dev/null
+++ b/frontends/relay-futil/tests/data/softmax.relay.data
@@ -0,0 +1,10 @@
+{
+  "x": {
+    "data": [[4, 16]],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [[0, 0]],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.expect b/frontends/relay-futil/tests/data/tensor4d_multiply.expect
new file mode 100644
index 0000000000..bd548739d2
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor4d_multiply.expect
@@ -0,0 +1,344 @@
+{
+  "x": [
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ]
+  ],
+  "x1": [
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ],
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+      ]
+    ]
+  ],
+  "x2": [
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ],
+    [
+      [
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ],
+        [
+          1,
+          4,
+          9,
+          16
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.relay b/frontends/relay-futil/tests/data/tensor4d_multiply.relay
new file mode 100644
index 0000000000..197d3c9564
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor4d_multiply.relay
@@ -0,0 +1,5 @@
+v0.0.4
+fn (%x: Tensor[(2, 2, 4, 4), int32], %x1: Tensor[(2, 2, 4, 4), int32]) {
+  let %x2: Tensor[(2, 2, 4, 4), int32] = multiply(%x, %x1);
+  %x2
+}
diff --git a/frontends/relay-futil/tests/data/tensor4d_multiply.relay.data b/frontends/relay-futil/tests/data/tensor4d_multiply.relay.data
new file mode 100644
index 0000000000..6cdaa8c7a7
--- /dev/null
+++ b/frontends/relay-futil/tests/data/tensor4d_multiply.relay.data
@@ -0,0 +1,23 @@
+{
+  "x": {
+    "data": [
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]],
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]]
+            ],
+    "bitwidth": 32
+  },
+  "x1": {
+    "data": [
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]],
+            [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]], [[[1,2,3,4], [1,2,3,4], [1,2,3,4], [1,2,3,4]]]
+            ],
+    "bitwidth": 32
+  },
+  "x2": {
+    "data": [
+            [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]], [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]],
+            [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]], [[[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]]]
+            ],
+    "bitwidth": 32
+  }
+}
\ No newline at end of file
diff --git a/frontends/relay-futil/tests/dense.expect b/frontends/relay-futil/tests/dense.expect
index a0d5ead2b1..9ca0f57adb 100644
--- a/frontends/relay-futil/tests/dense.expect
+++ b/frontends/relay-futil/tests/dense.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component dense(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 13, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 4, y0_0_addr1: 13, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+component dense0(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 13, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 4, y0_0_addr1: 13, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
   cells {
     add0 = prim std_add(13);
     add1 = prim std_add(4);
@@ -307,28 +307,28 @@ component main () -> () {
     x1 = prim std_mem_d2(32, 1, 10, 1, 4);
     x = prim std_mem_d2(32, 1, 4096, 1, 13);
     y = prim std_mem_d2(32, 10, 4096, 4, 13);
-    dense0 = dense;
+    comp_dense0 = dense0;
   }
   wires {
-    group run_dense {
-      x.addr0 = dense0.x0_0_addr0;
-      dense0.x0_0_read_data = x.read_data;
-      x.addr1 = dense0.x0_0_addr1;
-      y.addr0 = dense0.y0_0_addr0;
-      dense0.y0_0_read_data = y.read_data;
-      y.addr1 = dense0.y0_0_addr1;
-      x1.addr0 = dense0.x10_0_addr0;
-      x1.addr1 = dense0.x10_0_addr1;
-      x1.write_data = dense0.x10_0_write_data;
-      x1.write_en = dense0.x10_0_write_en;
-      dense0.x10_0_done = x1.done;
-      dense0.go = 1'd1;
-      run_dense[done] = dense0.done ? 1'd1;
+    group run_dense0 {
+      x.addr0 = comp_dense0.x0_0_addr0;
+      comp_dense0.x0_0_read_data = x.read_data;
+      x.addr1 = comp_dense0.x0_0_addr1;
+      y.addr0 = comp_dense0.y0_0_addr0;
+      comp_dense0.y0_0_read_data = y.read_data;
+      y.addr1 = comp_dense0.y0_0_addr1;
+      x1.addr0 = comp_dense0.x10_0_addr0;
+      x1.addr1 = comp_dense0.x10_0_addr1;
+      x1.write_data = comp_dense0.x10_0_write_data;
+      x1.write_en = comp_dense0.x10_0_write_en;
+      comp_dense0.x10_0_done = x1.done;
+      comp_dense0.go = 1'd1;
+      run_dense0[done] = comp_dense0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_dense;
+      run_dense0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/fixed_point_add.expect b/frontends/relay-futil/tests/fixed_point_add.expect
index aa8240b4cf..9c4910177e 100644
--- a/frontends/relay-futil/tests/fixed_point_add.expect
+++ b/frontends/relay-futil/tests/fixed_point_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component add0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim fixed_p_std_add(32, 16, 16);
     add1 = prim std_add(1);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x.addr0 = add0.x0_addr0;
-      add0.x0_read_data = x.read_data;
-      y.addr0 = add0.y0_addr0;
-      add0.y0_read_data = y.read_data;
-      z.addr0 = add0.z0_addr0;
-      z.write_data = add0.z0_write_data;
-      z.write_en = add0.z0_write_en;
-      add0.z0_done = z.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x.addr0 = comp_add0.x0_addr0;
+      comp_add0.x0_read_data = x.read_data;
+      y.addr0 = comp_add0.y0_addr0;
+      comp_add0.y0_read_data = y.read_data;
+      z.addr0 = comp_add0.z0_addr0;
+      z.write_data = comp_add0.z0_write_data;
+      z.write_en = comp_add0.z0_write_en;
+      comp_add0.z0_done = z.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let1.expect b/frontends/relay-futil/tests/let1.expect
index cf228003ae..e59cbebcd9 100644
--- a/frontends/relay-futil/tests/let1.expect
+++ b/frontends/relay-futil/tests/let1.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component multiply0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -82,25 +82,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
     b = prim std_mem_d1(32, 1, 1);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
   }
   wires {
-    group run_multiply {
-      a.addr0 = multiply0.a0_addr0;
-      multiply0.a0_read_data = a.read_data;
-      b.addr0 = multiply0.b0_addr0;
-      multiply0.b0_read_data = b.read_data;
-      z.addr0 = multiply0.z0_addr0;
-      z.write_data = multiply0.z0_write_data;
-      z.write_en = multiply0.z0_write_en;
-      multiply0.z0_done = z.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
+    group run_multiply0 {
+      a.addr0 = comp_multiply0.a0_addr0;
+      comp_multiply0.a0_read_data = a.read_data;
+      b.addr0 = comp_multiply0.b0_addr0;
+      comp_multiply0.b0_read_data = b.read_data;
+      z.addr0 = comp_multiply0.z0_addr0;
+      z.write_data = comp_multiply0.z0_write_data;
+      z.write_en = comp_multiply0.z0_write_en;
+      comp_multiply0.z0_done = z.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_multiply;
+      run_multiply0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let2.expect b/frontends/relay-futil/tests/let2.expect
index b9a9bfd9ec..451a17e8df 100644
--- a/frontends/relay-futil/tests/let2.expect
+++ b/frontends/relay-futil/tests/let2.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+component add0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(32);
@@ -68,7 +68,8 @@ component add(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_
     }
   }
 }
-component multiply(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
+
+component multiply0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -150,40 +151,40 @@ component main () -> () {
     d = prim std_mem_d1(32, 1, 1);
     c = prim std_mem_d1(32, 1, 1);
     a = prim std_mem_d1(32, 1, 1);
-    add0 = add;
+    comp_add0 = add0;
     b = prim std_mem_d1(32, 1, 1);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
   }
   wires {
-    group run_multiply {
-      a.addr0 = multiply0.a0_addr0;
-      multiply0.a0_read_data = a.read_data;
-      b.addr0 = multiply0.b0_addr0;
-      multiply0.b0_read_data = b.read_data;
-      c.addr0 = multiply0.c0_addr0;
-      c.write_data = multiply0.c0_write_data;
-      c.write_en = multiply0.c0_write_en;
-      multiply0.c0_done = c.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
-    }
-    group run_add {
-      c.addr0 = add0.c0_addr0;
-      add0.c0_read_data = c.read_data;
-      a.addr0 = add0.a0_addr0;
-      add0.a0_read_data = a.read_data;
-      d.addr0 = add0.d0_addr0;
-      d.write_data = add0.d0_write_data;
-      d.write_en = add0.d0_write_en;
-      add0.d0_done = d.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_multiply0 {
+      a.addr0 = comp_multiply0.a0_addr0;
+      comp_multiply0.a0_read_data = a.read_data;
+      b.addr0 = comp_multiply0.b0_addr0;
+      comp_multiply0.b0_read_data = b.read_data;
+      c.addr0 = comp_multiply0.c0_addr0;
+      c.write_data = comp_multiply0.c0_write_data;
+      c.write_en = comp_multiply0.c0_write_en;
+      comp_multiply0.c0_done = c.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
+    }
+    group run_add0 {
+      c.addr0 = comp_add0.c0_addr0;
+      comp_add0.c0_read_data = c.read_data;
+      a.addr0 = comp_add0.a0_addr0;
+      comp_add0.a0_read_data = a.read_data;
+      d.addr0 = comp_add0.d0_addr0;
+      d.write_data = comp_add0.d0_write_data;
+      d.write_en = comp_add0.d0_write_en;
+      comp_add0.d0_done = d.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_multiply;
-      run_add;
+      run_multiply0;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let3.expect b/frontends/relay-futil/tests/let3.expect
index 11b79b4180..222268b304 100644
--- a/frontends/relay-futil/tests/let3.expect
+++ b/frontends/relay-futil/tests/let3.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
+component multiply0(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1, e0_read_data: 32, e0_done: 1) -> (done: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1, e0_addr0: 1, e0_write_data: 32, e0_write_en: 1, e0_clk: 1) {
   cells {
     add0 = prim std_add(1);
     bin_read0_0 = prim std_reg(32);
@@ -76,18 +76,18 @@ component multiply(go: 1, clk: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32
     }
   }
 }
-component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
+
+component subtract1(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32, c0_done: 1, d0_read_data: 32, d0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1, d0_addr0: 1, d0_write_data: 32, d0_write_en: 1, d0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
     c_read0_0 = prim std_reg(32);
     const0 = prim std_const(1, 0);
     const1 = prim std_const(1, 0);
     const2 = prim std_const(1, 1);
-    div_pipe0 = prim std_div_pipe(32);
     i0 = prim std_reg(1);
     le0 = prim std_le(1);
+    sub0 = prim std_sub(32);
   }
   wires {
     group cond0<"static"=0> {
@@ -100,14 +100,6 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
       i0.write_en = 1'd1;
       let0[done] = i0.done;
     }
-    group let1<> {
-      bin_read0_0.in = div_pipe0.out;
-      bin_read0_0.write_en = div_pipe0.done;
-      let1[done] = bin_read0_0.done;
-      div_pipe0.left = c_read0_0.out;
-      div_pipe0.right = a_read0_0.out;
-      div_pipe0.go = !div_pipe0.done ? 1'd1;
-    }
     group upd0<"static"=1> {
       c_read0_0.write_en = 1'd1;
       c0_addr0 = i0.out;
@@ -123,7 +115,9 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
     group upd2<"static"=1> {
       d0_addr0 = i0.out;
       d0_write_en = 1'd1;
-      d0_write_data = 1'd1 ? bin_read0_0.out;
+      sub0.left = c_read0_0.out;
+      sub0.right = a_read0_0.out;
+      d0_write_data = 1'd1 ? sub0.out;
       upd2[done] = d0_done ? 1'd1;
     }
     group upd3<"static"=1> {
@@ -144,7 +138,6 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
             upd0;
             upd1;
           }
-          let1;
           upd2;
           upd3;
         }
@@ -152,7 +145,8 @@ component divide(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, c0_read_data: 32,
     }
   }
 }
-component subtract(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
+
+component subtract0(go: 1, clk: 1, a0_read_data: 32, a0_done: 1, b0_read_data: 32, b0_done: 1, c0_read_data: 32, c0_done: 1) -> (done: 1, a0_addr0: 1, a0_write_data: 32, a0_write_en: 1, a0_clk: 1, b0_addr0: 1, b0_write_data: 32, b0_write_en: 1, b0_clk: 1, c0_addr0: 1, c0_write_data: 32, c0_write_en: 1, c0_clk: 1) {
   cells {
     a_read0_0 = prim std_reg(32);
     add0 = prim std_add(1);
@@ -226,55 +220,55 @@ component main () -> () {
     e = prim std_mem_d1(32, 1, 1);
     c = prim std_mem_d1(32, 1, 1);
     d = prim std_mem_d1(32, 1, 1);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
     a = prim std_mem_d1(32, 1, 1);
-    divide0 = divide;
+    comp_subtract1 = subtract1;
     b = prim std_mem_d1(32, 1, 1);
-    subtract0 = subtract;
+    comp_subtract0 = subtract0;
   }
   wires {
-    group run_subtract {
-      a.addr0 = subtract0.a0_addr0;
-      subtract0.a0_read_data = a.read_data;
-      b.addr0 = subtract0.b0_addr0;
-      subtract0.b0_read_data = b.read_data;
-      c.addr0 = subtract0.c0_addr0;
-      c.write_data = subtract0.c0_write_data;
-      c.write_en = subtract0.c0_write_en;
-      subtract0.c0_done = c.done;
-      subtract0.go = 1'd1;
-      run_subtract[done] = subtract0.done ? 1'd1;
+    group run_subtract0 {
+      a.addr0 = comp_subtract0.a0_addr0;
+      comp_subtract0.a0_read_data = a.read_data;
+      b.addr0 = comp_subtract0.b0_addr0;
+      comp_subtract0.b0_read_data = b.read_data;
+      c.addr0 = comp_subtract0.c0_addr0;
+      c.write_data = comp_subtract0.c0_write_data;
+      c.write_en = comp_subtract0.c0_write_en;
+      comp_subtract0.c0_done = c.done;
+      comp_subtract0.go = 1'd1;
+      run_subtract0[done] = comp_subtract0.done ? 1'd1;
     }
-    group run_divide {
-      c.addr0 = divide0.c0_addr0;
-      divide0.c0_read_data = c.read_data;
-      a.addr0 = divide0.a0_addr0;
-      divide0.a0_read_data = a.read_data;
-      d.addr0 = divide0.d0_addr0;
-      d.write_data = divide0.d0_write_data;
-      d.write_en = divide0.d0_write_en;
-      divide0.d0_done = d.done;
-      divide0.go = 1'd1;
-      run_divide[done] = divide0.done ? 1'd1;
+    group run_subtract1 {
+      c.addr0 = comp_subtract1.c0_addr0;
+      comp_subtract1.c0_read_data = c.read_data;
+      a.addr0 = comp_subtract1.a0_addr0;
+      comp_subtract1.a0_read_data = a.read_data;
+      d.addr0 = comp_subtract1.d0_addr0;
+      d.write_data = comp_subtract1.d0_write_data;
+      d.write_en = comp_subtract1.d0_write_en;
+      comp_subtract1.d0_done = d.done;
+      comp_subtract1.go = 1'd1;
+      run_subtract1[done] = comp_subtract1.done ? 1'd1;
     }
-    group run_multiply {
-      c.addr0 = multiply0.c0_addr0;
-      multiply0.c0_read_data = c.read_data;
-      d.addr0 = multiply0.d0_addr0;
-      multiply0.d0_read_data = d.read_data;
-      e.addr0 = multiply0.e0_addr0;
-      e.write_data = multiply0.e0_write_data;
-      e.write_en = multiply0.e0_write_en;
-      multiply0.e0_done = e.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
+    group run_multiply0 {
+      c.addr0 = comp_multiply0.c0_addr0;
+      comp_multiply0.c0_read_data = c.read_data;
+      d.addr0 = comp_multiply0.d0_addr0;
+      comp_multiply0.d0_read_data = d.read_data;
+      e.addr0 = comp_multiply0.e0_addr0;
+      e.write_data = comp_multiply0.e0_write_data;
+      e.write_en = comp_multiply0.e0_write_en;
+      comp_multiply0.e0_done = e.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_subtract;
-      run_divide;
-      run_multiply;
+      run_subtract0;
+      run_subtract1;
+      run_multiply0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/let3.relay b/frontends/relay-futil/tests/let3.relay
index 50aa9a8064..725e75ab94 100644
--- a/frontends/relay-futil/tests/let3.relay
+++ b/frontends/relay-futil/tests/let3.relay
@@ -1,7 +1,7 @@
 v0.0.4
 fn (%a: int32, %b: int32) {
   let %c = subtract(%a, %b);
-  let %d = divide(%c, %a);
+  let %d = subtract(%c, %a);
   let %e = multiply(%c, %d);
   %e
 }
diff --git a/frontends/relay-futil/tests/max_pool2d.expect b/frontends/relay-futil/tests/max_pool2d.expect
new file mode 100644
index 0000000000..ee3eb04a8b
--- /dev/null
+++ b/frontends/relay-futil/tests/max_pool2d.expect
@@ -0,0 +1,354 @@
+import "primitives/std.lib";
+
+component max_pool2d0(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, result0_0_0_0_read_data: 32, result0_0_0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 2, data0_0_0_0_addr1: 2, data0_0_0_0_addr2: 3, data0_0_0_0_addr3: 3, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, result0_0_0_0_addr0: 2, result0_0_0_0_addr1: 2, result0_0_0_0_addr2: 2, result0_0_0_0_addr3: 2, result0_0_0_0_write_data: 32, result0_0_0_0_write_en: 1, result0_0_0_0_clk: 1) {
+  cells {
+    add0 = prim std_add(32);
+    add1 = prim std_add(32);
+    add2 = prim std_add(32);
+    add3 = prim std_add(32);
+    add4 = prim std_add(32);
+    add5 = prim std_add(32);
+    add6 = prim std_add(32);
+    add7 = prim std_add(32);
+    b0 = prim std_reg(32);
+    bin_read0_0 = prim std_reg(32);
+    bin_read1_0 = prim std_reg(32);
+    c0 = prim std_reg(32);
+    const0 = prim std_const(32, 0);
+    const1 = prim std_const(32, 1);
+    const10 = prim std_const(32, 0);
+    const11 = prim std_const(32, 1);
+    const12 = prim std_const(32, 0);
+    const13 = prim std_const(32, 1);
+    const14 = prim std_const(32, 1);
+    const15 = prim std_const(32, 1);
+    const16 = prim std_const(32, 1);
+    const17 = prim std_const(32, 1);
+    const18 = prim std_const(32, 1);
+    const19 = prim std_const(32, 1);
+    const2 = prim std_const(32, 0);
+    const3 = prim std_const(32, 1);
+    const4 = prim std_const(32, 0);
+    const5 = prim std_const(32, 1);
+    const6 = prim std_const(32, 0);
+    const7 = prim std_const(32, 1);
+    const8 = prim std_const(32, 2);
+    const9 = prim std_const(32, 2);
+    current_0 = prim std_reg(32);
+    gt0 = prim std_gt(32);
+    le0 = prim std_le(32);
+    le1 = prim std_le(32);
+    le2 = prim std_le(32);
+    le3 = prim std_le(32);
+    le4 = prim std_le(32);
+    le5 = prim std_le(32);
+    m0 = prim std_reg(32);
+    max_0 = prim std_reg(32);
+    mult_pipe0 = prim std_mult_pipe(32);
+    mult_pipe1 = prim std_mult_pipe(32);
+    n0 = prim std_reg(32);
+    pool_x_0 = prim std_reg(32);
+    pool_y_0 = prim std_reg(32);
+    slice0 = prim std_slice(32, 2);
+    slice1 = prim std_slice(32, 2);
+    slice10 = prim std_slice(32, 2);
+    slice11 = prim std_slice(32, 2);
+    slice2 = prim std_slice(32, 3);
+    slice3 = prim std_slice(32, 3);
+    slice4 = prim std_slice(32, 2);
+    slice5 = prim std_slice(32, 2);
+    slice6 = prim std_slice(32, 3);
+    slice7 = prim std_slice(32, 3);
+    slice8 = prim std_slice(32, 2);
+    slice9 = prim std_slice(32, 2);
+    stride_x_0 = prim std_reg(32);
+    stride_y_0 = prim std_reg(32);
+    x0 = prim std_reg(32);
+    y0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = b0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = c0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = y0.out;
+      le2.right = const5.out;
+    }
+    group cond3<"static"=0> {
+      cond3[done] = 1'd1;
+      le3.left = x0.out;
+      le3.right = const7.out;
+    }
+    group cond4<"static"=0> {
+      cond4[done] = 1'd1;
+      le4.left = m0.out;
+      le4.right = const11.out;
+    }
+    group cond5<"static"=0> {
+      cond5[done] = 1'd1;
+      le5.left = n0.out;
+      le5.right = const13.out;
+    }
+    group cond6<"static"=0> {
+      cond6[done] = 1'd1;
+      gt0.left = current_0.out;
+      gt0.right = max_0.out;
+    }
+    group let0<"static"=1> {
+      b0.in = const0.out;
+      b0.write_en = 1'd1;
+      let0[done] = b0.done;
+    }
+    group let1<"static"=1> {
+      c0.in = const2.out;
+      c0.write_en = 1'd1;
+      let1[done] = c0.done;
+    }
+    group let10<"static"=1> {
+      pool_y_0.in = add0.out;
+      pool_y_0.write_en = 1'd1;
+      let10[done] = pool_y_0.done;
+      add0.left = stride_y_0.out;
+      add0.right = m0.out;
+    }
+    group let11<"static"=1> {
+      pool_x_0.in = add1.out;
+      pool_x_0.write_en = 1'd1;
+      let11[done] = pool_x_0.done;
+      add1.left = stride_x_0.out;
+      add1.right = n0.out;
+    }
+    group let2<"static"=1> {
+      y0.in = const4.out;
+      y0.write_en = 1'd1;
+      let2[done] = y0.done;
+    }
+    group let3<"static"=1> {
+      x0.in = const6.out;
+      x0.write_en = 1'd1;
+      let3[done] = x0.done;
+    }
+    group let4<"static"=4> {
+      bin_read0_0.in = mult_pipe0.out;
+      bin_read0_0.write_en = mult_pipe0.done;
+      let4[done] = bin_read0_0.done;
+      mult_pipe0.left = y0.out;
+      mult_pipe0.right = const8.out;
+      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
+    }
+    group let5<"static"=1> {
+      stride_y_0.in = bin_read0_0.out;
+      stride_y_0.write_en = 1'd1;
+      let5[done] = stride_y_0.done;
+    }
+    group let6<"static"=4> {
+      bin_read1_0.in = mult_pipe1.out;
+      bin_read1_0.write_en = mult_pipe1.done;
+      let6[done] = bin_read1_0.done;
+      mult_pipe1.left = x0.out;
+      mult_pipe1.right = const9.out;
+      mult_pipe1.go = !mult_pipe1.done ? 1'd1;
+    }
+    group let7<"static"=1> {
+      stride_x_0.in = bin_read1_0.out;
+      stride_x_0.write_en = 1'd1;
+      let7[done] = stride_x_0.done;
+    }
+    group let8<"static"=1> {
+      m0.in = const10.out;
+      m0.write_en = 1'd1;
+      let8[done] = m0.done;
+    }
+    group let9<"static"=1> {
+      n0.in = const12.out;
+      n0.write_en = 1'd1;
+      let9[done] = n0.done;
+    }
+    group upd0<"static"=1> {
+      max_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice3.out;
+      slice3.in = stride_x_0.out;
+      data0_0_0_0_addr2 = slice2.out;
+      slice2.in = stride_y_0.out;
+      data0_0_0_0_addr1 = slice1.out;
+      slice1.in = c0.out;
+      data0_0_0_0_addr0 = slice0.out;
+      slice0.in = b0.out;
+      max_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd0[done] = max_0.done ? 1'd1;
+    }
+    group upd1<"static"=1> {
+      current_0.write_en = 1'd1;
+      data0_0_0_0_addr3 = slice7.out;
+      slice7.in = pool_x_0.out;
+      data0_0_0_0_addr2 = slice6.out;
+      slice6.in = pool_y_0.out;
+      data0_0_0_0_addr1 = slice5.out;
+      slice5.in = c0.out;
+      data0_0_0_0_addr0 = slice4.out;
+      slice4.in = b0.out;
+      current_0.in = 1'd1 ? data0_0_0_0_read_data;
+      upd1[done] = current_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      max_0.write_en = 1'd1;
+      max_0.in = 1'd1 ? current_0.out;
+      upd2[done] = max_0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      n0.write_en = 1'd1;
+      add2.left = n0.out;
+      add2.right = const14.out;
+      n0.in = 1'd1 ? add2.out;
+      upd3[done] = n0.done ? 1'd1;
+    }
+    group upd4<"static"=1> {
+      m0.write_en = 1'd1;
+      add3.left = m0.out;
+      add3.right = const15.out;
+      m0.in = 1'd1 ? add3.out;
+      upd4[done] = m0.done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      result0_0_0_0_addr3 = slice11.out;
+      slice11.in = x0.out;
+      result0_0_0_0_addr2 = slice10.out;
+      slice10.in = y0.out;
+      result0_0_0_0_addr1 = slice9.out;
+      slice9.in = c0.out;
+      result0_0_0_0_addr0 = slice8.out;
+      slice8.in = b0.out;
+      result0_0_0_0_write_en = 1'd1;
+      result0_0_0_0_write_data = 1'd1 ? max_0.out;
+      upd5[done] = result0_0_0_0_done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      x0.write_en = 1'd1;
+      add4.left = x0.out;
+      add4.right = const16.out;
+      x0.in = 1'd1 ? add4.out;
+      upd6[done] = x0.done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      y0.write_en = 1'd1;
+      add5.left = y0.out;
+      add5.right = const17.out;
+      y0.in = 1'd1 ? add5.out;
+      upd7[done] = y0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      c0.write_en = 1'd1;
+      add6.left = c0.out;
+      add6.right = const18.out;
+      c0.in = 1'd1 ? add6.out;
+      upd8[done] = c0.done ? 1'd1;
+    }
+    group upd9<"static"=1> {
+      b0.write_en = 1'd1;
+      add7.left = b0.out;
+      add7.right = const19.out;
+      b0.in = 1'd1 ? add7.out;
+      upd9[done] = b0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          let1;
+          while le1.out with cond1 {
+            seq {
+              let2;
+              while le2.out with cond2 {
+                seq {
+                  let3;
+                  while le3.out with cond3 {
+                    seq {
+                      par {
+                        seq {
+                          let4;
+                          let5;
+                        }
+                        seq {
+                          let6;
+                          let7;
+                        }
+                      }
+                      upd0;
+                      let8;
+                      while le4.out with cond4 {
+                        seq {
+                          let9;
+                          while le5.out with cond5 {
+                            seq {
+                              par {
+                                let10;
+                                let11;
+                              }
+                              upd1;
+                              if gt0.out with cond6 {
+                                upd2;
+                              }
+                              upd3;
+                            }
+                          }
+                          upd4;
+                        }
+                      }
+                      upd5;
+                      upd6;
+                    }
+                  }
+                  upd7;
+                }
+              }
+              upd8;
+            }
+          }
+          upd9;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    result = prim std_mem_d4(32, 2, 2, 2, 2, 2, 2, 2, 2);
+    data = prim std_mem_d4(32, 2, 2, 4, 4, 2, 2, 3, 3);
+    comp_max_pool2d0 = max_pool2d0;
+  }
+  wires {
+    group run_max_pool2d0 {
+      data.addr0 = comp_max_pool2d0.data0_0_0_0_addr0;
+      comp_max_pool2d0.data0_0_0_0_read_data = data.read_data;
+      data.addr1 = comp_max_pool2d0.data0_0_0_0_addr1;
+      data.addr2 = comp_max_pool2d0.data0_0_0_0_addr2;
+      data.addr3 = comp_max_pool2d0.data0_0_0_0_addr3;
+      result.addr0 = comp_max_pool2d0.result0_0_0_0_addr0;
+      result.addr1 = comp_max_pool2d0.result0_0_0_0_addr1;
+      result.addr2 = comp_max_pool2d0.result0_0_0_0_addr2;
+      result.addr3 = comp_max_pool2d0.result0_0_0_0_addr3;
+      result.write_data = comp_max_pool2d0.result0_0_0_0_write_data;
+      result.write_en = comp_max_pool2d0.result0_0_0_0_write_en;
+      comp_max_pool2d0.result0_0_0_0_done = result.done;
+      comp_max_pool2d0.go = 1'd1;
+      run_max_pool2d0[done] = comp_max_pool2d0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_max_pool2d0;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/max_pool2d.relay b/frontends/relay-futil/tests/max_pool2d.relay
new file mode 100644
index 0000000000..e1ba79d351
--- /dev/null
+++ b/frontends/relay-futil/tests/max_pool2d.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%data: Tensor[(2, 2, 4, 4), int32]) {
+  let %result: Tensor[(2, 2, 2, 2), int32] = nn.max_pool2d(%data, pool_size=[2, 2], strides=[2, 2], padding=[0,0,0,0]);
+  %result
+}
+
diff --git a/frontends/relay-futil/tests/mlp_net.expect b/frontends/relay-futil/tests/mlp_net.expect
deleted file mode 100644
index 812a0381fd..0000000000
--- a/frontends/relay-futil/tests/mlp_net.expect
+++ /dev/null
@@ -1,1791 +0,0 @@
-import "primitives/std.lib";
-
-component bias_add2(go: 1, clk: 1, fc3_bias0_read_data: 32, fc3_bias0_done: 1, x70_0_read_data: 32, x70_0_done: 1, x80_0_read_data: 32, x80_0_done: 1) -> (done: 1, fc3_bias0_addr0: 4, fc3_bias0_write_data: 32, fc3_bias0_write_en: 1, fc3_bias0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1, x80_0_addr0: 1, x80_0_addr1: 4, x80_0_write_data: 32, x80_0_write_en: 1, x80_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(4);
-    add2 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(4, 0);
-    const3 = prim std_const(4, 9);
-    const4 = prim std_const(4, 1);
-    const5 = prim std_const(1, 1);
-    fc3_bias_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(4);
-    le0 = prim std_le(1);
-    le1 = prim std_le(4);
-    x7_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x7_read0_0.write_en = 1'd1;
-      x70_0_addr1 = j0.out;
-      x70_0_addr0 = i0.out;
-      x7_read0_0.in = 1'd1 ? x70_0_read_data;
-      upd0[done] = x7_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      fc3_bias_read0_0.write_en = 1'd1;
-      fc3_bias0_addr0 = j0.out;
-      fc3_bias_read0_0.in = 1'd1 ? fc3_bias0_read_data;
-      upd1[done] = fc3_bias_read0_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x80_0_addr1 = j0.out;
-      x80_0_addr0 = i0.out;
-      x80_0_write_en = 1'd1;
-      add0.left = x7_read0_0.out;
-      add0.right = fc3_bias_read0_0.out;
-      x80_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x80_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              par {
-                upd0;
-                upd1;
-              }
-              upd2;
-              upd3;
-            }
-          }
-          upd4;
-        }
-      }
-    }
-  }
-}
-component dense2(go: 1, clk: 1, fc3_weight0_0_read_data: 32, fc3_weight0_0_done: 1, x60_0_read_data: 32, x60_0_done: 1, x70_0_read_data: 32, x70_0_done: 1) -> (done: 1, fc3_weight0_0_addr0: 4, fc3_weight0_0_addr1: 7, fc3_weight0_0_write_data: 32, fc3_weight0_0_write_en: 1, fc3_weight0_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1, x70_0_addr0: 1, x70_0_addr1: 4, x70_0_write_data: 32, x70_0_write_en: 1, x70_0_clk: 1) {
-  cells {
-    add0 = prim std_add(7);
-    add1 = prim std_add(4);
-    add2 = prim fixed_p_std_add(32, 16, 16);
-    add3 = prim std_add(7);
-    add4 = prim std_add(4);
-    add5 = prim std_add(1);
-    add6 = prim std_add(4);
-    add7 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(4, 0);
-    const1 = prim std_const(4, 9);
-    const10 = prim std_const(7, 0);
-    const11 = prim std_const(7, 63);
-    const12 = prim std_const(7, 1);
-    const13 = prim std_const(4, 1);
-    const14 = prim std_const(1, 1);
-    const15 = prim std_const(1, 0);
-    const16 = prim std_const(1, 0);
-    const17 = prim std_const(4, 0);
-    const18 = prim std_const(4, 9);
-    const19 = prim std_const(4, 1);
-    const2 = prim std_const(7, 0);
-    const20 = prim std_const(1, 1);
-    const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(4, 1);
-    const6 = prim std_const(1, 0);
-    const7 = prim std_const(1, 0);
-    const8 = prim std_const(4, 0);
-    const9 = prim std_const(4, 9);
-    fc3_weight_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(4);
-    i1 = prim std_reg(1);
-    i2 = prim std_reg(1);
-    j0 = prim std_reg(7);
-    j1 = prim std_reg(4);
-    j2 = prim std_reg(4);
-    k0 = prim std_reg(7);
-    le0 = prim std_le(4);
-    le1 = prim std_le(7);
-    le2 = prim std_le(1);
-    le3 = prim std_le(4);
-    le4 = prim std_le(7);
-    le5 = prim std_le(1);
-    le6 = prim std_le(4);
-    mult_pipe0 = prim std_mult_pipe(32);
-    product_0 = prim std_reg(32);
-    slice0 = prim std_slice(32, 32);
-    slice1 = prim std_slice(32, 32);
-    temporary_x70_0 = prim std_mem_d2(32, 1, 10, 1, 4);
-    temporary_x7_read0_0 = prim std_reg(32);
-    transpose_fc3_weight0_0 = prim std_mem_d2(32, 64, 10, 7, 4);
-    transpose_fc3_weight_read0_0 = prim std_reg(32);
-    x6_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = i1.out;
-      le2.right = const7.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = j1.out;
-      le3.right = const9.out;
-    }
-    group cond4<"static"=0> {
-      cond4[done] = 1'd1;
-      le4.left = k0.out;
-      le4.right = const11.out;
-    }
-    group cond5<"static"=0> {
-      cond5[done] = 1'd1;
-      le5.left = i2.out;
-      le5.right = const16.out;
-    }
-    group cond6<"static"=0> {
-      cond6[done] = 1'd1;
-      le6.left = j2.out;
-      le6.right = const18.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group let2<"static"=1> {
-      i1.in = const6.out;
-      i1.write_en = 1'd1;
-      let2[done] = i1.done;
-    }
-    group let3<"static"=1> {
-      j1.in = const8.out;
-      j1.write_en = 1'd1;
-      let3[done] = j1.done;
-    }
-    group let4<"static"=1> {
-      k0.in = const10.out;
-      k0.write_en = 1'd1;
-      let4[done] = k0.done;
-    }
-    group let5<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let5[done] = bin_read0_0.done;
-      slice0.in = mult_pipe0.out;
-      mult_pipe0.left = x6_read0_0.out;
-      mult_pipe0.right = transpose_fc3_weight_read0_0.out;
-      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
-    }
-    group let6<"static"=1> {
-      product_0.in = slice1.out;
-      product_0.write_en = 1'd1;
-      let6[done] = product_0.done;
-      slice1.in = bin_read0_0.out;
-    }
-    group let7<"static"=1> {
-      i2.in = const15.out;
-      i2.write_en = 1'd1;
-      let7[done] = i2.done;
-    }
-    group let8<"static"=1> {
-      j2.in = const17.out;
-      j2.write_en = 1'd1;
-      let8[done] = j2.done;
-    }
-    group upd0<"static"=1> {
-      fc3_weight_read0_0.write_en = 1'd1;
-      fc3_weight0_0_addr1 = j0.out;
-      fc3_weight0_0_addr0 = i0.out;
-      fc3_weight_read0_0.in = 1'd1 ? fc3_weight0_0_read_data;
-      upd0[done] = fc3_weight_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      transpose_fc3_weight0_0.addr1 = i0.out;
-      transpose_fc3_weight0_0.addr0 = j0.out;
-      transpose_fc3_weight0_0.write_en = 1'd1;
-      transpose_fc3_weight0_0.write_data = 1'd1 ? fc3_weight_read0_0.out;
-      upd1[done] = transpose_fc3_weight0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      temporary_x7_read0_0.write_en = 1'd1;
-      temporary_x70_0.addr1 = j2.out;
-      temporary_x70_0.addr0 = i2.out;
-      temporary_x7_read0_0.in = 1'd1 ? temporary_x70_0.read_data;
-      upd10[done] = temporary_x7_read0_0.done ? 1'd1;
-    }
-    group upd11<"static"=1> {
-      x70_0_addr1 = j2.out;
-      x70_0_addr0 = i2.out;
-      x70_0_write_en = 1'd1;
-      x70_0_write_data = 1'd1 ? temporary_x7_read0_0.out;
-      upd11[done] = x70_0_done ? 1'd1;
-    }
-    group upd12<"static"=1> {
-      j2.write_en = 1'd1;
-      add6.left = j2.out;
-      add6.right = const19.out;
-      j2.in = 1'd1 ? add6.out;
-      upd12[done] = j2.done ? 1'd1;
-    }
-    group upd13<"static"=1> {
-      i2.write_en = 1'd1;
-      add7.left = i2.out;
-      add7.right = const20.out;
-      i2.in = 1'd1 ? add7.out;
-      upd13[done] = i2.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd3[done] = i0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      x6_read0_0.write_en = 1'd1;
-      x60_0_addr1 = k0.out;
-      x60_0_addr0 = i1.out;
-      x6_read0_0.in = 1'd1 ? x60_0_read_data;
-      upd4[done] = x6_read0_0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      transpose_fc3_weight_read0_0.write_en = 1'd1;
-      transpose_fc3_weight0_0.addr1 = j1.out;
-      transpose_fc3_weight0_0.addr0 = k0.out;
-      transpose_fc3_weight_read0_0.in = 1'd1 ? transpose_fc3_weight0_0.read_data;
-      upd5[done] = transpose_fc3_weight_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      temporary_x70_0.addr1 = j1.out;
-      temporary_x70_0.addr0 = i1.out;
-      temporary_x70_0.write_en = 1'd1;
-      add2.left = temporary_x70_0.read_data;
-      add2.right = product_0.out;
-      temporary_x70_0.addr1 = j1.out;
-      temporary_x70_0.addr0 = i1.out;
-      temporary_x70_0.write_data = 1'd1 ? add2.out;
-      upd6[done] = temporary_x70_0.done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add3.left = k0.out;
-      add3.right = const12.out;
-      k0.in = 1'd1 ? add3.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      j1.write_en = 1'd1;
-      add4.left = j1.out;
-      add4.right = const13.out;
-      j1.in = 1'd1 ? add4.out;
-      upd8[done] = j1.done ? 1'd1;
-    }
-    group upd9<"static"=1> {
-      i1.write_en = 1'd1;
-      add5.left = i1.out;
-      add5.right = const14.out;
-      i1.in = 1'd1 ? add5.out;
-      upd9[done] = i1.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              upd1;
-              upd2;
-            }
-          }
-          upd3;
-        }
-      }
-      let2;
-      while le2.out with cond2 {
-        seq {
-          let3;
-          while le3.out with cond3 {
-            seq {
-              let4;
-              while le4.out with cond4 {
-                seq {
-                  par {
-                    upd4;
-                    upd5;
-                  }
-                  let5;
-                  let6;
-                  upd6;
-                  upd7;
-                }
-              }
-              upd8;
-            }
-          }
-          upd9;
-        }
-      }
-      let7;
-      while le5.out with cond5 {
-        seq {
-          let8;
-          while le6.out with cond6 {
-            seq {
-              upd10;
-              upd11;
-              upd12;
-            }
-          }
-          upd13;
-        }
-      }
-    }
-  }
-}
-component relu1(go: 1, clk: 1, x50_0_read_data: 32, x50_0_done: 1, x60_0_read_data: 32, x60_0_done: 1) -> (done: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1, x60_0_addr0: 1, x60_0_addr1: 7, x60_0_write_data: 32, x60_0_write_en: 1, x60_0_clk: 1) {
-  cells {
-    add0 = prim std_add(7);
-    add1 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(7, 0);
-    const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(1, 1);
-    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
-    gt0 = prim fixed_p_std_gt(32, 16, 16);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(7);
-    le0 = prim std_le(1);
-    le1 = prim std_le(7);
-    x5_read0_0 = prim std_reg(32);
-    x5_read1_0 = prim std_reg(32);
-    zero_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      gt0.left = x5_read0_0.out;
-      gt0.right = zero_0.out;
-    }
-    group let0<"static"=1> {
-      zero_0.in = fpconst0.out;
-      zero_0.write_en = 1'd1;
-      let0[done] = zero_0.done;
-    }
-    group let1<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let1[done] = i0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x5_read0_0.write_en = 1'd1;
-      x50_0_addr1 = j0.out;
-      x50_0_addr0 = i0.out;
-      x5_read0_0.in = 1'd1 ? x50_0_read_data;
-      upd0[done] = x5_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      x5_read1_0.write_en = 1'd1;
-      x50_0_addr1 = j0.out;
-      x50_0_addr0 = i0.out;
-      x5_read1_0.in = 1'd1 ? x50_0_read_data;
-      upd1[done] = x5_read1_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x60_0_addr1 = j0.out;
-      x60_0_addr0 = i0.out;
-      x60_0_write_en = 1'd1;
-      x60_0_write_data = 1'd1 ? x5_read1_0.out;
-      upd2[done] = x60_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      x60_0_addr1 = j0.out;
-      x60_0_addr0 = i0.out;
-      x60_0_write_en = 1'd1;
-      x60_0_write_data = 1'd1 ? zero_0.out;
-      upd3[done] = x60_0_done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd4[done] = j0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd5[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      let1;
-      while le0.out with cond0 {
-        seq {
-          let2;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              if gt0.out with cond2 {
-                seq {
-                  upd1;
-                  upd2;
-                }
-              } else {
-                upd3;
-              }
-              upd4;
-            }
-          }
-          upd5;
-        }
-      }
-    }
-  }
-}
-component bias_add1(go: 1, clk: 1, fc2_bias0_read_data: 32, fc2_bias0_done: 1, x40_0_read_data: 32, x40_0_done: 1, x50_0_read_data: 32, x50_0_done: 1) -> (done: 1, fc2_bias0_addr0: 7, fc2_bias0_write_data: 32, fc2_bias0_write_en: 1, fc2_bias0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1, x50_0_addr0: 1, x50_0_addr1: 7, x50_0_write_data: 32, x50_0_write_en: 1, x50_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(7);
-    add2 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(7, 0);
-    const3 = prim std_const(7, 63);
-    const4 = prim std_const(7, 1);
-    const5 = prim std_const(1, 1);
-    fc2_bias_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(7);
-    le0 = prim std_le(1);
-    le1 = prim std_le(7);
-    x4_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x4_read0_0.write_en = 1'd1;
-      x40_0_addr1 = j0.out;
-      x40_0_addr0 = i0.out;
-      x4_read0_0.in = 1'd1 ? x40_0_read_data;
-      upd0[done] = x4_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      fc2_bias_read0_0.write_en = 1'd1;
-      fc2_bias0_addr0 = j0.out;
-      fc2_bias_read0_0.in = 1'd1 ? fc2_bias0_read_data;
-      upd1[done] = fc2_bias_read0_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x50_0_addr1 = j0.out;
-      x50_0_addr0 = i0.out;
-      x50_0_write_en = 1'd1;
-      add0.left = x4_read0_0.out;
-      add0.right = fc2_bias_read0_0.out;
-      x50_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x50_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              par {
-                upd0;
-                upd1;
-              }
-              upd2;
-              upd3;
-            }
-          }
-          upd4;
-        }
-      }
-    }
-  }
-}
-component dense1(go: 1, clk: 1, fc2_weight0_0_read_data: 32, fc2_weight0_0_done: 1, x30_0_read_data: 32, x30_0_done: 1, x40_0_read_data: 32, x40_0_done: 1) -> (done: 1, fc2_weight0_0_addr0: 7, fc2_weight0_0_addr1: 8, fc2_weight0_0_write_data: 32, fc2_weight0_0_write_en: 1, fc2_weight0_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1, x40_0_addr0: 1, x40_0_addr1: 7, x40_0_write_data: 32, x40_0_write_en: 1, x40_0_clk: 1) {
-  cells {
-    add0 = prim std_add(8);
-    add1 = prim std_add(7);
-    add2 = prim fixed_p_std_add(32, 16, 16);
-    add3 = prim std_add(8);
-    add4 = prim std_add(7);
-    add5 = prim std_add(1);
-    add6 = prim std_add(7);
-    add7 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(7, 0);
-    const1 = prim std_const(7, 63);
-    const10 = prim std_const(8, 0);
-    const11 = prim std_const(8, 127);
-    const12 = prim std_const(8, 1);
-    const13 = prim std_const(7, 1);
-    const14 = prim std_const(1, 1);
-    const15 = prim std_const(1, 0);
-    const16 = prim std_const(1, 0);
-    const17 = prim std_const(7, 0);
-    const18 = prim std_const(7, 63);
-    const19 = prim std_const(7, 1);
-    const2 = prim std_const(8, 0);
-    const20 = prim std_const(1, 1);
-    const3 = prim std_const(8, 127);
-    const4 = prim std_const(8, 1);
-    const5 = prim std_const(7, 1);
-    const6 = prim std_const(1, 0);
-    const7 = prim std_const(1, 0);
-    const8 = prim std_const(7, 0);
-    const9 = prim std_const(7, 63);
-    fc2_weight_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(7);
-    i1 = prim std_reg(1);
-    i2 = prim std_reg(1);
-    j0 = prim std_reg(8);
-    j1 = prim std_reg(7);
-    j2 = prim std_reg(7);
-    k0 = prim std_reg(8);
-    le0 = prim std_le(7);
-    le1 = prim std_le(8);
-    le2 = prim std_le(1);
-    le3 = prim std_le(7);
-    le4 = prim std_le(8);
-    le5 = prim std_le(1);
-    le6 = prim std_le(7);
-    mult_pipe0 = prim std_mult_pipe(32);
-    product_0 = prim std_reg(32);
-    slice0 = prim std_slice(32, 32);
-    slice1 = prim std_slice(32, 32);
-    temporary_x40_0 = prim std_mem_d2(32, 1, 64, 1, 7);
-    temporary_x4_read0_0 = prim std_reg(32);
-    transpose_fc2_weight0_0 = prim std_mem_d2(32, 128, 64, 8, 7);
-    transpose_fc2_weight_read0_0 = prim std_reg(32);
-    x3_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = i1.out;
-      le2.right = const7.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = j1.out;
-      le3.right = const9.out;
-    }
-    group cond4<"static"=0> {
-      cond4[done] = 1'd1;
-      le4.left = k0.out;
-      le4.right = const11.out;
-    }
-    group cond5<"static"=0> {
-      cond5[done] = 1'd1;
-      le5.left = i2.out;
-      le5.right = const16.out;
-    }
-    group cond6<"static"=0> {
-      cond6[done] = 1'd1;
-      le6.left = j2.out;
-      le6.right = const18.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group let2<"static"=1> {
-      i1.in = const6.out;
-      i1.write_en = 1'd1;
-      let2[done] = i1.done;
-    }
-    group let3<"static"=1> {
-      j1.in = const8.out;
-      j1.write_en = 1'd1;
-      let3[done] = j1.done;
-    }
-    group let4<"static"=1> {
-      k0.in = const10.out;
-      k0.write_en = 1'd1;
-      let4[done] = k0.done;
-    }
-    group let5<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let5[done] = bin_read0_0.done;
-      slice0.in = mult_pipe0.out;
-      mult_pipe0.left = x3_read0_0.out;
-      mult_pipe0.right = transpose_fc2_weight_read0_0.out;
-      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
-    }
-    group let6<"static"=1> {
-      product_0.in = slice1.out;
-      product_0.write_en = 1'd1;
-      let6[done] = product_0.done;
-      slice1.in = bin_read0_0.out;
-    }
-    group let7<"static"=1> {
-      i2.in = const15.out;
-      i2.write_en = 1'd1;
-      let7[done] = i2.done;
-    }
-    group let8<"static"=1> {
-      j2.in = const17.out;
-      j2.write_en = 1'd1;
-      let8[done] = j2.done;
-    }
-    group upd0<"static"=1> {
-      fc2_weight_read0_0.write_en = 1'd1;
-      fc2_weight0_0_addr1 = j0.out;
-      fc2_weight0_0_addr0 = i0.out;
-      fc2_weight_read0_0.in = 1'd1 ? fc2_weight0_0_read_data;
-      upd0[done] = fc2_weight_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      transpose_fc2_weight0_0.addr1 = i0.out;
-      transpose_fc2_weight0_0.addr0 = j0.out;
-      transpose_fc2_weight0_0.write_en = 1'd1;
-      transpose_fc2_weight0_0.write_data = 1'd1 ? fc2_weight_read0_0.out;
-      upd1[done] = transpose_fc2_weight0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      temporary_x4_read0_0.write_en = 1'd1;
-      temporary_x40_0.addr1 = j2.out;
-      temporary_x40_0.addr0 = i2.out;
-      temporary_x4_read0_0.in = 1'd1 ? temporary_x40_0.read_data;
-      upd10[done] = temporary_x4_read0_0.done ? 1'd1;
-    }
-    group upd11<"static"=1> {
-      x40_0_addr1 = j2.out;
-      x40_0_addr0 = i2.out;
-      x40_0_write_en = 1'd1;
-      x40_0_write_data = 1'd1 ? temporary_x4_read0_0.out;
-      upd11[done] = x40_0_done ? 1'd1;
-    }
-    group upd12<"static"=1> {
-      j2.write_en = 1'd1;
-      add6.left = j2.out;
-      add6.right = const19.out;
-      j2.in = 1'd1 ? add6.out;
-      upd12[done] = j2.done ? 1'd1;
-    }
-    group upd13<"static"=1> {
-      i2.write_en = 1'd1;
-      add7.left = i2.out;
-      add7.right = const20.out;
-      i2.in = 1'd1 ? add7.out;
-      upd13[done] = i2.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd3[done] = i0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      x3_read0_0.write_en = 1'd1;
-      x30_0_addr1 = k0.out;
-      x30_0_addr0 = i1.out;
-      x3_read0_0.in = 1'd1 ? x30_0_read_data;
-      upd4[done] = x3_read0_0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      transpose_fc2_weight_read0_0.write_en = 1'd1;
-      transpose_fc2_weight0_0.addr1 = j1.out;
-      transpose_fc2_weight0_0.addr0 = k0.out;
-      transpose_fc2_weight_read0_0.in = 1'd1 ? transpose_fc2_weight0_0.read_data;
-      upd5[done] = transpose_fc2_weight_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      temporary_x40_0.addr1 = j1.out;
-      temporary_x40_0.addr0 = i1.out;
-      temporary_x40_0.write_en = 1'd1;
-      add2.left = temporary_x40_0.read_data;
-      add2.right = product_0.out;
-      temporary_x40_0.addr1 = j1.out;
-      temporary_x40_0.addr0 = i1.out;
-      temporary_x40_0.write_data = 1'd1 ? add2.out;
-      upd6[done] = temporary_x40_0.done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add3.left = k0.out;
-      add3.right = const12.out;
-      k0.in = 1'd1 ? add3.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      j1.write_en = 1'd1;
-      add4.left = j1.out;
-      add4.right = const13.out;
-      j1.in = 1'd1 ? add4.out;
-      upd8[done] = j1.done ? 1'd1;
-    }
-    group upd9<"static"=1> {
-      i1.write_en = 1'd1;
-      add5.left = i1.out;
-      add5.right = const14.out;
-      i1.in = 1'd1 ? add5.out;
-      upd9[done] = i1.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              upd1;
-              upd2;
-            }
-          }
-          upd3;
-        }
-      }
-      let2;
-      while le2.out with cond2 {
-        seq {
-          let3;
-          while le3.out with cond3 {
-            seq {
-              let4;
-              while le4.out with cond4 {
-                seq {
-                  par {
-                    upd4;
-                    upd5;
-                  }
-                  let5;
-                  let6;
-                  upd6;
-                  upd7;
-                }
-              }
-              upd8;
-            }
-          }
-          upd9;
-        }
-      }
-      let7;
-      while le5.out with cond5 {
-        seq {
-          let8;
-          while le6.out with cond6 {
-            seq {
-              upd10;
-              upd11;
-              upd12;
-            }
-          }
-          upd13;
-        }
-      }
-    }
-  }
-}
-component relu(go: 1, clk: 1, x20_0_read_data: 32, x20_0_done: 1, x30_0_read_data: 32, x30_0_done: 1) -> (done: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1, x30_0_addr0: 1, x30_0_addr1: 8, x30_0_write_data: 32, x30_0_write_en: 1, x30_0_clk: 1) {
-  cells {
-    add0 = prim std_add(8);
-    add1 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(8, 0);
-    const3 = prim std_const(8, 127);
-    const4 = prim std_const(8, 1);
-    const5 = prim std_const(1, 1);
-    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
-    gt0 = prim fixed_p_std_gt(32, 16, 16);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(8);
-    le0 = prim std_le(1);
-    le1 = prim std_le(8);
-    x2_read0_0 = prim std_reg(32);
-    x2_read1_0 = prim std_reg(32);
-    zero_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      gt0.left = x2_read0_0.out;
-      gt0.right = zero_0.out;
-    }
-    group let0<"static"=1> {
-      zero_0.in = fpconst0.out;
-      zero_0.write_en = 1'd1;
-      let0[done] = zero_0.done;
-    }
-    group let1<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let1[done] = i0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x2_read0_0.write_en = 1'd1;
-      x20_0_addr1 = j0.out;
-      x20_0_addr0 = i0.out;
-      x2_read0_0.in = 1'd1 ? x20_0_read_data;
-      upd0[done] = x2_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      x2_read1_0.write_en = 1'd1;
-      x20_0_addr1 = j0.out;
-      x20_0_addr0 = i0.out;
-      x2_read1_0.in = 1'd1 ? x20_0_read_data;
-      upd1[done] = x2_read1_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x30_0_addr1 = j0.out;
-      x30_0_addr0 = i0.out;
-      x30_0_write_en = 1'd1;
-      x30_0_write_data = 1'd1 ? x2_read1_0.out;
-      upd2[done] = x30_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      x30_0_addr1 = j0.out;
-      x30_0_addr0 = i0.out;
-      x30_0_write_en = 1'd1;
-      x30_0_write_data = 1'd1 ? zero_0.out;
-      upd3[done] = x30_0_done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd4[done] = j0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd5[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      let1;
-      while le0.out with cond0 {
-        seq {
-          let2;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              if gt0.out with cond2 {
-                seq {
-                  upd1;
-                  upd2;
-                }
-              } else {
-                upd3;
-              }
-              upd4;
-            }
-          }
-          upd5;
-        }
-      }
-    }
-  }
-}
-component bias_add(go: 1, clk: 1, fc1_bias0_read_data: 32, fc1_bias0_done: 1, x10_0_read_data: 32, x10_0_done: 1, x20_0_read_data: 32, x20_0_done: 1) -> (done: 1, fc1_bias0_addr0: 8, fc1_bias0_write_data: 32, fc1_bias0_write_en: 1, fc1_bias0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, x20_0_addr0: 1, x20_0_addr1: 8, x20_0_write_data: 32, x20_0_write_en: 1, x20_0_clk: 1) {
-  cells {
-    add0 = prim fixed_p_std_add(32, 16, 16);
-    add1 = prim std_add(8);
-    add2 = prim std_add(1);
-    const0 = prim std_const(1, 0);
-    const1 = prim std_const(1, 0);
-    const2 = prim std_const(8, 0);
-    const3 = prim std_const(8, 127);
-    const4 = prim std_const(8, 1);
-    const5 = prim std_const(1, 1);
-    fc1_bias_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(8);
-    le0 = prim std_le(1);
-    le1 = prim std_le(8);
-    x1_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group upd0<"static"=1> {
-      x1_read0_0.write_en = 1'd1;
-      x10_0_addr1 = j0.out;
-      x10_0_addr0 = i0.out;
-      x1_read0_0.in = 1'd1 ? x10_0_read_data;
-      upd0[done] = x1_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      fc1_bias_read0_0.write_en = 1'd1;
-      fc1_bias0_addr0 = j0.out;
-      fc1_bias_read0_0.in = 1'd1 ? fc1_bias0_read_data;
-      upd1[done] = fc1_bias_read0_0.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      x20_0_addr1 = j0.out;
-      x20_0_addr0 = i0.out;
-      x20_0_write_en = 1'd1;
-      add0.left = x1_read0_0.out;
-      add0.right = fc1_bias_read0_0.out;
-      x20_0_write_data = 1'd1 ? add0.out;
-      upd2[done] = x20_0_done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      j0.write_en = 1'd1;
-      add1.left = j0.out;
-      add1.right = const4.out;
-      j0.in = 1'd1 ? add1.out;
-      upd3[done] = j0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      i0.write_en = 1'd1;
-      add2.left = i0.out;
-      add2.right = const5.out;
-      i0.in = 1'd1 ? add2.out;
-      upd4[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              par {
-                upd0;
-                upd1;
-              }
-              upd2;
-              upd3;
-            }
-          }
-          upd4;
-        }
-      }
-    }
-  }
-}
-component dense(go: 1, clk: 1, fc1_weight0_0_read_data: 32, fc1_weight0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, fc1_weight0_0_addr0: 8, fc1_weight0_0_addr1: 10, fc1_weight0_0_write_data: 32, fc1_weight0_0_write_en: 1, fc1_weight0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 8, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
-  cells {
-    add0 = prim std_add(10);
-    add1 = prim std_add(8);
-    add2 = prim fixed_p_std_add(32, 16, 16);
-    add3 = prim std_add(10);
-    add4 = prim std_add(8);
-    add5 = prim std_add(1);
-    add6 = prim std_add(8);
-    add7 = prim std_add(1);
-    bin_read0_0 = prim std_reg(32);
-    const0 = prim std_const(8, 0);
-    const1 = prim std_const(8, 127);
-    const10 = prim std_const(10, 0);
-    const11 = prim std_const(10, 783);
-    const12 = prim std_const(10, 1);
-    const13 = prim std_const(8, 1);
-    const14 = prim std_const(1, 1);
-    const15 = prim std_const(1, 0);
-    const16 = prim std_const(1, 0);
-    const17 = prim std_const(8, 0);
-    const18 = prim std_const(8, 127);
-    const19 = prim std_const(8, 1);
-    const2 = prim std_const(10, 0);
-    const20 = prim std_const(1, 1);
-    const3 = prim std_const(10, 783);
-    const4 = prim std_const(10, 1);
-    const5 = prim std_const(8, 1);
-    const6 = prim std_const(1, 0);
-    const7 = prim std_const(1, 0);
-    const8 = prim std_const(8, 0);
-    const9 = prim std_const(8, 127);
-    fc1_weight_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(8);
-    i1 = prim std_reg(1);
-    i2 = prim std_reg(1);
-    j0 = prim std_reg(10);
-    j1 = prim std_reg(8);
-    j2 = prim std_reg(8);
-    k0 = prim std_reg(10);
-    le0 = prim std_le(8);
-    le1 = prim std_le(10);
-    le2 = prim std_le(1);
-    le3 = prim std_le(8);
-    le4 = prim std_le(10);
-    le5 = prim std_le(1);
-    le6 = prim std_le(8);
-    mult_pipe0 = prim std_mult_pipe(32);
-    product_0 = prim std_reg(32);
-    slice0 = prim std_slice(32, 32);
-    slice1 = prim std_slice(32, 32);
-    temporary_x10_0 = prim std_mem_d2(32, 1, 128, 1, 8);
-    temporary_x1_read0_0 = prim std_reg(32);
-    transpose_fc1_weight0_0 = prim std_mem_d2(32, 784, 128, 10, 8);
-    transpose_fc1_weight_read0_0 = prim std_reg(32);
-    x_read0_0 = prim std_reg(32);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const1.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const3.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = i1.out;
-      le2.right = const7.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = j1.out;
-      le3.right = const9.out;
-    }
-    group cond4<"static"=0> {
-      cond4[done] = 1'd1;
-      le4.left = k0.out;
-      le4.right = const11.out;
-    }
-    group cond5<"static"=0> {
-      cond5[done] = 1'd1;
-      le5.left = i2.out;
-      le5.right = const16.out;
-    }
-    group cond6<"static"=0> {
-      cond6[done] = 1'd1;
-      le6.left = j2.out;
-      le6.right = const18.out;
-    }
-    group let0<"static"=1> {
-      i0.in = const0.out;
-      i0.write_en = 1'd1;
-      let0[done] = i0.done;
-    }
-    group let1<"static"=1> {
-      j0.in = const2.out;
-      j0.write_en = 1'd1;
-      let1[done] = j0.done;
-    }
-    group let2<"static"=1> {
-      i1.in = const6.out;
-      i1.write_en = 1'd1;
-      let2[done] = i1.done;
-    }
-    group let3<"static"=1> {
-      j1.in = const8.out;
-      j1.write_en = 1'd1;
-      let3[done] = j1.done;
-    }
-    group let4<"static"=1> {
-      k0.in = const10.out;
-      k0.write_en = 1'd1;
-      let4[done] = k0.done;
-    }
-    group let5<"static"=1> {
-      bin_read0_0.in = slice0.out;
-      bin_read0_0.write_en = 1'd1;
-      let5[done] = bin_read0_0.done;
-      slice0.in = mult_pipe0.out;
-      mult_pipe0.left = x_read0_0.out;
-      mult_pipe0.right = transpose_fc1_weight_read0_0.out;
-      mult_pipe0.go = !mult_pipe0.done ? 1'd1;
-    }
-    group let6<"static"=1> {
-      product_0.in = slice1.out;
-      product_0.write_en = 1'd1;
-      let6[done] = product_0.done;
-      slice1.in = bin_read0_0.out;
-    }
-    group let7<"static"=1> {
-      i2.in = const15.out;
-      i2.write_en = 1'd1;
-      let7[done] = i2.done;
-    }
-    group let8<"static"=1> {
-      j2.in = const17.out;
-      j2.write_en = 1'd1;
-      let8[done] = j2.done;
-    }
-    group upd0<"static"=1> {
-      fc1_weight_read0_0.write_en = 1'd1;
-      fc1_weight0_0_addr1 = j0.out;
-      fc1_weight0_0_addr0 = i0.out;
-      fc1_weight_read0_0.in = 1'd1 ? fc1_weight0_0_read_data;
-      upd0[done] = fc1_weight_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      transpose_fc1_weight0_0.addr1 = i0.out;
-      transpose_fc1_weight0_0.addr0 = j0.out;
-      transpose_fc1_weight0_0.write_en = 1'd1;
-      transpose_fc1_weight0_0.write_data = 1'd1 ? fc1_weight_read0_0.out;
-      upd1[done] = transpose_fc1_weight0_0.done ? 1'd1;
-    }
-    group upd10<"static"=1> {
-      temporary_x1_read0_0.write_en = 1'd1;
-      temporary_x10_0.addr1 = j2.out;
-      temporary_x10_0.addr0 = i2.out;
-      temporary_x1_read0_0.in = 1'd1 ? temporary_x10_0.read_data;
-      upd10[done] = temporary_x1_read0_0.done ? 1'd1;
-    }
-    group upd11<"static"=1> {
-      x10_0_addr1 = j2.out;
-      x10_0_addr0 = i2.out;
-      x10_0_write_en = 1'd1;
-      x10_0_write_data = 1'd1 ? temporary_x1_read0_0.out;
-      upd11[done] = x10_0_done ? 1'd1;
-    }
-    group upd12<"static"=1> {
-      j2.write_en = 1'd1;
-      add6.left = j2.out;
-      add6.right = const19.out;
-      j2.in = 1'd1 ? add6.out;
-      upd12[done] = j2.done ? 1'd1;
-    }
-    group upd13<"static"=1> {
-      i2.write_en = 1'd1;
-      add7.left = i2.out;
-      add7.right = const20.out;
-      i2.in = 1'd1 ? add7.out;
-      upd13[done] = i2.done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      j0.write_en = 1'd1;
-      add0.left = j0.out;
-      add0.right = const4.out;
-      j0.in = 1'd1 ? add0.out;
-      upd2[done] = j0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      i0.write_en = 1'd1;
-      add1.left = i0.out;
-      add1.right = const5.out;
-      i0.in = 1'd1 ? add1.out;
-      upd3[done] = i0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      x_read0_0.write_en = 1'd1;
-      x0_0_addr1 = k0.out;
-      x0_0_addr0 = i1.out;
-      x_read0_0.in = 1'd1 ? x0_0_read_data;
-      upd4[done] = x_read0_0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      transpose_fc1_weight_read0_0.write_en = 1'd1;
-      transpose_fc1_weight0_0.addr1 = j1.out;
-      transpose_fc1_weight0_0.addr0 = k0.out;
-      transpose_fc1_weight_read0_0.in = 1'd1 ? transpose_fc1_weight0_0.read_data;
-      upd5[done] = transpose_fc1_weight_read0_0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      temporary_x10_0.addr1 = j1.out;
-      temporary_x10_0.addr0 = i1.out;
-      temporary_x10_0.write_en = 1'd1;
-      add2.left = temporary_x10_0.read_data;
-      add2.right = product_0.out;
-      temporary_x10_0.addr1 = j1.out;
-      temporary_x10_0.addr0 = i1.out;
-      temporary_x10_0.write_data = 1'd1 ? add2.out;
-      upd6[done] = temporary_x10_0.done ? 1'd1;
-    }
-    group upd7<"static"=1> {
-      k0.write_en = 1'd1;
-      add3.left = k0.out;
-      add3.right = const12.out;
-      k0.in = 1'd1 ? add3.out;
-      upd7[done] = k0.done ? 1'd1;
-    }
-    group upd8<"static"=1> {
-      j1.write_en = 1'd1;
-      add4.left = j1.out;
-      add4.right = const13.out;
-      j1.in = 1'd1 ? add4.out;
-      upd8[done] = j1.done ? 1'd1;
-    }
-    group upd9<"static"=1> {
-      i1.write_en = 1'd1;
-      add5.left = i1.out;
-      add5.right = const14.out;
-      i1.in = 1'd1 ? add5.out;
-      upd9[done] = i1.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      while le0.out with cond0 {
-        seq {
-          let1;
-          while le1.out with cond1 {
-            seq {
-              upd0;
-              upd1;
-              upd2;
-            }
-          }
-          upd3;
-        }
-      }
-      let2;
-      while le2.out with cond2 {
-        seq {
-          let3;
-          while le3.out with cond3 {
-            seq {
-              let4;
-              while le4.out with cond4 {
-                seq {
-                  par {
-                    upd4;
-                    upd5;
-                  }
-                  let5;
-                  let6;
-                  upd6;
-                  upd7;
-                }
-              }
-              upd8;
-            }
-          }
-          upd9;
-        }
-      }
-      let7;
-      while le5.out with cond5 {
-        seq {
-          let8;
-          while le6.out with cond6 {
-            seq {
-              upd10;
-              upd11;
-              upd12;
-            }
-          }
-          upd13;
-        }
-      }
-    }
-  }
-}
-component batch_flatten(go: 1, clk: 1, data0_0_0_0_read_data: 32, data0_0_0_0_done: 1, x0_0_read_data: 32, x0_0_done: 1) -> (done: 1, data0_0_0_0_addr0: 1, data0_0_0_0_addr1: 1, data0_0_0_0_addr2: 5, data0_0_0_0_addr3: 5, data0_0_0_0_write_data: 32, data0_0_0_0_write_en: 1, data0_0_0_0_clk: 1, x0_0_addr0: 1, x0_0_addr1: 10, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1) {
-  cells {
-    add0 = prim std_add(10);
-    add1 = prim std_add(5);
-    add2 = prim std_add(5);
-    add3 = prim std_add(1);
-    add4 = prim std_add(1);
-    const0 = prim std_const(10, 0);
-    const1 = prim std_const(1, 0);
-    const10 = prim std_const(5, 1);
-    const11 = prim std_const(5, 1);
-    const12 = prim std_const(1, 1);
-    const13 = prim std_const(1, 1);
-    const2 = prim std_const(1, 0);
-    const3 = prim std_const(1, 0);
-    const4 = prim std_const(1, 0);
-    const5 = prim std_const(5, 0);
-    const6 = prim std_const(5, 27);
-    const7 = prim std_const(5, 0);
-    const8 = prim std_const(5, 27);
-    const9 = prim std_const(10, 1);
-    data_read0_0 = prim std_reg(32);
-    i0 = prim std_reg(1);
-    j0 = prim std_reg(1);
-    k0 = prim std_reg(5);
-    l0 = prim std_reg(5);
-    le0 = prim std_le(1);
-    le1 = prim std_le(1);
-    le2 = prim std_le(5);
-    le3 = prim std_le(5);
-    m_0 = prim std_reg(10);
-  }
-  wires {
-    group cond0<"static"=0> {
-      cond0[done] = 1'd1;
-      le0.left = i0.out;
-      le0.right = const2.out;
-    }
-    group cond1<"static"=0> {
-      cond1[done] = 1'd1;
-      le1.left = j0.out;
-      le1.right = const4.out;
-    }
-    group cond2<"static"=0> {
-      cond2[done] = 1'd1;
-      le2.left = k0.out;
-      le2.right = const6.out;
-    }
-    group cond3<"static"=0> {
-      cond3[done] = 1'd1;
-      le3.left = l0.out;
-      le3.right = const8.out;
-    }
-    group let0<"static"=1> {
-      m_0.in = const0.out;
-      m_0.write_en = 1'd1;
-      let0[done] = m_0.done;
-    }
-    group let1<"static"=1> {
-      i0.in = const1.out;
-      i0.write_en = 1'd1;
-      let1[done] = i0.done;
-    }
-    group let2<"static"=1> {
-      j0.in = const3.out;
-      j0.write_en = 1'd1;
-      let2[done] = j0.done;
-    }
-    group let3<"static"=1> {
-      k0.in = const5.out;
-      k0.write_en = 1'd1;
-      let3[done] = k0.done;
-    }
-    group let4<"static"=1> {
-      l0.in = const7.out;
-      l0.write_en = 1'd1;
-      let4[done] = l0.done;
-    }
-    group upd0<"static"=1> {
-      data_read0_0.write_en = 1'd1;
-      data0_0_0_0_addr3 = l0.out;
-      data0_0_0_0_addr2 = k0.out;
-      data0_0_0_0_addr1 = j0.out;
-      data0_0_0_0_addr0 = i0.out;
-      data_read0_0.in = 1'd1 ? data0_0_0_0_read_data;
-      upd0[done] = data_read0_0.done ? 1'd1;
-    }
-    group upd1<"static"=1> {
-      x0_0_addr1 = m_0.out;
-      x0_0_addr0 = i0.out;
-      x0_0_write_en = 1'd1;
-      x0_0_write_data = 1'd1 ? data_read0_0.out;
-      upd1[done] = x0_0_done ? 1'd1;
-    }
-    group upd2<"static"=1> {
-      m_0.write_en = 1'd1;
-      add0.left = m_0.out;
-      add0.right = const9.out;
-      m_0.in = 1'd1 ? add0.out;
-      upd2[done] = m_0.done ? 1'd1;
-    }
-    group upd3<"static"=1> {
-      l0.write_en = 1'd1;
-      add1.left = l0.out;
-      add1.right = const10.out;
-      l0.in = 1'd1 ? add1.out;
-      upd3[done] = l0.done ? 1'd1;
-    }
-    group upd4<"static"=1> {
-      k0.write_en = 1'd1;
-      add2.left = k0.out;
-      add2.right = const11.out;
-      k0.in = 1'd1 ? add2.out;
-      upd4[done] = k0.done ? 1'd1;
-    }
-    group upd5<"static"=1> {
-      j0.write_en = 1'd1;
-      add3.left = j0.out;
-      add3.right = const12.out;
-      j0.in = 1'd1 ? add3.out;
-      upd5[done] = j0.done ? 1'd1;
-    }
-    group upd6<"static"=1> {
-      i0.write_en = 1'd1;
-      add4.left = i0.out;
-      add4.right = const13.out;
-      i0.in = 1'd1 ? add4.out;
-      upd6[done] = i0.done ? 1'd1;
-    }
-  }
-
-  control {
-    seq {
-      let0;
-      let1;
-      while le0.out with cond0 {
-        seq {
-          let2;
-          while le1.out with cond1 {
-            seq {
-              let3;
-              while le2.out with cond2 {
-                seq {
-                  let4;
-                  while le3.out with cond3 {
-                    seq {
-                      upd0;
-                      upd1;
-                      upd2;
-                      upd3;
-                    }
-                  }
-                  upd4;
-                }
-              }
-              upd5;
-            }
-          }
-          upd6;
-        }
-      }
-    }
-  }
-}
-
-component main () -> () {
-  cells {
-    x8 = prim std_mem_d2(32, 1, 10, 1, 4);
-    x7 = prim std_mem_d2(32, 1, 10, 1, 4);
-    fc3_bias = prim std_mem_d1(32, 10, 4);
-    bias_add2 = bias_add2;
-    x6 = prim std_mem_d2(32, 1, 64, 1, 7);
-    fc3_weight = prim std_mem_d2(32, 10, 64, 4, 7);
-    dense2 = dense2;
-    x5 = prim std_mem_d2(32, 1, 64, 1, 7);
-    relu1 = relu1;
-    x4 = prim std_mem_d2(32, 1, 64, 1, 7);
-    fc2_bias = prim std_mem_d1(32, 64, 7);
-    bias_add1 = bias_add1;
-    x3 = prim std_mem_d2(32, 1, 128, 1, 8);
-    fc2_weight = prim std_mem_d2(32, 64, 128, 7, 8);
-    dense1 = dense1;
-    x2 = prim std_mem_d2(32, 1, 128, 1, 8);
-    relu0 = relu;
-    x1 = prim std_mem_d2(32, 1, 128, 1, 8);
-    fc1_bias = prim std_mem_d1(32, 128, 8);
-    bias_add0 = bias_add;
-    x = prim std_mem_d2(32, 1, 784, 1, 10);
-    fc1_weight = prim std_mem_d2(32, 128, 784, 8, 10);
-    dense0 = dense;
-    data = prim std_mem_d4(32, 1, 1, 28, 28, 1, 1, 5, 5);
-    batch_flatten0 = batch_flatten;
-  }
-  wires {
-    group run_batch_flatten {
-      data.addr0 = batch_flatten0.data0_0_0_0_addr0;
-      batch_flatten0.data0_0_0_0_read_data = data.read_data;
-      data.addr1 = batch_flatten0.data0_0_0_0_addr1;
-      data.addr2 = batch_flatten0.data0_0_0_0_addr2;
-      x.addr0 = batch_flatten0.x0_0_addr0;
-      x.addr1 = batch_flatten0.x0_0_addr1;
-      x.write_data = batch_flatten0.x0_0_write_data;
-      x.write_en = batch_flatten0.x0_0_write_en;
-      batch_flatten0.x0_0_done = x.done;
-      batch_flatten0.go = 1'd1;
-      run_batch_flatten[done] = batch_flatten0.done ? 1'd1;
-    }
-    group run_dense {
-      x.addr0 = dense0.x0_0_addr0;
-      dense0.x0_0_read_data = x.read_data;
-      x.addr1 = dense0.x0_0_addr1;
-      fc1_weight.addr0 = dense0.fc1_weight0_0_addr0;
-      dense0.fc1_weight0_0_read_data = fc1_weight.read_data;
-      fc1_weight.addr1 = dense0.fc1_weight0_0_addr1;
-      x1.addr0 = dense0.x10_0_addr0;
-      x1.addr1 = dense0.x10_0_addr1;
-      x1.write_data = dense0.x10_0_write_data;
-      x1.write_en = dense0.x10_0_write_en;
-      dense0.x10_0_done = x1.done;
-      dense0.go = 1'd1;
-      run_dense[done] = dense0.done ? 1'd1;
-    }
-    group run_bias_add {
-      x1.addr0 = bias_add0.x10_0_addr0;
-      bias_add0.x10_0_read_data = x1.read_data;
-      x1.addr1 = bias_add0.x10_0_addr1;
-      fc1_bias.addr0 = bias_add0.fc1_bias0_addr0;
-      bias_add0.fc1_bias0_read_data = fc1_bias.read_data;
-      x2.addr0 = bias_add0.x20_0_addr0;
-      x2.addr1 = bias_add0.x20_0_addr1;
-      x2.write_data = bias_add0.x20_0_write_data;
-      x2.write_en = bias_add0.x20_0_write_en;
-      bias_add0.x20_0_done = x2.done;
-      bias_add0.go = 1'd1;
-      run_bias_add[done] = bias_add0.done ? 1'd1;
-    }
-    group run_relu {
-      x2.addr0 = relu0.x20_0_addr0;
-      relu0.x20_0_read_data = x2.read_data;
-      x2.addr1 = relu0.x20_0_addr1;
-      x3.addr0 = relu0.x30_0_addr0;
-      x3.addr1 = relu0.x30_0_addr1;
-      x3.write_data = relu0.x30_0_write_data;
-      x3.write_en = relu0.x30_0_write_en;
-      relu0.x30_0_done = x3.done;
-      relu0.go = 1'd1;
-      run_relu[done] = relu0.done ? 1'd1;
-    }
-    group run_dense1 {
-      x3.addr0 = dense1.x30_0_addr0;
-      dense1.x30_0_read_data = x3.read_data;
-      x3.addr1 = dense1.x30_0_addr1;
-      fc2_weight.addr0 = dense1.fc2_weight0_0_addr0;
-      dense1.fc2_weight0_0_read_data = fc2_weight.read_data;
-      fc2_weight.addr1 = dense1.fc2_weight0_0_addr1;
-      x4.addr0 = dense1.x40_0_addr0;
-      x4.addr1 = dense1.x40_0_addr1;
-      x4.write_data = dense1.x40_0_write_data;
-      x4.write_en = dense1.x40_0_write_en;
-      dense1.x40_0_done = x4.done;
-      dense1.go = 1'd1;
-      run_dense1[done] = dense1.done ? 1'd1;
-    }
-    group run_bias_add1 {
-      x4.addr0 = bias_add1.x40_0_addr0;
-      bias_add1.x40_0_read_data = x4.read_data;
-      x4.addr1 = bias_add1.x40_0_addr1;
-      fc2_bias.addr0 = bias_add1.fc2_bias0_addr0;
-      bias_add1.fc2_bias0_read_data = fc2_bias.read_data;
-      x5.addr0 = bias_add1.x50_0_addr0;
-      x5.addr1 = bias_add1.x50_0_addr1;
-      x5.write_data = bias_add1.x50_0_write_data;
-      x5.write_en = bias_add1.x50_0_write_en;
-      bias_add1.x50_0_done = x5.done;
-      bias_add1.go = 1'd1;
-      run_bias_add1[done] = bias_add1.done ? 1'd1;
-    }
-    group run_relu1 {
-      x5.addr0 = relu1.x50_0_addr0;
-      relu1.x50_0_read_data = x5.read_data;
-      x5.addr1 = relu1.x50_0_addr1;
-      x6.addr0 = relu1.x60_0_addr0;
-      x6.addr1 = relu1.x60_0_addr1;
-      x6.write_data = relu1.x60_0_write_data;
-      x6.write_en = relu1.x60_0_write_en;
-      relu1.x60_0_done = x6.done;
-      relu1.go = 1'd1;
-      run_relu1[done] = relu1.done ? 1'd1;
-    }
-    group run_dense2 {
-      x6.addr0 = dense2.x60_0_addr0;
-      dense2.x60_0_read_data = x6.read_data;
-      x6.addr1 = dense2.x60_0_addr1;
-      fc3_weight.addr0 = dense2.fc3_weight0_0_addr0;
-      dense2.fc3_weight0_0_read_data = fc3_weight.read_data;
-      fc3_weight.addr1 = dense2.fc3_weight0_0_addr1;
-      x7.addr0 = dense2.x70_0_addr0;
-      x7.addr1 = dense2.x70_0_addr1;
-      x7.write_data = dense2.x70_0_write_data;
-      x7.write_en = dense2.x70_0_write_en;
-      dense2.x70_0_done = x7.done;
-      dense2.go = 1'd1;
-      run_dense2[done] = dense2.done ? 1'd1;
-    }
-    group run_bias_add2 {
-      x7.addr0 = bias_add2.x70_0_addr0;
-      bias_add2.x70_0_read_data = x7.read_data;
-      x7.addr1 = bias_add2.x70_0_addr1;
-      fc3_bias.addr0 = bias_add2.fc3_bias0_addr0;
-      bias_add2.fc3_bias0_read_data = fc3_bias.read_data;
-      x8.addr0 = bias_add2.x80_0_addr0;
-      x8.addr1 = bias_add2.x80_0_addr1;
-      x8.write_data = bias_add2.x80_0_write_data;
-      x8.write_en = bias_add2.x80_0_write_en;
-      bias_add2.x80_0_done = x8.done;
-      bias_add2.go = 1'd1;
-      run_bias_add2[done] = bias_add2.done ? 1'd1;
-    }
-  }
-  control {
-    seq {
-      run_batch_flatten;
-      run_dense;
-      run_bias_add;
-      run_relu;
-      run_dense1;
-      run_bias_add1;
-      run_relu1;
-      run_dense2;
-      run_bias_add2;
-    }
-  }
-}
diff --git a/frontends/relay-futil/tests/mlp_net.relay b/frontends/relay-futil/tests/mlp_net.relay
deleted file mode 100644
index 8943360100..0000000000
--- a/frontends/relay-futil/tests/mlp_net.relay
+++ /dev/null
@@ -1,17 +0,0 @@
-v0.0.4
-fn (%data: Tensor[(1, 1, 28, 28), float32], %fc1_weight: Tensor[(128, 784), float32], %fc1_bias: Tensor[(128), float32],
-    %fc2_weight: Tensor[(64, 128), float32], %fc2_bias: Tensor[(64), float32], %fc3_weight: Tensor[(10, 64), float32],
-    %fc3_bias: Tensor[(10), float32]) -> Tensor[(1, 10), float32] {
-  let %x: Tensor[(1, 784), float32] = nn.batch_flatten(%data) /* ty=Tensor[(1, 784), float32] */;
-  let %x1: Tensor[(1, 128), float32] = nn.dense(%x, %fc1_weight, units=128) /* ty=Tensor[(1, 128), float32] */;
-  let %x2: Tensor[(1, 128), float32] = nn.bias_add(%x1, %fc1_bias, axis=-1) /* ty=Tensor[(1, 128), float32] */;
-  let %x3: Tensor[(1, 128), float32] = nn.relu(%x2) /* ty=Tensor[(1, 128), float32] */;
-  let %x4: Tensor[(1, 64), float32] = nn.dense(%x3, %fc2_weight, units=64) /* ty=Tensor[(1, 64), float32] */;
-  let %x5: Tensor[(1, 64), float32] = nn.bias_add(%x4, %fc2_bias, axis=-1) /* ty=Tensor[(1, 64), float32] */;
-  let %x6: Tensor[(1, 64), float32] = nn.relu(%x5) /* ty=Tensor[(1, 64), float32] */;
-  let %x7: Tensor[(1, 10), float32] = nn.dense(%x6, %fc3_weight, units=10) /* ty=Tensor[(1, 10), float32] */;
-  let %x8: Tensor[(1, 10), float32] = nn.bias_add(%x7, %fc3_bias, axis=-1) /* ty=Tensor[(1, 10), float32] */;
-  %x8
-  // let %x9: Tensor[(1, 10), float32] = nn.softmax(%x8) /* ty=Tensor[(1, 10), float32] */;
-  // %x9
-}
diff --git a/frontends/relay-futil/tests/relu.expect b/frontends/relay-futil/tests/relu.expect
index 74b5646d9b..66f3dd53e8 100644
--- a/frontends/relay-futil/tests/relu.expect
+++ b/frontends/relay-futil/tests/relu.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component relu(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 2, x0_0_0_0_addr1: 3, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 6, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 2, x10_0_0_0_addr1: 3, x10_0_0_0_addr2: 4, x10_0_0_0_addr3: 6, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+component relu0(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 2, x0_0_0_0_addr1: 3, x0_0_0_0_addr2: 4, x0_0_0_0_addr3: 6, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 2, x10_0_0_0_addr1: 3, x10_0_0_0_addr2: 4, x10_0_0_0_addr3: 6, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(6);
     add1 = prim std_add(4);
@@ -193,25 +193,29 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d4(32, 2, 4, 8, 32, 2, 3, 4, 6);
     x = prim std_mem_d4(32, 2, 4, 8, 32, 2, 3, 4, 6);
-    relu0 = relu;
+    comp_relu0 = relu0;
   }
   wires {
-    group run_relu {
-      x.addr0 = relu0.x0_0_0_0_addr0;
-      relu0.x0_0_0_0_read_data = x.read_data;
-      x.addr1 = relu0.x0_0_0_0_addr1;
-      x.addr2 = relu0.x0_0_0_0_addr2;
-      x1.addr0 = relu0.x10_0_0_0_addr0;
-      x1.write_data = relu0.x10_0_0_0_write_data;
-      x1.write_en = relu0.x10_0_0_0_write_en;
-      relu0.x10_0_0_0_done = x1.done;
-      relu0.go = 1'd1;
-      run_relu[done] = relu0.done ? 1'd1;
+    group run_relu0 {
+      x.addr0 = comp_relu0.x0_0_0_0_addr0;
+      comp_relu0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = comp_relu0.x0_0_0_0_addr1;
+      x.addr2 = comp_relu0.x0_0_0_0_addr2;
+      x.addr3 = comp_relu0.x0_0_0_0_addr3;
+      x1.addr0 = comp_relu0.x10_0_0_0_addr0;
+      x1.addr1 = comp_relu0.x10_0_0_0_addr1;
+      x1.addr2 = comp_relu0.x10_0_0_0_addr2;
+      x1.addr3 = comp_relu0.x10_0_0_0_addr3;
+      x1.write_data = comp_relu0.x10_0_0_0_write_data;
+      x1.write_en = comp_relu0.x10_0_0_0_write_en;
+      comp_relu0.x10_0_0_0_done = x1.done;
+      comp_relu0.go = 1'd1;
+      run_relu0[done] = comp_relu0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_relu;
+      run_relu0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/softmax.expect b/frontends/relay-futil/tests/softmax.expect
new file mode 100644
index 0000000000..128b2ebc0d
--- /dev/null
+++ b/frontends/relay-futil/tests/softmax.expect
@@ -0,0 +1,210 @@
+import "primitives/std.lib";
+
+component softmax0(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1) -> (done: 1, x0_0_addr0: 1, x0_0_addr1: 4, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 1, x10_0_addr1: 4, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1) {
+  cells {
+    add0 = prim fixed_p_std_add(32, 16, 16);
+    add1 = prim std_add(4);
+    add2 = prim std_add(4);
+    add3 = prim std_add(1);
+    bin_read0_0 = prim std_reg(32);
+    const0 = prim std_const(1, 0);
+    const1 = prim std_const(1, 0);
+    const2 = prim std_const(4, 0);
+    const3 = prim std_const(4, 9);
+    const4 = prim std_const(4, 1);
+    const5 = prim std_const(4, 0);
+    const6 = prim std_const(4, 9);
+    const7 = prim std_const(4, 1);
+    const8 = prim std_const(1, 1);
+    div_pipe0 = prim std_div_pipe(32);
+    exp0 = prim std_exp();
+    exp1 = prim std_exp();
+    fpconst0 = prim fixed_p_std_const(32, 16, 16, 0, 0);
+    i0 = prim std_reg(1);
+    j0 = prim std_reg(4);
+    k0 = prim std_reg(4);
+    le0 = prim std_le(1);
+    le1 = prim std_le(4);
+    le2 = prim std_le(4);
+    slice0 = prim std_slice(32, 32);
+    x1_read0_0 = prim std_reg(32);
+    x_expsum_0 = prim std_reg(32);
+    x_read0_0 = prim std_reg(32);
+    x_read1_0 = prim std_reg(32);
+  }
+  wires {
+    group cond0<"static"=0> {
+      cond0[done] = 1'd1;
+      le0.left = i0.out;
+      le0.right = const1.out;
+    }
+    group cond1<"static"=0> {
+      cond1[done] = 1'd1;
+      le1.left = j0.out;
+      le1.right = const3.out;
+    }
+    group cond2<"static"=0> {
+      cond2[done] = 1'd1;
+      le2.left = k0.out;
+      le2.right = const6.out;
+    }
+    group let0<"static"=1> {
+      i0.in = const0.out;
+      i0.write_en = 1'd1;
+      let0[done] = i0.done;
+    }
+    group let1<"static"=1> {
+      x_expsum_0.in = fpconst0.out;
+      x_expsum_0.write_en = 1'd1;
+      let1[done] = x_expsum_0.done;
+    }
+    group let2<"static"=1> {
+      j0.in = const2.out;
+      j0.write_en = 1'd1;
+      let2[done] = j0.done;
+    }
+    group let3<"static"=1> {
+      k0.in = const5.out;
+      k0.write_en = 1'd1;
+      let3[done] = k0.done;
+    }
+    group let4<"static"=1> {
+      bin_read0_0.in = slice0.out;
+      bin_read0_0.write_en = 1'd1;
+      let4[done] = bin_read0_0.done;
+      slice0.in = div_pipe0.out;
+      div_pipe0.left = x1_read0_0.out;
+      div_pipe0.right = x_expsum_0.out;
+      div_pipe0.go = !div_pipe0.done ? 1'd1;
+    }
+    group upd0<"static"=1> {
+      x_read0_0.write_en = 1'd1;
+      x0_0_addr1 = j0.out;
+      x0_0_addr0 = i0.out;
+      x_read0_0.in = 1'd1 ? x0_0_read_data;
+      upd0[done] = x_read0_0.done ? 1'd1;
+    }
+    group upd1 {
+      x_expsum_0.write_en = 1'd1;
+      add0.left = x_expsum_0.out;
+      add0.right = exp0.out;
+      exp0.exponent = x_read0_0.out;
+      exp0.go = !exp0.done ? 1'd1;
+      x_expsum_0.in = 1'd1 ? add0.out;
+      upd1[done] = x_expsum_0.done ? 1'd1;
+    }
+    group upd2<"static"=1> {
+      j0.write_en = 1'd1;
+      add1.left = j0.out;
+      add1.right = const4.out;
+      j0.in = 1'd1 ? add1.out;
+      upd2[done] = j0.done ? 1'd1;
+    }
+    group upd3<"static"=1> {
+      x_read1_0.write_en = 1'd1;
+      x0_0_addr1 = k0.out;
+      x0_0_addr0 = i0.out;
+      x_read1_0.in = 1'd1 ? x0_0_read_data;
+      upd3[done] = x_read1_0.done ? 1'd1;
+    }
+    group upd4 {
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = exp1.done;
+      exp1.exponent = x_read1_0.out;
+      exp1.go = !exp1.done ? 1'd1;
+      x10_0_write_data = exp1.done ? exp1.out;
+      upd4[done] = x10_0_done ? 1'd1;
+    }
+    group upd5<"static"=1> {
+      x1_read0_0.write_en = 1'd1;
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = i0.out;
+      x1_read0_0.in = 1'd1 ? x10_0_read_data;
+      upd5[done] = x1_read0_0.done ? 1'd1;
+    }
+    group upd6<"static"=1> {
+      x10_0_addr1 = k0.out;
+      x10_0_addr0 = i0.out;
+      x10_0_write_en = 1'd1;
+      x10_0_write_data = 1'd1 ? bin_read0_0.out;
+      upd6[done] = x10_0_done ? 1'd1;
+    }
+    group upd7<"static"=1> {
+      k0.write_en = 1'd1;
+      add2.left = k0.out;
+      add2.right = const7.out;
+      k0.in = 1'd1 ? add2.out;
+      upd7[done] = k0.done ? 1'd1;
+    }
+    group upd8<"static"=1> {
+      i0.write_en = 1'd1;
+      add3.left = i0.out;
+      add3.right = const8.out;
+      i0.in = 1'd1 ? add3.out;
+      upd8[done] = i0.done ? 1'd1;
+    }
+  }
+
+  control {
+    seq {
+      let0;
+      while le0.out with cond0 {
+        seq {
+          par {
+            let1;
+            seq {
+              let2;
+              while le1.out with cond1 {
+                seq {
+                  upd0;
+                  upd1;
+                  upd2;
+                }
+              }
+            }
+          }
+          let3;
+          while le2.out with cond2 {
+            seq {
+              upd3;
+              upd4;
+              upd5;
+              let4;
+              upd6;
+              upd7;
+            }
+          }
+          upd8;
+        }
+      }
+    }
+  }
+}
+
+component main () -> () {
+  cells {
+    x1 = prim std_mem_d2(32, 1, 10, 1, 4);
+    x = prim std_mem_d2(32, 1, 10, 1, 4);
+    comp_softmax0 = softmax0;
+  }
+  wires {
+    group run_softmax0 {
+      x.addr0 = comp_softmax0.x0_0_addr0;
+      comp_softmax0.x0_0_read_data = x.read_data;
+      x.addr1 = comp_softmax0.x0_0_addr1;
+      x1.addr0 = comp_softmax0.x10_0_addr0;
+      x1.addr1 = comp_softmax0.x10_0_addr1;
+      x1.write_data = comp_softmax0.x10_0_write_data;
+      x1.write_en = comp_softmax0.x10_0_write_en;
+      comp_softmax0.x10_0_done = x1.done;
+      comp_softmax0.go = 1'd1;
+      run_softmax0[done] = comp_softmax0.done ? 1'd1;
+    }
+  }
+  control {
+    seq {
+      run_softmax0;
+    }
+  }
+}
diff --git a/frontends/relay-futil/tests/softmax.relay b/frontends/relay-futil/tests/softmax.relay
new file mode 100644
index 0000000000..df46a20d70
--- /dev/null
+++ b/frontends/relay-futil/tests/softmax.relay
@@ -0,0 +1,6 @@
+v0.0.4
+fn (%x: Tensor[(1, 10), float32]) {
+  let %x1: Tensor[(1, 10), float32] = nn.softmax(%x);
+  %x1
+}
+
diff --git a/frontends/relay-futil/tests/sqrt.expect b/frontends/relay-futil/tests/sqrt.expect
index edb40c6259..ab67351192 100644
--- a/frontends/relay-futil/tests/sqrt.expect
+++ b/frontends/relay-futil/tests/sqrt.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component sqrt(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 4, x0_0_0_0_addr1: 4, x0_0_0_0_addr2: 5, x0_0_0_0_addr3: 7, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 4, x10_0_0_0_addr1: 4, x10_0_0_0_addr2: 5, x10_0_0_0_addr3: 7, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
+component sqrt0(go: 1, clk: 1, x0_0_0_0_read_data: 32, x0_0_0_0_done: 1, x10_0_0_0_read_data: 32, x10_0_0_0_done: 1) -> (done: 1, x0_0_0_0_addr0: 4, x0_0_0_0_addr1: 4, x0_0_0_0_addr2: 5, x0_0_0_0_addr3: 7, x0_0_0_0_write_data: 32, x0_0_0_0_write_en: 1, x0_0_0_0_clk: 1, x10_0_0_0_addr0: 4, x10_0_0_0_addr1: 4, x10_0_0_0_addr2: 5, x10_0_0_0_addr3: 7, x10_0_0_0_write_data: 32, x10_0_0_0_write_en: 1, x10_0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(7);
     add1 = prim std_add(5);
@@ -156,25 +156,29 @@ component main () -> () {
   cells {
     x1 = prim std_mem_d4(32, 8, 8, 16, 64, 4, 4, 5, 7);
     x = prim std_mem_d4(32, 8, 8, 16, 64, 4, 4, 5, 7);
-    sqrt0 = sqrt;
+    comp_sqrt0 = sqrt0;
   }
   wires {
-    group run_sqrt {
-      x.addr0 = sqrt0.x0_0_0_0_addr0;
-      sqrt0.x0_0_0_0_read_data = x.read_data;
-      x.addr1 = sqrt0.x0_0_0_0_addr1;
-      x.addr2 = sqrt0.x0_0_0_0_addr2;
-      x1.addr0 = sqrt0.x10_0_0_0_addr0;
-      x1.write_data = sqrt0.x10_0_0_0_write_data;
-      x1.write_en = sqrt0.x10_0_0_0_write_en;
-      sqrt0.x10_0_0_0_done = x1.done;
-      sqrt0.go = 1'd1;
-      run_sqrt[done] = sqrt0.done ? 1'd1;
+    group run_sqrt0 {
+      x.addr0 = comp_sqrt0.x0_0_0_0_addr0;
+      comp_sqrt0.x0_0_0_0_read_data = x.read_data;
+      x.addr1 = comp_sqrt0.x0_0_0_0_addr1;
+      x.addr2 = comp_sqrt0.x0_0_0_0_addr2;
+      x.addr3 = comp_sqrt0.x0_0_0_0_addr3;
+      x1.addr0 = comp_sqrt0.x10_0_0_0_addr0;
+      x1.addr1 = comp_sqrt0.x10_0_0_0_addr1;
+      x1.addr2 = comp_sqrt0.x10_0_0_0_addr2;
+      x1.addr3 = comp_sqrt0.x10_0_0_0_addr3;
+      x1.write_data = comp_sqrt0.x10_0_0_0_write_data;
+      x1.write_en = comp_sqrt0.x10_0_0_0_write_en;
+      comp_sqrt0.x10_0_0_0_done = x1.done;
+      comp_sqrt0.go = 1'd1;
+      run_sqrt0[done] = comp_sqrt0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_sqrt;
+      run_sqrt0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/sub.expect b/frontends/relay-futil/tests/sub.expect
index c74af4fb2e..7b3f88385d 100644
--- a/frontends/relay-futil/tests/sub.expect
+++ b/frontends/relay-futil/tests/sub.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component subtract(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
+component subtract0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, y0_read_data: 32, y0_done: 1, z0_read_data: 32, z0_done: 1) -> (done: 1, x0_addr0: 1, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, y0_addr0: 1, y0_write_data: 32, y0_write_en: 1, y0_clk: 1, z0_addr0: 1, z0_write_data: 32, z0_write_en: 1, z0_clk: 1) {
   cells {
     add0 = prim std_add(1);
     const0 = prim std_const(1, 0);
@@ -74,25 +74,25 @@ component main () -> () {
     z = prim std_mem_d1(32, 1, 1);
     x = prim std_mem_d1(32, 1, 1);
     y = prim std_mem_d1(32, 1, 1);
-    subtract0 = subtract;
+    comp_subtract0 = subtract0;
   }
   wires {
-    group run_subtract {
-      x.addr0 = subtract0.x0_addr0;
-      subtract0.x0_read_data = x.read_data;
-      y.addr0 = subtract0.y0_addr0;
-      subtract0.y0_read_data = y.read_data;
-      z.addr0 = subtract0.z0_addr0;
-      z.write_data = subtract0.z0_write_data;
-      z.write_en = subtract0.z0_write_en;
-      subtract0.z0_done = z.done;
-      subtract0.go = 1'd1;
-      run_subtract[done] = subtract0.done ? 1'd1;
+    group run_subtract0 {
+      x.addr0 = comp_subtract0.x0_addr0;
+      comp_subtract0.x0_read_data = x.read_data;
+      y.addr0 = comp_subtract0.y0_addr0;
+      comp_subtract0.y0_read_data = y.read_data;
+      z.addr0 = comp_subtract0.z0_addr0;
+      z.write_data = comp_subtract0.z0_write_data;
+      z.write_en = comp_subtract0.z0_write_en;
+      comp_subtract0.z0_done = z.done;
+      comp_subtract0.go = 1'd1;
+      run_subtract0[done] = comp_subtract0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_subtract;
+      run_subtract0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor1d_mult.expect b/frontends/relay-futil/tests/tensor1d_mult.expect
index dac0e76d85..a2b7d5d802 100644
--- a/frontends/relay-futil/tests/tensor1d_mult.expect
+++ b/frontends/relay-futil/tests/tensor1d_mult.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component multiply(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
+component multiply0(go: 1, clk: 1, x0_read_data: 32, x0_done: 1, x10_read_data: 32, x10_done: 1, y0_read_data: 32, y0_done: 1) -> (done: 1, x0_addr0: 3, x0_write_data: 32, x0_write_en: 1, x0_clk: 1, x10_addr0: 3, x10_write_data: 32, x10_write_en: 1, x10_clk: 1, y0_addr0: 3, y0_write_data: 32, y0_write_en: 1, y0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     bin_read0_0 = prim std_reg(32);
@@ -82,25 +82,25 @@ component main () -> () {
     x1 = prim std_mem_d1(32, 4, 3);
     x = prim std_mem_d1(32, 4, 3);
     y = prim std_mem_d1(32, 4, 3);
-    multiply0 = multiply;
+    comp_multiply0 = multiply0;
   }
   wires {
-    group run_multiply {
-      x.addr0 = multiply0.x0_addr0;
-      multiply0.x0_read_data = x.read_data;
-      y.addr0 = multiply0.y0_addr0;
-      multiply0.y0_read_data = y.read_data;
-      x1.addr0 = multiply0.x10_addr0;
-      x1.write_data = multiply0.x10_write_data;
-      x1.write_en = multiply0.x10_write_en;
-      multiply0.x10_done = x1.done;
-      multiply0.go = 1'd1;
-      run_multiply[done] = multiply0.done ? 1'd1;
+    group run_multiply0 {
+      x.addr0 = comp_multiply0.x0_addr0;
+      comp_multiply0.x0_read_data = x.read_data;
+      y.addr0 = comp_multiply0.y0_addr0;
+      comp_multiply0.y0_read_data = y.read_data;
+      x1.addr0 = comp_multiply0.x10_addr0;
+      x1.write_data = comp_multiply0.x10_write_data;
+      x1.write_en = comp_multiply0.x10_write_en;
+      comp_multiply0.x10_done = x1.done;
+      comp_multiply0.go = 1'd1;
+      run_multiply0[done] = comp_multiply0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_multiply;
+      run_multiply0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor2d_add.expect b/frontends/relay-futil/tests/tensor2d_add.expect
index d289badb27..c6a409b5af 100644
--- a/frontends/relay-futil/tests/tensor2d_add.expect
+++ b/frontends/relay-futil/tests/tensor2d_add.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component add(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
+component add0(go: 1, clk: 1, x0_0_read_data: 32, x0_0_done: 1, x10_0_read_data: 32, x10_0_done: 1, y0_0_read_data: 32, y0_0_done: 1) -> (done: 1, x0_0_addr0: 2, x0_0_addr1: 3, x0_0_write_data: 32, x0_0_write_en: 1, x0_0_clk: 1, x10_0_addr0: 2, x10_0_addr1: 3, x10_0_write_data: 32, x10_0_write_en: 1, x10_0_clk: 1, y0_0_addr0: 2, y0_0_addr1: 3, y0_0_write_data: 32, y0_0_write_en: 1, y0_0_clk: 1) {
   cells {
     add0 = prim std_add(32);
     add1 = prim std_add(3);
@@ -106,28 +106,28 @@ component main () -> () {
     x1 = prim std_mem_d2(32, 2, 4, 2, 3);
     x = prim std_mem_d2(32, 2, 4, 2, 3);
     y = prim std_mem_d2(32, 2, 4, 2, 3);
-    add0 = add;
+    comp_add0 = add0;
   }
   wires {
-    group run_add {
-      x.addr0 = add0.x0_0_addr0;
-      add0.x0_0_read_data = x.read_data;
-      x.addr1 = add0.x0_0_addr1;
-      y.addr0 = add0.y0_0_addr0;
-      add0.y0_0_read_data = y.read_data;
-      y.addr1 = add0.y0_0_addr1;
-      x1.addr0 = add0.x10_0_addr0;
-      x1.addr1 = add0.x10_0_addr1;
-      x1.write_data = add0.x10_0_write_data;
-      x1.write_en = add0.x10_0_write_en;
-      add0.x10_0_done = x1.done;
-      add0.go = 1'd1;
-      run_add[done] = add0.done ? 1'd1;
+    group run_add0 {
+      x.addr0 = comp_add0.x0_0_addr0;
+      comp_add0.x0_0_read_data = x.read_data;
+      x.addr1 = comp_add0.x0_0_addr1;
+      y.addr0 = comp_add0.y0_0_addr0;
+      comp_add0.y0_0_read_data = y.read_data;
+      y.addr1 = comp_add0.y0_0_addr1;
+      x1.addr0 = comp_add0.x10_0_addr0;
+      x1.addr1 = comp_add0.x10_0_addr1;
+      x1.write_data = comp_add0.x10_0_write_data;
+      x1.write_en = comp_add0.x10_0_write_en;
+      comp_add0.x10_0_done = x1.done;
+      comp_add0.go = 1'd1;
+      run_add0[done] = comp_add0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_add;
+      run_add0;
     }
   }
 }
diff --git a/frontends/relay-futil/tests/tensor3d_divide.expect b/frontends/relay-futil/tests/tensor3d_divide.expect
index 5058296dd8..0476ac946a 100644
--- a/frontends/relay-futil/tests/tensor3d_divide.expect
+++ b/frontends/relay-futil/tests/tensor3d_divide.expect
@@ -1,6 +1,6 @@
 import "primitives/std.lib";
 
-component divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
+component divide0(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_read_data: 32, x10_0_0_done: 1, y0_0_0_read_data: 32, y0_0_0_done: 1) -> (done: 1, x0_0_0_addr0: 3, x0_0_0_addr1: 3, x0_0_0_addr2: 3, x0_0_0_write_data: 32, x0_0_0_write_en: 1, x0_0_0_clk: 1, x10_0_0_addr0: 3, x10_0_0_addr1: 3, x10_0_0_addr2: 3, x10_0_0_write_data: 32, x10_0_0_write_en: 1, x10_0_0_clk: 1, y0_0_0_addr0: 3, y0_0_0_addr1: 3, y0_0_0_addr2: 3, y0_0_0_write_data: 32, y0_0_0_write_en: 1, y0_0_0_clk: 1) {
   cells {
     add0 = prim std_add(3);
     add1 = prim std_add(3);
@@ -56,7 +56,7 @@ component divide(go: 1, clk: 1, x0_0_0_read_data: 32, x0_0_0_done: 1, x10_0_0_re
       k0.write_en = 1'd1;
       let2[done] = k0.done;
     }
-    group let3<> {
+    group let3 {
       bin_read0_0.in = div_pipe0.out;
       bin_read0_0.write_en = div_pipe0.done;
       let3[done] = bin_read0_0.done;
@@ -146,31 +146,31 @@ component main () -> () {
     x1 = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
     x = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
     y = prim std_mem_d3(32, 4, 5, 6, 3, 3, 3);
-    divide0 = divide;
+    comp_divide0 = divide0;
   }
   wires {
-    group run_divide {
-      x.addr0 = divide0.x0_0_0_addr0;
-      divide0.x0_0_0_read_data = x.read_data;
-      x.addr1 = divide0.x0_0_0_addr1;
-      x.addr2 = divide0.x0_0_0_addr2;
-      y.addr0 = divide0.y0_0_0_addr0;
-      divide0.y0_0_0_read_data = y.read_data;
-      y.addr1 = divide0.y0_0_0_addr1;
-      y.addr2 = divide0.y0_0_0_addr2;
-      x1.addr0 = divide0.x10_0_0_addr0;
-      x1.addr1 = divide0.x10_0_0_addr1;
-      x1.addr2 = divide0.x10_0_0_addr2;
-      x1.write_data = divide0.x10_0_0_write_data;
-      x1.write_en = divide0.x10_0_0_write_en;
-      divide0.x10_0_0_done = x1.done;
-      divide0.go = 1'd1;
-      run_divide[done] = divide0.done ? 1'd1;
+    group run_divide0 {
+      x.addr0 = comp_divide0.x0_0_0_addr0;
+      comp_divide0.x0_0_0_read_data = x.read_data;
+      x.addr1 = comp_divide0.x0_0_0_addr1;
+      x.addr2 = comp_divide0.x0_0_0_addr2;
+      y.addr0 = comp_divide0.y0_0_0_addr0;
+      comp_divide0.y0_0_0_read_data = y.read_data;
+      y.addr1 = comp_divide0.y0_0_0_addr1;
+      y.addr2 = comp_divide0.y0_0_0_addr2;
+      x1.addr0 = comp_divide0.x10_0_0_addr0;
+      x1.addr1 = comp_divide0.x10_0_0_addr1;
+      x1.addr2 = comp_divide0.x10_0_0_addr2;
+      x1.write_data = comp_divide0.x10_0_0_write_data;
+      x1.write_en = comp_divide0.x10_0_0_write_en;
+      comp_divide0.x10_0_0_done = x1.done;
+      comp_divide0.go = 1'd1;
+      run_divide0[done] = comp_divide0.done ? 1'd1;
     }
   }
   control {
     seq {
-      run_divide;
+      run_divide0;
     }
   }
 }
diff --git a/frontends/relay-futil/utilities.py b/frontends/relay-futil/utilities.py
index aab5fd2a72..58fbb1ebc1 100644
--- a/frontends/relay-futil/utilities.py
+++ b/frontends/relay-futil/utilities.py
@@ -6,6 +6,11 @@
 NumDimensionsToPrimitive = {1: PrimitiveType.Memory1D, 2: PrimitiveType.Memory2D,
                             3: PrimitiveType.Memory3D, 4: PrimitiveType.Memory4D}
 
+# Mapping between primitive type and associated Dahlia name extension.
+# E.g. A 2D memory primitive named `A` will be lowered to `A0_0`.
+DahliaNameExtension = {PrimitiveType.Memory1D: '0', PrimitiveType.Memory2D: '0_0',
+                       PrimitiveType.Memory3D: '0_0_0', PrimitiveType.Memory4D: '0_0_0_0'}
+
 
 def flatten(l):
     '''
@@ -26,8 +31,9 @@ def get_dahlia_data_type(relay_type):
     Gets the Dahlia data type from the given Relay type.
     NOTE: Currently, Dahlia does not support signed types for arrays.
     '''
-    if 'int' in relay_type: return 'ubit'
-    if 'float' in relay_type: return 'ufix'
+    dtype = relay_type.dtype
+    if 'int' in dtype: return 'ubit'
+    if 'float' in dtype: return 'ufix'
     assert False, f'{relay_type} is not supported.'
 
 
@@ -37,12 +43,12 @@ def get_bitwidth(relay_type):
     If the relay_type is floating point of size N, returns a fixed point of size <N, N/2>.
     This lowers to a fixed point cell with `int_width` of size N/2, and a `fract_width` of size N/2.
     '''
-    type = str(relay_type)
-    length = len(type)
-    if 'int' in type: return type[3:length]
-    if 'float' in type:
-        width = int(type[5:length])
-        return f'{width}, {int(width / 2)}'
+    dtype = relay_type.dtype
+    length = len(dtype)
+    if 'int' in dtype: return dtype[3:length]
+    if 'float' in dtype:
+        width = dtype[5:length]
+        return f'{width}, {int(width) // 2}'
     assert False, f'{relay_type} is not supported.'
 
 
@@ -55,64 +61,62 @@ def get_memory_parameters(type):
 
     We then parse this to determine the corresponding FuTIL and Dahlia types.
     '''
-    t = str(type)
-    data_type = get_dahlia_data_type(t)
-    if t[0:3] == 'int' or t[0:5] == 'float':
+    typ = str(type)
+    data_type = get_dahlia_data_type(type)
+
+    if typ[0:3] == 'int' or typ[0:5] == 'float':
+        # Currently, we are treating scalar values as 1D Memory primitives.
         return [get_bitwidth(type), 1, 1], PrimitiveType.Memory1D, data_type
-    assert t[0:6] == 'Tensor', f'{type} is not currently supported.'
-    string_type = t[t.find(")") + 3:t.find("]")]
-    string_dimensions = t[t.find("(") + 1:t.find(")")]
+    assert typ[0:6] == 'Tensor', f'{type} is not currently supported.'
 
-    tensor_dimensions = list(map(int, string_dimensions.split(',')))
-    data, num_dimensions = [get_bitwidth(string_type)], len(tensor_dimensions)
+    tensor_dimensions = type.concrete_shape
+    data, num_dimensions = [get_bitwidth(type)], len(tensor_dimensions)
     assert num_dimensions in NumDimensionsToPrimitive, f'{num_dimensions} dimensions is not supported.'
     for dimension in tensor_dimensions: data.append(dimension)  # Size.
     for dimension in tensor_dimensions: data.append(int(math.log2(dimension) + 1))  # Index size.
     return data, NumDimensionsToPrimitive[num_dimensions], data_type
 
 
-def build_main_controls(c: FComponent):
+def build_main_controls(component: FComponent):
     '''
-    Builds the wires and control for the `main` component.
-    This is done by creating a group run_* with its respective
-    wiring for each Dahlia declaration, and adding it to the
-    control.
+    Builds the wires and control for the `main` component. This is done by creating a group `run_*`
+    with its respective wiring for each Relay function call, and adding it to the control.
     '''
-    dahlia_declarations = []
-    for cell in reversed(c.cells):
-        if not cell.is_dahlia_declaration(): continue
-        dahlia_declarations.append(cell.dahlia_declaration)
-
-    for declaration in dahlia_declarations:
-        inputs = declaration.inputs
+    for cell in reversed(component.cells.values()):
+        if not cell.is_relay_function(): continue
+        function = cell.relay_function
+        inputs, output = function.inputs, function.output
         wires = []
-        group_name = f'run_{declaration.component_name}'
+        group_name = f'run_{function.component_name}'
         for input in flatten(inputs):
             prim = input.primitive
-            wires.append(FWire(f'{prim.name}.addr0', f'{declaration.decl_name}.{input.dahlia_name}_addr0'))
+            wires.append(FWire(f'{prim.name}.addr0', f'{function.name}.{input.dahlia_name}_addr0'))
             wires.append(
-                FWire(f'{declaration.decl_name}.{input.dahlia_name}_read_data', f'{prim.name}.read_data'))
+                FWire(f'{function.name}.{input.dahlia_name}_read_data', f'{prim.name}.read_data'))
             if prim.type == PrimitiveType.Memory1D: continue
-            wires.append(FWire(f'{prim.name}.addr1', f'{declaration.decl_name}.{input.dahlia_name}_addr1'))
+            wires.append(FWire(f'{prim.name}.addr1', f'{function.name}.{input.dahlia_name}_addr1'))
             if prim.type == PrimitiveType.Memory2D: continue
-            wires.append(FWire(f'{prim.name}.addr2', f'{declaration.decl_name}.{input.dahlia_name}_addr2'))
-
-        output = declaration.output
-        wires.append(FWire(f'{output.primitive.name}.addr0', f'{declaration.decl_name}.{output.dahlia_name}_addr0'))
-        if output.primitive.type == PrimitiveType.Memory2D or output.primitive.type == PrimitiveType.Memory3D:
-            wires.append(FWire(f'{output.primitive.name}.addr1', f'{declaration.decl_name}.{output.dahlia_name}_addr1'))
-        if output.primitive.type == PrimitiveType.Memory3D:
-            wires.append(FWire(f'{output.primitive.name}.addr2', f'{declaration.decl_name}.{output.dahlia_name}_addr2'))
-
-        wires.append(
-            FWire(f'{output.primitive.name}.write_data', f'{declaration.decl_name}.{output.dahlia_name}_write_data'))
-        wires.append(
-            FWire(f'{output.primitive.name}.write_en', f'{declaration.decl_name}.{output.dahlia_name}_write_en'))
-        wires.append(FWire(f'{declaration.decl_name}.{output.dahlia_name}_done', f'{output.primitive.name}.done'))
-        wires.append(FWire(f'{declaration.decl_name}.go', "1'd1"))
-        wires.append(FWire(f'{group_name}[done]', f"{declaration.decl_name}.done ? 1'd1"))
-        c.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
-
-    # Ensures that only group names make it into the controls of a component.
-    connections = list(filter(lambda w: w.is_group(), c.wires))
-    c.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]
+            wires.append(FWire(f'{prim.name}.addr2', f'{function.name}.{input.dahlia_name}_addr2'))
+            if prim.type == PrimitiveType.Memory3D: continue
+            wires.append(FWire(f'{prim.name}.addr3', f'{function.name}.{input.dahlia_name}_addr3'))
+
+        output_type, output_name = output.primitive.type, output.primitive.name
+        for i in range(0, 1):
+            wires.append(FWire(f'{output_name}.addr0', f'{function.name}.{output.dahlia_name}_addr0'))
+            if output_type == PrimitiveType.Memory1D: break
+            wires.append(FWire(f'{output_name}.addr1', f'{function.name}.{output.dahlia_name}_addr1'))
+            if output_type == PrimitiveType.Memory2D: break
+            wires.append(FWire(f'{output_name}.addr2', f'{function.name}.{output.dahlia_name}_addr2'))
+            if output_type == PrimitiveType.Memory3D: break
+            wires.append(FWire(f'{output_name}.addr3', f'{function.name}.{output.dahlia_name}_addr3'))
+
+        wires.append(FWire(f'{output_name}.write_data', f'{function.name}.{output.dahlia_name}_write_data'))
+        wires.append(FWire(f'{output_name}.write_en', f'{function.name}.{output.dahlia_name}_write_en'))
+        wires.append(FWire(f'{function.name}.{output.dahlia_name}_done', f'{output_name}.done'))
+        wires.append(FWire(f'{function.name}.go', "1'd1"))
+        wires.append(FWire(f'{group_name}[done]', f"{function.name}.done ? 1'd1"))
+        component.wires.append(FConnection(group=FGroup(name=group_name, wires=wires, attributes=[])))
+
+    # Ensures that only group names make it into the controls of a FuTIL component.
+    connections = list(filter(lambda w: w.is_group(), component.wires))
+    component.controls = [Seq(stmts=list(map(lambda w: w.group.name, connections)))]
diff --git a/fud/fud/main.py b/fud/fud/main.py
index b82e40ef6f..34d7d93ef1 100644
--- a/fud/fud/main.py
+++ b/fud/fud/main.py
@@ -44,6 +44,10 @@ def register_stages(registry, cfg):
             cfg, 'futil-noinline', '-b futil -d hole-inliner',
             'Compile FuTIL to FuTIL to remove all control and inline groups'
         ))
+    registry.register(
+        futil.FutilStage(cfg, 'futil-externalize', '-b futil -p externalize',
+                         'Compile FuTIL to FuTIL to externalize all external memory primitives'
+        ))
 
     # Verilator
     registry.register(
diff --git a/primitives/std.lib b/primitives/std.lib
index ebac340226..8cc38c3c24 100644
--- a/primitives/std.lib
+++ b/primitives/std.lib
@@ -709,6 +709,28 @@ primitive std_le<"share"=1>[width](left: width, right: width) -> (out: 1) {
   }
 }
 
+primitive std_exp(exponent: 32, go: 1, clk: 1) -> (out: 32, done: 1) {
+   verilog {
+     module std_exp
+       (input  logic [31:0]  exponent,
+        input  logic        go,
+        input  logic        clk,
+        output logic [31:0] out,
+        output logic        done);
+        always_ff @(posedge clk) begin
+          if (go) begin
+            /* verilator lint_off REALCVT */
+            out = 2.718281 ** exponent;
+            done = 1;
+          end else begin
+            out = 0;
+            done = 0;
+          end
+        end
+     endmodule
+  }
+}
+
 primitive std_sqrt(in: 32, go: 1, clk: 1) -> (out: 32, done: 1) {
   verilog {
     module std_sqrt
@@ -822,7 +844,7 @@ primitive fixed_p_std_const[width, int_width, fract_width, value1, value2] () ->
         module fixed_p_std_const
             #(parameter width=32,
             parameter int_width = 8,
-            parameter fract_width= 24,
+            parameter fract_width = 24,
             parameter value1 = 0,
             parameter value2 = 0)
 
@@ -1490,4 +1512,4 @@ primitive sfixed_p_std_add_dbit[width1, width2 , int_width1, fract_width1, int_w
         assign out = {whole_int, whole_fract};
       endmodule
     }
-}
+}
\ No newline at end of file