From fc08f821b9e94aadff80616fa97d76bef5f16b1b Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Mon, 21 Oct 2024 00:36:02 -0500
Subject: [PATCH 01/16] Add integration for the conv2d (CPU) test suite.

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/CMakeLists.txt                     |  21 +
 linalg_ops/convolution/CMakeLists.txt         | 112 +++
 .../convolution/generate_e2e_conv2d_tests.py  | 693 ++++++++++++++++
 .../convolution/generate_test_mlir_files.sh   | 105 +++
 .../f16_f16_f16/conv2d_f16_f16_f16_large.mlir |  10 +
 .../conv2d_f16_f16_f16_large_calls.mlir       | 112 +++
 .../conv2d_f16_f16_f16_medium.mlir            |  15 +
 .../conv2d_f16_f16_f16_medium_calls.mlir      | 163 ++++
 .../f16_f16_f16/conv2d_f16_f16_f16_small.mlir |  15 +
 .../conv2d_f16_f16_f16_small_calls.mlir       | 163 ++++
 .../conv2d_winograd_f16_f16_f16_large.mlir    |  10 +
 ...nv2d_winograd_f16_f16_f16_large_calls.mlir | 112 +++
 .../conv2d_winograd_f16_f16_f16_medium.mlir   |  15 +
 ...v2d_winograd_f16_f16_f16_medium_calls.mlir | 163 ++++
 .../conv2d_winograd_f16_f16_f16_small.mlir    |  15 +
 ...nv2d_winograd_f16_f16_f16_small_calls.mlir | 163 ++++
 .../f32_f32_f32/conv2d_f32_f32_f32_large.mlir |  10 +
 .../conv2d_f32_f32_f32_large_calls.mlir       | 112 +++
 .../conv2d_f32_f32_f32_medium.mlir            |  15 +
 .../conv2d_f32_f32_f32_medium_calls.mlir      | 163 ++++
 .../f32_f32_f32/conv2d_f32_f32_f32_small.mlir |  15 +
 .../conv2d_f32_f32_f32_small_calls.mlir       | 163 ++++
 .../conv2d_winograd_f32_f32_f32_large.mlir    |  10 +
 ...nv2d_winograd_f32_f32_f32_large_calls.mlir | 112 +++
 .../conv2d_winograd_f32_f32_f32_medium.mlir   |  15 +
 ...v2d_winograd_f32_f32_f32_medium_calls.mlir | 163 ++++
 .../conv2d_winograd_f32_f32_f32_small.mlir    |  15 +
 ...nv2d_winograd_f32_f32_f32_small_calls.mlir | 163 ++++
 linalg_ops/iree-e2e-conv2d-test.cc            | 775 ++++++++++++++++++
 linalg_ops/test_utils.c                       |   6 +-
 linalg_ops/test_utils.h                       |   1 +
 31 files changed, 3621 insertions(+), 4 deletions(-)
 create mode 100644 linalg_ops/convolution/CMakeLists.txt
 create mode 100644 linalg_ops/convolution/generate_e2e_conv2d_tests.py
 create mode 100755 linalg_ops/convolution/generate_test_mlir_files.sh
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir
 create mode 100644 linalg_ops/iree-e2e-conv2d-test.cc

diff --git a/linalg_ops/CMakeLists.txt b/linalg_ops/CMakeLists.txt
index 08cf318..c6a14aa 100644
--- a/linalg_ops/CMakeLists.txt
+++ b/linalg_ops/CMakeLists.txt
@@ -114,6 +114,26 @@ iree_cc_binary(
     iree::vm::cc
 )
 
+iree_cc_binary(
+  NAME
+    iree-e2e-conv2d-test
+  SRCS
+    "iree-e2e-conv2d-test.cc"
+  DEPS
+    ::test_utils
+    iree::base
+    iree::base::internal
+    iree::base::internal::cpu
+    iree::base::internal::flags
+    iree::base::internal::path
+    iree::hal
+    iree::modules::hal
+    iree::tooling::context_util
+    iree::tooling::device_util
+    iree::vm
+    iree::vm::cc
+)
+
 #-------------------------------------------------------------------------------
 # Tests
 #-------------------------------------------------------------------------------
@@ -123,3 +143,4 @@ include(iree_test_suites_native_test)
 include(iree_test_suites_runner_test)
 
 add_subdirectory(matmul)
+add_subdirectory(convolution)
\ No newline at end of file
diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
new file mode 100644
index 0000000..e9a57b5
--- /dev/null
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -0,0 +1,112 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# TODO(scotttodd): add filtering here, in the helper functions, or in ctest to
+#                  choose which tests to compile and run
+
+set(_SIZES)
+list(APPEND _SIZES "large")
+list(APPEND _SIZES "medium")
+list(APPEND _SIZES "small")
+
+###############################################################################
+#
+# CPU - llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+
+set(_DTYPES)
+list(APPEND _DTYPES "f16_f16_f16")
+list(APPEND _DTYPES "f32_f32_f32")
+
+foreach(_DTYPE IN LISTS _DTYPES)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_${_DTYPE}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_FLAGS
+      RUNNER_FLAGS
+      LABELS
+         "hostonly"
+         "local"
+    )
+  endforeach()
+endforeach()
+
+foreach(_DTYPE IN LISTS _DTYPES)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_winograd_${_DTYPE}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_FLAGS
+        "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
+      RUNNER_FLAGS
+      LABELS
+         "hostonly"
+         "local"
+      TARGET_CPU_FEATURES_VARIANTS
+         "default"
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, default flags.
+#
+###############################################################################
+
+# if(IREE_HIP_TEST_TARGET_CHIP)
+
+# set(_DTYPES)
+# list(APPEND _DTYPES "f16_f16_f16")
+# list(APPEND _DTYPES "f32_f32_f32")
+
+# foreach(_DTYPE IN LISTS _DTYPES)
+#   foreach(_SIZE IN LISTS _SIZES)
+#     iree_test_suites_runner_test(
+#       NAME
+#         matmul_hip_${_DTYPE}_${_SIZE}
+#       TESTS_SRC
+#         "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
+#       CALLS_SRC
+#         "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
+#       TEST_RUNNER
+#         iree-test-suites_iree-e2e-matmul-test
+#       TARGET_BACKEND
+#         "rocm"
+#       DRIVER
+#         "hip"
+#       COMPILER_FLAGS
+#         "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+#       RUNNER_FLAGS
+#       LABELS
+#     )
+#   endforeach()
+# endforeach()
+
+# endif()
\ No newline at end of file
diff --git a/linalg_ops/convolution/generate_e2e_conv2d_tests.py b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
new file mode 100644
index 0000000..a492eb4
--- /dev/null
+++ b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
@@ -0,0 +1,693 @@
+#!/usr/bin/env python3
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Generator for e2e conv2d tests.
+"""
+
+from typing import Optional
+import argparse
+import enum
+import dataclasses
+import typing
+import math
+import itertools
+import re
+
+# Data type of kernel entries. The string values must match MLIR data types.
+@enum.unique
+class KernelElemTypeId(enum.Enum):
+    NONE = ""
+    I8 = "i8"
+    F32 = "f32"
+    F16 = "f16"
+
+
+# Data type of input entries. The string values must match MLIR data types.
+@enum.unique
+class InputElemTypeId(enum.Enum):
+    NONE = ""
+    I8 = "i8"
+    F32 = "f32"
+    F16 = "f16"
+
+
+# Data type of input entries. The string values must match MLIR data types.
+@enum.unique
+class AccElemTypeId(enum.Enum):
+    NONE = ""
+    I32 = "i32"
+    F32 = "f32"
+    F16 = "f16"
+
+# Enumerates of the collections of shapes that we can generate tests for.
+# The values are the accepted values for the --shapes= flag.
+@enum.unique
+class ShapesId(enum.Enum):
+    SMALL = "small"
+    MEDIUM = "medium"
+    LARGE = "large"
+
+
+# Enumerates ways to construct MLIR tensor types.
+# TODO: Enable dynamic dimensions once the tests start passing.
+@enum.unique
+class Dynamicity(enum.Enum):
+    DYNAMIC = "dynamic"  # Use '?' everywhere. Example: tensor<?x?xf32>.
+    STATIC = "static"  # Use fixed values everywhere. Example: tensor<4x6xf32>.
+    MIXED = "mixed"  # Randomly mix '?' and values. Example: tensor<?x4xf32>.
+
+
+# TODO: Add more input layouts as needed. The layout determines the dim of input and kernel.
+@enum.unique
+class InputLayout(enum.Enum):
+    NCHW = "nchw"
+    NHWC = "nhwc"
+
+
+# TODO: Add more kernel layouts as needed.
+@enum.unique
+class KernelLayout(enum.Enum):
+    FCHW = "fchw"
+    HWCF = "hwcf"
+
+
+# Describes the shape of a tensor conv2d in the usual convention:
+# the input is {n}x{c}x{h}x{w}, the kernel is {f}x{c}x{kh}x{kw}, the accumulator/result is
+# {n}x{f}x{oh}x{ow}.
+# The extra `accumulate` boolean tells whether the conv2d is accumulating into
+# an existing accumulator (C += A * B) or just overwriting the result
+# (C = A * B).
+@dataclasses.dataclass
+class TestShape:
+    n: int
+    c: int
+    h: int
+    w: int
+    kh: int
+    kw: int
+    f: int
+    accumulate: bool
+
+
+# Attributes for the linalg.conv2d operation.
+@dataclasses.dataclass
+class ConvAttrs:
+    STRIDE: typing.Tuple[int, int] = (1, 1)
+    DILATION: typing.Tuple[int, int] = (1, 1)
+
+
+# Returns the list of TestShape's to use for the collection of shapes
+# identified by shapes_id.
+def get_test_shapes(shapes_id: ShapesId):
+    # Notes:
+    # 1. Be conservative in adding more shapes, as that can increase both the
+    #    build and execution latency of tests. The build latency is nearly the
+    #    same for all shapes, while execution latency grows linearly with
+    #    n*f*ow*oh*kh*kw.
+
+    if shapes_id == ShapesId.SMALL:
+        return [
+            TestShape(n=1, c=1, h=1, w=1, kh=1, kw=1, f=1, accumulate=True),
+            TestShape(n=1, c=1, h=16, w=16, kh=2, kw=2, f=1, accumulate=True),
+            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=2, accumulate=True),
+        ]
+    if shapes_id == ShapesId.MEDIUM:
+        return [
+            TestShape(n=2, c=2, h=32, w=32,  kh=3, kw=3, f=2, accumulate=True),
+            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
+            TestShape(n=2, h=32, w=32, c=32, kh=3, kw=3, f=64, accumulate=True),
+        ]
+    if shapes_id == ShapesId.LARGE:
+        return [
+            TestShape(n=2, c=4, h=128, w=128, kh=3, kw=3, f=8, accumulate=True),
+            TestShape(n=2, c=3, h=128, w=128, kh=3, kw=3, f=12, accumulate=True),
+        ]
+
+    raise ValueError(shapes_id)
+
+
+# A shape dimension value, i.e. a size value that could appear in a MLIR type
+# such as 'tensor<?x4xf32>'. None means a dynamic size, similar to '?' in MLIR.
+@dataclasses.dataclass
+class DimSize:
+    value: typing.Optional[int]
+
+
+# Generates a compile-time MLIR size value, i.e. either a fixed positive integer
+# or None (which maps to MLIR '?') depending on dynamicity.
+def shape_dim(x: int, dynamicity: Dynamicity):
+    if dynamicity == Dynamicity.DYNAMIC:
+        return DimSize(None)
+    elif dynamicity == Dynamicity.STATIC:
+        return DimSize(x)
+    else:
+        raise ValueError(dynamicity)
+
+
+# Stringification used for generating MLIR types, e.g. tensor<?x?xf32>.
+def int_or_question_mark(s: DimSize):
+    return s.value or "?"
+
+
+# Stringification used for generating alphanumeric identifiers, e.g.
+# func.func @somefunction_DYNxDYNxf32, where we can't use "?" characters.
+def int_or_DYN(s: DimSize):
+    return s.value or "DYN"
+
+
+# Determines the shape of input and kernel tensors.
+@dataclasses.dataclass
+class TestInputTensorShapes:
+    n: DimSize
+    c: DimSize
+    h: DimSize
+    w: DimSize
+    kh: DimSize
+    kw: DimSize
+    f: DimSize
+
+
+# Helper for generate_function. Generates TestInputTensorShapes, i.e.
+# converts from the runtime shape dimensions in TestShape and given dynamicity to
+# the set of shapes to be used in a test function's input tensors.
+def generate_shapes(shape: TestShape, dynamicity: Dynamicity):
+    n = shape_dim(shape.n, dynamicity)
+    c = shape_dim(shape.c, dynamicity)
+    h = shape_dim(shape.h, dynamicity)
+    w = shape_dim(shape.w, dynamicity)
+    kh = shape_dim(shape.kh, dynamicity)
+    kw = shape_dim(shape.kw, dynamicity)
+    f = shape_dim(shape.f, dynamicity)
+    shapes = TestInputTensorShapes(
+        n=n,
+        c=c,
+        h=h,
+        w=w,
+        kh=kh,
+        kw=kw,
+        f=f,
+    )
+    return shapes
+
+
+# Helper to calculate the output shape based on the input shape, kernel shape,
+# dilation and stride.
+def calc_out_shape(i_shape: int, k_shape: int, dilation_val: int, stride_val: int):
+    x = (k_shape - 1) * (dilation_val - 1)
+    x = i_shape - k_shape - x
+    return math.floor(x / stride_val) + 1
+
+
+# Helper to return input, kernel and output shapes based on the layout and Conv2dParams.
+def get_tensor_shape(
+    shapes: TestShape,
+    kernel_layout: KernelLayout,
+    input_layout: InputLayout,
+    conv_attr: ConvAttrs,
+):
+    n = shapes.n
+    c = shapes.c
+    h = shapes.h
+    w = shapes.w
+    kh = shapes.kh
+    kw = shapes.kw
+    f = shapes.f
+
+    # Extract input dimensions
+    input_height, input_width = h, w
+
+    # Extract kernel dimensions
+    kernel_height, kernel_width = kh, kw
+
+    # Get the dilation and stride
+    dilation = conv_attr.DILATION
+    stride = conv_attr.STRIDE
+
+    # Calculate output height.
+    oh = calc_out_shape(input_height, kernel_height, dilation[0], stride[0])
+    # Calculate output width.
+    ow = calc_out_shape(input_width, kernel_width, dilation[1], stride[1])
+
+    input_tensor_shape, kernel_tensor_shape, output_tensor_shape = [], [], []
+
+    if input_layout == InputLayout.NCHW:
+        input_tensor_shape = [n, c, h, w]
+        output_tensor_shape = [n, f, oh, ow]
+    elif input_layout == InputLayout.NHWC:
+        input_tensor_shape = [n, h, w, c]
+        output_tensor_shape = [n, oh, ow, f]
+    else:
+        raise ValueError(input_layout)
+
+    if kernel_layout == KernelLayout.FCHW:
+        kernel_tensor_shape = [f, c, kh, kw]
+    elif kernel_layout == KernelLayout.HWCF:
+        kernel_tensor_shape = [kh, kw, c, f]
+    else:
+        raise ValueError(kernel_layout)
+
+    return input_tensor_shape, kernel_tensor_shape, output_tensor_shape
+
+
+# Helper for generate_function.
+# Generates a name for a test function in the generated MLIR code.
+def generate_function_name(
+    input_type: InputElemTypeId,
+    kernel_type: KernelElemTypeId,
+    output_type: AccElemTypeId,
+    shapes: TestInputTensorShapes,
+    accumulate: bool,
+):
+    input_t = input_type.value
+    kernel_t = kernel_type.value
+    acc_t = output_type.value
+    n = int_or_DYN(shapes.n)
+    c = int_or_DYN(shapes.c)
+    h = int_or_DYN(shapes.h)
+    w = int_or_DYN(shapes.w)
+    kh = int_or_DYN(shapes.kh)
+    kw = int_or_DYN(shapes.kw)
+    f = int_or_DYN(shapes.f)
+
+    conv2d_kind = "conv2d_accumulate" if accumulate else "conv2d"
+    return (
+        f"{conv2d_kind}_{n}_{c}_{h}_{w}_times_"
+        + f"{kh}_{kw}_{f}_dtype_{input_t}_{kernel_t}_{acc_t}"
+    )
+
+
+# Represents a generated test function.
+@dataclasses.dataclass
+class MLIRFunction:
+    name: str
+    signature: str
+    import_declaration: str
+    definition: str
+
+
+# Generates a test function in the generated MLIR code.
+# The generated function will take the same arguments as linalg.conv2d variants
+# and will just call linalg.conv2d variants with them, returning its result.
+def generate_function(
+    input_type: InputElemTypeId,
+    input_layout: InputLayout,
+    kernel_type: KernelElemTypeId,
+    kernel_layout: KernelLayout,
+    acc_type: AccElemTypeId,
+    conv2d_attr: ConvAttrs,
+    shape: TestShape,
+    dynamicity: Dynamicity,
+):
+    shapes = generate_shapes(shape, dynamicity)
+    func_name = generate_function_name(
+        input_type,
+        kernel_type,
+        acc_type,
+        shapes,
+        shape.accumulate,
+    )
+
+    input_shape, kernel_shape, output_shape = get_tensor_shape(
+        shape, kernel_layout, input_layout, conv2d_attr
+    )
+    input_tensor_type = f"tensor<{input_shape[0]}x{input_shape[1]}x{input_shape[2]}x{input_shape[3]}x{input_type.value}>"
+    kernel_tensor_type = f"tensor<{kernel_shape[0]}x{kernel_shape[1]}x{kernel_shape[2]}x{kernel_shape[3]}x{kernel_type.value}>"
+
+    acc_tensor_type = f"tensor<{output_shape[0]}x{output_shape[1]}x{output_shape[2]}x{output_shape[3]}x{acc_type.value}>"
+
+    op_name = None
+    if input_layout == InputLayout.NCHW:
+        if kernel_layout == KernelLayout.FCHW:
+            op_name = "linalg.conv_2d_nchw_fchw"
+        if kernel_layout == KernelLayout.HWCF:
+            op_name = "linalg.conv_2d_nchw_hwcf"
+    elif input_layout == InputLayout.NHWC:
+        if kernel_layout == KernelLayout.HWCF:
+            op_name = "linalg.conv_2d_nhwc_hwcf"
+
+    if op_name is None:
+        raise ValueError("Invalid combination of input_layout and kernel_layout")
+
+    conv_attr = f"{{dilations = dense<{list(conv2d_attr.DILATION)}> : tensor<2xi64>, strides = dense<{list(conv2d_attr.STRIDE)}> : tensor<2xi64>}}"
+
+    # Compilation info is optional; prints empty string by default.
+    func_definition = ""
+
+    signature = f"({input_tensor_type}, {kernel_tensor_type}, {acc_tensor_type}) -> {acc_tensor_type}"
+    import_declaration = f"func.func private @module.{func_name}(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view"
+    func_definition = func_definition + (
+        f"func.func @{func_name}(%lhs: {input_tensor_type}, %rhs: {kernel_tensor_type}, %acc: {acc_tensor_type}) -> {acc_tensor_type} {{\n"
+        f"  %result = {op_name} {conv_attr} ins(%lhs, %rhs: {input_tensor_type}, {kernel_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
+        f"  return %result: {acc_tensor_type}\n"
+        f"}}\n"
+    )
+
+    return MLIRFunction(
+        name=func_name,
+        signature=signature,
+        import_declaration=import_declaration,
+        definition=func_definition,
+    )
+
+
+# Represents a call to a generated test function.
+@dataclasses.dataclass
+class TestCall:
+    function: MLIRFunction
+    op: str
+
+
+# Enumerates ways to initialize tensor buffer contents.
+@enum.unique
+class TensorGenerator(enum.Enum):
+    ZERO = "zero"  # Fill with zeros
+    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
+
+
+# Intentionally fixed seed! We want full reproducibility here, both across runs
+# and across machines.
+# Intentionally not shared with local_pseudorandom_state to limit the ways
+# in which shuffling testcases changes which random values are generated.
+pseudorandom_generator_seed = 1
+
+
+# Generate a 4d tensor function argument of the given size as `%name`.
+def generate_random_4d_tensor(
+    name: str,
+    tensor_shape: list,
+    element_type: typing.Union[InputElemTypeId, KernelElemTypeId],
+):
+    global pseudorandom_generator_seed
+    pseudorandom_generator_seed = pseudorandom_generator_seed + 1
+    return (
+        f"  %{name}_dim0 = arith.constant {tensor_shape[0]} : i64\n"
+        f"  %{name}_dim1 = arith.constant {tensor_shape[1]} : i64\n"
+        f"  %{name}_dim2 = arith.constant {tensor_shape[2]} : i64\n"
+        f"  %{name}_dim3 = arith.constant {tensor_shape[3]} : i64\n"
+        f"  %{name}_element_type = hal.element_type<{element_type.value}> : i32\n"
+        f"  %{name}_seed = arith.constant {pseudorandom_generator_seed} : i32\n"
+        f"  %{name} = call @conv2d_test.generate_random_tensor(%device, %{name}_dim0, %{name}_dim1, %{name}_dim2, %{name}_dim3, %{name}_element_type, %{name}_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view\n"
+    )
+
+
+call_id = 0
+
+
+def generate_call(
+    function: MLIRFunction,
+    input_type: InputElemTypeId,
+    input_layout: InputLayout,
+    kernel_type: KernelElemTypeId,
+    kernel_layout: KernelLayout,
+    conv2d_attr: ConvAttrs,
+    acc_type: AccElemTypeId,
+    shape: TestShape,
+):
+    global call_id
+    func_name = f"{function.name}_{shape.n}_{shape.c}_{shape.h}_{shape.w}_{shape.f}_{shape.kh}_{shape.kw}"
+    if shape.accumulate:
+        func_name = f"{func_name}_acc"
+    func_name = f"{func_name}_{call_id}"
+    call_id = call_id + 1
+
+    # layout of output tensor for checking correctness
+    layout = -1
+
+    if input_layout == InputLayout.NCHW:
+        if kernel_layout == KernelLayout.FCHW or kernel_layout == KernelLayout.HWCF:
+            layout = 0  # for output tensor NxFxOHxOW
+        else:
+            raise ValueError(kernel_layout)
+    elif input_layout == InputLayout.NHWC:
+        if kernel_layout == KernelLayout.HWCF:
+            layout = 1  # for output tensor NxOHxOWxF
+        else:
+            raise ValueError(kernel_layout)
+    else:
+        raise ValueError(InputLayout)
+        
+    description = f"Conv2d shape (NxCxHxWxFxKHxKW): {shape.n}x{shape.c}x{shape.h}x{shape.w}x{shape.f}x{shape.kh}x{shape.kw}"
+    op = (
+        f"func.func @{func_name}() attributes {{\n"
+        f'  iree.reflection = {{description = "{description}"}}\n'
+        "} {\n"
+        "  %device_index = arith.constant 0 : index\n"
+        "  %device = hal.devices.get %device_index : !hal.device\n"
+    )
+
+    inp_shape, kernel_shape, out_shape = get_tensor_shape(
+        shape,
+        kernel_layout,
+        input_layout,
+        conv2d_attr,
+    )
+
+    op = op + generate_random_4d_tensor("input", inp_shape, input_type)
+    op = op + generate_random_4d_tensor("kernel", kernel_shape, kernel_type)
+    if shape.accumulate:
+        op = op + generate_random_4d_tensor("acc", out_shape, acc_type)
+        # TODO(#16168): there's a bug with in-place input->output aliasing and
+        # we work around it here by passing in a unique copy.
+        global pseudorandom_generator_seed
+        pseudorandom_generator_seed = pseudorandom_generator_seed - 1
+        op = op + generate_random_4d_tensor("acc_copy", out_shape, acc_type)
+        op = op + (
+            f"  %result = call @module.{function.name}(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
+        )
+    else:
+        op = op + (
+            f"  %acc = util.null : !hal.buffer_view\n"
+            f"  %result = call @module.{function.name}(%input, %kernel) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
+        )
+
+    op = op + (
+        f"  %n = arith.constant {shape.n} : i64\n"
+        f"  %c = arith.constant {shape.c} : i64\n"
+        f"  %h = arith.constant {shape.h} : i64\n"
+        f"  %w = arith.constant {shape.w} : i64\n"
+        f"  %f = arith.constant {shape.f} : i64\n"
+        f"  %kh = arith.constant {shape.kh} : i64\n"
+        f"  %kw = arith.constant {shape.kw} : i64\n"
+        f"  %layout = arith.constant {layout} : i64\n"
+        f"  %sh = arith.constant {conv2d_attr.STRIDE[0]} : i64\n"
+        f"  %sw = arith.constant {conv2d_attr.STRIDE[1]} : i64\n"
+        f"  %dh = arith.constant {conv2d_attr.DILATION[0]} : i64\n"
+        f"  %dw = arith.constant {conv2d_attr.DILATION[1]} : i64\n"
+        f"  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n")
+
+    op = op + "  return\n"
+    op = op + "}\n"
+
+    return TestCall(function=function, op=op)
+
+
+# Generates all output files' contents as strings.
+def generate(
+    input_elem_type: InputElemTypeId,
+    input_layout: InputLayout,
+    kernel_elem_type: KernelElemTypeId,
+    kernel_layout: KernelLayout,
+    conv2d_attr: ConvAttrs,
+    acc_type: AccElemTypeId,
+    shapes_id: ShapesId,
+):
+    functions = {}
+    calls = []
+
+    for shape in get_test_shapes(shapes_id):
+        for dynamicity in [Dynamicity.STATIC]:
+            function = generate_function(
+                input_elem_type,
+                input_layout,
+                kernel_elem_type,
+                kernel_layout,
+                acc_type,
+                conv2d_attr,
+                shape,
+                dynamicity,
+            )
+            # Different testcases may differ only by runtime parameters but
+            # share the same code. For example, dynamic-shapes testcases
+            # share the same code involing tensor<?x?xf32> even though the runtime
+            # value in the trace are different. That's why we append conditionally
+            # to calls, but unconditionally to function_definitions.
+            if function.name not in functions:
+                functions[function.name] = function
+            calls.append(
+                generate_call(
+                    function,
+                    input_elem_type,
+                    input_layout,
+                    kernel_elem_type,
+                    kernel_layout,
+                    conv2d_attr,
+                    acc_type,
+                    shape,
+                )
+            )
+
+    return (functions, calls)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Generator of e2e conv2d tests")
+    parser.add_argument(
+        "--output_conv2d_mlir",
+        type=str,
+        help="Path of output .mlir file containing the generated conv2d functions",
+        required=True,
+    )
+    parser.add_argument(
+        "--output_calls_mlir",
+        type=str,
+        help="Path of output .mlir file containing the calls",
+        required=True,
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        choices=["i8", "f32", "f16"],
+        help="Numeric type of input tensors",
+        required=True,
+    )
+    parser.add_argument(
+        "--input_layout",
+        type=str,
+        default="nchw",
+        choices=["nchw", "nhwc"],
+        help="Layout of the input tensor. Currently, only nchw is supported.",
+        required=False,
+    )
+    parser.add_argument(
+        "--kernel_type",
+        type=str,
+        choices=["i8", "f32", "f16"],
+        help="Numeric type of input tensors",
+        required=True,
+    )
+    parser.add_argument(
+        "--kernel_layout",
+        type=str,
+        default="fchw",
+        choices=["fchw", "hwcf"],
+        help="Layout of kernel tensor. Currently, only fchw is supported.",
+        required=False,
+    )
+    parser.add_argument(
+        "--acc_type",
+        type=str,
+        choices=["i32", "f32", "f16"],
+        help="Numeric type of input tensors",
+        default="",
+        required=False,
+    )
+    parser.add_argument(
+        "--shapes",
+        type=str,
+        choices=[s.value for s in ShapesId],
+        help="Collection of tensor shapes to test",
+        required=True,
+    )
+    parser.add_argument(
+        "--dilation",
+        type=str,
+        default="1,1",
+        help="The dilation factor for the convolution operation. Comma-separated. As in 1,1",
+        required=False,
+    )
+    parser.add_argument(
+        "--stride",
+        type=str,
+        default="1,1",
+        help="The stride factor for the convolution operation. Comma-separated. As in 1,1",
+        required=False,
+    )
+    parser.add_argument(
+        "--requirements",
+        type=str,
+        help="Target requirements for this module. Comma-separated. As in -iree-llvmcpu-target-cpu-features. If the target device does not meet all of the requirements, the test will be skipped.",
+        required=False,
+    )
+    return parser.parse_args()
+
+
+def write_code_file(functions, filename):
+    with open(filename, "w") as file:
+        for function in functions.values():
+            file.write(function.definition + "\n")
+
+
+def write_calls_file(functions, calls, filename, requirements):
+    # Module-level reflection information used to control the test tool.
+    reflection = ""
+    if requirements:
+        reflection = (
+            "iree.reflection = {"
+            'target_features = "'
+            + ",".join([req.lstrip("+") for req in requirements.split(",")])
+            + '"'
+            "}"
+        )
+    module_definition = (
+        f"builtin.module @calls attributes {{\n" f"  {reflection}\n" f"}} {{\n\n"
+    )
+
+    # Declare the custom module that generates arguments.
+    module_definition = module_definition + (
+        "func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view\n"
+        "func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)\n"    
+        "\n"
+        
+    )
+
+    # Declare the functions that will be called.
+    for function in functions.values():
+        module_definition = module_definition + function.import_declaration + "\n"
+    module_definition = module_definition + "\n"
+
+    # Emit the test cases for each call.
+    for call in calls:
+        module_definition = module_definition + call.op + "\n"
+
+    module_definition = module_definition + "\n}\n"
+
+    with open(filename, "w") as file:
+        file.write(module_definition)
+
+
+def main(args):
+    input_type = InputElemTypeId(args.input_type)
+    input_layout = InputLayout(args.input_layout)
+    kernel_type = KernelElemTypeId(args.kernel_type)
+    kernel_layout = KernelLayout(args.kernel_layout)
+    acc_type = AccElemTypeId(args.acc_type) 
+    shapes_id = ShapesId(args.shapes)
+    conv2d_attr = ConvAttrs(
+        tuple(map(int, args.stride.split(","))),
+        tuple(map(int, args.dilation.split(","))),
+    )
+
+    (functions, calls) = generate(
+        input_type,
+        input_layout,
+        kernel_type,
+        kernel_layout,
+        conv2d_attr,
+        acc_type,
+        shapes_id,
+    )
+
+    write_code_file(functions, args.output_conv2d_mlir)
+    write_calls_file(
+        functions,
+        calls,
+        args.output_calls_mlir,
+        args.requirements,
+    )
+
+
+if __name__ == "__main__":
+    main(parse_arguments())
\ No newline at end of file
diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
new file mode 100755
index 0000000..35c4a7a
--- /dev/null
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This script runs generate_e2e_conv2d_tests for all argument combinations that
+# we are interested in testing.
+#
+# The output is a 'generated' folder with contents like this:
+#   linalg_ops/
+#     convolution/
+#       generated/
+#         f16_f16_f16/
+#           conv2d_f16_f16_f16_large_calls.mlir
+#           conv2d_f16_f16_f16_large.mlir
+#           conv2d_f16_f16_f16_medium_calls.mlir
+#           conv2d_f16_f16_f16_medium.mlir
+#           conv2d_f16_f16_f16_small_calls.mlir
+#           conv2d_f16_f16_f16_small.mlir
+#           conv2d_winograd_f16_f16_f16_large_calls.mlir
+#           conv2d_winograd_f16_f16_f16_large.mlir
+#           conv2d_winograd_f16_f16_f16_medium_calls.mlir
+#           conv2d_winograd_f16_f16_f16_medium.mlir
+#           conv2d_winograd_f16_f16_f16_small_calls.mlir
+#           conv2d_winograd_f16_f16_f16_small.mlir
+#         f32_f32_f32/
+#           conv2d_f32_f32_f32_large_calls.mlir
+#           conv2d_f32_f32_f32_large.mlir
+#           conv2d_f32_f32_f32_medium_calls.mlir
+#           conv2d_f32_f32_f32_medium.mlir
+#           conv2d_f32_f32_f32_small_calls.mlir
+#           conv2d_f32_f32_f32_small.mlir
+#           conv2d_winograd_f32_f32_f32_large_calls.mlir
+#           conv2d_winograd_f32_f32_f32_large.mlir
+#           conv2d_winograd_f32_f32_f32_medium_calls.mlir
+#           conv2d_winograd_f32_f32_f32_medium.mlir
+#           conv2d_winograd_f32_f32_f32_small_calls.mlir
+#           conv2d_winograd_f32_f32_f32_small.mlir
+#         ...
+#           ...
+# Usage:
+#   generate_test_mlir_files.sh
+
+set -euo pipefail
+
+this_dir="$(cd $(dirname $0) && pwd)"
+generated_dir_root="${this_dir}/generated"
+
+# Reset generated directory.
+rm -rf ${generated_dir_root?}
+mkdir -p ${generated_dir_root?}
+
+shapes=(
+  "small"
+  "medium"
+  "large"
+)
+
+# input_type;kernel_type;acc_type
+type_combinations=(
+  "f16;f16;f16"
+  "f32;f32;f32"
+)
+
+for type_combination in ${type_combinations[@]}; do
+  IFS=";" read -r -a types <<< "${type_combination}"
+  input_type="${types[0]}"
+  kernel_type="${types[1]}"
+  acc_type="${types[2]}"
+
+  type_name="${input_type}_${kernel_type}_${acc_type}"
+  type_combination_dir="${generated_dir_root}/${type_name}"
+  mkdir -p ${type_combination_dir}
+
+  for shape in ${shapes[@]}; do
+    echo "Generating conv2d test files for ${type_name}_${shape}"
+
+    name="conv2d_${type_name}_${shape}"
+    python ${this_dir}/generate_e2e_conv2d_tests.py \
+      --output_conv2d_mlir=${type_combination_dir}/${name}.mlir \
+      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
+      --input_type=${input_type} \
+      --kernel_type=${kernel_type} \
+      --acc_type=${acc_type} \
+      --shapes=${shape}
+
+    name="conv2d_winograd_${type_name}_${shape}"
+    python ${this_dir}/generate_e2e_conv2d_tests.py \
+      --output_conv2d_mlir=${type_combination_dir}/${name}.mlir \
+      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
+      --input_type=${input_type} \
+      --kernel_type=${kernel_type} \
+      --acc_type=${acc_type} \
+      --shapes=${shape}
+  done
+done
+
+# input_type;kernel_type;acc_type
+type_combinations=(
+  "f16;f16;f16"
+  "f32;f32;f32"
+)
\ No newline at end of file
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
new file mode 100644
index 0000000..51c8a1e
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
@@ -0,0 +1,10 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: tensor<2x4x128x128xf16>, %rhs: tensor<8x4x3x3xf16>, %acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16>
+  return %result: tensor<2x8x126x126xf16>
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16>
+  return %result: tensor<2x12x126x126xf16>
+}
+
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
new file mode 100644
index 0000000..7dfb92f
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
@@ -0,0 +1,112 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
new file mode 100644
index 0000000..a2564aa
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
+  return %result: tensor<2x2x30x30xf16>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+  return %result: tensor<2x64x30x30xf16>
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+  return %result: tensor<2x64x30x30xf16>
+}
+
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
new file mode 100644
index 0000000..c6e86d7
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
new file mode 100644
index 0000000..ddbe425
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
+  return %result: tensor<1x1x1x1xf16>
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16>
+  return %result: tensor<1x1x15x15xf16>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
+  return %result: tensor<2x2x30x30xf16>
+}
+
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
new file mode 100644
index 0000000..872c618
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir
new file mode 100644
index 0000000..51c8a1e
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir
@@ -0,0 +1,10 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: tensor<2x4x128x128xf16>, %rhs: tensor<8x4x3x3xf16>, %acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16>
+  return %result: tensor<2x8x126x126xf16>
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16>
+  return %result: tensor<2x12x126x126xf16>
+}
+
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir
new file mode 100644
index 0000000..7dfb92f
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir
@@ -0,0 +1,112 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir
new file mode 100644
index 0000000..a2564aa
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
+  return %result: tensor<2x2x30x30xf16>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+  return %result: tensor<2x64x30x30xf16>
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+  return %result: tensor<2x64x30x30xf16>
+}
+
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir
new file mode 100644
index 0000000..c6e86d7
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir
new file mode 100644
index 0000000..ddbe425
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
+  return %result: tensor<1x1x1x1xf16>
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16>
+  return %result: tensor<1x1x15x15xf16>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
+  return %result: tensor<2x2x30x30xf16>
+}
+
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir
new file mode 100644
index 0000000..872c618
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
new file mode 100644
index 0000000..a47185c
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
@@ -0,0 +1,10 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: tensor<2x4x128x128xf32>, %rhs: tensor<8x4x3x3xf32>, %acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf32>, tensor<8x4x3x3xf32>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
+  return %result: tensor<2x8x126x126xf32>
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x3x128x128xf32>, %rhs: tensor<12x3x3x3xf32>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf32>, tensor<12x3x3x3xf32>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
+  return %result: tensor<2x12x126x126xf32>
+}
+
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
new file mode 100644
index 0000000..cdd2788
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
@@ -0,0 +1,112 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
new file mode 100644
index 0000000..e0a0376
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<64x2x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<64x2x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x32xf32>, %rhs: tensor<64x32x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf32>, tensor<64x32x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
new file mode 100644
index 0000000..3537bc9
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
new file mode 100644
index 0000000..9ecd2bd
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<1x1x1x1xf32>, %rhs: tensor<1x1x1x1xf32>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x1x16x16xf32>, %rhs: tensor<1x1x2x2xf32>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf32>, tensor<1x1x2x2xf32>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
+  return %result: tensor<1x1x15x15xf32>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
new file mode 100644
index 0000000..092bd67
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir
new file mode 100644
index 0000000..a47185c
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir
@@ -0,0 +1,10 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: tensor<2x4x128x128xf32>, %rhs: tensor<8x4x3x3xf32>, %acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf32>, tensor<8x4x3x3xf32>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
+  return %result: tensor<2x8x126x126xf32>
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x3x128x128xf32>, %rhs: tensor<12x3x3x3xf32>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf32>, tensor<12x3x3x3xf32>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
+  return %result: tensor<2x12x126x126xf32>
+}
+
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir
new file mode 100644
index 0000000..cdd2788
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir
@@ -0,0 +1,112 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir
new file mode 100644
index 0000000..e0a0376
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<64x2x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<64x2x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x32xf32>, %rhs: tensor<64x32x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf32>, tensor<64x32x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir
new file mode 100644
index 0000000..3537bc9
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir
new file mode 100644
index 0000000..9ecd2bd
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir
@@ -0,0 +1,15 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<1x1x1x1xf32>, %rhs: tensor<1x1x1x1xf32>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x1x16x16xf32>, %rhs: tensor<1x1x2x2xf32>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf32>, tensor<1x1x2x2xf32>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
+  return %result: tensor<1x1x15x15xf32>
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir
new file mode 100644
index 0000000..092bd67
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir
@@ -0,0 +1,163 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+
+
+}
diff --git a/linalg_ops/iree-e2e-conv2d-test.cc b/linalg_ops/iree-e2e-conv2d-test.cc
new file mode 100644
index 0000000..686fb4e
--- /dev/null
+++ b/linalg_ops/iree-e2e-conv2d-test.cc
@@ -0,0 +1,775 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+#include "iree/base/internal/math.h"
+#include "iree/hal/api.h"
+#include "iree/modules/hal/module.h"
+#include "iree/tooling/context_util.h"
+#include "iree/tooling/device_util.h"
+#include "iree/vm/api.h"
+#include "iree/vm/native_module_cc.h"
+#include "tools/testing/e2e/test_utils.h"
+
+//===----------------------------------------------------------------------===//
+// Reference conv2d (NCHW-FCHW) and (NHWC-HWCF)
+//===----------------------------------------------------------------------===//
+
+// Conversion from 4D indices in row major order to 1D index.
+static int convert_to_1d_index(iree_hal_dim_t channels, iree_hal_dim_t height,
+                               iree_hal_dim_t width, iree_hal_dim_t n,
+                               iree_hal_dim_t c, iree_hal_dim_t h,
+                               iree_hal_dim_t w) {
+  return n * (channels * height * width) + c * (height * width) + h * width + w;
+}
+
+// [f16 <= f16 * f16 + f16]
+static void reference_conv2d_f16_f16_f16_f16(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t* input_data,
+    const uint16_t* kernel_data, const uint16_t* acc_data,
+    uint16_t* result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
+    iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    //printf("layout == 0\n");
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    float acc = acc_data ? iree_math_f16_to_f32(acc_data[out_idx]) : 0.f;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+      result_data[out_idx] = iree_math_f32_to_f16(acc);
+    }
+  } else if (layout == 1) {
+    //printf("layout == 1\n");
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    float acc = acc_data ? iree_math_f16_to_f32(acc_data[out_idx]) : 0.f;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+    }
+    result_data[out_idx] = iree_math_f32_to_f16(acc);
+  }
+}
+
+// [f32 <= f16 * f16 + f32]
+static void reference_conv2d_f16_f16_f32_f32(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t* input_data,
+    const uint16_t* kernel_data, const float* acc_data, float* result_data,
+    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    float acc = acc_data ? acc_data[out_idx] : 0.f;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+      result_data[out_idx] = acc;
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    float acc = acc_data ? acc_data[out_idx] : 0.f;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+    }
+    result_data[out_idx] = acc;
+  }
+}
+
+// [i32 <= i8 * i8 + i32]
+static void reference_conv2d_i8_i8_i32_i32(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const int8_t* input_data,
+    const int8_t* kernel_data, const int32_t* acc_data, int32_t* result_data,
+    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    int32_t acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          int8_t lhs_value = input_data[inp_idx];
+          int8_t rhs_value = kernel_data[krnl_idx];
+          acc += (int32_t)lhs_value * (int32_t)rhs_value;
+        }
+      }
+      result_data[out_idx] = acc;
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    int32_t acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+
+          int8_t lhs_value = input_data[inp_idx];
+          int8_t rhs_value = kernel_data[krnl_idx];
+          acc += (int32_t)lhs_value * (int32_t)rhs_value;
+        }
+      }
+    }
+    result_data[out_idx] = acc;
+  }
+}
+
+static void reference_conv2d_f32_f32_f32_f32(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const float* input_data,
+    const float* kernel_data, const float* acc_data, float* result_data,
+    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    float acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          acc += input_data[inp_idx] * kernel_data[krnl_idx];
+        }
+      }
+      result_data[out_idx] = acc;
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    float acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+          acc += input_data[inp_idx] * kernel_data[krnl_idx];
+        }
+      }
+    }
+    result_data[out_idx] = acc;
+  }
+}
+
+// Helper for reference_conv2d.
+static iree_status_t reference_conv2d_element(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size,
+    iree_hal_element_type_t input_type, iree_hal_element_type_t kernel_type,
+    iree_hal_element_type_t acc_type, void* input_data, void* kernel_data,
+    void* acc_data, void* result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
+    iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
+      kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
+      acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    reference_conv2d_f32_f32_f32_f32(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const float*)input_data, (const float*)kernel_data,
+        (const float*)acc_data, (float*)result_data, n, oc, oh, ow);
+  } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16) {
+    reference_conv2d_f16_f16_f16_f16(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const uint16_t*)input_data, (const uint16_t*)kernel_data,
+        (const uint16_t*)acc_data, (uint16_t*)result_data, n, oc, oh, ow);
+  } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    reference_conv2d_f16_f16_f32_f32(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const uint16_t*)input_data, (const uint16_t*)kernel_data,
+        (const float*)acc_data, (float*)result_data, n, oc, oh, ow);
+  } else if (input_type == IREE_HAL_ELEMENT_TYPE_INT_8 &&
+             kernel_type == IREE_HAL_ELEMENT_TYPE_INT_8 &&
+             acc_type == IREE_HAL_ELEMENT_TYPE_INT_32) {
+    reference_conv2d_i8_i8_i32_i32(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const int8_t*)input_data, (const int8_t*)kernel_data,
+        (const int32_t*)acc_data, (int32_t*)result_data, n, oc, oh, ow);
+  } else {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "unhandled combination of element types in conv2d");
+  }
+  return iree_ok_status();
+}
+
+// Calculate the output shape given the dilation and strides.
+static iree_hal_dim_t out_shape_calc(iree_hal_dim_t i_shape,
+                                     iree_hal_dim_t k_shape,
+                                     iree_hal_dim_t stride,
+                                     iree_hal_dim_t dilation) {
+  iree_hal_dim_t x = (k_shape - 1) * (dilation - 1);
+  x = i_shape - k_shape - x;
+  return floor(x / stride) + 1;
+}
+
+// Reference conv2d-NCHW-FCHW implementation, used to compare conv2d results
+// against.
+static iree_status_t reference_conv2d(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_element_type_t input_type, iree_hal_element_type_t kernel_type,
+    iree_hal_element_type_t acc_type, iree_byte_span_t input_contents,
+    iree_byte_span_t kernel_contents, iree_byte_span_t acc_contents,
+    iree_byte_span_t result_contents, int compute_every) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, n_size);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, c_size);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, h_size);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, w_size);
+
+  iree_hal_dim_t oh_size = out_shape_calc(h_size, kh_size, sh_size, dh_size);
+  iree_hal_dim_t ow_size = out_shape_calc(w_size, kw_size, sw_size, dw_size);
+
+  if (layout == 0) {
+    for (iree_hal_dim_t n = 0; n < n_size; ++n) {
+      for (iree_hal_dim_t oc = 0; oc < f_size; ++oc) {
+        for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
+          for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
+            IREE_RETURN_AND_END_ZONE_IF_ERROR(
+                z0,
+                reference_conv2d_element(
+                    n_size, c_size, h_size, w_size, f_size, kh_size, kw_size,
+                    layout, sh_size, sw_size, dh_size, dw_size, oh_size,
+                    ow_size, input_type, kernel_type, acc_type,
+                    input_contents.data, kernel_contents.data,
+                    acc_contents.data, result_contents.data, n, oc, oh, ow));
+          }
+        }
+      }
+    }
+  } else if (layout == 1) {
+    for (iree_hal_dim_t n = 0; n < n_size; ++n) {
+      for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
+        for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
+          for (iree_hal_dim_t oc = 0; oc < f_size; ++oc) {
+            IREE_RETURN_AND_END_ZONE_IF_ERROR(
+                z0,
+                reference_conv2d_element(
+                    n_size, c_size, h_size, w_size, f_size, kh_size, kw_size,
+                    layout, sh_size, sw_size, dh_size, dw_size, oh_size,
+                    ow_size, input_type, kernel_type, acc_type,
+                    input_contents.data, kernel_contents.data,
+                    acc_contents.data, result_contents.data, n, oc, oh, ow));
+          }
+        }
+      }
+    }
+  } else {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "unhandled conv2d layout");
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// Conv2d comparison/logging
+//===----------------------------------------------------------------------===//
+
+typedef struct {
+  iree_allocator_t host_allocator;
+  iree_hal_dim_t n;       // batch dim
+  iree_hal_dim_t c;       // input channels
+  iree_hal_dim_t h;       // input height
+  iree_hal_dim_t w;       // input width
+  iree_hal_dim_t f;       // output channels
+  iree_hal_dim_t kh;      // kernel height
+  iree_hal_dim_t kw;      // kernel width
+  iree_hal_dim_t layout;  // conv layout, 0 : nchwxfchw (default); 1: nhwcxhwcf
+  iree_hal_dim_t sh;      // stride along height dim
+  iree_hal_dim_t sw;      // stride along width dim
+  iree_hal_dim_t dh;      // dilation along height dim
+  iree_hal_dim_t dw;      // dilation along width dim
+  iree_hal_element_type_t input_type;
+  iree_hal_element_type_t kernel_type;
+  iree_hal_element_type_t acc_type;
+  iree_hal_element_type_t result_type;
+  iree_byte_span_t input_contents;
+  iree_byte_span_t kernel_contents;
+  iree_byte_span_t acc_contents;
+  iree_byte_span_t actual_contents;
+  iree_byte_span_t expected_contents;
+} conv2d_results_t;
+
+static void conv2d_results_deinitialize(conv2d_results_t* results);
+
+static iree_status_t conv2d_results_initialize(
+    iree_hal_device_t* device, iree_hal_dim_t n_size, iree_hal_dim_t c_size,
+    iree_hal_dim_t h_size, iree_hal_dim_t w_size, iree_hal_dim_t f_size,
+    iree_hal_dim_t kh_size, iree_hal_dim_t kw_size, iree_hal_dim_t layout,
+    iree_hal_dim_t sh_size, iree_hal_dim_t sw_size, iree_hal_dim_t dh_size,
+    iree_hal_dim_t dw_size, iree_hal_buffer_view_t* input,
+    iree_hal_buffer_view_t* kernel, iree_hal_buffer_view_t* acc,
+    iree_hal_buffer_view_t* result, iree_allocator_t host_allocator,
+    conv2d_results_t* out_results) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_results, 0, sizeof(*out_results));
+  out_results->host_allocator = host_allocator;
+
+  out_results->n = n_size;
+  out_results->c = c_size;
+  out_results->h = h_size;
+  out_results->w = w_size;
+  out_results->f = f_size;
+  out_results->kh = kh_size;
+  out_results->kw = kw_size;
+  out_results->layout = layout;
+  out_results->sh = sh_size;
+  out_results->sw = sw_size;
+  out_results->dh = dh_size;
+  out_results->dw = dw_size;
+
+  out_results->input_type = iree_hal_buffer_view_element_type(input);
+  out_results->kernel_type = iree_hal_buffer_view_element_type(kernel);
+  out_results->acc_type = iree_hal_buffer_view_element_type(acc);
+  out_results->result_type = iree_hal_buffer_view_element_type(result);
+
+  iree_hal_buffer_t* input_buffer = iree_hal_buffer_view_buffer(input);
+  iree_hal_buffer_t* kernel_buffer = iree_hal_buffer_view_buffer(kernel);
+  iree_hal_buffer_t* acc_buffer = acc ? iree_hal_buffer_view_buffer(acc) : NULL;
+  iree_hal_buffer_t* result_buffer = iree_hal_buffer_view_buffer(result);
+
+  iree_status_t status = iree_ok_status();
+
+  if (iree_status_is_ok(status)) {
+    out_results->input_contents.data_length =
+        iree_hal_buffer_byte_length(input_buffer);
+    status = iree_allocator_malloc(host_allocator,
+                                   out_results->input_contents.data_length,
+                                   (void**)&out_results->input_contents.data);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_transfer_d2h(
+        device, input_buffer, 0, out_results->input_contents.data,
+        out_results->input_contents.data_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_results->kernel_contents.data_length =
+        iree_hal_buffer_byte_length(kernel_buffer);
+    status = iree_allocator_malloc(host_allocator,
+                                   out_results->kernel_contents.data_length,
+                                   (void**)&out_results->kernel_contents.data);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_transfer_d2h(
+        device, kernel_buffer, 0, out_results->kernel_contents.data,
+        out_results->kernel_contents.data_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  if (acc_buffer) {
+    if (iree_status_is_ok(status)) {
+      out_results->acc_contents.data_length =
+          iree_hal_buffer_byte_length(acc_buffer);
+      status = iree_allocator_malloc(host_allocator,
+                                     out_results->acc_contents.data_length,
+                                     (void**)&out_results->acc_contents.data);
+    }
+    if (iree_status_is_ok(status)) {
+      status = iree_hal_device_transfer_d2h(
+          device, acc_buffer, 0, out_results->acc_contents.data,
+          out_results->acc_contents.data_length,
+          IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_results->actual_contents.data_length =
+        iree_hal_buffer_byte_length(result_buffer);
+    status = iree_allocator_malloc(host_allocator,
+                                   out_results->actual_contents.data_length,
+                                   (void**)&out_results->actual_contents.data);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_transfer_d2h(
+        device, result_buffer, 0, out_results->actual_contents.data,
+        out_results->actual_contents.data_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_results->expected_contents.data_length =
+        iree_hal_buffer_byte_length(result_buffer);
+    status = iree_allocator_malloc(
+        host_allocator, out_results->expected_contents.data_length,
+        (void**)&out_results->expected_contents.data);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    conv2d_results_deinitialize(out_results);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void conv2d_results_deinitialize(conv2d_results_t* results) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(results->host_allocator, results->input_contents.data);
+  iree_allocator_free(results->host_allocator, results->kernel_contents.data);
+  if (!iree_byte_span_is_empty(results->acc_contents)) {
+    iree_allocator_free(results->host_allocator, results->acc_contents.data);
+  }
+  iree_allocator_free(results->host_allocator, results->actual_contents.data);
+  iree_allocator_free(results->host_allocator, results->expected_contents.data);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Helper for check_conv2d: the actual interesting part once we've
+// obtained and validated the {n, f, oh, ow}_size values. On error, the first
+// index is returned where the actual and expected value doesn't match. TODO:
+// Add detailed logging to |file|.
+static iree_status_t check_conv2d_results_impl(FILE* file,
+                                               const conv2d_results_t* results,
+                                               int check_every) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      reference_conv2d(
+          results->n, results->c, results->h, results->w, results->f,
+          results->kh, results->kw, results->layout, results->sh, results->sw,
+          results->dh, results->dw, results->input_type, results->kernel_type,
+          results->acc_type, results->input_contents, results->kernel_contents,
+          results->acc_contents, results->expected_contents, check_every));
+
+  int count = 0;
+
+  iree_hal_dim_t oh_size =
+      out_shape_calc(results->h, results->kh, results->sh, results->dh);
+  iree_hal_dim_t ow_size =
+      out_shape_calc(results->w, results->kw, results->sw, results->dw);
+
+  for (iree_hal_dim_t n = 0; n < results->n; ++n) {
+    for (iree_hal_dim_t oc = 0; oc < results->f; ++oc) {
+      for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
+        for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
+          if (++count < check_every) continue;
+          count = 0;
+          iree_hal_dim_t idx =
+              convert_to_1d_index(results->f, oh_size, ow_size, n, oc, oh, ow);
+          iree_test_utils_e2e_value_t actual_value =
+              iree_test_utils_read_buffer_element(
+                  idx, results->result_type, results->actual_contents.data);
+          iree_test_utils_e2e_value_t expected_value =
+              iree_test_utils_read_buffer_element(
+                  idx, results->result_type, results->expected_contents.data);
+          if (!iree_test_utils_result_elements_agree(actual_value,
+                                                     expected_value)) {
+            printf("HERE: actual_value: %f, actual_value: %f\n", actual_value.f32, expected_value.f32);                                          
+            fprintf(
+                file,
+                "\n\nerror: the actual and expected result tensors disagree "
+                "at n %" PRIdim ", oc %" PRIdim ", oh %" PRIdim ", ow %" PRIdim
+                ".\n\n",
+                n, oc, oh, ow);
+            IREE_TRACE_ZONE_END(z0);
+            return iree_make_status(IREE_STATUS_ABORTED);
+          }
+        }
+      }
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+// Given an actual conv2d's inputs and output (all host-local), uses a
+// reference conv2d implementation on the same inputs to check if the output
+// is correct. On error, the first index is returned where the actual and
+// expected value doesn't match. TODO: Add detailed logging to |file|.
+static iree_status_t check_conv2d_results(FILE* file,
+                                          const conv2d_results_t* results) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // TODO: Increase the check every param to reduce the number of comparisons.
+  int check_every = 1;
+  iree_status_t status = check_conv2d_results_impl(file, results, check_every);
+  if (!iree_status_is_ok(status) && check_every > 1) {
+    // If we got a failure with check_every>1, that didn't log a useful
+    // numerical summary, as most of the reference tensor entries hadn't been
+    // computed. Rerun now with check_every=1 to get that numerical logging.
+    iree_status_ignore(status);
+    status = check_conv2d_results_impl(file, results, 1);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// `conv2d_test` custom module
+//===----------------------------------------------------------------------===//
+// This uses the C++ wrapper to keep things simple. Though easier to use it's
+// got additional overhead/code-size bloat that doesn't matter in a test like
+// this. Making a C module builder API that removes the boilerplate there is
+// TBD so this file is written in C besides this module so that we can swap it
+// back to being pure C in the future.
+
+namespace iree {
+
+class Conv2dTestModuleState final {
+ public:
+  explicit Conv2dTestModuleState(iree_allocator_t host_allocator)
+      : host_allocator_(host_allocator) {}
+  ~Conv2dTestModuleState() = default;
+
+  // Fills the destination span with pseudorandom values of the given
+  // |element_type|. The given |seed| is passed to the pseudorandom generator.
+  // The pseudorandom values are reproducible both across runs and across
+  // machines.
+  StatusOr<vm::ref<iree_hal_buffer_view_t>> GenerateRandom4dTensor(
+      const vm::ref<iree_hal_device_t> device, int64_t dim0, int64_t dim1,
+      int64_t dim2, int64_t dim3, iree_hal_element_type_t element_type,
+      int32_t seed) {
+    iree_hal_dim_t dims[4] = {
+        (iree_hal_dim_t)dim0,
+        (iree_hal_dim_t)dim1,
+        (iree_hal_dim_t)dim2,
+        (iree_hal_dim_t)dim3,
+    };
+    iree_hal_buffer_params_t buffer_params = {0};
+    buffer_params.usage = IREE_HAL_BUFFER_USAGE_DEFAULT;
+    buffer_params.access = IREE_HAL_MEMORY_ACCESS_ALL;
+    buffer_params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE;
+    vm::ref<iree_hal_buffer_view_t> result_view;
+    struct callback_state_t {
+      iree_hal_element_type_t element_type;
+      int32_t seed;
+    } callback_state = {
+        element_type,
+        seed,
+    };
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_generate_buffer(
+        device.get(), iree_hal_device_allocator(device.get()),
+        IREE_ARRAYSIZE(dims), dims, element_type,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
+        +[](iree_hal_buffer_mapping_t* mapping, void* user_data) {
+          callback_state_t callback_state = *(callback_state_t*)user_data;
+          iree_byte_span_t span = mapping->contents;
+          // Generate "uniform" integer-valued numbers in the range [min,
+          // max].
+          int32_t min = 0;
+          int32_t max = 0;
+          iree_test_utils_get_min_max_for_element_type(
+              callback_state.element_type, &min, &max);
+          // divided by 4 to make numerical behavior more stable
+          uint32_t range = (max - min + 1) / 4;
+          iree_host_size_t element_byte_count =
+              iree_hal_element_dense_byte_count(callback_state.element_type);
+          uint8_t* data_end = span.data + span.data_length;
+          uint32_t state = callback_state.seed;
+          for (uint8_t* data = span.data; data < data_end;
+               data += element_byte_count) {
+            int32_t value =
+                (int32_t)iree_test_utils_pseudorandom_range(&state, range) +
+                min;
+            iree_test_utils_write_element(callback_state.element_type, value,
+                                          data);
+          }
+          return iree_ok_status();
+        },
+        &callback_state, &result_view));
+    return std::move(result_view);
+  }
+
+  Status CheckConv2dResults(
+      const vm::ref<iree_hal_device_t> device, int64_t n, int64_t c, int64_t h,
+      int64_t w, int64_t f, int64_t kh, int64_t kw, int64_t layout, int64_t sh,
+      int64_t sw, int64_t dh, int64_t dw,
+      const vm::ref<iree_hal_buffer_view_t> input,
+      const vm::ref<iree_hal_buffer_view_t> kernel,
+      const vm::ref<iree_hal_buffer_view_t> acc,
+      const vm::ref<iree_hal_buffer_view_t> actual_result) {
+    conv2d_results_t results = {};
+    IREE_RETURN_IF_ERROR(conv2d_results_initialize(
+        device.get(), (iree_hal_dim_t)n, (iree_hal_dim_t)c, (iree_hal_dim_t)h,
+        (iree_hal_dim_t)w, (iree_hal_dim_t)f, (iree_hal_dim_t)kh,
+        (iree_hal_dim_t)kw, (iree_hal_dim_t)layout, (iree_hal_dim_t)sh,
+        (iree_hal_dim_t)sw, (iree_hal_dim_t)dh, (iree_hal_dim_t)dw, input.get(),
+        kernel.get(), acc.get(), actual_result.get(), host_allocator_,
+        &results));
+    iree_status_t status = check_conv2d_results(stderr, &results);
+    conv2d_results_deinitialize(&results);
+    return status;
+  }
+
+ private:
+  iree_allocator_t host_allocator_;
+};
+
+static const vm::NativeFunction<Conv2dTestModuleState>
+    kConv2dTestModuleFunctions[] = {
+        vm::MakeNativeFunction("generate_random_tensor",
+                               &Conv2dTestModuleState::GenerateRandom4dTensor),
+        vm::MakeNativeFunction("check_conv2d_results",
+                               &Conv2dTestModuleState::CheckConv2dResults),
+};
+
+struct Conv2dTestModule final : public vm::NativeModule<Conv2dTestModuleState> {
+  using vm::NativeModule<Conv2dTestModuleState>::NativeModule;
+  StatusOr<std::unique_ptr<Conv2dTestModuleState>> CreateState(
+      iree_allocator_t host_allocator) override {
+    return std::make_unique<Conv2dTestModuleState>(host_allocator);
+  }
+};
+
+}  // namespace iree
+
+static iree_status_t conv2d_test_module_create(iree_vm_instance_t* instance,
+                                               iree_allocator_t host_allocator,
+                                               iree_vm_module_t** out_module) {
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+  auto module = std::make_unique<iree::Conv2dTestModule>(
+      "conv2d_test", /*version=*/0, instance, host_allocator,
+      iree::span<const iree::vm::NativeFunction<iree::Conv2dTestModuleState>>(
+          iree::kConv2dTestModuleFunctions));
+  *out_module = module.release()->interface();
+  return iree_ok_status();
+}
+
+int main(int argc, char** argv) {
+  IREE_TRACE_APP_ENTER();
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+  if (argc != 1) {
+    fprintf(stderr, "use --module= flags to specify the modules to run\n");
+    IREE_TRACE_APP_EXIT(EXIT_FAILURE);
+    return EXIT_FAILURE;
+  }
+
+  iree_status_t status = iree_test_utils_load_and_run_e2e_tests(
+      iree_allocator_system(), conv2d_test_module_create);
+  int exit_code = EXIT_SUCCESS;
+  if (!iree_status_is_ok(status)) {
+    iree_status_fprint(stderr, status);
+    bool is_unavailable = iree_status_is_unavailable(status);
+    iree_status_free(status);
+    exit_code = is_unavailable ? EXIT_SUCCESS : EXIT_FAILURE;
+  }
+
+  IREE_TRACE_APP_EXIT(exit_code);
+  return exit_code;
+}
\ No newline at end of file
diff --git a/linalg_ops/test_utils.c b/linalg_ops/test_utils.c
index 8b8aecd..659ea2c 100644
--- a/linalg_ops/test_utils.c
+++ b/linalg_ops/test_utils.c
@@ -193,10 +193,8 @@ bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
     // expected values. Inexact results are only permitted when the
     // `require_exact_results` flag is set to `false`.
     case IREE_TEST_UTILS_VALUE_TYPE_F16:
-      if (actual.f16_u16 == expected.f16_u16) return true;
-      if (iree_test_utils_max_elements_to_check()) return false;
-      return fabsf(iree_math_f16_to_f32(actual.f16_u16) -
-                   iree_math_f16_to_f32(expected.f16_u16)) <
+      if (actual.f16 == expected.f16) return true;
+        return fabsf((actual.f16) - (expected.f16)) <
              acceptable_fp_delta;
     case IREE_TEST_UTILS_VALUE_TYPE_BF16:
       if (actual.bf16_u16 == expected.bf16_u16) return true;
diff --git a/linalg_ops/test_utils.h b/linalg_ops/test_utils.h
index 626097b..f86986b 100644
--- a/linalg_ops/test_utils.h
+++ b/linalg_ops/test_utils.h
@@ -62,6 +62,7 @@ typedef struct iree_test_utils_value_t {
     int16_t i16;
     int32_t i32;
     int64_t i64;
+    float f16;
     float f32;
     uint16_t f16_u16;
     uint16_t bf16_u16;

From 89f8ac623ef7c01262d1223cdb07ab41b5e684c8 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Mon, 21 Oct 2024 00:51:35 -0500
Subject: [PATCH 02/16] Add formatting

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/CMakeLists.txt | 39 +--------------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
index e9a57b5..db18238 100644
--- a/linalg_ops/convolution/CMakeLists.txt
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -72,41 +72,4 @@ foreach(_DTYPE IN LISTS _DTYPES)
          "default"
     )
   endforeach()
-endforeach()
-
-###############################################################################
-#
-# GPU - ROCm/HIP, default flags.
-#
-###############################################################################
-
-# if(IREE_HIP_TEST_TARGET_CHIP)
-
-# set(_DTYPES)
-# list(APPEND _DTYPES "f16_f16_f16")
-# list(APPEND _DTYPES "f32_f32_f32")
-
-# foreach(_DTYPE IN LISTS _DTYPES)
-#   foreach(_SIZE IN LISTS _SIZES)
-#     iree_test_suites_runner_test(
-#       NAME
-#         matmul_hip_${_DTYPE}_${_SIZE}
-#       TESTS_SRC
-#         "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}.mlir"
-#       CALLS_SRC
-#         "generated/${_DTYPE}/matmul_${_DTYPE}_${_SIZE}_calls.mlir"
-#       TEST_RUNNER
-#         iree-test-suites_iree-e2e-matmul-test
-#       TARGET_BACKEND
-#         "rocm"
-#       DRIVER
-#         "hip"
-#       COMPILER_FLAGS
-#         "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
-#       RUNNER_FLAGS
-#       LABELS
-#     )
-#   endforeach()
-# endforeach()
-
-# endif()
\ No newline at end of file
+endforeach()
\ No newline at end of file

From 5323f0a39a63ad251317ce7088dd792819324802 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Mon, 21 Oct 2024 01:29:17 -0500
Subject: [PATCH 03/16] Add formatting

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/CMakeLists.txt               | 2 +-
 linalg_ops/convolution/generate_e2e_conv2d_tests.py | 2 +-
 linalg_ops/convolution/generate_test_mlir_files.sh  | 6 ------
 linalg_ops/iree-e2e-conv2d-test.cc                  | 4 ++--
 4 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
index db18238..5b5f32e 100644
--- a/linalg_ops/convolution/CMakeLists.txt
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -72,4 +72,4 @@ foreach(_DTYPE IN LISTS _DTYPES)
          "default"
     )
   endforeach()
-endforeach()
\ No newline at end of file
+endforeach()
diff --git a/linalg_ops/convolution/generate_e2e_conv2d_tests.py b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
index a492eb4..fdb6037 100644
--- a/linalg_ops/convolution/generate_e2e_conv2d_tests.py
+++ b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
@@ -690,4 +690,4 @@ def main(args):
 
 
 if __name__ == "__main__":
-    main(parse_arguments())
\ No newline at end of file
+    main(parse_arguments())
diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index 35c4a7a..e742257 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -97,9 +97,3 @@ for type_combination in ${type_combinations[@]}; do
       --shapes=${shape}
   done
 done
-
-# input_type;kernel_type;acc_type
-type_combinations=(
-  "f16;f16;f16"
-  "f32;f32;f32"
-)
\ No newline at end of file
diff --git a/linalg_ops/iree-e2e-conv2d-test.cc b/linalg_ops/iree-e2e-conv2d-test.cc
index 686fb4e..6bc2565 100644
--- a/linalg_ops/iree-e2e-conv2d-test.cc
+++ b/linalg_ops/iree-e2e-conv2d-test.cc
@@ -18,7 +18,7 @@
 #include "iree/tooling/device_util.h"
 #include "iree/vm/api.h"
 #include "iree/vm/native_module_cc.h"
-#include "tools/testing/e2e/test_utils.h"
+#include "test_utils.h"
 
 //===----------------------------------------------------------------------===//
 // Reference conv2d (NCHW-FCHW) and (NHWC-HWCF)
@@ -772,4 +772,4 @@ int main(int argc, char** argv) {
 
   IREE_TRACE_APP_EXIT(exit_code);
   return exit_code;
-}
\ No newline at end of file
+}

From 02407acd03722f24e10019c5398ac1ea5f93e6cd Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Mon, 21 Oct 2024 01:34:36 -0500
Subject: [PATCH 04/16] Add formatting

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linalg_ops/CMakeLists.txt b/linalg_ops/CMakeLists.txt
index c6a14aa..015b318 100644
--- a/linalg_ops/CMakeLists.txt
+++ b/linalg_ops/CMakeLists.txt
@@ -143,4 +143,4 @@ include(iree_test_suites_native_test)
 include(iree_test_suites_runner_test)
 
 add_subdirectory(matmul)
-add_subdirectory(convolution)
\ No newline at end of file
+add_subdirectory(convolution)

From f3573407618db561d3674348b5aff03878cf7834 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Mon, 21 Oct 2024 16:35:12 -0500
Subject: [PATCH 05/16] Remove winograd specific .mlir code generation

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 .../convolution/generate_test_mlir_files.sh   |   9 -
 .../conv2d_winograd_f16_f16_f16_large.mlir    |  10 --
 ...nv2d_winograd_f16_f16_f16_large_calls.mlir | 112 ------------
 .../conv2d_winograd_f16_f16_f16_medium.mlir   |  15 --
 ...v2d_winograd_f16_f16_f16_medium_calls.mlir | 163 ------------------
 .../conv2d_winograd_f16_f16_f16_small.mlir    |  15 --
 ...nv2d_winograd_f16_f16_f16_small_calls.mlir | 163 ------------------
 .../conv2d_winograd_f32_f32_f32_large.mlir    |  10 --
 ...nv2d_winograd_f32_f32_f32_large_calls.mlir | 112 ------------
 .../conv2d_winograd_f32_f32_f32_medium.mlir   |  15 --
 ...v2d_winograd_f32_f32_f32_medium_calls.mlir | 163 ------------------
 .../conv2d_winograd_f32_f32_f32_small.mlir    |  15 --
 ...nv2d_winograd_f32_f32_f32_small_calls.mlir | 163 ------------------
 13 files changed, 965 deletions(-)
 delete mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir
 delete mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir
 delete mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir
 delete mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir
 delete mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir
 delete mode 100644 linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir
 delete mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir
 delete mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir
 delete mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir
 delete mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir
 delete mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir
 delete mode 100644 linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir

diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index e742257..dc3581a 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -86,14 +86,5 @@ for type_combination in ${type_combinations[@]}; do
       --kernel_type=${kernel_type} \
       --acc_type=${acc_type} \
       --shapes=${shape}
-
-    name="conv2d_winograd_${type_name}_${shape}"
-    python ${this_dir}/generate_e2e_conv2d_tests.py \
-      --output_conv2d_mlir=${type_combination_dir}/${name}.mlir \
-      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
-      --input_type=${input_type} \
-      --kernel_type=${kernel_type} \
-      --acc_type=${acc_type} \
-      --shapes=${shape}
   done
 done
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir
deleted file mode 100644
index 51c8a1e..0000000
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: tensor<2x4x128x128xf16>, %rhs: tensor<8x4x3x3xf16>, %acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16>
-  return %result: tensor<2x8x126x126xf16>
-}
-
-func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16>
-  return %result: tensor<2x12x126x126xf16>
-}
-
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir
deleted file mode 100644
index 7dfb92f..0000000
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_large_calls.mlir
+++ /dev/null
@@ -1,112 +0,0 @@
-builtin.module @calls attributes {
-  
-} {
-
-func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
-func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
-func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-
-func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_128_8_3_3_acc_0() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 4 : i64
-  %input_dim2 = arith.constant 128 : i64
-  %input_dim3 = arith.constant 128 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 2 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 8 : i64
-  %kernel_dim1 = arith.constant 4 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 3 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 8 : i64
-  %acc_dim2 = arith.constant 126 : i64
-  %acc_dim3 = arith.constant 126 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 4 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 8 : i64
-  %acc_copy_dim2 = arith.constant 126 : i64
-  %acc_copy_dim3 = arith.constant 126 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 4 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 4 : i64
-  %h = arith.constant 128 : i64
-  %w = arith.constant 128 : i64
-  %f = arith.constant 8 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 3 : i64
-  %input_dim2 = arith.constant 128 : i64
-  %input_dim3 = arith.constant 128 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 5 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 12 : i64
-  %kernel_dim1 = arith.constant 3 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 6 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 12 : i64
-  %acc_dim2 = arith.constant 126 : i64
-  %acc_dim3 = arith.constant 126 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 7 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 12 : i64
-  %acc_copy_dim2 = arith.constant 126 : i64
-  %acc_copy_dim3 = arith.constant 126 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 7 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 3 : i64
-  %h = arith.constant 128 : i64
-  %w = arith.constant 128 : i64
-  %f = arith.constant 12 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-
-}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir
deleted file mode 100644
index a2564aa..0000000
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
-  return %result: tensor<2x2x30x30xf16>
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
-  return %result: tensor<2x64x30x30xf16>
-}
-
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
-  return %result: tensor<2x64x30x30xf16>
-}
-
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir
deleted file mode 100644
index c6e86d7..0000000
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_medium_calls.mlir
+++ /dev/null
@@ -1,163 +0,0 @@
-builtin.module @calls attributes {
-  
-} {
-
-func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
-func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
-func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 2 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 2 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 2 : i64
-  %kernel_dim1 = arith.constant 2 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 3 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 2 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 4 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 2 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 4 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 2 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 2 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 2 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 5 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 2 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 6 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 64 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 7 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 64 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 7 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 2 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 64 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 32 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 8 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 32 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 9 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 64 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 10 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 64 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 10 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 64 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-
-}
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir
deleted file mode 100644
index ddbe425..0000000
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
-  return %result: tensor<1x1x1x1xf16>
-}
-
-func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16>
-  return %result: tensor<1x1x15x15xf16>
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
-  return %result: tensor<2x2x30x30xf16>
-}
-
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir
deleted file mode 100644
index 872c618..0000000
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_winograd_f16_f16_f16_small_calls.mlir
+++ /dev/null
@@ -1,163 +0,0 @@
-builtin.module @calls attributes {
-  
-} {
-
-func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
-func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
-func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-
-func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1_acc_0() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 1 : i64
-  %input_dim1 = arith.constant 1 : i64
-  %input_dim2 = arith.constant 1 : i64
-  %input_dim3 = arith.constant 1 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 2 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 1 : i64
-  %kernel_dim1 = arith.constant 1 : i64
-  %kernel_dim2 = arith.constant 1 : i64
-  %kernel_dim3 = arith.constant 1 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 3 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 1 : i64
-  %acc_dim1 = arith.constant 1 : i64
-  %acc_dim2 = arith.constant 1 : i64
-  %acc_dim3 = arith.constant 1 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 4 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 1 : i64
-  %acc_copy_dim1 = arith.constant 1 : i64
-  %acc_copy_dim2 = arith.constant 1 : i64
-  %acc_copy_dim3 = arith.constant 1 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 4 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 1 : i64
-  %c = arith.constant 1 : i64
-  %h = arith.constant 1 : i64
-  %w = arith.constant 1 : i64
-  %f = arith.constant 1 : i64
-  %kh = arith.constant 1 : i64
-  %kw = arith.constant 1 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 1 : i64
-  %input_dim1 = arith.constant 1 : i64
-  %input_dim2 = arith.constant 16 : i64
-  %input_dim3 = arith.constant 16 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 5 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 1 : i64
-  %kernel_dim1 = arith.constant 1 : i64
-  %kernel_dim2 = arith.constant 2 : i64
-  %kernel_dim3 = arith.constant 2 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 6 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 1 : i64
-  %acc_dim1 = arith.constant 1 : i64
-  %acc_dim2 = arith.constant 15 : i64
-  %acc_dim3 = arith.constant 15 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 7 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 1 : i64
-  %acc_copy_dim1 = arith.constant 1 : i64
-  %acc_copy_dim2 = arith.constant 15 : i64
-  %acc_copy_dim3 = arith.constant 15 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 7 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 1 : i64
-  %c = arith.constant 1 : i64
-  %h = arith.constant 16 : i64
-  %w = arith.constant 16 : i64
-  %f = arith.constant 1 : i64
-  %kh = arith.constant 2 : i64
-  %kw = arith.constant 2 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 2 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f16> : i32
-  %input_seed = arith.constant 8 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 2 : i64
-  %kernel_dim1 = arith.constant 2 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f16> : i32
-  %kernel_seed = arith.constant 9 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 2 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f16> : i32
-  %acc_seed = arith.constant 10 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 2 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f16> : i32
-  %acc_copy_seed = arith.constant 10 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 2 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 2 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-
-}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir
deleted file mode 100644
index a47185c..0000000
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: tensor<2x4x128x128xf32>, %rhs: tensor<8x4x3x3xf32>, %acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf32>, tensor<8x4x3x3xf32>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
-  return %result: tensor<2x8x126x126xf32>
-}
-
-func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x3x128x128xf32>, %rhs: tensor<12x3x3x3xf32>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf32>, tensor<12x3x3x3xf32>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
-  return %result: tensor<2x12x126x126xf32>
-}
-
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir
deleted file mode 100644
index cdd2788..0000000
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_large_calls.mlir
+++ /dev/null
@@ -1,112 +0,0 @@
-builtin.module @calls attributes {
-  
-} {
-
-func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
-func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
-func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-
-func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_128_8_3_3_acc_0() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 4 : i64
-  %input_dim2 = arith.constant 128 : i64
-  %input_dim3 = arith.constant 128 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 2 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 8 : i64
-  %kernel_dim1 = arith.constant 4 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 3 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 8 : i64
-  %acc_dim2 = arith.constant 126 : i64
-  %acc_dim3 = arith.constant 126 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 4 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 8 : i64
-  %acc_copy_dim2 = arith.constant 126 : i64
-  %acc_copy_dim3 = arith.constant 126 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 4 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 4 : i64
-  %h = arith.constant 128 : i64
-  %w = arith.constant 128 : i64
-  %f = arith.constant 8 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 3 : i64
-  %input_dim2 = arith.constant 128 : i64
-  %input_dim3 = arith.constant 128 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 5 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 12 : i64
-  %kernel_dim1 = arith.constant 3 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 6 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 12 : i64
-  %acc_dim2 = arith.constant 126 : i64
-  %acc_dim3 = arith.constant 126 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 7 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 12 : i64
-  %acc_copy_dim2 = arith.constant 126 : i64
-  %acc_copy_dim3 = arith.constant 126 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 7 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 3 : i64
-  %h = arith.constant 128 : i64
-  %w = arith.constant 128 : i64
-  %f = arith.constant 12 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-
-}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir
deleted file mode 100644
index e0a0376..0000000
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
-  return %result: tensor<2x2x30x30xf32>
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<64x2x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<64x2x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
-  return %result: tensor<2x64x30x30xf32>
-}
-
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x32xf32>, %rhs: tensor<64x32x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf32>, tensor<64x32x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
-  return %result: tensor<2x64x30x30xf32>
-}
-
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir
deleted file mode 100644
index 3537bc9..0000000
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_medium_calls.mlir
+++ /dev/null
@@ -1,163 +0,0 @@
-builtin.module @calls attributes {
-  
-} {
-
-func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
-func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
-func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 2 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 2 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 2 : i64
-  %kernel_dim1 = arith.constant 2 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 3 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 2 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 4 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 2 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 4 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 2 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 2 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 2 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 5 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 2 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 6 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 64 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 7 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 64 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 7 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 2 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 64 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 32 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 8 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 32 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 9 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 64 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 10 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 64 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 10 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 64 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-
-}
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir
deleted file mode 100644
index 9ecd2bd..0000000
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<1x1x1x1xf32>, %rhs: tensor<1x1x1x1xf32>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
-  return %result: tensor<1x1x1x1xf32>
-}
-
-func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x1x16x16xf32>, %rhs: tensor<1x1x2x2xf32>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf32>, tensor<1x1x2x2xf32>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
-  return %result: tensor<1x1x15x15xf32>
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
-  return %result: tensor<2x2x30x30xf32>
-}
-
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir
deleted file mode 100644
index 092bd67..0000000
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_winograd_f32_f32_f32_small_calls.mlir
+++ /dev/null
@@ -1,163 +0,0 @@
-builtin.module @calls attributes {
-  
-} {
-
-func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
-func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
-func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-
-func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1_acc_0() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 1 : i64
-  %input_dim1 = arith.constant 1 : i64
-  %input_dim2 = arith.constant 1 : i64
-  %input_dim3 = arith.constant 1 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 2 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 1 : i64
-  %kernel_dim1 = arith.constant 1 : i64
-  %kernel_dim2 = arith.constant 1 : i64
-  %kernel_dim3 = arith.constant 1 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 3 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 1 : i64
-  %acc_dim1 = arith.constant 1 : i64
-  %acc_dim2 = arith.constant 1 : i64
-  %acc_dim3 = arith.constant 1 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 4 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 1 : i64
-  %acc_copy_dim1 = arith.constant 1 : i64
-  %acc_copy_dim2 = arith.constant 1 : i64
-  %acc_copy_dim3 = arith.constant 1 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 4 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 1 : i64
-  %c = arith.constant 1 : i64
-  %h = arith.constant 1 : i64
-  %w = arith.constant 1 : i64
-  %f = arith.constant 1 : i64
-  %kh = arith.constant 1 : i64
-  %kw = arith.constant 1 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 1 : i64
-  %input_dim1 = arith.constant 1 : i64
-  %input_dim2 = arith.constant 16 : i64
-  %input_dim3 = arith.constant 16 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 5 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 1 : i64
-  %kernel_dim1 = arith.constant 1 : i64
-  %kernel_dim2 = arith.constant 2 : i64
-  %kernel_dim3 = arith.constant 2 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 6 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 1 : i64
-  %acc_dim1 = arith.constant 1 : i64
-  %acc_dim2 = arith.constant 15 : i64
-  %acc_dim3 = arith.constant 15 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 7 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 1 : i64
-  %acc_copy_dim1 = arith.constant 1 : i64
-  %acc_copy_dim2 = arith.constant 15 : i64
-  %acc_copy_dim3 = arith.constant 15 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 7 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 1 : i64
-  %c = arith.constant 1 : i64
-  %h = arith.constant 16 : i64
-  %w = arith.constant 16 : i64
-  %f = arith.constant 1 : i64
-  %kh = arith.constant 2 : i64
-  %kw = arith.constant 2 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
-} {
-  %device_index = arith.constant 0 : index
-  %device = hal.devices.get %device_index : !hal.device
-  %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 2 : i64
-  %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
-  %input_element_type = hal.element_type<f32> : i32
-  %input_seed = arith.constant 8 : i32
-  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %kernel_dim0 = arith.constant 2 : i64
-  %kernel_dim1 = arith.constant 2 : i64
-  %kernel_dim2 = arith.constant 3 : i64
-  %kernel_dim3 = arith.constant 3 : i64
-  %kernel_element_type = hal.element_type<f32> : i32
-  %kernel_seed = arith.constant 9 : i32
-  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_dim0 = arith.constant 2 : i64
-  %acc_dim1 = arith.constant 2 : i64
-  %acc_dim2 = arith.constant 30 : i64
-  %acc_dim3 = arith.constant 30 : i64
-  %acc_element_type = hal.element_type<f32> : i32
-  %acc_seed = arith.constant 10 : i32
-  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %acc_copy_dim0 = arith.constant 2 : i64
-  %acc_copy_dim1 = arith.constant 2 : i64
-  %acc_copy_dim2 = arith.constant 30 : i64
-  %acc_copy_dim3 = arith.constant 30 : i64
-  %acc_copy_element_type = hal.element_type<f32> : i32
-  %acc_copy_seed = arith.constant 10 : i32
-  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
-  %n = arith.constant 2 : i64
-  %c = arith.constant 2 : i64
-  %h = arith.constant 32 : i64
-  %w = arith.constant 32 : i64
-  %f = arith.constant 2 : i64
-  %kh = arith.constant 3 : i64
-  %kw = arith.constant 3 : i64
-  %layout = arith.constant 0 : i64
-  %sh = arith.constant 1 : i64
-  %sw = arith.constant 1 : i64
-  %dh = arith.constant 1 : i64
-  %dw = arith.constant 1 : i64
-  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
-  return
-}
-
-
-}

From 66a927c3656abedbb757a8b17f62596b5dc59ade Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Mon, 21 Oct 2024 16:38:59 -0500
Subject: [PATCH 06/16] Remove unnecessary comments

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/generate_test_mlir_files.sh | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index dc3581a..9137999 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -20,12 +20,6 @@
 #           conv2d_f16_f16_f16_medium.mlir
 #           conv2d_f16_f16_f16_small_calls.mlir
 #           conv2d_f16_f16_f16_small.mlir
-#           conv2d_winograd_f16_f16_f16_large_calls.mlir
-#           conv2d_winograd_f16_f16_f16_large.mlir
-#           conv2d_winograd_f16_f16_f16_medium_calls.mlir
-#           conv2d_winograd_f16_f16_f16_medium.mlir
-#           conv2d_winograd_f16_f16_f16_small_calls.mlir
-#           conv2d_winograd_f16_f16_f16_small.mlir
 #         f32_f32_f32/
 #           conv2d_f32_f32_f32_large_calls.mlir
 #           conv2d_f32_f32_f32_large.mlir
@@ -33,12 +27,6 @@
 #           conv2d_f32_f32_f32_medium.mlir
 #           conv2d_f32_f32_f32_small_calls.mlir
 #           conv2d_f32_f32_f32_small.mlir
-#           conv2d_winograd_f32_f32_f32_large_calls.mlir
-#           conv2d_winograd_f32_f32_f32_large.mlir
-#           conv2d_winograd_f32_f32_f32_medium_calls.mlir
-#           conv2d_winograd_f32_f32_f32_medium.mlir
-#           conv2d_winograd_f32_f32_f32_small_calls.mlir
-#           conv2d_winograd_f32_f32_f32_small.mlir
 #         ...
 #           ...
 # Usage:

From 8d63db70535ac402371bf3af391cab23a27d7e9c Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 23 Oct 2024 17:16:49 -0500
Subject: [PATCH 07/16] Addressing multiple comments

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 .../convolution/generate_e2e_conv2d_tests.py  |  32 ++--
 .../f16_f16_f16/conv2d_f16_f16_f16_large.mlir |   2 -
 .../conv2d_f16_f16_f16_large_calls.mlir       |   4 -
 .../conv2d_f16_f16_f16_medium.mlir            |   3 -
 .../conv2d_f16_f16_f16_medium_calls.mlir      |   5 -
 .../f16_f16_f16/conv2d_f16_f16_f16_small.mlir |   3 -
 .../conv2d_f16_f16_f16_small_calls.mlir       |   5 -
 .../f32_f32_f32/conv2d_f32_f32_f32_large.mlir |   2 -
 .../conv2d_f32_f32_f32_large_calls.mlir       |   4 -
 .../conv2d_f32_f32_f32_medium.mlir            |   3 -
 .../conv2d_f32_f32_f32_medium_calls.mlir      |   5 -
 .../f32_f32_f32/conv2d_f32_f32_f32_small.mlir |   3 -
 .../conv2d_f32_f32_f32_small_calls.mlir       |   5 -
 linalg_ops/iree-e2e-conv2d-test.cc            | 166 +++++++++---------
 14 files changed, 104 insertions(+), 138 deletions(-)

diff --git a/linalg_ops/convolution/generate_e2e_conv2d_tests.py b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
index fdb6037..491b48b 100644
--- a/linalg_ops/convolution/generate_e2e_conv2d_tests.py
+++ b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
@@ -16,13 +16,14 @@
 import itertools
 import re
 
+
 # Data type of kernel entries. The string values must match MLIR data types.
 @enum.unique
 class KernelElemTypeId(enum.Enum):
     NONE = ""
     I8 = "i8"
-    F32 = "f32"
     F16 = "f16"
+    F32 = "f32"
 
 
 # Data type of input entries. The string values must match MLIR data types.
@@ -30,8 +31,8 @@ class KernelElemTypeId(enum.Enum):
 class InputElemTypeId(enum.Enum):
     NONE = ""
     I8 = "i8"
-    F32 = "f32"
     F16 = "f16"
+    F32 = "f32"
 
 
 # Data type of input entries. The string values must match MLIR data types.
@@ -42,6 +43,7 @@ class AccElemTypeId(enum.Enum):
     F32 = "f32"
     F16 = "f16"
 
+
 # Enumerates of the collections of shapes that we can generate tests for.
 # The values are the accepted values for the --shapes= flag.
 @enum.unique
@@ -116,9 +118,9 @@ def get_test_shapes(shapes_id: ShapesId):
         ]
     if shapes_id == ShapesId.MEDIUM:
         return [
-            TestShape(n=2, c=2, h=32, w=32,  kh=3, kw=3, f=2, accumulate=True),
+            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=2, accumulate=True),
             TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
-            TestShape(n=2, h=32, w=32, c=32, kh=3, kw=3, f=64, accumulate=True),
+            TestShape(n=2, c=32, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
         ]
     if shapes_id == ShapesId.LARGE:
         return [
@@ -342,7 +344,7 @@ def generate_function(
         f"func.func @{func_name}(%lhs: {input_tensor_type}, %rhs: {kernel_tensor_type}, %acc: {acc_tensor_type}) -> {acc_tensor_type} {{\n"
         f"  %result = {op_name} {conv_attr} ins(%lhs, %rhs: {input_tensor_type}, {kernel_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
         f"  return %result: {acc_tensor_type}\n"
-        f"}}\n"
+        f"}}"
     )
 
     return MLIRFunction(
@@ -428,7 +430,7 @@ def generate_call(
             raise ValueError(kernel_layout)
     else:
         raise ValueError(InputLayout)
-        
+
     description = f"Conv2d shape (NxCxHxWxFxKHxKW): {shape.n}x{shape.c}x{shape.h}x{shape.w}x{shape.f}x{shape.kh}x{shape.kw}"
     op = (
         f"func.func @{func_name}() attributes {{\n"
@@ -476,7 +478,8 @@ def generate_call(
         f"  %sw = arith.constant {conv2d_attr.STRIDE[1]} : i64\n"
         f"  %dh = arith.constant {conv2d_attr.DILATION[0]} : i64\n"
         f"  %dw = arith.constant {conv2d_attr.DILATION[1]} : i64\n"
-        f"  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n")
+        f"  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n"
+    )
 
     op = op + "  return\n"
     op = op + "}\n"
@@ -621,7 +624,12 @@ def write_code_file(functions, filename):
 
 
 def write_calls_file(functions, calls, filename, requirements):
+    # TODO(scotttodd): write "GENERATED BY" comment to the top of the file
+
     # Module-level reflection information used to control the test tool.
+    # TODO(scotttodd): drop this and whatever logic in the test tool used it
+    #     multiple backends should be able to use the same input IR, so the
+    #     input IR shouldn't need things like CPU features in it
     reflection = ""
     if requirements:
         reflection = (
@@ -638,9 +646,7 @@ def write_calls_file(functions, calls, filename, requirements):
     # Declare the custom module that generates arguments.
     module_definition = module_definition + (
         "func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view\n"
-        "func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)\n"    
-        "\n"
-        
+        "func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)\n"
     )
 
     # Declare the functions that will be called.
@@ -650,9 +656,9 @@ def write_calls_file(functions, calls, filename, requirements):
 
     # Emit the test cases for each call.
     for call in calls:
-        module_definition = module_definition + call.op + "\n"
+        module_definition = module_definition + call.op + ""
 
-    module_definition = module_definition + "\n}\n"
+    module_definition = module_definition + "}\n"
 
     with open(filename, "w") as file:
         file.write(module_definition)
@@ -663,7 +669,7 @@ def main(args):
     input_layout = InputLayout(args.input_layout)
     kernel_type = KernelElemTypeId(args.kernel_type)
     kernel_layout = KernelLayout(args.kernel_layout)
-    acc_type = AccElemTypeId(args.acc_type) 
+    acc_type = AccElemTypeId(args.acc_type)
     shapes_id = ShapesId(args.shapes)
     conv2d_attr = ConvAttrs(
         tuple(map(int, args.stride.split(","))),
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
index 51c8a1e..ca13bae 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
@@ -2,9 +2,7 @@ func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: ten
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16>
   return %result: tensor<2x8x126x126xf16>
 }
-
 func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16>
   return %result: tensor<2x12x126x126xf16>
 }
-
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
index 7dfb92f..ee31f04 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
@@ -4,7 +4,6 @@ builtin.module @calls attributes {
 
 func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
 func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
@@ -57,7 +56,6 @@ func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_1
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
 } {
@@ -107,6 +105,4 @@ func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
-
 }
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
index a2564aa..caba912 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
@@ -2,14 +2,11 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tenso
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
   return %result: tensor<2x2x30x30xf16>
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
   return %result: tensor<2x64x30x30xf16>
 }
-
 func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
   return %result: tensor<2x64x30x30xf16>
 }
-
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
index c6e86d7..60860a5 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
@@ -4,7 +4,6 @@ builtin.module @calls attributes {
 
 func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
@@ -58,7 +57,6 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
 } {
@@ -108,7 +106,6 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_32_64_3_3_acc_2() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
 } {
@@ -158,6 +155,4 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_3
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
-
 }
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
index ddbe425..66fe7fd 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
@@ -2,14 +2,11 @@ func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
   return %result: tensor<1x1x1x1xf16>
 }
-
 func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16>
   return %result: tensor<1x1x15x15xf16>
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
   return %result: tensor<2x2x30x30xf16>
 }
-
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
index 872c618..98438c6 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
+++ b/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
@@ -4,7 +4,6 @@ builtin.module @calls attributes {
 
 func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
 func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
@@ -58,7 +57,6 @@ func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
 } {
@@ -108,7 +106,6 @@ func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
 } {
@@ -158,6 +155,4 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
-
 }
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
index a47185c..1714e5b 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
@@ -2,9 +2,7 @@ func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: ten
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf32>, tensor<8x4x3x3xf32>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
   return %result: tensor<2x8x126x126xf32>
 }
-
 func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x3x128x128xf32>, %rhs: tensor<12x3x3x3xf32>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf32>, tensor<12x3x3x3xf32>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
   return %result: tensor<2x12x126x126xf32>
 }
-
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
index cdd2788..ce81bc5 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
@@ -4,7 +4,6 @@ builtin.module @calls attributes {
 
 func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
 func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
@@ -57,7 +56,6 @@ func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_1
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
 } {
@@ -107,6 +105,4 @@ func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
-
 }
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
index e0a0376..97ff810 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
@@ -2,14 +2,11 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tenso
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
   return %result: tensor<2x2x30x30xf32>
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<64x2x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<64x2x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
   return %result: tensor<2x64x30x30xf32>
 }
-
 func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x32xf32>, %rhs: tensor<64x32x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf32>, tensor<64x32x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
   return %result: tensor<2x64x30x30xf32>
 }
-
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
index 3537bc9..3a2f05c 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
@@ -4,7 +4,6 @@ builtin.module @calls attributes {
 
 func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
@@ -58,7 +57,6 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
 } {
@@ -108,7 +106,6 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_32_64_3_3_acc_2() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
 } {
@@ -158,6 +155,4 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_3
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
-
 }
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
index 9ecd2bd..a4a08ad 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
@@ -2,14 +2,11 @@ func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
   return %result: tensor<1x1x1x1xf32>
 }
-
 func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x1x16x16xf32>, %rhs: tensor<1x1x2x2xf32>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf32>, tensor<1x1x2x2xf32>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
   return %result: tensor<1x1x15x15xf32>
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
   return %result: tensor<2x2x30x30xf32>
 }
-
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
index 092bd67..9f01130 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
+++ b/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
@@ -4,7 +4,6 @@ builtin.module @calls attributes {
 
 func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
-
 func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
@@ -58,7 +57,6 @@ func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
 } {
@@ -108,7 +106,6 @@ func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
 } {
@@ -158,6 +155,4 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-
-
 }
diff --git a/linalg_ops/iree-e2e-conv2d-test.cc b/linalg_ops/iree-e2e-conv2d-test.cc
index 6bc2565..2d1e986 100644
--- a/linalg_ops/iree-e2e-conv2d-test.cc
+++ b/linalg_ops/iree-e2e-conv2d-test.cc
@@ -38,12 +38,11 @@ static void reference_conv2d_f16_f16_f16_f16(
     iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
     iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
     iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
-    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t* input_data,
-    const uint16_t* kernel_data, const uint16_t* acc_data,
-    uint16_t* result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t *input_data,
+    const uint16_t *kernel_data, const uint16_t *acc_data,
+    uint16_t *result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
     iree_hal_dim_t oh, iree_hal_dim_t ow) {
   if (layout == 0) {
-    //printf("layout == 0\n");
     // The layout of output tensor is NxfxOHxOW
     iree_hal_dim_t out_idx =
         convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
@@ -66,7 +65,6 @@ static void reference_conv2d_f16_f16_f16_f16(
       result_data[out_idx] = iree_math_f32_to_f16(acc);
     }
   } else if (layout == 1) {
-    //printf("layout == 1\n");
     // The layout of output tensor is NxOHxOWxf
     iree_hal_dim_t out_idx =
         convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
@@ -97,8 +95,8 @@ static void reference_conv2d_f16_f16_f32_f32(
     iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
     iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
     iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
-    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t* input_data,
-    const uint16_t* kernel_data, const float* acc_data, float* result_data,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t *input_data,
+    const uint16_t *kernel_data, const float *acc_data, float *result_data,
     iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
   if (layout == 0) {
     // The layout of output tensor is NxfxOHxOW
@@ -153,8 +151,8 @@ static void reference_conv2d_i8_i8_i32_i32(
     iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
     iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
     iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
-    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const int8_t* input_data,
-    const int8_t* kernel_data, const int32_t* acc_data, int32_t* result_data,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const int8_t *input_data,
+    const int8_t *kernel_data, const int32_t *acc_data, int32_t *result_data,
     iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
   if (layout == 0) {
     // The layout of output tensor is NxfxOHxOW
@@ -210,8 +208,8 @@ static void reference_conv2d_f32_f32_f32_f32(
     iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
     iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
     iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
-    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const float* input_data,
-    const float* kernel_data, const float* acc_data, float* result_data,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const float *input_data,
+    const float *kernel_data, const float *acc_data, float *result_data,
     iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
   if (layout == 0) {
     // The layout of output tensor is NxfxOHxOW
@@ -265,8 +263,8 @@ static iree_status_t reference_conv2d_element(
     iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
     iree_hal_dim_t oh_size, iree_hal_dim_t ow_size,
     iree_hal_element_type_t input_type, iree_hal_element_type_t kernel_type,
-    iree_hal_element_type_t acc_type, void* input_data, void* kernel_data,
-    void* acc_data, void* result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
+    iree_hal_element_type_t acc_type, void *input_data, void *kernel_data,
+    void *acc_data, void *result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
     iree_hal_dim_t oh, iree_hal_dim_t ow) {
   if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
       kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
@@ -274,35 +272,38 @@ static iree_status_t reference_conv2d_element(
     reference_conv2d_f32_f32_f32_f32(
         n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
         sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
-        (const float*)input_data, (const float*)kernel_data,
-        (const float*)acc_data, (float*)result_data, n, oc, oh, ow);
+        (const float *)input_data, (const float *)kernel_data,
+        (const float *)acc_data, (float *)result_data, n, oc, oh, ow);
   } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
              kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
              acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16) {
     reference_conv2d_f16_f16_f16_f16(
         n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
         sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
-        (const uint16_t*)input_data, (const uint16_t*)kernel_data,
-        (const uint16_t*)acc_data, (uint16_t*)result_data, n, oc, oh, ow);
+        (const uint16_t *)input_data, (const uint16_t *)kernel_data,
+        (const uint16_t *)acc_data, (uint16_t *)result_data, n, oc, oh, ow);
   } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
              kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
              acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
     reference_conv2d_f16_f16_f32_f32(
         n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
         sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
-        (const uint16_t*)input_data, (const uint16_t*)kernel_data,
-        (const float*)acc_data, (float*)result_data, n, oc, oh, ow);
+        (const uint16_t *)input_data, (const uint16_t *)kernel_data,
+        (const float *)acc_data, (float *)result_data, n, oc, oh, ow);
   } else if (input_type == IREE_HAL_ELEMENT_TYPE_INT_8 &&
              kernel_type == IREE_HAL_ELEMENT_TYPE_INT_8 &&
              acc_type == IREE_HAL_ELEMENT_TYPE_INT_32) {
     reference_conv2d_i8_i8_i32_i32(
         n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
         sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
-        (const int8_t*)input_data, (const int8_t*)kernel_data,
-        (const int32_t*)acc_data, (int32_t*)result_data, n, oc, oh, ow);
+        (const int8_t *)input_data, (const int8_t *)kernel_data,
+        (const int32_t *)acc_data, (int32_t *)result_data, n, oc, oh, ow);
   } else {
-    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                            "unhandled combination of element types in conv2d");
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "unhandled combination of element types in conv2d input_type: %d,"
+        " kernel_type: %d, acc_type: %d",
+        input_type, kernel_type, acc_type);
   }
   return iree_ok_status();
 }
@@ -386,18 +387,18 @@ static iree_status_t reference_conv2d(
 
 typedef struct {
   iree_allocator_t host_allocator;
-  iree_hal_dim_t n;       // batch dim
-  iree_hal_dim_t c;       // input channels
-  iree_hal_dim_t h;       // input height
-  iree_hal_dim_t w;       // input width
-  iree_hal_dim_t f;       // output channels
-  iree_hal_dim_t kh;      // kernel height
-  iree_hal_dim_t kw;      // kernel width
-  iree_hal_dim_t layout;  // conv layout, 0 : nchwxfchw (default); 1: nhwcxhwcf
-  iree_hal_dim_t sh;      // stride along height dim
-  iree_hal_dim_t sw;      // stride along width dim
-  iree_hal_dim_t dh;      // dilation along height dim
-  iree_hal_dim_t dw;      // dilation along width dim
+  iree_hal_dim_t n;      // batch dim
+  iree_hal_dim_t c;      // input channels
+  iree_hal_dim_t h;      // input height
+  iree_hal_dim_t w;      // input width
+  iree_hal_dim_t f;      // output channels
+  iree_hal_dim_t kh;     // kernel height
+  iree_hal_dim_t kw;     // kernel width
+  iree_hal_dim_t layout; // conv layout, 0 : nchwxfchw (default); 1: nhwcxhwcf
+  iree_hal_dim_t sh;     // stride along height dim
+  iree_hal_dim_t sw;     // stride along width dim
+  iree_hal_dim_t dh;     // dilation along height dim
+  iree_hal_dim_t dw;     // dilation along width dim
   iree_hal_element_type_t input_type;
   iree_hal_element_type_t kernel_type;
   iree_hal_element_type_t acc_type;
@@ -409,17 +410,17 @@ typedef struct {
   iree_byte_span_t expected_contents;
 } conv2d_results_t;
 
-static void conv2d_results_deinitialize(conv2d_results_t* results);
+static void conv2d_results_deinitialize(conv2d_results_t *results);
 
 static iree_status_t conv2d_results_initialize(
-    iree_hal_device_t* device, iree_hal_dim_t n_size, iree_hal_dim_t c_size,
+    iree_hal_device_t *device, iree_hal_dim_t n_size, iree_hal_dim_t c_size,
     iree_hal_dim_t h_size, iree_hal_dim_t w_size, iree_hal_dim_t f_size,
     iree_hal_dim_t kh_size, iree_hal_dim_t kw_size, iree_hal_dim_t layout,
     iree_hal_dim_t sh_size, iree_hal_dim_t sw_size, iree_hal_dim_t dh_size,
-    iree_hal_dim_t dw_size, iree_hal_buffer_view_t* input,
-    iree_hal_buffer_view_t* kernel, iree_hal_buffer_view_t* acc,
-    iree_hal_buffer_view_t* result, iree_allocator_t host_allocator,
-    conv2d_results_t* out_results) {
+    iree_hal_dim_t dw_size, iree_hal_buffer_view_t *input,
+    iree_hal_buffer_view_t *kernel, iree_hal_buffer_view_t *acc,
+    iree_hal_buffer_view_t *result, iree_allocator_t host_allocator,
+    conv2d_results_t *out_results) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
   memset(out_results, 0, sizeof(*out_results));
@@ -443,10 +444,10 @@ static iree_status_t conv2d_results_initialize(
   out_results->acc_type = iree_hal_buffer_view_element_type(acc);
   out_results->result_type = iree_hal_buffer_view_element_type(result);
 
-  iree_hal_buffer_t* input_buffer = iree_hal_buffer_view_buffer(input);
-  iree_hal_buffer_t* kernel_buffer = iree_hal_buffer_view_buffer(kernel);
-  iree_hal_buffer_t* acc_buffer = acc ? iree_hal_buffer_view_buffer(acc) : NULL;
-  iree_hal_buffer_t* result_buffer = iree_hal_buffer_view_buffer(result);
+  iree_hal_buffer_t *input_buffer = iree_hal_buffer_view_buffer(input);
+  iree_hal_buffer_t *kernel_buffer = iree_hal_buffer_view_buffer(kernel);
+  iree_hal_buffer_t *acc_buffer = acc ? iree_hal_buffer_view_buffer(acc) : NULL;
+  iree_hal_buffer_t *result_buffer = iree_hal_buffer_view_buffer(result);
 
   iree_status_t status = iree_ok_status();
 
@@ -455,7 +456,7 @@ static iree_status_t conv2d_results_initialize(
         iree_hal_buffer_byte_length(input_buffer);
     status = iree_allocator_malloc(host_allocator,
                                    out_results->input_contents.data_length,
-                                   (void**)&out_results->input_contents.data);
+                                   (void **)&out_results->input_contents.data);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_device_transfer_d2h(
@@ -469,7 +470,7 @@ static iree_status_t conv2d_results_initialize(
         iree_hal_buffer_byte_length(kernel_buffer);
     status = iree_allocator_malloc(host_allocator,
                                    out_results->kernel_contents.data_length,
-                                   (void**)&out_results->kernel_contents.data);
+                                   (void **)&out_results->kernel_contents.data);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_device_transfer_d2h(
@@ -484,7 +485,7 @@ static iree_status_t conv2d_results_initialize(
           iree_hal_buffer_byte_length(acc_buffer);
       status = iree_allocator_malloc(host_allocator,
                                      out_results->acc_contents.data_length,
-                                     (void**)&out_results->acc_contents.data);
+                                     (void **)&out_results->acc_contents.data);
     }
     if (iree_status_is_ok(status)) {
       status = iree_hal_device_transfer_d2h(
@@ -499,7 +500,7 @@ static iree_status_t conv2d_results_initialize(
         iree_hal_buffer_byte_length(result_buffer);
     status = iree_allocator_malloc(host_allocator,
                                    out_results->actual_contents.data_length,
-                                   (void**)&out_results->actual_contents.data);
+                                   (void **)&out_results->actual_contents.data);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_device_transfer_d2h(
@@ -513,7 +514,7 @@ static iree_status_t conv2d_results_initialize(
         iree_hal_buffer_byte_length(result_buffer);
     status = iree_allocator_malloc(
         host_allocator, out_results->expected_contents.data_length,
-        (void**)&out_results->expected_contents.data);
+        (void **)&out_results->expected_contents.data);
   }
 
   if (!iree_status_is_ok(status)) {
@@ -523,7 +524,7 @@ static iree_status_t conv2d_results_initialize(
   return status;
 }
 
-static void conv2d_results_deinitialize(conv2d_results_t* results) {
+static void conv2d_results_deinitialize(conv2d_results_t *results) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
   iree_allocator_free(results->host_allocator, results->input_contents.data);
@@ -541,8 +542,8 @@ static void conv2d_results_deinitialize(conv2d_results_t* results) {
 // obtained and validated the {n, f, oh, ow}_size values. On error, the first
 // index is returned where the actual and expected value doesn't match. TODO:
 // Add detailed logging to |file|.
-static iree_status_t check_conv2d_results_impl(FILE* file,
-                                               const conv2d_results_t* results,
+static iree_status_t check_conv2d_results_impl(FILE *file,
+                                               const conv2d_results_t *results,
                                                int check_every) {
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -566,7 +567,8 @@ static iree_status_t check_conv2d_results_impl(FILE* file,
     for (iree_hal_dim_t oc = 0; oc < results->f; ++oc) {
       for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
         for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
-          if (++count < check_every) continue;
+          if (++count < check_every)
+            continue;
           count = 0;
           iree_hal_dim_t idx =
               convert_to_1d_index(results->f, oh_size, ow_size, n, oc, oh, ow);
@@ -578,7 +580,8 @@ static iree_status_t check_conv2d_results_impl(FILE* file,
                   idx, results->result_type, results->expected_contents.data);
           if (!iree_test_utils_result_elements_agree(actual_value,
                                                      expected_value)) {
-            printf("HERE: actual_value: %f, actual_value: %f\n", actual_value.f32, expected_value.f32);                                          
+            printf("HERE: actual_value: %f, actual_value: %f\n",
+                   actual_value.f32, expected_value.f32);
             fprintf(
                 file,
                 "\n\nerror: the actual and expected result tensors disagree "
@@ -601,8 +604,8 @@ static iree_status_t check_conv2d_results_impl(FILE* file,
 // reference conv2d implementation on the same inputs to check if the output
 // is correct. On error, the first index is returned where the actual and
 // expected value doesn't match. TODO: Add detailed logging to |file|.
-static iree_status_t check_conv2d_results(FILE* file,
-                                          const conv2d_results_t* results) {
+static iree_status_t check_conv2d_results(FILE *file,
+                                          const conv2d_results_t *results) {
   IREE_TRACE_ZONE_BEGIN(z0);
   // TODO: Increase the check every param to reduce the number of comparisons.
   int check_every = 1;
@@ -630,7 +633,7 @@ static iree_status_t check_conv2d_results(FILE* file,
 namespace iree {
 
 class Conv2dTestModuleState final {
- public:
+public:
   explicit Conv2dTestModuleState(iree_allocator_t host_allocator)
       : host_allocator_(host_allocator) {}
   ~Conv2dTestModuleState() = default;
@@ -639,10 +642,10 @@ class Conv2dTestModuleState final {
   // |element_type|. The given |seed| is passed to the pseudorandom generator.
   // The pseudorandom values are reproducible both across runs and across
   // machines.
-  StatusOr<vm::ref<iree_hal_buffer_view_t>> GenerateRandom4dTensor(
-      const vm::ref<iree_hal_device_t> device, int64_t dim0, int64_t dim1,
-      int64_t dim2, int64_t dim3, iree_hal_element_type_t element_type,
-      int32_t seed) {
+  StatusOr<vm::ref<iree_hal_buffer_view_t>>
+  GenerateRandom4dTensor(const vm::ref<iree_hal_device_t> device, int64_t dim0,
+                         int64_t dim1, int64_t dim2, int64_t dim3,
+                         iree_hal_element_type_t element_type, int32_t seed) {
     iree_hal_dim_t dims[4] = {
         (iree_hal_dim_t)dim0,
         (iree_hal_dim_t)dim1,
@@ -665,8 +668,8 @@ class Conv2dTestModuleState final {
         device.get(), iree_hal_device_allocator(device.get()),
         IREE_ARRAYSIZE(dims), dims, element_type,
         IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
-        +[](iree_hal_buffer_mapping_t* mapping, void* user_data) {
-          callback_state_t callback_state = *(callback_state_t*)user_data;
+        +[](iree_hal_buffer_mapping_t *mapping, void *user_data) {
+          callback_state_t callback_state = *(callback_state_t *)user_data;
           iree_byte_span_t span = mapping->contents;
           // Generate "uniform" integer-valued numbers in the range [min,
           // max].
@@ -678,9 +681,9 @@ class Conv2dTestModuleState final {
           uint32_t range = (max - min + 1) / 4;
           iree_host_size_t element_byte_count =
               iree_hal_element_dense_byte_count(callback_state.element_type);
-          uint8_t* data_end = span.data + span.data_length;
+          uint8_t *data_end = span.data + span.data_length;
           uint32_t state = callback_state.seed;
-          for (uint8_t* data = span.data; data < data_end;
+          for (uint8_t *data = span.data; data < data_end;
                data += element_byte_count) {
             int32_t value =
                 (int32_t)iree_test_utils_pseudorandom_range(&state, range) +
@@ -694,14 +697,15 @@ class Conv2dTestModuleState final {
     return std::move(result_view);
   }
 
-  Status CheckConv2dResults(
-      const vm::ref<iree_hal_device_t> device, int64_t n, int64_t c, int64_t h,
-      int64_t w, int64_t f, int64_t kh, int64_t kw, int64_t layout, int64_t sh,
-      int64_t sw, int64_t dh, int64_t dw,
-      const vm::ref<iree_hal_buffer_view_t> input,
-      const vm::ref<iree_hal_buffer_view_t> kernel,
-      const vm::ref<iree_hal_buffer_view_t> acc,
-      const vm::ref<iree_hal_buffer_view_t> actual_result) {
+  Status
+  CheckConv2dResults(const vm::ref<iree_hal_device_t> device, int64_t n,
+                     int64_t c, int64_t h, int64_t w, int64_t f, int64_t kh,
+                     int64_t kw, int64_t layout, int64_t sh, int64_t sw,
+                     int64_t dh, int64_t dw,
+                     const vm::ref<iree_hal_buffer_view_t> input,
+                     const vm::ref<iree_hal_buffer_view_t> kernel,
+                     const vm::ref<iree_hal_buffer_view_t> acc,
+                     const vm::ref<iree_hal_buffer_view_t> actual_result) {
     conv2d_results_t results = {};
     IREE_RETURN_IF_ERROR(conv2d_results_initialize(
         device.get(), (iree_hal_dim_t)n, (iree_hal_dim_t)c, (iree_hal_dim_t)h,
@@ -715,7 +719,7 @@ class Conv2dTestModuleState final {
     return status;
   }
 
- private:
+private:
   iree_allocator_t host_allocator_;
 };
 
@@ -729,17 +733,17 @@ static const vm::NativeFunction<Conv2dTestModuleState>
 
 struct Conv2dTestModule final : public vm::NativeModule<Conv2dTestModuleState> {
   using vm::NativeModule<Conv2dTestModuleState>::NativeModule;
-  StatusOr<std::unique_ptr<Conv2dTestModuleState>> CreateState(
-      iree_allocator_t host_allocator) override {
+  StatusOr<std::unique_ptr<Conv2dTestModuleState>>
+  CreateState(iree_allocator_t host_allocator) override {
     return std::make_unique<Conv2dTestModuleState>(host_allocator);
   }
 };
 
-}  // namespace iree
+} // namespace iree
 
-static iree_status_t conv2d_test_module_create(iree_vm_instance_t* instance,
+static iree_status_t conv2d_test_module_create(iree_vm_instance_t *instance,
                                                iree_allocator_t host_allocator,
-                                               iree_vm_module_t** out_module) {
+                                               iree_vm_module_t **out_module) {
   IREE_ASSERT_ARGUMENT(out_module);
   *out_module = NULL;
   auto module = std::make_unique<iree::Conv2dTestModule>(
@@ -750,7 +754,7 @@ static iree_status_t conv2d_test_module_create(iree_vm_instance_t* instance,
   return iree_ok_status();
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   IREE_TRACE_APP_ENTER();
 
   iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);

From 5d518b3e2591434451adf2c84ec7a6fb66f89c9b Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 23 Oct 2024 17:20:39 -0500
Subject: [PATCH 08/16] Data type increasing order

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/generate_e2e_conv2d_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linalg_ops/convolution/generate_e2e_conv2d_tests.py b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
index 491b48b..92e3c1a 100644
--- a/linalg_ops/convolution/generate_e2e_conv2d_tests.py
+++ b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
@@ -39,9 +39,9 @@ class InputElemTypeId(enum.Enum):
 @enum.unique
 class AccElemTypeId(enum.Enum):
     NONE = ""
+    F16 = "f16"
     I32 = "i32"
     F32 = "f32"
-    F16 = "f16"
 
 
 # Enumerates of the collections of shapes that we can generate tests for.

From 46a462bbb209c5f73cb510d8721214e556d251fd Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 23 Oct 2024 20:28:23 -0500
Subject: [PATCH 09/16] Remove to requiring exact results

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/iree-e2e-conv2d-test.cc | 2 --
 linalg_ops/test_utils.c            | 8 +++++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/linalg_ops/iree-e2e-conv2d-test.cc b/linalg_ops/iree-e2e-conv2d-test.cc
index 2d1e986..a8d2391 100644
--- a/linalg_ops/iree-e2e-conv2d-test.cc
+++ b/linalg_ops/iree-e2e-conv2d-test.cc
@@ -580,8 +580,6 @@ static iree_status_t check_conv2d_results_impl(FILE *file,
                   idx, results->result_type, results->expected_contents.data);
           if (!iree_test_utils_result_elements_agree(actual_value,
                                                      expected_value)) {
-            printf("HERE: actual_value: %f, actual_value: %f\n",
-                   actual_value.f32, expected_value.f32);
             fprintf(
                 file,
                 "\n\nerror: the actual and expected result tensors disagree "
diff --git a/linalg_ops/test_utils.c b/linalg_ops/test_utils.c
index 659ea2c..5f5ce16 100644
--- a/linalg_ops/test_utils.c
+++ b/linalg_ops/test_utils.c
@@ -22,7 +22,7 @@
 #include "iree/tooling/device_util.h"
 #include "iree/vm/api.h"
 
-IREE_FLAG(bool, require_exact_results, true,
+IREE_FLAG(bool, require_exact_results, false,
           "Requires floating point result elements to match exactly.");
 
 bool iree_test_utils_require_exact_results(void) {
@@ -194,8 +194,10 @@ bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
     // `require_exact_results` flag is set to `false`.
     case IREE_TEST_UTILS_VALUE_TYPE_F16:
       if (actual.f16 == expected.f16) return true;
-        return fabsf((actual.f16) - (expected.f16)) <
-             acceptable_fp_delta;
+      if (iree_test_utils_require_exact_results()) return false;
+       return fabsf(iree_math_f16_to_f32(actual.f16) -
+                    iree_math_f16_to_f32(expected.f16)) <
+              acceptable_fp_delta;
     case IREE_TEST_UTILS_VALUE_TYPE_BF16:
       if (actual.bf16_u16 == expected.bf16_u16) return true;
       if (iree_test_utils_require_exact_results()) return false;

From cd86b13146a94d2f3db5c2f2561c656e81a5b516 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 23 Oct 2024 20:36:46 -0500
Subject: [PATCH 10/16] Add test test generation for GPU

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/CMakeLists.txt         |  74 ++++++++++++
 .../convolution/generate_test_mlir_files.sh   |  35 ++++++
 .../conv2d_f16_nchw_f16_fchw_f32_large.mlir   |   8 ++
 ...v2d_f16_nchw_f16_fchw_f32_large_calls.mlir | 108 ++++++++++++++++++
 .../conv2d_f16_nhwc_f16_hwcf_f32_large.mlir   |   8 ++
 ...v2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir | 108 ++++++++++++++++++
 .../conv2d_i8_nhwc_i8_hwcf_i32_large.mlir     |   8 ++
 ...onv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir | 108 ++++++++++++++++++
 8 files changed, 457 insertions(+)
 create mode 100644 linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir

diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
index 5b5f32e..cdcf6bc 100644
--- a/linalg_ops/convolution/CMakeLists.txt
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -73,3 +73,77 @@ foreach(_DTYPE IN LISTS _DTYPES)
     )
   endforeach()
 endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, default flags.
+#
+###############################################################################
+
+# To distinguish between CDNA(gfx9) and RDNA3(gfx11)
+if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
+
+set(_SIZES)
+list(APPEND _SIZES "large")
+
+set(_DTYPES_AND_LAYOUTS)
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "i8_nhwc_i8_hwcf_i32")
+
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
+
+set(_SIZES)
+list(APPEND _SIZES "large")
+
+set(_DTYPES_AND_LAYOUTS)
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+endif()
diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index 9137999..f195ec4 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -76,3 +76,38 @@ for type_combination in ${type_combinations[@]}; do
       --shapes=${shape}
   done
 done
+
+shapes=(
+  "large"
+)
+# input_type;input_layout;kernel_type;kernel_layout;acc_type
+type_and_layout_combinations=(
+  "f16;nhwc;f16;hwcf;f32"
+  "f16;nchw;f16;fchw;f32"
+  "i8;nhwc;i8;hwcf;i32"
+)
+for type_and_layout_combination in ${type_and_layout_combinations[@]}; do
+  IFS=";" read -r -a combination <<< "${type_and_layout_combination}"
+  input_type="${combination[0]}"
+  input_layout="${combination[1]}"
+  kernel_type="${combination[2]}"
+  kernel_layout="${combination[3]}"
+  acc_type="${combination[4]}"
+  type_layout_name="${input_type}_${input_layout}_${kernel_type}_${kernel_layout}_${acc_type}"
+  #layout_name="${input_layout}_${kernel_layout}"
+  type_combination_dir="${generated_dir_root}/${type_layout_name}"
+  mkdir -p ${type_combination_dir}
+  for shape in ${shapes[@]}; do
+    echo "Generating conv2d test files for ${type_layout_name}_${shape}"
+    name="conv2d_${type_layout_name}_${shape}"
+    python ${this_dir}/generate_e2e_conv2d_tests.py \
+      --output_conv2d_mlir=${type_combination_dir}/${name}.mlir \
+      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
+      --input_type=${input_type} \
+      --input_layout=${input_layout} \
+      --kernel_type=${kernel_type} \
+      --kernel_layout=${kernel_layout} \
+      --acc_type=${acc_type} \
+      --shapes=${shape}
+  done
+done
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir
new file mode 100644
index 0000000..21afe9d
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%lhs: tensor<2x4x128x128xf16>, %rhs: tensor<8x4x3x3xf16>, %acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
+  return %result: tensor<2x8x126x126xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
+  return %result: tensor<2x12x126x126xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
new file mode 100644
index 0000000..34fdff2
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir
new file mode 100644
index 0000000..2a7b2f2
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%lhs: tensor<2x128x128x4xf16>, %rhs: tensor<3x3x4x8xf16>, %acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xf16>, tensor<3x3x4x8xf16>) outs(%acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32>
+  return %result: tensor<2x126x126x8xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%lhs: tensor<2x128x128x3xf16>, %rhs: tensor<3x3x3x12xf16>, %acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xf16>, tensor<3x3x3x12xf16>) outs(%acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32>
+  return %result: tensor<2x126x126x12xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir
new file mode 100644
index 0000000..cd7d928
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir
new file mode 100644
index 0000000..99911df
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%lhs: tensor<2x128x128x4xi8>, %rhs: tensor<3x3x4x8xi8>, %acc: tensor<2x126x126x8xi32>) -> tensor<2x126x126x8xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xi8>, tensor<3x3x4x8xi8>) outs(%acc: tensor<2x126x126x8xi32>) -> tensor<2x126x126x8xi32>
+  return %result: tensor<2x126x126x8xi32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%lhs: tensor<2x128x128x3xi8>, %rhs: tensor<3x3x3x12xi8>, %acc: tensor<2x126x126x12xi32>) -> tensor<2x126x126x12xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xi8>, tensor<3x3x3x12xi8>) outs(%acc: tensor<2x126x126x12xi32>) -> tensor<2x126x126x12xi32>
+  return %result: tensor<2x126x126x12xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir
new file mode 100644
index 0000000..a863eca
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}

From 858881346e4852ebd55b666a2b6d1e6dae7316a7 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 23 Oct 2024 20:49:33 -0500
Subject: [PATCH 11/16] Formatting

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/test_utils.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linalg_ops/test_utils.c b/linalg_ops/test_utils.c
index 5f5ce16..05065b9 100644
--- a/linalg_ops/test_utils.c
+++ b/linalg_ops/test_utils.c
@@ -195,8 +195,8 @@ bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
     case IREE_TEST_UTILS_VALUE_TYPE_F16:
       if (actual.f16 == expected.f16) return true;
       if (iree_test_utils_require_exact_results()) return false;
-       return fabsf(iree_math_f16_to_f32(actual.f16) -
-                    iree_math_f16_to_f32(expected.f16)) <
+      return fabsf(iree_math_f16_to_f32(actual.f16) -
+                  iree_math_f16_to_f32(expected.f16)) <
               acceptable_fp_delta;
     case IREE_TEST_UTILS_VALUE_TYPE_BF16:
       if (actual.bf16_u16 == expected.bf16_u16) return true;

From 427f4064ccd31fc764cd1d80fde72bc55696fccc Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Thu, 24 Oct 2024 00:35:09 -0500
Subject: [PATCH 12/16] Remove old code

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/generate_test_mlir_files.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index f195ec4..292ddf5 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -94,7 +94,6 @@ for type_and_layout_combination in ${type_and_layout_combinations[@]}; do
   kernel_layout="${combination[3]}"
   acc_type="${combination[4]}"
   type_layout_name="${input_type}_${input_layout}_${kernel_type}_${kernel_layout}_${acc_type}"
-  #layout_name="${input_layout}_${kernel_layout}"
   type_combination_dir="${generated_dir_root}/${type_layout_name}"
   mkdir -p ${type_combination_dir}
   for shape in ${shapes[@]}; do

From 5295314e5db69ec26f623b0da592a758636ef289 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Thu, 24 Oct 2024 16:43:32 -0500
Subject: [PATCH 13/16] Add medium and small problem sizes for GPU targets

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/CMakeLists.txt         |   6 -
 .../convolution/generate_test_mlir_files.sh   |   3 -
 .../conv2d_f16_nchw_f16_fchw_f32_medium.mlir  |  12 ++
 ...2d_f16_nchw_f16_fchw_f32_medium_calls.mlir | 158 ++++++++++++++++++
 .../conv2d_f16_nchw_f16_fchw_f32_small.mlir   |  12 ++
 ...v2d_f16_nchw_f16_fchw_f32_small_calls.mlir | 158 ++++++++++++++++++
 .../conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir  |  12 ++
 ...2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir | 158 ++++++++++++++++++
 .../conv2d_f16_nhwc_f16_hwcf_f32_small.mlir   |  12 ++
 ...v2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir | 158 ++++++++++++++++++
 .../conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir    |  12 ++
 ...nv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir | 158 ++++++++++++++++++
 .../conv2d_i8_nhwc_i8_hwcf_i32_small.mlir     |  12 ++
 ...onv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir | 158 ++++++++++++++++++
 14 files changed, 1020 insertions(+), 9 deletions(-)
 create mode 100644 linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir

diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
index cdcf6bc..ed8c033 100644
--- a/linalg_ops/convolution/CMakeLists.txt
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -83,9 +83,6 @@ endforeach()
 # To distinguish between CDNA(gfx9) and RDNA3(gfx11)
 if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
 
-set(_SIZES)
-list(APPEND _SIZES "large")
-
 set(_DTYPES_AND_LAYOUTS)
 list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
 list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f32")
@@ -117,9 +114,6 @@ endforeach()
 
 elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
 
-set(_SIZES)
-list(APPEND _SIZES "large")
-
 set(_DTYPES_AND_LAYOUTS)
 list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
 
diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index 292ddf5..1fabd77 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -77,9 +77,6 @@ for type_combination in ${type_combinations[@]}; do
   done
 done
 
-shapes=(
-  "large"
-)
 # input_type;input_layout;kernel_type;kernel_layout;acc_type
 type_and_layout_combinations=(
   "f16;nhwc;f16;hwcf;f32"
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
new file mode 100644
index 0000000..cd7d6a4
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
new file mode 100644
index 0000000..451175c
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir
new file mode 100644
index 0000000..f2d0ea0
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
+  return %result: tensor<1x1x15x15xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
new file mode 100644
index 0000000..5c160c6
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
new file mode 100644
index 0000000..c77e99c
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x64xf16>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<3x3x32x64xf16>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<3x3x32x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
new file mode 100644
index 0000000..0c7afed
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 32 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir
new file mode 100644
index 0000000..59e9504
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%lhs: tensor<1x16x16x1xf16>, %rhs: tensor<2x2x1x1xf16>, %acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xf16>, tensor<2x2x1x1xf16>) outs(%acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32>
+  return %result: tensor<1x15x15x1xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir
new file mode 100644
index 0000000..6a9ab15
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
new file mode 100644
index 0000000..5b8985b
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x32x32x2xi8>, %rhs: tensor<3x3x2x2xi8>, %acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x2xi8>) outs(%acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32>
+  return %result: tensor<2x30x30x2xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x32x32x2xi8>, %rhs: tensor<3x3x2x64xi8>, %acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
+  return %result: tensor<2x30x30x64xi32>
+}
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x32x32x32xi8>, %rhs: tensor<3x3x32x64xi8>, %acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xi8>, tensor<3x3x32x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
+  return %result: tensor<2x30x30x64xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
new file mode 100644
index 0000000..e4c2495
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32_2_32_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 32 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 32 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir
new file mode 100644
index 0000000..5d52f93
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%lhs: tensor<1x1x1x1xi8>, %rhs: tensor<1x1x1x1xi8>, %acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xi8>, tensor<1x1x1x1xi8>) outs(%acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32>
+  return %result: tensor<1x1x1x1xi32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%lhs: tensor<1x16x16x1xi8>, %rhs: tensor<2x2x1x1xi8>, %acc: tensor<1x15x15x1xi32>) -> tensor<1x15x15x1xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xi8>, tensor<2x2x1x1xi8>) outs(%acc: tensor<1x15x15x1xi32>) -> tensor<1x15x15x1xi32>
+  return %result: tensor<1x15x15x1xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x32x32x2xi8>, %rhs: tensor<3x3x2x2xi8>, %acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x2xi8>) outs(%acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32>
+  return %result: tensor<2x30x30x2xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir
new file mode 100644
index 0000000..da9b803
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}

From bb4257b34e4943f0d7bc1798b8fce2136d2c86f9 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Tue, 29 Oct 2024 17:07:22 -0500
Subject: [PATCH 14/16] Address multiple comments

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/CMakeLists.txt         | 208 ++++++++++++++++--
 .../convolution/generate_e2e_conv2d_tests.py  |   2 +-
 .../convolution/generate_test_mlir_files.sh   |  66 ++----
 .../conv2d_f16_nchw_f16_fchw_f16_large.mlir}  |   0
 ...2d_f16_nchw_f16_fchw_f16_large_calls.mlir} |   0
 .../conv2d_f16_nchw_f16_fchw_f16_medium.mlir} |   4 +-
 ...d_f16_nchw_f16_fchw_f16_medium_calls.mlir} |  14 +-
 .../conv2d_f16_nchw_f16_fchw_f16_small.mlir}  |   0
 ...2d_f16_nchw_f16_fchw_f16_small_calls.mlir} |   0
 .../conv2d_f16_nchw_f16_fchw_f32_medium.mlir  |   4 +-
 ...2d_f16_nchw_f16_fchw_f32_medium_calls.mlir |  14 +-
 .../conv2d_f16_nhwc_f16_hwcf_f16_large.mlir   |   8 +
 ...v2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir | 108 +++++++++
 .../conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir  |  12 +
 ...2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir | 158 +++++++++++++
 .../conv2d_f16_nhwc_f16_hwcf_f16_small.mlir   |  12 +
 ...v2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir | 158 +++++++++++++
 .../conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir  |   4 +-
 ...2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir |  14 +-
 .../conv2d_f32_nchw_f32_fchw_f32_large.mlir}  |   0
 ...2d_f32_nchw_f32_fchw_f32_large_calls.mlir} |   0
 .../conv2d_f32_nchw_f32_fchw_f32_medium.mlir} |   4 +-
 ...d_f32_nchw_f32_fchw_f32_medium_calls.mlir} |  14 +-
 .../conv2d_f32_nchw_f32_fchw_f32_small.mlir}  |   0
 ...2d_f32_nchw_f32_fchw_f32_small_calls.mlir} |   0
 .../conv2d_f32_nhwc_f32_hwcf_f32_large.mlir   |   8 +
 ...v2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir | 108 +++++++++
 .../conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir  |  12 +
 ...2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir | 158 +++++++++++++
 .../conv2d_f32_nhwc_f32_hwcf_f32_small.mlir   |  12 +
 ...v2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir | 158 +++++++++++++
 .../conv2d_i8_nchw_i8_fchw_i32_large.mlir     |   8 +
 ...onv2d_i8_nchw_i8_fchw_i32_large_calls.mlir | 108 +++++++++
 .../conv2d_i8_nchw_i8_fchw_i32_medium.mlir    |  12 +
 ...nv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir | 158 +++++++++++++
 .../conv2d_i8_nchw_i8_fchw_i32_small.mlir     |  12 +
 ...onv2d_i8_nchw_i8_fchw_i32_small_calls.mlir | 158 +++++++++++++
 .../conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir    |   4 +-
 ...nv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir |  14 +-
 linalg_ops/test_utils.c                       |   8 +-
 linalg_ops/test_utils.h                       |   1 -
 41 files changed, 1627 insertions(+), 116 deletions(-)
 rename linalg_ops/convolution/generated/{f16_f16_f16/conv2d_f16_f16_f16_large.mlir => f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large.mlir} (100%)
 rename linalg_ops/convolution/generated/{f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir => f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir} (100%)
 rename linalg_ops/convolution/generated/{f16_f16_f16/conv2d_f16_f16_f16_medium.mlir => f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir} (86%)
 rename linalg_ops/convolution/generated/{f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir => f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir} (96%)
 rename linalg_ops/convolution/generated/{f16_f16_f16/conv2d_f16_f16_f16_small.mlir => f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small.mlir} (100%)
 rename linalg_ops/convolution/generated/{f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir => f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir} (100%)
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir
 rename linalg_ops/convolution/generated/{f32_f32_f32/conv2d_f32_f32_f32_large.mlir => f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large.mlir} (100%)
 rename linalg_ops/convolution/generated/{f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir => f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large_calls.mlir} (100%)
 rename linalg_ops/convolution/generated/{f32_f32_f32/conv2d_f32_f32_f32_medium.mlir => f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir} (86%)
 rename linalg_ops/convolution/generated/{f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir => f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir} (96%)
 rename linalg_ops/convolution/generated/{f32_f32_f32/conv2d_f32_f32_f32_small.mlir => f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small.mlir} (100%)
 rename linalg_ops/convolution/generated/{f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir => f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small_calls.mlir} (100%)
 create mode 100644 linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir
 create mode 100644 linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir

diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
index ed8c033..73bf3cf 100644
--- a/linalg_ops/convolution/CMakeLists.txt
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -12,26 +12,30 @@ list(APPEND _SIZES "large")
 list(APPEND _SIZES "medium")
 list(APPEND _SIZES "small")
 
+
+set(_DTYPES_AND_LAYOUTS)
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f16")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f16")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f32_nhwc_f32_hwcf_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f32_nchw_f32_fchw_f32")
+
 ###############################################################################
 #
 # CPU - llvm-cpu on local-task, default flags.
 #
 ###############################################################################
 
-
-set(_DTYPES)
-list(APPEND _DTYPES "f16_f16_f16")
-list(APPEND _DTYPES "f32_f32_f32")
-
-foreach(_DTYPE IN LISTS _DTYPES)
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
   foreach(_SIZE IN LISTS _SIZES)
     iree_test_suites_runner_test(
       NAME
-        conv2d_${_DTYPE}_${_SIZE}
+        conv2d_llvm-cpu_local-task_${_DTYPE_AND_LAYOUT}_${_SIZE}
       TESTS_SRC
-        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}.mlir"
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
       CALLS_SRC
-        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}_calls.mlir"
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
       TEST_RUNNER
         iree-test-suites_iree-e2e-conv2d-test
       TARGET_BACKEND
@@ -47,15 +51,21 @@ foreach(_DTYPE IN LISTS _DTYPES)
   endforeach()
 endforeach()
 
-foreach(_DTYPE IN LISTS _DTYPES)
+###############################################################################
+#
+# CPU - Winograd llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
   foreach(_SIZE IN LISTS _SIZES)
     iree_test_suites_runner_test(
       NAME
-        conv2d_winograd_${_DTYPE}_${_SIZE}
+        conv2d_winograd_llvm-cpu_local-task_${_DTYPE_AND_LAYOUT}_${_SIZE}
       TESTS_SRC
-        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}.mlir"
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
       CALLS_SRC
-        "generated/${_DTYPE}/conv2d_${_DTYPE}_${_SIZE}_calls.mlir"
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
       TEST_RUNNER
         iree-test-suites_iree-e2e-conv2d-test
       TARGET_BACKEND
@@ -76,24 +86,47 @@ endforeach()
 
 ###############################################################################
 #
-# GPU - ROCm/HIP, default flags.
+# GPU - ROCm/HIP, CDNA(gfx9).
 #
 ###############################################################################
 
 # To distinguish between CDNA(gfx9) and RDNA3(gfx11)
 if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
 
-set(_DTYPES_AND_LAYOUTS)
-list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
-list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f32")
-list(APPEND _DTYPES_AND_LAYOUTS "i8_nhwc_i8_hwcf_i32")
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
 
+###############################################################################
+#
+# Winograd GPU - ROCm/HIP, CDNA(gfx9).
+#
+###############################################################################
 
 foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
   foreach(_SIZE IN LISTS _SIZES)
     iree_test_suites_runner_test(
       NAME
-        conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}
+        conv2d_winograd_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
       TESTS_SRC
         "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
       CALLS_SRC
@@ -105,6 +138,7 @@ foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
       DRIVER
         "hip"
       COMPILER_FLAGS
+        "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
         "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
       RUNNER_FLAGS
       LABELS
@@ -112,16 +146,148 @@ foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
   endforeach()
 endforeach()
 
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx11)
+#
+###############################################################################
+
 elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
 
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# Winograd GPU - ROCm/HIP, CDNA(gfx11).
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_winograd_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+endif()
+
+# CPU and GPU tests for "i8_nhwc_i8_hwcf_i32" combination without Winograd
 set(_DTYPES_AND_LAYOUTS)
-list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "i8_nhwc_i8_hwcf_i32")
+list(APPEND _DTYPES_AND_LAYOUTS "i8_nchw_i8_fchw_i32")
+
+###############################################################################
+#
+# CPU - llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_llvm-cpu_local-task_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_FLAGS
+      RUNNER_FLAGS
+      LABELS
+         "hostonly"
+         "local"
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx9).
+#
+###############################################################################
+
+# To distinguish between CDNA(gfx9) and RDNA3(gfx11)
+if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx11)
+#
+###############################################################################
+elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
 
 foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
   foreach(_SIZE IN LISTS _SIZES)
     iree_test_suites_runner_test(
       NAME
-        conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
       TESTS_SRC
         "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
       CALLS_SRC
diff --git a/linalg_ops/convolution/generate_e2e_conv2d_tests.py b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
index 92e3c1a..1a1d984 100644
--- a/linalg_ops/convolution/generate_e2e_conv2d_tests.py
+++ b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
@@ -120,7 +120,7 @@ def get_test_shapes(shapes_id: ShapesId):
         return [
             TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=2, accumulate=True),
             TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
-            TestShape(n=2, c=32, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
+            TestShape(n=2, c=16, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
         ]
     if shapes_id == ShapesId.LARGE:
         return [
diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
index 1fabd77..69aea7d 100755
--- a/linalg_ops/convolution/generate_test_mlir_files.sh
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -13,20 +13,20 @@
 #   linalg_ops/
 #     convolution/
 #       generated/
-#         f16_f16_f16/
-#           conv2d_f16_f16_f16_large_calls.mlir
-#           conv2d_f16_f16_f16_large.mlir
-#           conv2d_f16_f16_f16_medium_calls.mlir
-#           conv2d_f16_f16_f16_medium.mlir
-#           conv2d_f16_f16_f16_small_calls.mlir
-#           conv2d_f16_f16_f16_small.mlir
-#         f32_f32_f32/
-#           conv2d_f32_f32_f32_large_calls.mlir
-#           conv2d_f32_f32_f32_large.mlir
-#           conv2d_f32_f32_f32_medium_calls.mlir
-#           conv2d_f32_f32_f32_medium.mlir
-#           conv2d_f32_f32_f32_small_calls.mlir
-#           conv2d_f32_f32_f32_small.mlir
+#         f16_nchw_f16_fchw_f16/
+#           conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_large.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_medium.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_small.mlir
+#         f16_nchw_f16_fchw_f32/
+#           conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_large.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_medium.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_small.mlir
 #         ...
 #           ...
 # Usage:
@@ -47,42 +47,18 @@ shapes=(
   "large"
 )
 
-# input_type;kernel_type;acc_type
-type_combinations=(
-  "f16;f16;f16"
-  "f32;f32;f32"
-)
-
-for type_combination in ${type_combinations[@]}; do
-  IFS=";" read -r -a types <<< "${type_combination}"
-  input_type="${types[0]}"
-  kernel_type="${types[1]}"
-  acc_type="${types[2]}"
-
-  type_name="${input_type}_${kernel_type}_${acc_type}"
-  type_combination_dir="${generated_dir_root}/${type_name}"
-  mkdir -p ${type_combination_dir}
-
-  for shape in ${shapes[@]}; do
-    echo "Generating conv2d test files for ${type_name}_${shape}"
-
-    name="conv2d_${type_name}_${shape}"
-    python ${this_dir}/generate_e2e_conv2d_tests.py \
-      --output_conv2d_mlir=${type_combination_dir}/${name}.mlir \
-      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
-      --input_type=${input_type} \
-      --kernel_type=${kernel_type} \
-      --acc_type=${acc_type} \
-      --shapes=${shape}
-  done
-done
-
 # input_type;input_layout;kernel_type;kernel_layout;acc_type
 type_and_layout_combinations=(
+  "f16;nhwc;f16;hwcf;f16"
+  "f16;nchw;f16;fchw;f16"
   "f16;nhwc;f16;hwcf;f32"
   "f16;nchw;f16;fchw;f32"
+  "f32;nhwc;f32;hwcf;f32"
+  "f32;nchw;f32;fchw;f32"
   "i8;nhwc;i8;hwcf;i32"
+  "i8;nchw;i8;fchw;i32"
 )
+
 for type_and_layout_combination in ${type_and_layout_combinations[@]}; do
   IFS=";" read -r -a combination <<< "${type_and_layout_combination}"
   input_type="${combination[0]}"
@@ -90,9 +66,11 @@ for type_and_layout_combination in ${type_and_layout_combinations[@]}; do
   kernel_type="${combination[2]}"
   kernel_layout="${combination[3]}"
   acc_type="${combination[4]}"
+
   type_layout_name="${input_type}_${input_layout}_${kernel_type}_${kernel_layout}_${acc_type}"
   type_combination_dir="${generated_dir_root}/${type_layout_name}"
   mkdir -p ${type_combination_dir}
+  
   for shape in ${shapes[@]}; do
     echo "Generating conv2d test files for ${type_layout_name}_${shape}"
     name="conv2d_${type_layout_name}_${shape}"
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large.mlir
rename to linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large.mlir
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_large_calls.mlir
rename to linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir
similarity index 86%
rename from linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
rename to linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir
index caba912..b630d29 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium.mlir
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir
@@ -6,7 +6,7 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tens
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
   return %result: tensor<2x64x30x30xf16>
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x16x32x32xf16>, %rhs: tensor<64x16x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xf16>, tensor<64x16x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
   return %result: tensor<2x64x30x30xf16>
 }
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
similarity index 96%
rename from linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
rename to linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
index 60860a5..1298b5b 100644
--- a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
@@ -6,7 +6,7 @@ func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
@@ -106,20 +106,20 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
 } {
   %device_index = arith.constant 0 : index
   %device = hal.devices.get %device_index : !hal.device
   %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 32 : i64
+  %input_dim1 = arith.constant 16 : i64
   %input_dim2 = arith.constant 32 : i64
   %input_dim3 = arith.constant 32 : i64
   %input_element_type = hal.element_type<f16> : i32
   %input_seed = arith.constant 8 : i32
   %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
   %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim1 = arith.constant 16 : i64
   %kernel_dim2 = arith.constant 3 : i64
   %kernel_dim3 = arith.constant 3 : i64
   %kernel_element_type = hal.element_type<f16> : i32
@@ -139,9 +139,9 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16_2_32_32_3
   %acc_copy_element_type = hal.element_type<f16> : i32
   %acc_copy_seed = arith.constant 10 : i32
   %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
   %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
+  %c = arith.constant 16 : i64
   %h = arith.constant 32 : i64
   %w = arith.constant 32 : i64
   %f = arith.constant 64 : i64
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small.mlir
rename to linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small.mlir
diff --git a/linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f16_f16_f16/conv2d_f16_f16_f16_small_calls.mlir
rename to linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
index cd7d6a4..0f9d9df 100644
--- a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
@@ -6,7 +6,7 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tens
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
   return %result: tensor<2x64x30x30xf32>
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<64x32x3x3xf16>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<64x32x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x16x32x32xf16>, %rhs: tensor<64x16x3x3xf16>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xf16>, tensor<64x16x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
   return %result: tensor<2x64x30x30xf32>
 }
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
index 451175c..f8798e9 100644
--- a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
@@ -6,7 +6,7 @@ func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_0() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
@@ -106,20 +106,20 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32_2_2_32_32_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
 } {
   %device_index = arith.constant 0 : index
   %device = hal.devices.get %device_index : !hal.device
   %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 32 : i64
+  %input_dim1 = arith.constant 16 : i64
   %input_dim2 = arith.constant 32 : i64
   %input_dim3 = arith.constant 32 : i64
   %input_element_type = hal.element_type<f16> : i32
   %input_seed = arith.constant 8 : i32
   %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
   %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim1 = arith.constant 16 : i64
   %kernel_dim2 = arith.constant 3 : i64
   %kernel_dim3 = arith.constant 3 : i64
   %kernel_element_type = hal.element_type<f16> : i32
@@ -139,9 +139,9 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32_2_32_32_3
   %acc_copy_element_type = hal.element_type<f32> : i32
   %acc_copy_seed = arith.constant 10 : i32
   %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
   %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
+  %c = arith.constant 16 : i64
   %h = arith.constant 32 : i64
   %w = arith.constant 32 : i64
   %f = arith.constant 64 : i64
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir
new file mode 100644
index 0000000..17eb9e8
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: tensor<2x128x128x4xf16>, %rhs: tensor<3x3x4x8xf16>, %acc: tensor<2x126x126x8xf16>) -> tensor<2x126x126x8xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xf16>, tensor<3x3x4x8xf16>) outs(%acc: tensor<2x126x126x8xf16>) -> tensor<2x126x126x8xf16>
+  return %result: tensor<2x126x126x8xf16>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x128x128x3xf16>, %rhs: tensor<3x3x3x12xf16>, %acc: tensor<2x126x126x12xf16>) -> tensor<2x126x126x12xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xf16>, tensor<3x3x3x12xf16>) outs(%acc: tensor<2x126x126x12xf16>) -> tensor<2x126x126x12xf16>
+  return %result: tensor<2x126x126x12xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir
new file mode 100644
index 0000000..b07a2e5
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir
new file mode 100644
index 0000000..addb8a2
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16>
+  return %result: tensor<2x30x30x2xf16>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x64xf16>, %acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x64xf16>) outs(%acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16>
+  return %result: tensor<2x30x30x64xf16>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x16xf16>, %rhs: tensor<3x3x16x64xf16>, %acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xf16>, tensor<3x3x16x64xf16>) outs(%acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16>
+  return %result: tensor<2x30x30x64xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir
new file mode 100644
index 0000000..17ee9c1
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 16 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir
new file mode 100644
index 0000000..b3bf59e
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
+  return %result: tensor<1x1x1x1xf16>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x16x16x1xf16>, %rhs: tensor<2x2x1x1xf16>, %acc: tensor<1x15x15x1xf16>) -> tensor<1x15x15x1xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xf16>, tensor<2x2x1x1xf16>) outs(%acc: tensor<1x15x15x1xf16>) -> tensor<1x15x15x1xf16>
+  return %result: tensor<1x15x15x1xf16>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16>
+  return %result: tensor<2x30x30x2xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir
new file mode 100644
index 0000000..f50584a
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
index c77e99c..d140187 100644
--- a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
@@ -6,7 +6,7 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tens
   %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
   return %result: tensor<2x30x30x64xf32>
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x32xf16>, %rhs: tensor<3x3x32x64xf16>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
-  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf16>, tensor<3x3x32x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x16xf16>, %rhs: tensor<3x3x16x64xf16>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xf16>, tensor<3x3x16x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
   return %result: tensor<2x30x30x64xf32>
 }
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
index 0c7afed..548e7ad 100644
--- a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
@@ -6,7 +6,7 @@ func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_0() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
@@ -106,21 +106,21 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32_2_2_32_32_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
 } {
   %device_index = arith.constant 0 : index
   %device = hal.devices.get %device_index : !hal.device
   %input_dim0 = arith.constant 2 : i64
   %input_dim1 = arith.constant 32 : i64
   %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
   %input_element_type = hal.element_type<f16> : i32
   %input_seed = arith.constant 8 : i32
   %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
   %kernel_dim0 = arith.constant 3 : i64
   %kernel_dim1 = arith.constant 3 : i64
-  %kernel_dim2 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 16 : i64
   %kernel_dim3 = arith.constant 64 : i64
   %kernel_element_type = hal.element_type<f16> : i32
   %kernel_seed = arith.constant 9 : i32
@@ -139,9 +139,9 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32_2_32_32_3
   %acc_copy_element_type = hal.element_type<f32> : i32
   %acc_copy_seed = arith.constant 10 : i32
   %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
   %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
+  %c = arith.constant 16 : i64
   %h = arith.constant 32 : i64
   %w = arith.constant 32 : i64
   %f = arith.constant 64 : i64
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large.mlir
rename to linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large.mlir
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large_calls.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_large_calls.mlir
rename to linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large_calls.mlir
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir
similarity index 86%
rename from linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
rename to linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir
index 97ff810..d074f1f 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium.mlir
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir
@@ -6,7 +6,7 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tens
   %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<64x2x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
   return %result: tensor<2x64x30x30xf32>
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x32xf32>, %rhs: tensor<64x32x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
-  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xf32>, tensor<64x32x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x16x32x32xf32>, %rhs: tensor<64x16x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xf32>, tensor<64x16x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
   return %result: tensor<2x64x30x30xf32>
 }
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir
similarity index 96%
rename from linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
rename to linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir
index 3a2f05c..092a825 100644
--- a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir
@@ -6,7 +6,7 @@ func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
@@ -106,20 +106,20 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
 } {
   %device_index = arith.constant 0 : index
   %device = hal.devices.get %device_index : !hal.device
   %input_dim0 = arith.constant 2 : i64
-  %input_dim1 = arith.constant 32 : i64
+  %input_dim1 = arith.constant 16 : i64
   %input_dim2 = arith.constant 32 : i64
   %input_dim3 = arith.constant 32 : i64
   %input_element_type = hal.element_type<f32> : i32
   %input_seed = arith.constant 8 : i32
   %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
   %kernel_dim0 = arith.constant 64 : i64
-  %kernel_dim1 = arith.constant 32 : i64
+  %kernel_dim1 = arith.constant 16 : i64
   %kernel_dim2 = arith.constant 3 : i64
   %kernel_dim3 = arith.constant 3 : i64
   %kernel_element_type = hal.element_type<f32> : i32
@@ -139,9 +139,9 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32_2_32_32_3
   %acc_copy_element_type = hal.element_type<f32> : i32
   %acc_copy_seed = arith.constant 10 : i32
   %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
   %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
+  %c = arith.constant 16 : i64
   %h = arith.constant 32 : i64
   %w = arith.constant 32 : i64
   %f = arith.constant 64 : i64
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small.mlir
rename to linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small.mlir
diff --git a/linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small_calls.mlir
similarity index 100%
rename from linalg_ops/convolution/generated/f32_f32_f32/conv2d_f32_f32_f32_small_calls.mlir
rename to linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small_calls.mlir
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir
new file mode 100644
index 0000000..0cdae51
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: tensor<2x128x128x4xf32>, %rhs: tensor<3x3x4x8xf32>, %acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xf32>, tensor<3x3x4x8xf32>) outs(%acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32>
+  return %result: tensor<2x126x126x8xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x128x128x3xf32>, %rhs: tensor<3x3x3x12xf32>, %acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xf32>, tensor<3x3x3x12xf32>) outs(%acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32>
+  return %result: tensor<2x126x126x12xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir
new file mode 100644
index 0000000..854a307
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir
new file mode 100644
index 0000000..393c487
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x32x32x2xf32>, %rhs: tensor<3x3x2x2xf32>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf32>, tensor<3x3x2x2xf32>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x2xf32>, %rhs: tensor<3x3x2x64xf32>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf32>, tensor<3x3x2x64xf32>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x16xf32>, %rhs: tensor<3x3x16x64xf32>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xf32>, tensor<3x3x16x64xf32>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir
new file mode 100644
index 0000000..5043f0d
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 16 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir
new file mode 100644
index 0000000..ea9d92c
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<1x1x1x1xf32>, %rhs: tensor<1x1x1x1xf32>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x16x16x1xf32>, %rhs: tensor<2x2x1x1xf32>, %acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xf32>, tensor<2x2x1x1xf32>) outs(%acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32>
+  return %result: tensor<1x15x15x1xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x32x32x2xf32>, %rhs: tensor<3x3x2x2xf32>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf32>, tensor<3x3x2x2xf32>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir
new file mode 100644
index 0000000..b25c720
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir
new file mode 100644
index 0000000..706848a
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%lhs: tensor<2x4x128x128xi8>, %rhs: tensor<8x4x3x3xi8>, %acc: tensor<2x8x126x126xi32>) -> tensor<2x8x126x126xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xi8>, tensor<8x4x3x3xi8>) outs(%acc: tensor<2x8x126x126xi32>) -> tensor<2x8x126x126xi32>
+  return %result: tensor<2x8x126x126xi32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%lhs: tensor<2x3x128x128xi8>, %rhs: tensor<12x3x3x3xi8>, %acc: tensor<2x12x126x126xi32>) -> tensor<2x12x126x126xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xi8>, tensor<12x3x3x3xi8>) outs(%acc: tensor<2x12x126x126xi32>) -> tensor<2x12x126x126xi32>
+  return %result: tensor<2x12x126x126xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir
new file mode 100644
index 0000000..af106a6
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir
new file mode 100644
index 0000000..780c670
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x2x32x32xi8>, %rhs: tensor<2x2x3x3xi8>, %acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xi8>, tensor<2x2x3x3xi8>) outs(%acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32>
+  return %result: tensor<2x2x30x30xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x2x32x32xi8>, %rhs: tensor<64x2x3x3xi8>, %acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xi8>, tensor<64x2x3x3xi8>) outs(%acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32>
+  return %result: tensor<2x64x30x30xi32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x16x32x32xi8>, %rhs: tensor<64x16x3x3xi8>, %acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xi8>, tensor<64x16x3x3xi8>) outs(%acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32>
+  return %result: tensor<2x64x30x30xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir
new file mode 100644
index 0000000..8c4dc85
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 16 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir
new file mode 100644
index 0000000..8acc310
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%lhs: tensor<1x1x1x1xi8>, %rhs: tensor<1x1x1x1xi8>, %acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xi8>, tensor<1x1x1x1xi8>) outs(%acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32>
+  return %result: tensor<1x1x1x1xi32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%lhs: tensor<1x1x16x16xi8>, %rhs: tensor<1x1x2x2xi8>, %acc: tensor<1x1x15x15xi32>) -> tensor<1x1x15x15xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xi8>, tensor<1x1x2x2xi8>) outs(%acc: tensor<1x1x15x15xi32>) -> tensor<1x1x15x15xi32>
+  return %result: tensor<1x1x15x15xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x2x32x32xi8>, %rhs: tensor<2x2x3x3xi8>, %acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xi8>, tensor<2x2x3x3xi8>) outs(%acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32>
+  return %result: tensor<2x2x30x30xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir
new file mode 100644
index 0000000..6f68fe9
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
index 5b8985b..e64bc66 100644
--- a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
@@ -6,7 +6,7 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor
   %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
   return %result: tensor<2x30x30x64xi32>
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x32x32x32xi8>, %rhs: tensor<3x3x32x64xi8>, %acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32> {
-  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x32xi8>, tensor<3x3x32x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x32x32x16xi8>, %rhs: tensor<3x3x16x64xi8>, %acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xi8>, tensor<3x3x16x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
   return %result: tensor<2x30x30x64xi32>
 }
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
index e4c2495..ea12edb 100644
--- a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
@@ -6,7 +6,7 @@ func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim
 func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
-func.func private @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
 
 func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_0() attributes {
   iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
@@ -106,21 +106,21 @@ func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32_2_2_32_32_64
   call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
   return
 }
-func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32_2_32_32_32_64_3_3_acc_2() attributes {
-  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x32x32x32x64x3x3"}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
 } {
   %device_index = arith.constant 0 : index
   %device = hal.devices.get %device_index : !hal.device
   %input_dim0 = arith.constant 2 : i64
   %input_dim1 = arith.constant 32 : i64
   %input_dim2 = arith.constant 32 : i64
-  %input_dim3 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
   %input_element_type = hal.element_type<i8> : i32
   %input_seed = arith.constant 8 : i32
   %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
   %kernel_dim0 = arith.constant 3 : i64
   %kernel_dim1 = arith.constant 3 : i64
-  %kernel_dim2 = arith.constant 32 : i64
+  %kernel_dim2 = arith.constant 16 : i64
   %kernel_dim3 = arith.constant 64 : i64
   %kernel_element_type = hal.element_type<i8> : i32
   %kernel_seed = arith.constant 9 : i32
@@ -139,9 +139,9 @@ func.func @conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32_2_32_32_32_
   %acc_copy_element_type = hal.element_type<i32> : i32
   %acc_copy_seed = arith.constant 10 : i32
   %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
-  %result = call @module.conv2d_accumulate_2_32_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
   %n = arith.constant 2 : i64
-  %c = arith.constant 32 : i64
+  %c = arith.constant 16 : i64
   %h = arith.constant 32 : i64
   %w = arith.constant 32 : i64
   %f = arith.constant 64 : i64
diff --git a/linalg_ops/test_utils.c b/linalg_ops/test_utils.c
index 05065b9..e86702b 100644
--- a/linalg_ops/test_utils.c
+++ b/linalg_ops/test_utils.c
@@ -22,7 +22,7 @@
 #include "iree/tooling/device_util.h"
 #include "iree/vm/api.h"
 
-IREE_FLAG(bool, require_exact_results, false,
+IREE_FLAG(bool, require_exact_results, true,
           "Requires floating point result elements to match exactly.");
 
 bool iree_test_utils_require_exact_results(void) {
@@ -193,10 +193,10 @@ bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
     // expected values. Inexact results are only permitted when the
     // `require_exact_results` flag is set to `false`.
     case IREE_TEST_UTILS_VALUE_TYPE_F16:
-      if (actual.f16 == expected.f16) return true;
+      if (actual.f16_u16 == expected.f16_u16) return true;
       if (iree_test_utils_require_exact_results()) return false;
-      return fabsf(iree_math_f16_to_f32(actual.f16) -
-                  iree_math_f16_to_f32(expected.f16)) <
+      return fabsf(iree_math_f16_to_f32(actual.f16_u16) -
+                  iree_math_f16_to_f32(expected.f16_u16)) <
               acceptable_fp_delta;
     case IREE_TEST_UTILS_VALUE_TYPE_BF16:
       if (actual.bf16_u16 == expected.bf16_u16) return true;
diff --git a/linalg_ops/test_utils.h b/linalg_ops/test_utils.h
index f86986b..626097b 100644
--- a/linalg_ops/test_utils.h
+++ b/linalg_ops/test_utils.h
@@ -62,7 +62,6 @@ typedef struct iree_test_utils_value_t {
     int16_t i16;
     int32_t i32;
     int64_t i64;
-    float f16;
     float f32;
     uint16_t f16_u16;
     uint16_t bf16_u16;

From d00fedd328382bcb00b057e2f57b8391817b0594 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 30 Oct 2024 12:18:37 -0500
Subject: [PATCH 15/16] Remove outdated comments

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/convolution/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
index 73bf3cf..460cb2b 100644
--- a/linalg_ops/convolution/CMakeLists.txt
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -209,7 +209,7 @@ endforeach()
 
 endif()
 
-# CPU and GPU tests for "i8_nhwc_i8_hwcf_i32" combination without Winograd
+# CPU and GPU tests for without Winograd
 set(_DTYPES_AND_LAYOUTS)
 list(APPEND _DTYPES_AND_LAYOUTS "i8_nhwc_i8_hwcf_i32")
 list(APPEND _DTYPES_AND_LAYOUTS "i8_nchw_i8_fchw_i32")

From cf73da6af11f5646926f23a4c9a8a5703dde171a Mon Sep 17 00:00:00 2001
From: erman-gurses <erman@nod-labs.com>
Date: Wed, 30 Oct 2024 12:29:44 -0500
Subject: [PATCH 16/16] Formatting

Signed-off-by: erman-gurses <erman@nod-labs.com>
---
 linalg_ops/test_utils.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linalg_ops/test_utils.c b/linalg_ops/test_utils.c
index e86702b..9762861 100644
--- a/linalg_ops/test_utils.c
+++ b/linalg_ops/test_utils.c
@@ -196,8 +196,8 @@ bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
       if (actual.f16_u16 == expected.f16_u16) return true;
       if (iree_test_utils_require_exact_results()) return false;
       return fabsf(iree_math_f16_to_f32(actual.f16_u16) -
-                  iree_math_f16_to_f32(expected.f16_u16)) <
-              acceptable_fp_delta;
+                   iree_math_f16_to_f32(expected.f16_u16)) <
+             acceptable_fp_delta;
     case IREE_TEST_UTILS_VALUE_TYPE_BF16:
       if (actual.bf16_u16 == expected.bf16_u16) return true;
       if (iree_test_utils_require_exact_results()) return false;