diff --git a/linalg_ops/CMakeLists.txt b/linalg_ops/CMakeLists.txt
index 08cf318..015b318 100644
--- a/linalg_ops/CMakeLists.txt
+++ b/linalg_ops/CMakeLists.txt
@@ -114,6 +114,26 @@ iree_cc_binary(
     iree::vm::cc
 )
 
+iree_cc_binary(
+  NAME
+    iree-e2e-conv2d-test
+  SRCS
+    "iree-e2e-conv2d-test.cc"
+  DEPS
+    ::test_utils
+    iree::base
+    iree::base::internal
+    iree::base::internal::cpu
+    iree::base::internal::flags
+    iree::base::internal::path
+    iree::hal
+    iree::modules::hal
+    iree::tooling::context_util
+    iree::tooling::device_util
+    iree::vm
+    iree::vm::cc
+)
+
 #-------------------------------------------------------------------------------
 # Tests
 #-------------------------------------------------------------------------------
@@ -123,3 +143,4 @@ include(iree_test_suites_native_test)
 include(iree_test_suites_runner_test)
 
 add_subdirectory(matmul)
+add_subdirectory(convolution)
diff --git a/linalg_ops/convolution/CMakeLists.txt b/linalg_ops/convolution/CMakeLists.txt
new file mode 100644
index 0000000..460cb2b
--- /dev/null
+++ b/linalg_ops/convolution/CMakeLists.txt
@@ -0,0 +1,309 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# TODO(scotttodd): add filtering here, in the helper functions, or in ctest to
+#                  choose which tests to compile and run
+
+set(_SIZES)
+list(APPEND _SIZES "large")
+list(APPEND _SIZES "medium")
+list(APPEND _SIZES "small")
+
+
+set(_DTYPES_AND_LAYOUTS)
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f16")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f16")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nhwc_f16_hwcf_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f16_nchw_f16_fchw_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f32_nhwc_f32_hwcf_f32")
+list(APPEND _DTYPES_AND_LAYOUTS "f32_nchw_f32_fchw_f32")
+
+###############################################################################
+#
+# CPU - llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_llvm-cpu_local-task_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_FLAGS
+      RUNNER_FLAGS
+      LABELS
+         "hostonly"
+         "local"
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# CPU - Winograd llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_winograd_llvm-cpu_local-task_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_FLAGS
+        "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
+      RUNNER_FLAGS
+      LABELS
+         "hostonly"
+         "local"
+      TARGET_CPU_FEATURES_VARIANTS
+         "default"
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx9).
+#
+###############################################################################
+
+# To distinguish between CDNA(gfx9) and RDNA3(gfx11)
+if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# Winograd GPU - ROCm/HIP, CDNA(gfx9).
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_winograd_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx11)
+#
+###############################################################################
+
+elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# Winograd GPU - ROCm/HIP, CDNA(gfx11).
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_winograd_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-preprocessing-pass-pipeline=builtin.module\(func.func\(iree-linalg-ext-convert-conv2d-to-winograd{replace-all-convs=true}\)\)"
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+endif()
+
+# CPU and GPU tests for without Winograd
+set(_DTYPES_AND_LAYOUTS)
+list(APPEND _DTYPES_AND_LAYOUTS "i8_nhwc_i8_hwcf_i32")
+list(APPEND _DTYPES_AND_LAYOUTS "i8_nchw_i8_fchw_i32")
+
+###############################################################################
+#
+# CPU - llvm-cpu on local-task, default flags.
+#
+###############################################################################
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_llvm-cpu_local-task_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "llvm-cpu"
+      DRIVER
+        "local-task"
+      COMPILER_FLAGS
+      RUNNER_FLAGS
+      LABELS
+         "hostonly"
+         "local"
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx9).
+#
+###############################################################################
+
+# To distinguish between CDNA(gfx9) and RDNA3(gfx11)
+if(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx9")
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+###############################################################################
+#
+# GPU - ROCm/HIP, CDNA(gfx11)
+#
+###############################################################################
+elseif(IREE_HIP_TEST_TARGET_CHIP MATCHES "^gfx11")
+
+foreach(_DTYPE_AND_LAYOUT IN LISTS _DTYPES_AND_LAYOUTS)
+  foreach(_SIZE IN LISTS _SIZES)
+    iree_test_suites_runner_test(
+      NAME
+        conv2d_rocm_hip_${_DTYPE_AND_LAYOUT}_${_SIZE}
+      TESTS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}.mlir"
+      CALLS_SRC
+        "generated/${_DTYPE_AND_LAYOUT}/conv2d_${_DTYPE_AND_LAYOUT}_${_SIZE}_calls.mlir"
+      TEST_RUNNER
+        iree-test-suites_iree-e2e-conv2d-test
+      TARGET_BACKEND
+        "rocm"
+      DRIVER
+        "hip"
+      COMPILER_FLAGS
+        "--iree-hip-target=${IREE_HIP_TEST_TARGET_CHIP}"
+      RUNNER_FLAGS
+      LABELS
+    )
+  endforeach()
+endforeach()
+
+endif()
diff --git a/linalg_ops/convolution/generate_e2e_conv2d_tests.py b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
new file mode 100644
index 0000000..1a1d984
--- /dev/null
+++ b/linalg_ops/convolution/generate_e2e_conv2d_tests.py
@@ -0,0 +1,699 @@
+#!/usr/bin/env python3
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Generator for e2e conv2d tests.
+"""
+
+from typing import Optional
+import argparse
+import enum
+import dataclasses
+import typing
+import math
+import itertools
+import re
+
+
+# Data type of kernel entries. The string values must match MLIR data types.
+@enum.unique
+class KernelElemTypeId(enum.Enum):
+    NONE = ""
+    I8 = "i8"
+    F16 = "f16"
+    F32 = "f32"
+
+
+# Data type of input entries. The string values must match MLIR data types.
+@enum.unique
+class InputElemTypeId(enum.Enum):
+    NONE = ""
+    I8 = "i8"
+    F16 = "f16"
+    F32 = "f32"
+
+
+# Data type of input entries. The string values must match MLIR data types.
+@enum.unique
+class AccElemTypeId(enum.Enum):
+    NONE = ""
+    F16 = "f16"
+    I32 = "i32"
+    F32 = "f32"
+
+
+# Enumerates of the collections of shapes that we can generate tests for.
+# The values are the accepted values for the --shapes= flag.
+@enum.unique
+class ShapesId(enum.Enum):
+    SMALL = "small"
+    MEDIUM = "medium"
+    LARGE = "large"
+
+
+# Enumerates ways to construct MLIR tensor types.
+# TODO: Enable dynamic dimensions once the tests start passing.
+@enum.unique
+class Dynamicity(enum.Enum):
+    DYNAMIC = "dynamic"  # Use '?' everywhere. Example: tensor<?x?xf32>.
+    STATIC = "static"  # Use fixed values everywhere. Example: tensor<4x6xf32>.
+    MIXED = "mixed"  # Randomly mix '?' and values. Example: tensor<?x4xf32>.
+
+
+# TODO: Add more input layouts as needed. The layout determines the dim of input and kernel.
+@enum.unique
+class InputLayout(enum.Enum):
+    NCHW = "nchw"
+    NHWC = "nhwc"
+
+
+# TODO: Add more kernel layouts as needed.
+@enum.unique
+class KernelLayout(enum.Enum):
+    FCHW = "fchw"
+    HWCF = "hwcf"
+
+
+# Describes the shape of a tensor conv2d in the usual convention:
+# the input is {n}x{c}x{h}x{w}, the kernel is {f}x{c}x{kh}x{kw}, the accumulator/result is
+# {n}x{f}x{oh}x{ow}.
+# The extra `accumulate` boolean tells whether the conv2d is accumulating into
+# an existing accumulator (C += A * B) or just overwriting the result
+# (C = A * B).
+@dataclasses.dataclass
+class TestShape:
+    n: int
+    c: int
+    h: int
+    w: int
+    kh: int
+    kw: int
+    f: int
+    accumulate: bool
+
+
+# Attributes for the linalg.conv2d operation.
+@dataclasses.dataclass
+class ConvAttrs:
+    STRIDE: typing.Tuple[int, int] = (1, 1)
+    DILATION: typing.Tuple[int, int] = (1, 1)
+
+
+# Returns the list of TestShape's to use for the collection of shapes
+# identified by shapes_id.
+def get_test_shapes(shapes_id: ShapesId):
+    # Notes:
+    # 1. Be conservative in adding more shapes, as that can increase both the
+    #    build and execution latency of tests. The build latency is nearly the
+    #    same for all shapes, while execution latency grows linearly with
+    #    n*f*ow*oh*kh*kw.
+
+    if shapes_id == ShapesId.SMALL:
+        return [
+            TestShape(n=1, c=1, h=1, w=1, kh=1, kw=1, f=1, accumulate=True),
+            TestShape(n=1, c=1, h=16, w=16, kh=2, kw=2, f=1, accumulate=True),
+            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=2, accumulate=True),
+        ]
+    if shapes_id == ShapesId.MEDIUM:
+        return [
+            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=2, accumulate=True),
+            TestShape(n=2, c=2, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
+            TestShape(n=2, c=16, h=32, w=32, kh=3, kw=3, f=64, accumulate=True),
+        ]
+    if shapes_id == ShapesId.LARGE:
+        return [
+            TestShape(n=2, c=4, h=128, w=128, kh=3, kw=3, f=8, accumulate=True),
+            TestShape(n=2, c=3, h=128, w=128, kh=3, kw=3, f=12, accumulate=True),
+        ]
+
+    raise ValueError(shapes_id)
+
+
+# A shape dimension value, i.e. a size value that could appear in a MLIR type
+# such as 'tensor<?x4xf32>'. None means a dynamic size, similar to '?' in MLIR.
+@dataclasses.dataclass
+class DimSize:
+    value: typing.Optional[int]
+
+
+# Generates a compile-time MLIR size value, i.e. either a fixed positive integer
+# or None (which maps to MLIR '?') depending on dynamicity.
+def shape_dim(x: int, dynamicity: Dynamicity):
+    if dynamicity == Dynamicity.DYNAMIC:
+        return DimSize(None)
+    elif dynamicity == Dynamicity.STATIC:
+        return DimSize(x)
+    else:
+        raise ValueError(dynamicity)
+
+
+# Stringification used for generating MLIR types, e.g. tensor<?x?xf32>.
+def int_or_question_mark(s: DimSize):
+    return s.value or "?"
+
+
+# Stringification used for generating alphanumeric identifiers, e.g.
+# func.func @somefunction_DYNxDYNxf32, where we can't use "?" characters.
+def int_or_DYN(s: DimSize):
+    return s.value or "DYN"
+
+
+# Determines the shape of input and kernel tensors.
+@dataclasses.dataclass
+class TestInputTensorShapes:
+    n: DimSize
+    c: DimSize
+    h: DimSize
+    w: DimSize
+    kh: DimSize
+    kw: DimSize
+    f: DimSize
+
+
+# Helper for generate_function. Generates TestInputTensorShapes, i.e.
+# converts from the runtime shape dimensions in TestShape and given dynamicity to
+# the set of shapes to be used in a test function's input tensors.
+def generate_shapes(shape: TestShape, dynamicity: Dynamicity):
+    n = shape_dim(shape.n, dynamicity)
+    c = shape_dim(shape.c, dynamicity)
+    h = shape_dim(shape.h, dynamicity)
+    w = shape_dim(shape.w, dynamicity)
+    kh = shape_dim(shape.kh, dynamicity)
+    kw = shape_dim(shape.kw, dynamicity)
+    f = shape_dim(shape.f, dynamicity)
+    shapes = TestInputTensorShapes(
+        n=n,
+        c=c,
+        h=h,
+        w=w,
+        kh=kh,
+        kw=kw,
+        f=f,
+    )
+    return shapes
+
+
+# Helper to calculate the output shape based on the input shape, kernel shape,
+# dilation and stride.
+def calc_out_shape(i_shape: int, k_shape: int, dilation_val: int, stride_val: int):
+    x = (k_shape - 1) * (dilation_val - 1)
+    x = i_shape - k_shape - x
+    return math.floor(x / stride_val) + 1
+
+
+# Helper to return input, kernel and output shapes based on the layout and Conv2dParams.
+def get_tensor_shape(
+    shapes: TestShape,
+    kernel_layout: KernelLayout,
+    input_layout: InputLayout,
+    conv_attr: ConvAttrs,
+):
+    n = shapes.n
+    c = shapes.c
+    h = shapes.h
+    w = shapes.w
+    kh = shapes.kh
+    kw = shapes.kw
+    f = shapes.f
+
+    # Extract input dimensions
+    input_height, input_width = h, w
+
+    # Extract kernel dimensions
+    kernel_height, kernel_width = kh, kw
+
+    # Get the dilation and stride
+    dilation = conv_attr.DILATION
+    stride = conv_attr.STRIDE
+
+    # Calculate output height.
+    oh = calc_out_shape(input_height, kernel_height, dilation[0], stride[0])
+    # Calculate output width.
+    ow = calc_out_shape(input_width, kernel_width, dilation[1], stride[1])
+
+    input_tensor_shape, kernel_tensor_shape, output_tensor_shape = [], [], []
+
+    if input_layout == InputLayout.NCHW:
+        input_tensor_shape = [n, c, h, w]
+        output_tensor_shape = [n, f, oh, ow]
+    elif input_layout == InputLayout.NHWC:
+        input_tensor_shape = [n, h, w, c]
+        output_tensor_shape = [n, oh, ow, f]
+    else:
+        raise ValueError(input_layout)
+
+    if kernel_layout == KernelLayout.FCHW:
+        kernel_tensor_shape = [f, c, kh, kw]
+    elif kernel_layout == KernelLayout.HWCF:
+        kernel_tensor_shape = [kh, kw, c, f]
+    else:
+        raise ValueError(kernel_layout)
+
+    return input_tensor_shape, kernel_tensor_shape, output_tensor_shape
+
+
+# Helper for generate_function.
+# Generates a name for a test function in the generated MLIR code.
+def generate_function_name(
+    input_type: InputElemTypeId,
+    kernel_type: KernelElemTypeId,
+    output_type: AccElemTypeId,
+    shapes: TestInputTensorShapes,
+    accumulate: bool,
+):
+    input_t = input_type.value
+    kernel_t = kernel_type.value
+    acc_t = output_type.value
+    n = int_or_DYN(shapes.n)
+    c = int_or_DYN(shapes.c)
+    h = int_or_DYN(shapes.h)
+    w = int_or_DYN(shapes.w)
+    kh = int_or_DYN(shapes.kh)
+    kw = int_or_DYN(shapes.kw)
+    f = int_or_DYN(shapes.f)
+
+    conv2d_kind = "conv2d_accumulate" if accumulate else "conv2d"
+    return (
+        f"{conv2d_kind}_{n}_{c}_{h}_{w}_times_"
+        + f"{kh}_{kw}_{f}_dtype_{input_t}_{kernel_t}_{acc_t}"
+    )
+
+
+# Represents a generated test function.
+@dataclasses.dataclass
+class MLIRFunction:
+    name: str
+    signature: str
+    import_declaration: str
+    definition: str
+
+
+# Generates a test function in the generated MLIR code.
+# The generated function will take the same arguments as linalg.conv2d variants
+# and will just call linalg.conv2d variants with them, returning its result.
+def generate_function(
+    input_type: InputElemTypeId,
+    input_layout: InputLayout,
+    kernel_type: KernelElemTypeId,
+    kernel_layout: KernelLayout,
+    acc_type: AccElemTypeId,
+    conv2d_attr: ConvAttrs,
+    shape: TestShape,
+    dynamicity: Dynamicity,
+):
+    shapes = generate_shapes(shape, dynamicity)
+    func_name = generate_function_name(
+        input_type,
+        kernel_type,
+        acc_type,
+        shapes,
+        shape.accumulate,
+    )
+
+    input_shape, kernel_shape, output_shape = get_tensor_shape(
+        shape, kernel_layout, input_layout, conv2d_attr
+    )
+    input_tensor_type = f"tensor<{input_shape[0]}x{input_shape[1]}x{input_shape[2]}x{input_shape[3]}x{input_type.value}>"
+    kernel_tensor_type = f"tensor<{kernel_shape[0]}x{kernel_shape[1]}x{kernel_shape[2]}x{kernel_shape[3]}x{kernel_type.value}>"
+
+    acc_tensor_type = f"tensor<{output_shape[0]}x{output_shape[1]}x{output_shape[2]}x{output_shape[3]}x{acc_type.value}>"
+
+    op_name = None
+    if input_layout == InputLayout.NCHW:
+        if kernel_layout == KernelLayout.FCHW:
+            op_name = "linalg.conv_2d_nchw_fchw"
+        if kernel_layout == KernelLayout.HWCF:
+            op_name = "linalg.conv_2d_nchw_hwcf"
+    elif input_layout == InputLayout.NHWC:
+        if kernel_layout == KernelLayout.HWCF:
+            op_name = "linalg.conv_2d_nhwc_hwcf"
+
+    if op_name is None:
+        raise ValueError("Invalid combination of input_layout and kernel_layout")
+
+    conv_attr = f"{{dilations = dense<{list(conv2d_attr.DILATION)}> : tensor<2xi64>, strides = dense<{list(conv2d_attr.STRIDE)}> : tensor<2xi64>}}"
+
+    # Compilation info is optional; prints empty string by default.
+    func_definition = ""
+
+    signature = f"({input_tensor_type}, {kernel_tensor_type}, {acc_tensor_type}) -> {acc_tensor_type}"
+    import_declaration = f"func.func private @module.{func_name}(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view"
+    func_definition = func_definition + (
+        f"func.func @{func_name}(%lhs: {input_tensor_type}, %rhs: {kernel_tensor_type}, %acc: {acc_tensor_type}) -> {acc_tensor_type} {{\n"
+        f"  %result = {op_name} {conv_attr} ins(%lhs, %rhs: {input_tensor_type}, {kernel_tensor_type}) outs(%acc: {acc_tensor_type}) -> {acc_tensor_type}\n"
+        f"  return %result: {acc_tensor_type}\n"
+        f"}}"
+    )
+
+    return MLIRFunction(
+        name=func_name,
+        signature=signature,
+        import_declaration=import_declaration,
+        definition=func_definition,
+    )
+
+
+# Represents a call to a generated test function.
+@dataclasses.dataclass
+class TestCall:
+    function: MLIRFunction
+    op: str
+
+
+# Enumerates ways to initialize tensor buffer contents.
+@enum.unique
+class TensorGenerator(enum.Enum):
+    ZERO = "zero"  # Fill with zeros
+    RANDOM = "random"  # Fill with (deterministic) pseudorandom values.
+
+
+# Intentionally fixed seed! We want full reproducibility here, both across runs
+# and across machines.
+# Intentionally not shared with local_pseudorandom_state to limit the ways
+# in which shuffling testcases changes which random values are generated.
+pseudorandom_generator_seed = 1
+
+
+# Generate a 4d tensor function argument of the given size as `%name`.
+def generate_random_4d_tensor(
+    name: str,
+    tensor_shape: list,
+    element_type: typing.Union[InputElemTypeId, KernelElemTypeId],
+):
+    global pseudorandom_generator_seed
+    pseudorandom_generator_seed = pseudorandom_generator_seed + 1
+    return (
+        f"  %{name}_dim0 = arith.constant {tensor_shape[0]} : i64\n"
+        f"  %{name}_dim1 = arith.constant {tensor_shape[1]} : i64\n"
+        f"  %{name}_dim2 = arith.constant {tensor_shape[2]} : i64\n"
+        f"  %{name}_dim3 = arith.constant {tensor_shape[3]} : i64\n"
+        f"  %{name}_element_type = hal.element_type<{element_type.value}> : i32\n"
+        f"  %{name}_seed = arith.constant {pseudorandom_generator_seed} : i32\n"
+        f"  %{name} = call @conv2d_test.generate_random_tensor(%device, %{name}_dim0, %{name}_dim1, %{name}_dim2, %{name}_dim3, %{name}_element_type, %{name}_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view\n"
+    )
+
+
+call_id = 0
+
+
+def generate_call(
+    function: MLIRFunction,
+    input_type: InputElemTypeId,
+    input_layout: InputLayout,
+    kernel_type: KernelElemTypeId,
+    kernel_layout: KernelLayout,
+    conv2d_attr: ConvAttrs,
+    acc_type: AccElemTypeId,
+    shape: TestShape,
+):
+    global call_id
+    func_name = f"{function.name}_{shape.n}_{shape.c}_{shape.h}_{shape.w}_{shape.f}_{shape.kh}_{shape.kw}"
+    if shape.accumulate:
+        func_name = f"{func_name}_acc"
+    func_name = f"{func_name}_{call_id}"
+    call_id = call_id + 1
+
+    # layout of output tensor for checking correctness
+    layout = -1
+
+    if input_layout == InputLayout.NCHW:
+        if kernel_layout == KernelLayout.FCHW or kernel_layout == KernelLayout.HWCF:
+            layout = 0  # for output tensor NxFxOHxOW
+        else:
+            raise ValueError(kernel_layout)
+    elif input_layout == InputLayout.NHWC:
+        if kernel_layout == KernelLayout.HWCF:
+            layout = 1  # for output tensor NxOHxOWxF
+        else:
+            raise ValueError(kernel_layout)
+    else:
+        raise ValueError(InputLayout)
+
+    description = f"Conv2d shape (NxCxHxWxFxKHxKW): {shape.n}x{shape.c}x{shape.h}x{shape.w}x{shape.f}x{shape.kh}x{shape.kw}"
+    op = (
+        f"func.func @{func_name}() attributes {{\n"
+        f'  iree.reflection = {{description = "{description}"}}\n'
+        "} {\n"
+        "  %device_index = arith.constant 0 : index\n"
+        "  %device = hal.devices.get %device_index : !hal.device\n"
+    )
+
+    inp_shape, kernel_shape, out_shape = get_tensor_shape(
+        shape,
+        kernel_layout,
+        input_layout,
+        conv2d_attr,
+    )
+
+    op = op + generate_random_4d_tensor("input", inp_shape, input_type)
+    op = op + generate_random_4d_tensor("kernel", kernel_shape, kernel_type)
+    if shape.accumulate:
+        op = op + generate_random_4d_tensor("acc", out_shape, acc_type)
+        # TODO(#16168): there's a bug with in-place input->output aliasing and
+        # we work around it here by passing in a unique copy.
+        global pseudorandom_generator_seed
+        pseudorandom_generator_seed = pseudorandom_generator_seed - 1
+        op = op + generate_random_4d_tensor("acc_copy", out_shape, acc_type)
+        op = op + (
+            f"  %result = call @module.{function.name}(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
+        )
+    else:
+        op = op + (
+            f"  %acc = util.null : !hal.buffer_view\n"
+            f"  %result = call @module.{function.name}(%input, %kernel) : (!hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view\n"
+        )
+
+    op = op + (
+        f"  %n = arith.constant {shape.n} : i64\n"
+        f"  %c = arith.constant {shape.c} : i64\n"
+        f"  %h = arith.constant {shape.h} : i64\n"
+        f"  %w = arith.constant {shape.w} : i64\n"
+        f"  %f = arith.constant {shape.f} : i64\n"
+        f"  %kh = arith.constant {shape.kh} : i64\n"
+        f"  %kw = arith.constant {shape.kw} : i64\n"
+        f"  %layout = arith.constant {layout} : i64\n"
+        f"  %sh = arith.constant {conv2d_attr.STRIDE[0]} : i64\n"
+        f"  %sw = arith.constant {conv2d_attr.STRIDE[1]} : i64\n"
+        f"  %dh = arith.constant {conv2d_attr.DILATION[0]} : i64\n"
+        f"  %dw = arith.constant {conv2d_attr.DILATION[1]} : i64\n"
+        f"  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()\n"
+    )
+
+    op = op + "  return\n"
+    op = op + "}\n"
+
+    return TestCall(function=function, op=op)
+
+
+# Generates all output files' contents as strings.
+def generate(
+    input_elem_type: InputElemTypeId,
+    input_layout: InputLayout,
+    kernel_elem_type: KernelElemTypeId,
+    kernel_layout: KernelLayout,
+    conv2d_attr: ConvAttrs,
+    acc_type: AccElemTypeId,
+    shapes_id: ShapesId,
+):
+    functions = {}
+    calls = []
+
+    for shape in get_test_shapes(shapes_id):
+        for dynamicity in [Dynamicity.STATIC]:
+            function = generate_function(
+                input_elem_type,
+                input_layout,
+                kernel_elem_type,
+                kernel_layout,
+                acc_type,
+                conv2d_attr,
+                shape,
+                dynamicity,
+            )
+            # Different testcases may differ only by runtime parameters but
+            # share the same code. For example, dynamic-shapes testcases
+            # share the same code involing tensor<?x?xf32> even though the runtime
+            # value in the trace are different. That's why we append conditionally
+            # to calls, but unconditionally to function_definitions.
+            if function.name not in functions:
+                functions[function.name] = function
+            calls.append(
+                generate_call(
+                    function,
+                    input_elem_type,
+                    input_layout,
+                    kernel_elem_type,
+                    kernel_layout,
+                    conv2d_attr,
+                    acc_type,
+                    shape,
+                )
+            )
+
+    return (functions, calls)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Generator of e2e conv2d tests")
+    parser.add_argument(
+        "--output_conv2d_mlir",
+        type=str,
+        help="Path of output .mlir file containing the generated conv2d functions",
+        required=True,
+    )
+    parser.add_argument(
+        "--output_calls_mlir",
+        type=str,
+        help="Path of output .mlir file containing the calls",
+        required=True,
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        choices=["i8", "f32", "f16"],
+        help="Numeric type of input tensors",
+        required=True,
+    )
+    parser.add_argument(
+        "--input_layout",
+        type=str,
+        default="nchw",
+        choices=["nchw", "nhwc"],
+        help="Layout of the input tensor. Currently, only nchw is supported.",
+        required=False,
+    )
+    parser.add_argument(
+        "--kernel_type",
+        type=str,
+        choices=["i8", "f32", "f16"],
+        help="Numeric type of input tensors",
+        required=True,
+    )
+    parser.add_argument(
+        "--kernel_layout",
+        type=str,
+        default="fchw",
+        choices=["fchw", "hwcf"],
+        help="Layout of kernel tensor. Currently, only fchw is supported.",
+        required=False,
+    )
+    parser.add_argument(
+        "--acc_type",
+        type=str,
+        choices=["i32", "f32", "f16"],
+        help="Numeric type of input tensors",
+        default="",
+        required=False,
+    )
+    parser.add_argument(
+        "--shapes",
+        type=str,
+        choices=[s.value for s in ShapesId],
+        help="Collection of tensor shapes to test",
+        required=True,
+    )
+    parser.add_argument(
+        "--dilation",
+        type=str,
+        default="1,1",
+        help="The dilation factor for the convolution operation. Comma-separated. As in 1,1",
+        required=False,
+    )
+    parser.add_argument(
+        "--stride",
+        type=str,
+        default="1,1",
+        help="The stride factor for the convolution operation. Comma-separated. As in 1,1",
+        required=False,
+    )
+    parser.add_argument(
+        "--requirements",
+        type=str,
+        help="Target requirements for this module. Comma-separated. As in -iree-llvmcpu-target-cpu-features. If the target device does not meet all of the requirements, the test will be skipped.",
+        required=False,
+    )
+    return parser.parse_args()
+
+
+def write_code_file(functions, filename):
+    with open(filename, "w") as file:
+        for function in functions.values():
+            file.write(function.definition + "\n")
+
+
+def write_calls_file(functions, calls, filename, requirements):
+    # TODO(scotttodd): write "GENERATED BY" comment to the top of the file
+
+    # Module-level reflection information used to control the test tool.
+    # TODO(scotttodd): drop this and whatever logic in the test tool used it
+    #     multiple backends should be able to use the same input IR, so the
+    #     input IR shouldn't need things like CPU features in it
+    reflection = ""
+    if requirements:
+        reflection = (
+            "iree.reflection = {"
+            'target_features = "'
+            + ",".join([req.lstrip("+") for req in requirements.split(",")])
+            + '"'
+            "}"
+        )
+    module_definition = (
+        f"builtin.module @calls attributes {{\n" f"  {reflection}\n" f"}} {{\n\n"
+    )
+
+    # Declare the custom module that generates arguments.
+    module_definition = module_definition + (
+        "func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view\n"
+        "func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)\n"
+    )
+
+    # Declare the functions that will be called.
+    for function in functions.values():
+        module_definition = module_definition + function.import_declaration + "\n"
+    module_definition = module_definition + "\n"
+
+    # Emit the test cases for each call.
+    for call in calls:
+        module_definition = module_definition + call.op + ""
+
+    module_definition = module_definition + "}\n"
+
+    with open(filename, "w") as file:
+        file.write(module_definition)
+
+
+def main(args):
+    input_type = InputElemTypeId(args.input_type)
+    input_layout = InputLayout(args.input_layout)
+    kernel_type = KernelElemTypeId(args.kernel_type)
+    kernel_layout = KernelLayout(args.kernel_layout)
+    acc_type = AccElemTypeId(args.acc_type)
+    shapes_id = ShapesId(args.shapes)
+    conv2d_attr = ConvAttrs(
+        tuple(map(int, args.stride.split(","))),
+        tuple(map(int, args.dilation.split(","))),
+    )
+
+    (functions, calls) = generate(
+        input_type,
+        input_layout,
+        kernel_type,
+        kernel_layout,
+        conv2d_attr,
+        acc_type,
+        shapes_id,
+    )
+
+    write_code_file(functions, args.output_conv2d_mlir)
+    write_calls_file(
+        functions,
+        calls,
+        args.output_calls_mlir,
+        args.requirements,
+    )
+
+
+if __name__ == "__main__":
+    main(parse_arguments())
diff --git a/linalg_ops/convolution/generate_test_mlir_files.sh b/linalg_ops/convolution/generate_test_mlir_files.sh
new file mode 100755
index 0000000..69aea7d
--- /dev/null
+++ b/linalg_ops/convolution/generate_test_mlir_files.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This script runs generate_e2e_conv2d_tests for all argument combinations that
+# we are interested in testing.
+#
+# The output is a 'generated' folder with contents like this:
+#   linalg_ops/
+#     convolution/
+#       generated/
+#         f16_nchw_f16_fchw_f16/
+#           conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_large.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_medium.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f16_small.mlir
+#         f16_nchw_f16_fchw_f32/
+#           conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_large.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_medium.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
+#           conv2d_f16_nchw_f16_fchw_f32_small.mlir
+#         ...
+#           ...
+# Usage:
+#   generate_test_mlir_files.sh
+
+set -euo pipefail
+
+this_dir="$(cd $(dirname $0) && pwd)"
+generated_dir_root="${this_dir}/generated"
+
+# Reset generated directory.
+rm -rf ${generated_dir_root?}
+mkdir -p ${generated_dir_root?}
+
+shapes=(
+  "small"
+  "medium"
+  "large"
+)
+
+# input_type;input_layout;kernel_type;kernel_layout;acc_type
+type_and_layout_combinations=(
+  "f16;nhwc;f16;hwcf;f16"
+  "f16;nchw;f16;fchw;f16"
+  "f16;nhwc;f16;hwcf;f32"
+  "f16;nchw;f16;fchw;f32"
+  "f32;nhwc;f32;hwcf;f32"
+  "f32;nchw;f32;fchw;f32"
+  "i8;nhwc;i8;hwcf;i32"
+  "i8;nchw;i8;fchw;i32"
+)
+
+for type_and_layout_combination in ${type_and_layout_combinations[@]}; do
+  IFS=";" read -r -a combination <<< "${type_and_layout_combination}"
+  input_type="${combination[0]}"
+  input_layout="${combination[1]}"
+  kernel_type="${combination[2]}"
+  kernel_layout="${combination[3]}"
+  acc_type="${combination[4]}"
+
+  type_layout_name="${input_type}_${input_layout}_${kernel_type}_${kernel_layout}_${acc_type}"
+  type_combination_dir="${generated_dir_root}/${type_layout_name}"
+  mkdir -p ${type_combination_dir}
+  
+  for shape in ${shapes[@]}; do
+    echo "Generating conv2d test files for ${type_layout_name}_${shape}"
+    name="conv2d_${type_layout_name}_${shape}"
+    python ${this_dir}/generate_e2e_conv2d_tests.py \
+      --output_conv2d_mlir=${type_combination_dir}/${name}.mlir \
+      --output_calls_mlir=${type_combination_dir}/${name}_calls.mlir \
+      --input_type=${input_type} \
+      --input_layout=${input_layout} \
+      --kernel_type=${kernel_type} \
+      --kernel_layout=${kernel_layout} \
+      --acc_type=${acc_type} \
+      --shapes=${shape}
+  done
+done
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large.mlir
new file mode 100644
index 0000000..ca13bae
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: tensor<2x4x128x128xf16>, %rhs: tensor<8x4x3x3xf16>, %acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf16>) -> tensor<2x8x126x126xf16>
+  return %result: tensor<2x8x126x126xf16>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf16>) -> tensor<2x12x126x126xf16>
+  return %result: tensor<2x12x126x126xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir
new file mode 100644
index 0000000..ee31f04
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir
new file mode 100644
index 0000000..b630d29
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
+  return %result: tensor<2x2x30x30xf16>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+  return %result: tensor<2x64x30x30xf16>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x16x32x32xf16>, %rhs: tensor<64x16x3x3xf16>, %acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xf16>, tensor<64x16x3x3xf16>) outs(%acc: tensor<2x64x30x30xf16>) -> tensor<2x64x30x30xf16>
+  return %result: tensor<2x64x30x30xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
new file mode 100644
index 0000000..1298b5b
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 16 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small.mlir
new file mode 100644
index 0000000..66fe7fd
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
+  return %result: tensor<1x1x1x1xf16>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf16>) -> tensor<1x1x15x15xf16>
+  return %result: tensor<1x1x15x15xf16>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf16>) -> tensor<2x2x30x30xf16>
+  return %result: tensor<2x2x30x30xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir
new file mode 100644
index 0000000..98438c6
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f16/conv2d_f16_nchw_f16_fchw_f16_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir
new file mode 100644
index 0000000..21afe9d
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%lhs: tensor<2x4x128x128xf16>, %rhs: tensor<8x4x3x3xf16>, %acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf16>, tensor<8x4x3x3xf16>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
+  return %result: tensor<2x8x126x126xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%lhs: tensor<2x3x128x128xf16>, %rhs: tensor<12x3x3x3xf16>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf16>, tensor<12x3x3x3xf16>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
+  return %result: tensor<2x12x126x126xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
new file mode 100644
index 0000000..34fdff2
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
new file mode 100644
index 0000000..0f9d9df
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<64x2x3x3xf16>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<64x2x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x16x32x32xf16>, %rhs: tensor<64x16x3x3xf16>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xf16>, tensor<64x16x3x3xf16>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
new file mode 100644
index 0000000..f8798e9
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 16 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir
new file mode 100644
index 0000000..f2d0ea0
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%lhs: tensor<1x1x16x16xf16>, %rhs: tensor<1x1x2x2xf16>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf16>, tensor<1x1x2x2xf16>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
+  return %result: tensor<1x1x15x15xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x2x32x32xf16>, %rhs: tensor<2x2x3x3xf16>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf16>, tensor<2x2x3x3xf16>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
new file mode 100644
index 0000000..5c160c6
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nchw_f16_fchw_f32/conv2d_f16_nchw_f16_fchw_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir
new file mode 100644
index 0000000..17eb9e8
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%lhs: tensor<2x128x128x4xf16>, %rhs: tensor<3x3x4x8xf16>, %acc: tensor<2x126x126x8xf16>) -> tensor<2x126x126x8xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xf16>, tensor<3x3x4x8xf16>) outs(%acc: tensor<2x126x126x8xf16>) -> tensor<2x126x126x8xf16>
+  return %result: tensor<2x126x126x8xf16>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%lhs: tensor<2x128x128x3xf16>, %rhs: tensor<3x3x3x12xf16>, %acc: tensor<2x126x126x12xf16>) -> tensor<2x126x126x12xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xf16>, tensor<3x3x3x12xf16>) outs(%acc: tensor<2x126x126x12xf16>) -> tensor<2x126x126x12xf16>
+  return %result: tensor<2x126x126x12xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir
new file mode 100644
index 0000000..b07a2e5
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir
new file mode 100644
index 0000000..addb8a2
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16>
+  return %result: tensor<2x30x30x2xf16>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x64xf16>, %acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x64xf16>) outs(%acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16>
+  return %result: tensor<2x30x30x64xf16>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%lhs: tensor<2x32x32x16xf16>, %rhs: tensor<3x3x16x64xf16>, %acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xf16>, tensor<3x3x16x64xf16>) outs(%acc: tensor<2x30x30x64xf16>) -> tensor<2x30x30x64xf16>
+  return %result: tensor<2x30x30x64xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir
new file mode 100644
index 0000000..17ee9c1
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 16 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir
new file mode 100644
index 0000000..b3bf59e
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf16>) -> tensor<1x1x1x1xf16>
+  return %result: tensor<1x1x1x1xf16>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%lhs: tensor<1x16x16x1xf16>, %rhs: tensor<2x2x1x1xf16>, %acc: tensor<1x15x15x1xf16>) -> tensor<1x15x15x1xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xf16>, tensor<2x2x1x1xf16>) outs(%acc: tensor<1x15x15x1xf16>) -> tensor<1x15x15x1xf16>
+  return %result: tensor<1x15x15x1xf16>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf16>) -> tensor<2x30x30x2xf16>
+  return %result: tensor<2x30x30x2xf16>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir
new file mode 100644
index 0000000..f50584a
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f16/conv2d_f16_nhwc_f16_hwcf_f16_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f16> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f16> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f16(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir
new file mode 100644
index 0000000..2a7b2f2
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%lhs: tensor<2x128x128x4xf16>, %rhs: tensor<3x3x4x8xf16>, %acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xf16>, tensor<3x3x4x8xf16>) outs(%acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32>
+  return %result: tensor<2x126x126x8xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%lhs: tensor<2x128x128x3xf16>, %rhs: tensor<3x3x3x12xf16>, %acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xf16>, tensor<3x3x3x12xf16>) outs(%acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32>
+  return %result: tensor<2x126x126x12xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir
new file mode 100644
index 0000000..cd7d928
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
new file mode 100644
index 0000000..d140187
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x64xf16>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%lhs: tensor<2x32x32x16xf16>, %rhs: tensor<3x3x16x64xf16>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xf16>, tensor<3x3x16x64xf16>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
new file mode 100644
index 0000000..548e7ad
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 16 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir
new file mode 100644
index 0000000..59e9504
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%lhs: tensor<1x1x1x1xf16>, %rhs: tensor<1x1x1x1xf16>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf16>, tensor<1x1x1x1xf16>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%lhs: tensor<1x16x16x1xf16>, %rhs: tensor<2x2x1x1xf16>, %acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xf16>, tensor<2x2x1x1xf16>) outs(%acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32>
+  return %result: tensor<1x15x15x1xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%lhs: tensor<2x32x32x2xf16>, %rhs: tensor<3x3x2x2xf16>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf16>, tensor<3x3x2x2xf16>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
diff --git a/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir
new file mode 100644
index 0000000..6a9ab15
--- /dev/null
+++ b/linalg_ops/convolution/generated/f16_nhwc_f16_hwcf_f32/conv2d_f16_nhwc_f16_hwcf_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f16> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f16> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f16_f16_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large.mlir
new file mode 100644
index 0000000..1714e5b
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: tensor<2x4x128x128xf32>, %rhs: tensor<8x4x3x3xf32>, %acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xf32>, tensor<8x4x3x3xf32>) outs(%acc: tensor<2x8x126x126xf32>) -> tensor<2x8x126x126xf32>
+  return %result: tensor<2x8x126x126xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x3x128x128xf32>, %rhs: tensor<12x3x3x3xf32>, %acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xf32>, tensor<12x3x3x3xf32>) outs(%acc: tensor<2x12x126x126xf32>) -> tensor<2x12x126x126xf32>
+  return %result: tensor<2x12x126x126xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large_calls.mlir
new file mode 100644
index 0000000..ce81bc5
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir
new file mode 100644
index 0000000..d074f1f
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<64x2x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<64x2x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x16x32x32xf32>, %rhs: tensor<64x16x3x3xf32>, %acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xf32>, tensor<64x16x3x3xf32>) outs(%acc: tensor<2x64x30x30xf32>) -> tensor<2x64x30x30xf32>
+  return %result: tensor<2x64x30x30xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir
new file mode 100644
index 0000000..092a825
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 16 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small.mlir
new file mode 100644
index 0000000..a4a08ad
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<1x1x1x1xf32>, %rhs: tensor<1x1x1x1xf32>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x1x16x16xf32>, %rhs: tensor<1x1x2x2xf32>, %acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xf32>, tensor<1x1x2x2xf32>) outs(%acc: tensor<1x1x15x15xf32>) -> tensor<1x1x15x15xf32>
+  return %result: tensor<1x1x15x15xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x2x32x32xf32>, %rhs: tensor<2x2x3x3xf32>, %acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xf32>, tensor<2x2x3x3xf32>) outs(%acc: tensor<2x2x30x30xf32>) -> tensor<2x2x30x30xf32>
+  return %result: tensor<2x2x30x30xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small_calls.mlir
new file mode 100644
index 0000000..9f01130
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nchw_f32_fchw_f32/conv2d_f32_nchw_f32_fchw_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir
new file mode 100644
index 0000000..0cdae51
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%lhs: tensor<2x128x128x4xf32>, %rhs: tensor<3x3x4x8xf32>, %acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xf32>, tensor<3x3x4x8xf32>) outs(%acc: tensor<2x126x126x8xf32>) -> tensor<2x126x126x8xf32>
+  return %result: tensor<2x126x126x8xf32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%lhs: tensor<2x128x128x3xf32>, %rhs: tensor<3x3x3x12xf32>, %acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xf32>, tensor<3x3x3x12xf32>) outs(%acc: tensor<2x126x126x12xf32>) -> tensor<2x126x126x12xf32>
+  return %result: tensor<2x126x126x12xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir
new file mode 100644
index 0000000..854a307
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir
new file mode 100644
index 0000000..393c487
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x32x32x2xf32>, %rhs: tensor<3x3x2x2xf32>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf32>, tensor<3x3x2x2xf32>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x2xf32>, %rhs: tensor<3x3x2x64xf32>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf32>, tensor<3x3x2x64xf32>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%lhs: tensor<2x32x32x16xf32>, %rhs: tensor<3x3x16x64xf32>, %acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xf32>, tensor<3x3x16x64xf32>) outs(%acc: tensor<2x30x30x64xf32>) -> tensor<2x30x30x64xf32>
+  return %result: tensor<2x30x30x64xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir
new file mode 100644
index 0000000..5043f0d
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 16 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir
new file mode 100644
index 0000000..ea9d92c
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%lhs: tensor<1x1x1x1xf32>, %rhs: tensor<1x1x1x1xf32>, %acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) outs(%acc: tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
+  return %result: tensor<1x1x1x1xf32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%lhs: tensor<1x16x16x1xf32>, %rhs: tensor<2x2x1x1xf32>, %acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xf32>, tensor<2x2x1x1xf32>) outs(%acc: tensor<1x15x15x1xf32>) -> tensor<1x15x15x1xf32>
+  return %result: tensor<1x15x15x1xf32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%lhs: tensor<2x32x32x2xf32>, %rhs: tensor<3x3x2x2xf32>, %acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xf32>, tensor<3x3x2x2xf32>) outs(%acc: tensor<2x30x30x2xf32>) -> tensor<2x30x30x2xf32>
+  return %result: tensor<2x30x30x2xf32>
+}
diff --git a/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir
new file mode 100644
index 0000000..b25c720
--- /dev/null
+++ b/linalg_ops/convolution/generated/f32_nhwc_f32_hwcf_f32/conv2d_f32_nhwc_f32_hwcf_f32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<f32> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<f32> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<f32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<f32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_f32_f32_f32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir
new file mode 100644
index 0000000..706848a
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%lhs: tensor<2x4x128x128xi8>, %rhs: tensor<8x4x3x3xi8>, %acc: tensor<2x8x126x126xi32>) -> tensor<2x8x126x126xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x4x128x128xi8>, tensor<8x4x3x3xi8>) outs(%acc: tensor<2x8x126x126xi32>) -> tensor<2x8x126x126xi32>
+  return %result: tensor<2x8x126x126xi32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%lhs: tensor<2x3x128x128xi8>, %rhs: tensor<12x3x3x3xi8>, %acc: tensor<2x12x126x126xi32>) -> tensor<2x12x126x126xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x3x128x128xi8>, tensor<12x3x3x3xi8>) outs(%acc: tensor<2x12x126x126xi32>) -> tensor<2x12x126x126xi32>
+  return %result: tensor<2x12x126x126xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir
new file mode 100644
index 0000000..af106a6
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 4 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 8 : i64
+  %kernel_dim1 = arith.constant 4 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 8 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 8 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 3 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 128 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 12 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 12 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 126 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 12 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 126 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir
new file mode 100644
index 0000000..780c670
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x2x32x32xi8>, %rhs: tensor<2x2x3x3xi8>, %acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xi8>, tensor<2x2x3x3xi8>) outs(%acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32>
+  return %result: tensor<2x2x30x30xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x2x32x32xi8>, %rhs: tensor<64x2x3x3xi8>, %acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xi8>, tensor<64x2x3x3xi8>) outs(%acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32>
+  return %result: tensor<2x64x30x30xi32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x16x32x32xi8>, %rhs: tensor<64x16x3x3xi8>, %acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x16x32x32xi8>, tensor<64x16x3x3xi8>) outs(%acc: tensor<2x64x30x30xi32>) -> tensor<2x64x30x30xi32>
+  return %result: tensor<2x64x30x30xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir
new file mode 100644
index 0000000..8c4dc85
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 64 : i64
+  %kernel_dim1 = arith.constant 16 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 64 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 64 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir
new file mode 100644
index 0000000..8acc310
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%lhs: tensor<1x1x1x1xi8>, %rhs: tensor<1x1x1x1xi8>, %acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xi8>, tensor<1x1x1x1xi8>) outs(%acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32>
+  return %result: tensor<1x1x1x1xi32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%lhs: tensor<1x1x16x16xi8>, %rhs: tensor<1x1x2x2xi8>, %acc: tensor<1x1x15x15xi32>) -> tensor<1x1x15x15xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x16x16xi8>, tensor<1x1x2x2xi8>) outs(%acc: tensor<1x1x15x15xi32>) -> tensor<1x1x15x15xi32>
+  return %result: tensor<1x1x15x15xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x2x32x32xi8>, %rhs: tensor<2x2x3x3xi8>, %acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32> {
+  %result = linalg.conv_2d_nchw_fchw {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x2x32x32xi8>, tensor<2x2x3x3xi8>) outs(%acc: tensor<2x2x30x30xi32>) -> tensor<2x2x30x30xi32>
+  return %result: tensor<2x2x30x30xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir
new file mode 100644
index 0000000..6f68fe9
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nchw_i8_fchw_i32/conv2d_i8_nchw_i8_fchw_i32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 15 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 15 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 2 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 32 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 3 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 2 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 30 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 2 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 30 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 0 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir
new file mode 100644
index 0000000..99911df
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large.mlir
@@ -0,0 +1,8 @@
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%lhs: tensor<2x128x128x4xi8>, %rhs: tensor<3x3x4x8xi8>, %acc: tensor<2x126x126x8xi32>) -> tensor<2x126x126x8xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x4xi8>, tensor<3x3x4x8xi8>) outs(%acc: tensor<2x126x126x8xi32>) -> tensor<2x126x126x8xi32>
+  return %result: tensor<2x126x126x8xi32>
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%lhs: tensor<2x128x128x3xi8>, %rhs: tensor<3x3x3x12xi8>, %acc: tensor<2x126x126x12xi32>) -> tensor<2x126x126x12xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x128x128x3xi8>, tensor<3x3x3x12xi8>) outs(%acc: tensor<2x126x126x12xi32>) -> tensor<2x126x126x12xi32>
+  return %result: tensor<2x126x126x12xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir
new file mode 100644
index 0000000..a863eca
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_large_calls.mlir
@@ -0,0 +1,108 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32_2_4_128_128_8_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x4x128x128x8x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 4 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 4 : i64
+  %kernel_dim3 = arith.constant 8 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 8 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 8 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_4_128_128_times_3_3_8_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 4 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 8 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32_2_3_128_128_12_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x3x128x128x12x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 128 : i64
+  %input_dim2 = arith.constant 128 : i64
+  %input_dim3 = arith.constant 3 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 3 : i64
+  %kernel_dim3 = arith.constant 12 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 126 : i64
+  %acc_dim2 = arith.constant 126 : i64
+  %acc_dim3 = arith.constant 12 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 126 : i64
+  %acc_copy_dim2 = arith.constant 126 : i64
+  %acc_copy_dim3 = arith.constant 12 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_3_128_128_times_3_3_12_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 3 : i64
+  %h = arith.constant 128 : i64
+  %w = arith.constant 128 : i64
+  %f = arith.constant 12 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
new file mode 100644
index 0000000..e64bc66
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x32x32x2xi8>, %rhs: tensor<3x3x2x2xi8>, %acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x2xi8>) outs(%acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32>
+  return %result: tensor<2x30x30x2xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x32x32x2xi8>, %rhs: tensor<3x3x2x64xi8>, %acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
+  return %result: tensor<2x30x30x64xi32>
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%lhs: tensor<2x32x32x16xi8>, %rhs: tensor<3x3x16x64xi8>, %acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x16xi8>, tensor<3x3x16x64xi8>) outs(%acc: tensor<2x30x30x64xi32>) -> tensor<2x30x30x64xi32>
+  return %result: tensor<2x30x30x64xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
new file mode 100644
index 0000000..ea12edb
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_medium_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32_2_2_32_32_64_3_3_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32_2_16_32_32_64_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x16x32x32x64x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 16 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 16 : i64
+  %kernel_dim3 = arith.constant 64 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 64 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 64 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_16_32_32_times_3_3_64_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 16 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 64 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir
new file mode 100644
index 0000000..5d52f93
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small.mlir
@@ -0,0 +1,12 @@
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%lhs: tensor<1x1x1x1xi8>, %rhs: tensor<1x1x1x1xi8>, %acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x1x1x1xi8>, tensor<1x1x1x1xi8>) outs(%acc: tensor<1x1x1x1xi32>) -> tensor<1x1x1x1xi32>
+  return %result: tensor<1x1x1x1xi32>
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%lhs: tensor<1x16x16x1xi8>, %rhs: tensor<2x2x1x1xi8>, %acc: tensor<1x15x15x1xi32>) -> tensor<1x15x15x1xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<1x16x16x1xi8>, tensor<2x2x1x1xi8>) outs(%acc: tensor<1x15x15x1xi32>) -> tensor<1x15x15x1xi32>
+  return %result: tensor<1x15x15x1xi32>
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%lhs: tensor<2x32x32x2xi8>, %rhs: tensor<3x3x2x2xi8>, %acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32> {
+  %result = linalg.conv_2d_nhwc_hwcf {dilations = dense<[1, 1]> : tensor<2xi64>, strides = dense<[1, 1]> : tensor<2xi64>} ins(%lhs, %rhs: tensor<2x32x32x2xi8>, tensor<3x3x2x2xi8>) outs(%acc: tensor<2x30x30x2xi32>) -> tensor<2x30x30x2xi32>
+  return %result: tensor<2x30x30x2xi32>
+}
diff --git a/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir
new file mode 100644
index 0000000..da9b803
--- /dev/null
+++ b/linalg_ops/convolution/generated/i8_nhwc_i8_hwcf_i32/conv2d_i8_nhwc_i8_hwcf_i32_small_calls.mlir
@@ -0,0 +1,158 @@
+builtin.module @calls attributes {
+  
+} {
+
+func.func private @conv2d_test.generate_random_tensor(%device: !hal.device, %dim0: i64, %dim1: i64, %dim2: i64, %dim3: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
+func.func private @conv2d_test.check_conv2d_results(%device: !hal.device, %n: i64, %c: i64, %h: i64, %w: i64, %f:i64, %kh:i64, %kw:i64, %layout:i64, %sh:i64, %sw:i64, %dh:i64, %dw:i64, %input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
+func.func private @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+func.func private @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input: !hal.buffer_view, %kernel: !hal.buffer_view, %acc: !hal.buffer_view) -> !hal.buffer_view
+
+func.func @conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32_1_1_1_1_1_1_1_acc_0() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x1x1x1x1x1"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 1 : i64
+  %input_dim2 = arith.constant 1 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 2 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 1 : i64
+  %kernel_dim1 = arith.constant 1 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 3 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 1 : i64
+  %acc_dim2 = arith.constant 1 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 4 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 1 : i64
+  %acc_copy_dim2 = arith.constant 1 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 4 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_1_1_times_1_1_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 1 : i64
+  %w = arith.constant 1 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 1 : i64
+  %kw = arith.constant 1 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32_1_1_16_16_1_2_2_acc_1() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 1x1x16x16x1x2x2"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 1 : i64
+  %input_dim1 = arith.constant 16 : i64
+  %input_dim2 = arith.constant 16 : i64
+  %input_dim3 = arith.constant 1 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 5 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 2 : i64
+  %kernel_dim1 = arith.constant 2 : i64
+  %kernel_dim2 = arith.constant 1 : i64
+  %kernel_dim3 = arith.constant 1 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 6 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 1 : i64
+  %acc_dim1 = arith.constant 15 : i64
+  %acc_dim2 = arith.constant 15 : i64
+  %acc_dim3 = arith.constant 1 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 7 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 1 : i64
+  %acc_copy_dim1 = arith.constant 15 : i64
+  %acc_copy_dim2 = arith.constant 15 : i64
+  %acc_copy_dim3 = arith.constant 1 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 7 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_1_1_16_16_times_2_2_1_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 1 : i64
+  %c = arith.constant 1 : i64
+  %h = arith.constant 16 : i64
+  %w = arith.constant 16 : i64
+  %f = arith.constant 1 : i64
+  %kh = arith.constant 2 : i64
+  %kw = arith.constant 2 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+func.func @conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32_2_2_32_32_2_3_3_acc_2() attributes {
+  iree.reflection = {description = "Conv2d shape (NxCxHxWxFxKHxKW): 2x2x32x32x2x3x3"}
+} {
+  %device_index = arith.constant 0 : index
+  %device = hal.devices.get %device_index : !hal.device
+  %input_dim0 = arith.constant 2 : i64
+  %input_dim1 = arith.constant 32 : i64
+  %input_dim2 = arith.constant 32 : i64
+  %input_dim3 = arith.constant 2 : i64
+  %input_element_type = hal.element_type<i8> : i32
+  %input_seed = arith.constant 8 : i32
+  %input = call @conv2d_test.generate_random_tensor(%device, %input_dim0, %input_dim1, %input_dim2, %input_dim3, %input_element_type, %input_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %kernel_dim0 = arith.constant 3 : i64
+  %kernel_dim1 = arith.constant 3 : i64
+  %kernel_dim2 = arith.constant 2 : i64
+  %kernel_dim3 = arith.constant 2 : i64
+  %kernel_element_type = hal.element_type<i8> : i32
+  %kernel_seed = arith.constant 9 : i32
+  %kernel = call @conv2d_test.generate_random_tensor(%device, %kernel_dim0, %kernel_dim1, %kernel_dim2, %kernel_dim3, %kernel_element_type, %kernel_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_dim0 = arith.constant 2 : i64
+  %acc_dim1 = arith.constant 30 : i64
+  %acc_dim2 = arith.constant 30 : i64
+  %acc_dim3 = arith.constant 2 : i64
+  %acc_element_type = hal.element_type<i32> : i32
+  %acc_seed = arith.constant 10 : i32
+  %acc = call @conv2d_test.generate_random_tensor(%device, %acc_dim0, %acc_dim1, %acc_dim2, %acc_dim3, %acc_element_type, %acc_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %acc_copy_dim0 = arith.constant 2 : i64
+  %acc_copy_dim1 = arith.constant 30 : i64
+  %acc_copy_dim2 = arith.constant 30 : i64
+  %acc_copy_dim3 = arith.constant 2 : i64
+  %acc_copy_element_type = hal.element_type<i32> : i32
+  %acc_copy_seed = arith.constant 10 : i32
+  %acc_copy = call @conv2d_test.generate_random_tensor(%device, %acc_copy_dim0, %acc_copy_dim1, %acc_copy_dim2, %acc_copy_dim3, %acc_copy_element_type, %acc_copy_seed) : (!hal.device, i64, i64, i64, i64, i32, i32) -> !hal.buffer_view
+  %result = call @module.conv2d_accumulate_2_2_32_32_times_3_3_2_dtype_i8_i8_i32(%input, %kernel, %acc_copy) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> !hal.buffer_view
+  %n = arith.constant 2 : i64
+  %c = arith.constant 2 : i64
+  %h = arith.constant 32 : i64
+  %w = arith.constant 32 : i64
+  %f = arith.constant 2 : i64
+  %kh = arith.constant 3 : i64
+  %kw = arith.constant 3 : i64
+  %layout = arith.constant 1 : i64
+  %sh = arith.constant 1 : i64
+  %sw = arith.constant 1 : i64
+  %dh = arith.constant 1 : i64
+  %dw = arith.constant 1 : i64
+  call @conv2d_test.check_conv2d_results(%device, %n, %c, %h, %w, %f, %kh, %kw, %layout, %sh, %sw, %dh, %dw, %input, %kernel, %acc, %result) : (!hal.device, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view) -> ()
+  return
+}
+}
diff --git a/linalg_ops/iree-e2e-conv2d-test.cc b/linalg_ops/iree-e2e-conv2d-test.cc
new file mode 100644
index 0000000..a8d2391
--- /dev/null
+++ b/linalg_ops/iree-e2e-conv2d-test.cc
@@ -0,0 +1,777 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "iree/base/api.h"
+#include "iree/base/internal/flags.h"
+#include "iree/base/internal/math.h"
+#include "iree/hal/api.h"
+#include "iree/modules/hal/module.h"
+#include "iree/tooling/context_util.h"
+#include "iree/tooling/device_util.h"
+#include "iree/vm/api.h"
+#include "iree/vm/native_module_cc.h"
+#include "test_utils.h"
+
+//===----------------------------------------------------------------------===//
+// Reference conv2d (NCHW-FCHW) and (NHWC-HWCF)
+//===----------------------------------------------------------------------===//
+
+// Conversion from 4D indices in row major order to 1D index.
+static int convert_to_1d_index(iree_hal_dim_t channels, iree_hal_dim_t height,
+                               iree_hal_dim_t width, iree_hal_dim_t n,
+                               iree_hal_dim_t c, iree_hal_dim_t h,
+                               iree_hal_dim_t w) {
+  return n * (channels * height * width) + c * (height * width) + h * width + w;
+}
+
+// [f16 <= f16 * f16 + f16]
+static void reference_conv2d_f16_f16_f16_f16(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t *input_data,
+    const uint16_t *kernel_data, const uint16_t *acc_data,
+    uint16_t *result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
+    iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    float acc = acc_data ? iree_math_f16_to_f32(acc_data[out_idx]) : 0.f;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+      result_data[out_idx] = iree_math_f32_to_f16(acc);
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    float acc = acc_data ? iree_math_f16_to_f32(acc_data[out_idx]) : 0.f;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+    }
+    result_data[out_idx] = iree_math_f32_to_f16(acc);
+  }
+}
+
+// [f32 <= f16 * f16 + f32]
+static void reference_conv2d_f16_f16_f32_f32(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const uint16_t *input_data,
+    const uint16_t *kernel_data, const float *acc_data, float *result_data,
+    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    float acc = acc_data ? acc_data[out_idx] : 0.f;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+      result_data[out_idx] = acc;
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    float acc = acc_data ? acc_data[out_idx] : 0.f;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+
+          acc += iree_math_f16_to_f32(input_data[inp_idx]) *
+                 iree_math_f16_to_f32(kernel_data[krnl_idx]);
+        }
+      }
+    }
+    result_data[out_idx] = acc;
+  }
+}
+
+// [i32 <= i8 * i8 + i32]
+static void reference_conv2d_i8_i8_i32_i32(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const int8_t *input_data,
+    const int8_t *kernel_data, const int32_t *acc_data, int32_t *result_data,
+    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    int32_t acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          int8_t lhs_value = input_data[inp_idx];
+          int8_t rhs_value = kernel_data[krnl_idx];
+          acc += (int32_t)lhs_value * (int32_t)rhs_value;
+        }
+      }
+      result_data[out_idx] = acc;
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    int32_t acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+
+          int8_t lhs_value = input_data[inp_idx];
+          int8_t rhs_value = kernel_data[krnl_idx];
+          acc += (int32_t)lhs_value * (int32_t)rhs_value;
+        }
+      }
+    }
+    result_data[out_idx] = acc;
+  }
+}
+
+static void reference_conv2d_f32_f32_f32_f32(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size, const float *input_data,
+    const float *kernel_data, const float *acc_data, float *result_data,
+    iree_hal_dim_t n, iree_hal_dim_t oc, iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (layout == 0) {
+    // The layout of output tensor is NxfxOHxOW
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(f_size, oh_size, ow_size, n, oc, oh, ow);
+
+    float acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+      for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+        for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              c_size, h_size, w_size, n, ic, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size));
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(c_size, kh_size, kw_size, oc, ic, kh, kw);
+
+          acc += input_data[inp_idx] * kernel_data[krnl_idx];
+        }
+      }
+      result_data[out_idx] = acc;
+    }
+  } else if (layout == 1) {
+    // The layout of output tensor is NxOHxOWxf
+    iree_hal_dim_t out_idx =
+        convert_to_1d_index(oh_size, ow_size, f_size, n, oh, ow, oc);
+
+    float acc = acc_data ? acc_data[out_idx] : 0;
+
+    for (iree_hal_dim_t kh = 0; kh < kh_size; ++kh) {
+      for (iree_hal_dim_t kw = 0; kw < kw_size; ++kw) {
+        for (iree_hal_dim_t ic = 0; ic < c_size; ++ic) {
+          iree_hal_dim_t inp_idx = convert_to_1d_index(
+              h_size, w_size, c_size, n, (oh * sh_size + kh * dh_size),
+              (ow * sw_size + kw * dw_size), ic);
+          iree_hal_dim_t krnl_idx =
+              convert_to_1d_index(kw_size, c_size, f_size, kh, kw, ic, oc);
+          acc += input_data[inp_idx] * kernel_data[krnl_idx];
+        }
+      }
+    }
+    result_data[out_idx] = acc;
+  }
+}
+
+// Helper for reference_conv2d.
+static iree_status_t reference_conv2d_element(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_dim_t oh_size, iree_hal_dim_t ow_size,
+    iree_hal_element_type_t input_type, iree_hal_element_type_t kernel_type,
+    iree_hal_element_type_t acc_type, void *input_data, void *kernel_data,
+    void *acc_data, void *result_data, iree_hal_dim_t n, iree_hal_dim_t oc,
+    iree_hal_dim_t oh, iree_hal_dim_t ow) {
+  if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
+      kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32 &&
+      acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    reference_conv2d_f32_f32_f32_f32(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const float *)input_data, (const float *)kernel_data,
+        (const float *)acc_data, (float *)result_data, n, oc, oh, ow);
+  } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16) {
+    reference_conv2d_f16_f16_f16_f16(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const uint16_t *)input_data, (const uint16_t *)kernel_data,
+        (const uint16_t *)acc_data, (uint16_t *)result_data, n, oc, oh, ow);
+  } else if (input_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             kernel_type == IREE_HAL_ELEMENT_TYPE_FLOAT_16 &&
+             acc_type == IREE_HAL_ELEMENT_TYPE_FLOAT_32) {
+    reference_conv2d_f16_f16_f32_f32(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const uint16_t *)input_data, (const uint16_t *)kernel_data,
+        (const float *)acc_data, (float *)result_data, n, oc, oh, ow);
+  } else if (input_type == IREE_HAL_ELEMENT_TYPE_INT_8 &&
+             kernel_type == IREE_HAL_ELEMENT_TYPE_INT_8 &&
+             acc_type == IREE_HAL_ELEMENT_TYPE_INT_32) {
+    reference_conv2d_i8_i8_i32_i32(
+        n_size, c_size, h_size, w_size, f_size, kh_size, kw_size, layout,
+        sh_size, sw_size, dh_size, dw_size, oh_size, ow_size,
+        (const int8_t *)input_data, (const int8_t *)kernel_data,
+        (const int32_t *)acc_data, (int32_t *)result_data, n, oc, oh, ow);
+  } else {
+    return iree_make_status(
+        IREE_STATUS_INVALID_ARGUMENT,
+        "unhandled combination of element types in conv2d input_type: %d,"
+        " kernel_type: %d, acc_type: %d",
+        input_type, kernel_type, acc_type);
+  }
+  return iree_ok_status();
+}
+
+// Calculate the output shape given the dilation and strides.
+static iree_hal_dim_t out_shape_calc(iree_hal_dim_t i_shape,
+                                     iree_hal_dim_t k_shape,
+                                     iree_hal_dim_t stride,
+                                     iree_hal_dim_t dilation) {
+  iree_hal_dim_t x = (k_shape - 1) * (dilation - 1);
+  x = i_shape - k_shape - x;
+  return floor(x / stride) + 1;
+}
+
+// Reference conv2d-NCHW-FCHW implementation, used to compare conv2d results
+// against.
+static iree_status_t reference_conv2d(
+    iree_hal_dim_t n_size, iree_hal_dim_t c_size, iree_hal_dim_t h_size,
+    iree_hal_dim_t w_size, iree_hal_dim_t f_size, iree_hal_dim_t kh_size,
+    iree_hal_dim_t kw_size, iree_hal_dim_t layout, iree_hal_dim_t sh_size,
+    iree_hal_dim_t sw_size, iree_hal_dim_t dh_size, iree_hal_dim_t dw_size,
+    iree_hal_element_type_t input_type, iree_hal_element_type_t kernel_type,
+    iree_hal_element_type_t acc_type, iree_byte_span_t input_contents,
+    iree_byte_span_t kernel_contents, iree_byte_span_t acc_contents,
+    iree_byte_span_t result_contents, int compute_every) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, n_size);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, c_size);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, h_size);
+  IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, w_size);
+
+  iree_hal_dim_t oh_size = out_shape_calc(h_size, kh_size, sh_size, dh_size);
+  iree_hal_dim_t ow_size = out_shape_calc(w_size, kw_size, sw_size, dw_size);
+
+  if (layout == 0) {
+    for (iree_hal_dim_t n = 0; n < n_size; ++n) {
+      for (iree_hal_dim_t oc = 0; oc < f_size; ++oc) {
+        for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
+          for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
+            IREE_RETURN_AND_END_ZONE_IF_ERROR(
+                z0,
+                reference_conv2d_element(
+                    n_size, c_size, h_size, w_size, f_size, kh_size, kw_size,
+                    layout, sh_size, sw_size, dh_size, dw_size, oh_size,
+                    ow_size, input_type, kernel_type, acc_type,
+                    input_contents.data, kernel_contents.data,
+                    acc_contents.data, result_contents.data, n, oc, oh, ow));
+          }
+        }
+      }
+    }
+  } else if (layout == 1) {
+    for (iree_hal_dim_t n = 0; n < n_size; ++n) {
+      for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
+        for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
+          for (iree_hal_dim_t oc = 0; oc < f_size; ++oc) {
+            IREE_RETURN_AND_END_ZONE_IF_ERROR(
+                z0,
+                reference_conv2d_element(
+                    n_size, c_size, h_size, w_size, f_size, kh_size, kw_size,
+                    layout, sh_size, sw_size, dh_size, dw_size, oh_size,
+                    ow_size, input_type, kernel_type, acc_type,
+                    input_contents.data, kernel_contents.data,
+                    acc_contents.data, result_contents.data, n, oc, oh, ow));
+          }
+        }
+      }
+    }
+  } else {
+    return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
+                            "unhandled conv2d layout");
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+//===----------------------------------------------------------------------===//
+// Conv2d comparison/logging
+//===----------------------------------------------------------------------===//
+
+typedef struct {
+  iree_allocator_t host_allocator;
+  iree_hal_dim_t n;      // batch dim
+  iree_hal_dim_t c;      // input channels
+  iree_hal_dim_t h;      // input height
+  iree_hal_dim_t w;      // input width
+  iree_hal_dim_t f;      // output channels
+  iree_hal_dim_t kh;     // kernel height
+  iree_hal_dim_t kw;     // kernel width
+  iree_hal_dim_t layout; // conv layout, 0 : nchwxfchw (default); 1: nhwcxhwcf
+  iree_hal_dim_t sh;     // stride along height dim
+  iree_hal_dim_t sw;     // stride along width dim
+  iree_hal_dim_t dh;     // dilation along height dim
+  iree_hal_dim_t dw;     // dilation along width dim
+  iree_hal_element_type_t input_type;
+  iree_hal_element_type_t kernel_type;
+  iree_hal_element_type_t acc_type;
+  iree_hal_element_type_t result_type;
+  iree_byte_span_t input_contents;
+  iree_byte_span_t kernel_contents;
+  iree_byte_span_t acc_contents;
+  iree_byte_span_t actual_contents;
+  iree_byte_span_t expected_contents;
+} conv2d_results_t;
+
+static void conv2d_results_deinitialize(conv2d_results_t *results);
+
+static iree_status_t conv2d_results_initialize(
+    iree_hal_device_t *device, iree_hal_dim_t n_size, iree_hal_dim_t c_size,
+    iree_hal_dim_t h_size, iree_hal_dim_t w_size, iree_hal_dim_t f_size,
+    iree_hal_dim_t kh_size, iree_hal_dim_t kw_size, iree_hal_dim_t layout,
+    iree_hal_dim_t sh_size, iree_hal_dim_t sw_size, iree_hal_dim_t dh_size,
+    iree_hal_dim_t dw_size, iree_hal_buffer_view_t *input,
+    iree_hal_buffer_view_t *kernel, iree_hal_buffer_view_t *acc,
+    iree_hal_buffer_view_t *result, iree_allocator_t host_allocator,
+    conv2d_results_t *out_results) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  memset(out_results, 0, sizeof(*out_results));
+  out_results->host_allocator = host_allocator;
+
+  out_results->n = n_size;
+  out_results->c = c_size;
+  out_results->h = h_size;
+  out_results->w = w_size;
+  out_results->f = f_size;
+  out_results->kh = kh_size;
+  out_results->kw = kw_size;
+  out_results->layout = layout;
+  out_results->sh = sh_size;
+  out_results->sw = sw_size;
+  out_results->dh = dh_size;
+  out_results->dw = dw_size;
+
+  out_results->input_type = iree_hal_buffer_view_element_type(input);
+  out_results->kernel_type = iree_hal_buffer_view_element_type(kernel);
+  out_results->acc_type = iree_hal_buffer_view_element_type(acc);
+  out_results->result_type = iree_hal_buffer_view_element_type(result);
+
+  iree_hal_buffer_t *input_buffer = iree_hal_buffer_view_buffer(input);
+  iree_hal_buffer_t *kernel_buffer = iree_hal_buffer_view_buffer(kernel);
+  iree_hal_buffer_t *acc_buffer = acc ? iree_hal_buffer_view_buffer(acc) : NULL;
+  iree_hal_buffer_t *result_buffer = iree_hal_buffer_view_buffer(result);
+
+  iree_status_t status = iree_ok_status();
+
+  if (iree_status_is_ok(status)) {
+    out_results->input_contents.data_length =
+        iree_hal_buffer_byte_length(input_buffer);
+    status = iree_allocator_malloc(host_allocator,
+                                   out_results->input_contents.data_length,
+                                   (void **)&out_results->input_contents.data);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_transfer_d2h(
+        device, input_buffer, 0, out_results->input_contents.data,
+        out_results->input_contents.data_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_results->kernel_contents.data_length =
+        iree_hal_buffer_byte_length(kernel_buffer);
+    status = iree_allocator_malloc(host_allocator,
+                                   out_results->kernel_contents.data_length,
+                                   (void **)&out_results->kernel_contents.data);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_transfer_d2h(
+        device, kernel_buffer, 0, out_results->kernel_contents.data,
+        out_results->kernel_contents.data_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  if (acc_buffer) {
+    if (iree_status_is_ok(status)) {
+      out_results->acc_contents.data_length =
+          iree_hal_buffer_byte_length(acc_buffer);
+      status = iree_allocator_malloc(host_allocator,
+                                     out_results->acc_contents.data_length,
+                                     (void **)&out_results->acc_contents.data);
+    }
+    if (iree_status_is_ok(status)) {
+      status = iree_hal_device_transfer_d2h(
+          device, acc_buffer, 0, out_results->acc_contents.data,
+          out_results->acc_contents.data_length,
+          IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+    }
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_results->actual_contents.data_length =
+        iree_hal_buffer_byte_length(result_buffer);
+    status = iree_allocator_malloc(host_allocator,
+                                   out_results->actual_contents.data_length,
+                                   (void **)&out_results->actual_contents.data);
+  }
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_device_transfer_d2h(
+        device, result_buffer, 0, out_results->actual_contents.data,
+        out_results->actual_contents.data_length,
+        IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout());
+  }
+
+  if (iree_status_is_ok(status)) {
+    out_results->expected_contents.data_length =
+        iree_hal_buffer_byte_length(result_buffer);
+    status = iree_allocator_malloc(
+        host_allocator, out_results->expected_contents.data_length,
+        (void **)&out_results->expected_contents.data);
+  }
+
+  if (!iree_status_is_ok(status)) {
+    conv2d_results_deinitialize(out_results);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+static void conv2d_results_deinitialize(conv2d_results_t *results) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  iree_allocator_free(results->host_allocator, results->input_contents.data);
+  iree_allocator_free(results->host_allocator, results->kernel_contents.data);
+  if (!iree_byte_span_is_empty(results->acc_contents)) {
+    iree_allocator_free(results->host_allocator, results->acc_contents.data);
+  }
+  iree_allocator_free(results->host_allocator, results->actual_contents.data);
+  iree_allocator_free(results->host_allocator, results->expected_contents.data);
+
+  IREE_TRACE_ZONE_END(z0);
+}
+
+// Helper for check_conv2d: the actual interesting part once we've
+// obtained and validated the {n, f, oh, ow}_size values. On error, the first
+// index is returned where the actual and expected value doesn't match. TODO:
+// Add detailed logging to |file|.
+static iree_status_t check_conv2d_results_impl(FILE *file,
+                                               const conv2d_results_t *results,
+                                               int check_every) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+
+  IREE_RETURN_AND_END_ZONE_IF_ERROR(
+      z0,
+      reference_conv2d(
+          results->n, results->c, results->h, results->w, results->f,
+          results->kh, results->kw, results->layout, results->sh, results->sw,
+          results->dh, results->dw, results->input_type, results->kernel_type,
+          results->acc_type, results->input_contents, results->kernel_contents,
+          results->acc_contents, results->expected_contents, check_every));
+
+  int count = 0;
+
+  iree_hal_dim_t oh_size =
+      out_shape_calc(results->h, results->kh, results->sh, results->dh);
+  iree_hal_dim_t ow_size =
+      out_shape_calc(results->w, results->kw, results->sw, results->dw);
+
+  for (iree_hal_dim_t n = 0; n < results->n; ++n) {
+    for (iree_hal_dim_t oc = 0; oc < results->f; ++oc) {
+      for (iree_hal_dim_t oh = 0; oh < oh_size; ++oh) {
+        for (iree_hal_dim_t ow = 0; ow < ow_size; ++ow) {
+          if (++count < check_every)
+            continue;
+          count = 0;
+          iree_hal_dim_t idx =
+              convert_to_1d_index(results->f, oh_size, ow_size, n, oc, oh, ow);
+          iree_test_utils_e2e_value_t actual_value =
+              iree_test_utils_read_buffer_element(
+                  idx, results->result_type, results->actual_contents.data);
+          iree_test_utils_e2e_value_t expected_value =
+              iree_test_utils_read_buffer_element(
+                  idx, results->result_type, results->expected_contents.data);
+          if (!iree_test_utils_result_elements_agree(actual_value,
+                                                     expected_value)) {
+            fprintf(
+                file,
+                "\n\nerror: the actual and expected result tensors disagree "
+                "at n %" PRIdim ", oc %" PRIdim ", oh %" PRIdim ", ow %" PRIdim
+                ".\n\n",
+                n, oc, oh, ow);
+            IREE_TRACE_ZONE_END(z0);
+            return iree_make_status(IREE_STATUS_ABORTED);
+          }
+        }
+      }
+    }
+  }
+
+  IREE_TRACE_ZONE_END(z0);
+  return iree_ok_status();
+}
+
+// Given an actual conv2d's inputs and output (all host-local), uses a
+// reference conv2d implementation on the same inputs to check if the output
+// is correct. On error, the first index is returned where the actual and
+// expected value doesn't match. TODO: Add detailed logging to |file|.
+static iree_status_t check_conv2d_results(FILE *file,
+                                          const conv2d_results_t *results) {
+  IREE_TRACE_ZONE_BEGIN(z0);
+  // TODO: Increase the check every param to reduce the number of comparisons.
+  int check_every = 1;
+  iree_status_t status = check_conv2d_results_impl(file, results, check_every);
+  if (!iree_status_is_ok(status) && check_every > 1) {
+    // If we got a failure with check_every>1, that didn't log a useful
+    // numerical summary, as most of the reference tensor entries hadn't been
+    // computed. Rerun now with check_every=1 to get that numerical logging.
+    iree_status_ignore(status);
+    status = check_conv2d_results_impl(file, results, 1);
+  }
+  IREE_TRACE_ZONE_END(z0);
+  return status;
+}
+
+//===----------------------------------------------------------------------===//
+// `conv2d_test` custom module
+//===----------------------------------------------------------------------===//
+// This uses the C++ wrapper to keep things simple. Though easier to use it's
+// got additional overhead/code-size bloat that doesn't matter in a test like
+// this. Making a C module builder API that removes the boilerplate there is
+// TBD so this file is written in C besides this module so that we can swap it
+// back to being pure C in the future.
+
+namespace iree {
+
+class Conv2dTestModuleState final {
+public:
+  explicit Conv2dTestModuleState(iree_allocator_t host_allocator)
+      : host_allocator_(host_allocator) {}
+  ~Conv2dTestModuleState() = default;
+
+  // Fills the destination span with pseudorandom values of the given
+  // |element_type|. The given |seed| is passed to the pseudorandom generator.
+  // The pseudorandom values are reproducible both across runs and across
+  // machines.
+  StatusOr<vm::ref<iree_hal_buffer_view_t>>
+  GenerateRandom4dTensor(const vm::ref<iree_hal_device_t> device, int64_t dim0,
+                         int64_t dim1, int64_t dim2, int64_t dim3,
+                         iree_hal_element_type_t element_type, int32_t seed) {
+    iree_hal_dim_t dims[4] = {
+        (iree_hal_dim_t)dim0,
+        (iree_hal_dim_t)dim1,
+        (iree_hal_dim_t)dim2,
+        (iree_hal_dim_t)dim3,
+    };
+    iree_hal_buffer_params_t buffer_params = {0};
+    buffer_params.usage = IREE_HAL_BUFFER_USAGE_DEFAULT;
+    buffer_params.access = IREE_HAL_MEMORY_ACCESS_ALL;
+    buffer_params.type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE;
+    vm::ref<iree_hal_buffer_view_t> result_view;
+    struct callback_state_t {
+      iree_hal_element_type_t element_type;
+      int32_t seed;
+    } callback_state = {
+        element_type,
+        seed,
+    };
+    IREE_RETURN_IF_ERROR(iree_hal_buffer_view_generate_buffer(
+        device.get(), iree_hal_device_allocator(device.get()),
+        IREE_ARRAYSIZE(dims), dims, element_type,
+        IREE_HAL_ENCODING_TYPE_DENSE_ROW_MAJOR, buffer_params,
+        +[](iree_hal_buffer_mapping_t *mapping, void *user_data) {
+          callback_state_t callback_state = *(callback_state_t *)user_data;
+          iree_byte_span_t span = mapping->contents;
+          // Generate "uniform" integer-valued numbers in the range [min,
+          // max].
+          int32_t min = 0;
+          int32_t max = 0;
+          iree_test_utils_get_min_max_for_element_type(
+              callback_state.element_type, &min, &max);
+          // divided by 4 to make numerical behavior more stable
+          uint32_t range = (max - min + 1) / 4;
+          iree_host_size_t element_byte_count =
+              iree_hal_element_dense_byte_count(callback_state.element_type);
+          uint8_t *data_end = span.data + span.data_length;
+          uint32_t state = callback_state.seed;
+          for (uint8_t *data = span.data; data < data_end;
+               data += element_byte_count) {
+            int32_t value =
+                (int32_t)iree_test_utils_pseudorandom_range(&state, range) +
+                min;
+            iree_test_utils_write_element(callback_state.element_type, value,
+                                          data);
+          }
+          return iree_ok_status();
+        },
+        &callback_state, &result_view));
+    return std::move(result_view);
+  }
+
+  Status
+  CheckConv2dResults(const vm::ref<iree_hal_device_t> device, int64_t n,
+                     int64_t c, int64_t h, int64_t w, int64_t f, int64_t kh,
+                     int64_t kw, int64_t layout, int64_t sh, int64_t sw,
+                     int64_t dh, int64_t dw,
+                     const vm::ref<iree_hal_buffer_view_t> input,
+                     const vm::ref<iree_hal_buffer_view_t> kernel,
+                     const vm::ref<iree_hal_buffer_view_t> acc,
+                     const vm::ref<iree_hal_buffer_view_t> actual_result) {
+    conv2d_results_t results = {};
+    IREE_RETURN_IF_ERROR(conv2d_results_initialize(
+        device.get(), (iree_hal_dim_t)n, (iree_hal_dim_t)c, (iree_hal_dim_t)h,
+        (iree_hal_dim_t)w, (iree_hal_dim_t)f, (iree_hal_dim_t)kh,
+        (iree_hal_dim_t)kw, (iree_hal_dim_t)layout, (iree_hal_dim_t)sh,
+        (iree_hal_dim_t)sw, (iree_hal_dim_t)dh, (iree_hal_dim_t)dw, input.get(),
+        kernel.get(), acc.get(), actual_result.get(), host_allocator_,
+        &results));
+    iree_status_t status = check_conv2d_results(stderr, &results);
+    conv2d_results_deinitialize(&results);
+    return status;
+  }
+
+private:
+  iree_allocator_t host_allocator_;
+};
+
+static const vm::NativeFunction<Conv2dTestModuleState>
+    kConv2dTestModuleFunctions[] = {
+        vm::MakeNativeFunction("generate_random_tensor",
+                               &Conv2dTestModuleState::GenerateRandom4dTensor),
+        vm::MakeNativeFunction("check_conv2d_results",
+                               &Conv2dTestModuleState::CheckConv2dResults),
+};
+
+struct Conv2dTestModule final : public vm::NativeModule<Conv2dTestModuleState> {
+  using vm::NativeModule<Conv2dTestModuleState>::NativeModule;
+  StatusOr<std::unique_ptr<Conv2dTestModuleState>>
+  CreateState(iree_allocator_t host_allocator) override {
+    return std::make_unique<Conv2dTestModuleState>(host_allocator);
+  }
+};
+
+} // namespace iree
+
+static iree_status_t conv2d_test_module_create(iree_vm_instance_t *instance,
+                                               iree_allocator_t host_allocator,
+                                               iree_vm_module_t **out_module) {
+  IREE_ASSERT_ARGUMENT(out_module);
+  *out_module = NULL;
+  auto module = std::make_unique<iree::Conv2dTestModule>(
+      "conv2d_test", /*version=*/0, instance, host_allocator,
+      iree::span<const iree::vm::NativeFunction<iree::Conv2dTestModuleState>>(
+          iree::kConv2dTestModuleFunctions));
+  *out_module = module.release()->interface();
+  return iree_ok_status();
+}
+
+int main(int argc, char **argv) {
+  IREE_TRACE_APP_ENTER();
+
+  iree_flags_parse_checked(IREE_FLAGS_PARSE_MODE_DEFAULT, &argc, &argv);
+  if (argc != 1) {
+    fprintf(stderr, "use --module= flags to specify the modules to run\n");
+    IREE_TRACE_APP_EXIT(EXIT_FAILURE);
+    return EXIT_FAILURE;
+  }
+
+  iree_status_t status = iree_test_utils_load_and_run_e2e_tests(
+      iree_allocator_system(), conv2d_test_module_create);
+  int exit_code = EXIT_SUCCESS;
+  if (!iree_status_is_ok(status)) {
+    iree_status_fprint(stderr, status);
+    bool is_unavailable = iree_status_is_unavailable(status);
+    iree_status_free(status);
+    exit_code = is_unavailable ? EXIT_SUCCESS : EXIT_FAILURE;
+  }
+
+  IREE_TRACE_APP_EXIT(exit_code);
+  return exit_code;
+}
diff --git a/linalg_ops/test_utils.c b/linalg_ops/test_utils.c
index 8b8aecd..9762861 100644
--- a/linalg_ops/test_utils.c
+++ b/linalg_ops/test_utils.c
@@ -194,7 +194,7 @@ bool iree_test_utils_result_elements_agree(iree_test_utils_e2e_value_t expected,
     // `require_exact_results` flag is set to `false`.
     case IREE_TEST_UTILS_VALUE_TYPE_F16:
       if (actual.f16_u16 == expected.f16_u16) return true;
-      if (iree_test_utils_max_elements_to_check()) return false;
+      if (iree_test_utils_require_exact_results()) return false;
       return fabsf(iree_math_f16_to_f32(actual.f16_u16) -
                    iree_math_f16_to_f32(expected.f16_u16)) <
              acceptable_fp_delta;