From b9fdcce3ee0aaacfb385bd9e510a7ec810388aa9 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Thu, 15 Feb 2024 19:23:21 -0500
Subject: [PATCH] [Codegen] Re-Enable transform dialect configuration strategy
 round 2 (#16427)

This time it just drops all transform dialect usage outside of transform
library file path + entry point name. This reduces code complexity in
`MaterializeUserConfigs`.

Also cleans up some of the transform dialect tests to stop lit testing
at the same time. We might want to consider dropping some of them as
they aren't being maintained (the only thing they verify at the moment
is that the transform scripts are valid for CUDA).
---
 .../Codegen/Common/MaterializeUserConfigs.cpp | 168 ++++++++++--------
 .../src/iree/compiler/Codegen/Common/Passes.h |   3 +-
 .../TransformDialectInterpreterPass.cpp       |  14 +-
 .../LLVMCPU/LLVMCPULowerExecutableTarget.cpp  |   7 +-
 .../iree/compiler/Codegen/LLVMCPU/Passes.cpp  |   5 +-
 .../iree/compiler/Codegen/LLVMCPU/Passes.h    |   3 +-
 .../LLVMGPU/LLVMGPULowerExecutableTarget.cpp  |   7 +-
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |   5 +-
 .../iree/compiler/Codegen/LLVMGPU/Passes.h    |   2 +-
 .../compiler/Codegen/LLVMGPU/test/BUILD.bazel |   5 +-
 .../Codegen/LLVMGPU/test/CMakeLists.txt       |   1 +
 .../LLVMGPU/test/linalg_transform.mlir        |   4 +-
 ...nsform_dialect_codegen_bufferize_spec.mlir |   5 +
 ...m_dialect_codegen_foreach_to_gpu_spec.mlir |   5 +
 ...transform_dialect_vector_to_nvgpu_mma.mlir |   0
 .../iree/compiler/Codegen/SPIRV/Passes.cpp    |  10 +-
 .../src/iree/compiler/Codegen/SPIRV/Passes.h  |   3 +-
 .../SPIRV/SPIRVLowerExecutableTargetPass.cpp  |   7 +-
 samples/transform_dialect/example_module.mlir |  25 +--
 .../transform_dialect/transform_library.mlir  | 103 ++++++++++-
 tests/e2e/linalg_transform/BUILD.bazel        |  33 ----
 tests/e2e/linalg_transform/CMakeLists.txt     |  30 ----
 .../linalg_transform/linalg_transform.mlir    |  45 -----
 .../transform_dialect_codegen_spec.mlir       |   5 -
 .../transform_dialect_dispatch_spec.mlir      |   7 -
 tests/transform_dialect/cpu/attention.mlir    |   3 +-
 .../cpu/attention_codegen_spec.mlir           |   5 +
 tests/transform_dialect/cpu/matmul.mlir       |  15 +-
 .../cpu/matmul_codegen_default_spec.mlir      |   5 +
 .../cpu/matmul_library_call.mlir              |   6 +-
 tests/transform_dialect/cuda/BUILD.bazel      |   1 -
 tests/transform_dialect/cuda/CMakeLists.txt   |   2 -
 .../cuda/double_mma_layout_analysis.mlir      |   3 +-
 ...uble_mma_layout_analysis_codegen_spec.mlir |   5 +
 .../cuda/eltwise_reduction.mlir               |  60 -------
 .../cuda/eltwise_reduction_codegen_spec.mlir  | 103 -----------
 .../cuda/eltwise_reduction_eltwise.mlir       |  63 -------
 ...ltwise_reduction_eltwise_codegen_spec.mlir | 111 ------------
 ...elemwise_layout_analysis_codegen_spec.mlir | 115 ++++++------
 .../cuda/mma_reduction_layout_analysis.mlir   |   3 +-
 ...eduction_layout_analysis_codegen_spec.mlir |   4 +
 ...ma_using_layout_analysis_codegen_spec.mlir |   5 +
 tests/transform_dialect/cuda/reduction.mlir   |  46 +----
 .../cuda/reduction_codegen_spec.mlir          |   5 +
 .../cuda/reduction_eltwise.mlir               |  48 +----
 .../cuda/reduction_eltwise_codegen_spec.mlir  |   5 +
 .../transform_dialect/cuda/reduction_v2.mlir  |  47 +----
 .../cuda/reduction_v2_codegen_spec.mlir       |   5 +
 .../cuda/reduction_v2_uneven.mlir             |  42 +----
 tests/transform_dialect/cuda/softmax.mlir     |  25 +--
 .../cuda/softmax_codegen_spec.mlir            |   5 +
 .../cuda/softmax_partial.mlir                 |  21 +--
 .../cuda/softmax_partial_codegen_spec.mlir    |   5 +
 tests/transform_dialect/cuda/softmax_v2.mlir  |  21 +--
 .../cuda/softmax_v2_codegen_spec.mlir         |   5 +
 55 files changed, 393 insertions(+), 898 deletions(-)
 rename tests/transform_dialect/cuda/mma.mlir => compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_to_nvgpu_mma.mlir (100%)
 delete mode 100644 tests/e2e/linalg_transform/BUILD.bazel
 delete mode 100644 tests/e2e/linalg_transform/CMakeLists.txt
 delete mode 100644 tests/e2e/linalg_transform/linalg_transform.mlir
 delete mode 100644 tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
 delete mode 100644 tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 038d3faafc08..1a918fe1c72e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -25,25 +25,47 @@
 
 namespace mlir::iree_compiler {
 
-llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName(
-    "iree-codegen-use-transform-dialect-strategy",
-    llvm::cl::desc(
-        "Broadcasts the given transform dialect strategy specification to all "
-        "dispatches. The specification is a symbol reference to load from a"
-        "library of transform specs (@library_call)"),
-    llvm::cl::init(""));
-
 llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName(
     "iree-codegen-transform-dialect-library",
     llvm::cl::desc(
         "File path to a module containing a library of transform dialect"
-        "strategies"),
+        "strategies. Can be suffixed with the name of a transform sequence"
+        "within the library to run as preprocessing per executable variant."
+        "This is specified as <file-path>@<sequence-name>. If not specified,"
+        "this will default to `__kernel_config`."),
     llvm::cl::init(""));
 
 namespace {
 
 static const char kTranslationInfoAttrName[] = "translation_info";
 
+enum StrategyRunResult {
+  Success = 0,
+  NotFound = 1,
+  Failed = 2,
+};
+
+static StrategyRunResult
+runTransformConfigurationStrategy(Operation *payloadRoot,
+                                  StringRef entryPointName,
+                                  ModuleOp &transformLibrary) {
+  /// If we have a symbol, verify the existence of the symbol within the
+  /// transform library.
+  Operation *entryPoint = transform::detail::findTransformEntryPoint(
+      payloadRoot, transformLibrary, entryPointName);
+  if (!entryPoint) {
+    return StrategyRunResult::NotFound;
+  }
+
+  transform::TransformOptions options;
+  if (failed(transform::applyTransformNamedSequence(
+          payloadRoot, entryPoint, transformLibrary,
+          options.enableExpensiveChecks(true)))) {
+    return StrategyRunResult::Failed;
+  }
+  return StrategyRunResult::Success;
+}
+
 struct MaterializeUserConfigsPass
     : public MaterializeUserConfigsBase<MaterializeUserConfigsPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -57,42 +79,73 @@ struct MaterializeUserConfigsPass
         getAllEntryPoints(moduleOp);
     MLIRContext *context = moduleOp.getContext();
 
+    // Parse the file path and kernel config strategy from flags. There are
+    // two possible usage flows for transform dialect libraries.
+    //   1. Use `__kernel_config` to match and annotate variants with the
+    //      strategy to use. This could either be a transform dialect strategy
+    //      or any other IREE codegen pipeline.
+    //
+    //   2. Use the configuration strategy to do codegen directly. At the end of
+    //      the strategy, the variant needs to be annotated with
+    //      "translation_info" = #iree_codegen.translation_info<None>
+    SmallVector<StringRef, 2> parts;
+    llvm::SplitString(llvm::StringRef(clCodegenTransformDialectLibraryFileName),
+                      parts, "@");
+    if (parts.size() > 2) {
+      variantOp.emitError()
+          << "Invalid transform library path and sequence name "
+          << clCodegenTransformDialectLibraryFileName;
+      return signalPassFailure();
+    }
+    bool hasTransformLibrary = !parts.empty();
+
+    std::string libraryFileName;
+    if (hasTransformLibrary) {
+      if (parts[0].empty()) {
+        variantOp.emitError() << "Cannot specify an empty library path";
+        return signalPassFailure();
+      }
+      libraryFileName = parts[0];
+    }
+
+    std::string entrySequenceName;
+    // Check if the user specified a custom entry point name.
+    if (parts.size() == 2) {
+      if (parts[1].empty()) {
+        variantOp.emitError() << "Cannot specify an empty sequence name";
+        return signalPassFailure();
+      }
+      entrySequenceName = parts[1];
+    } else {
+      entrySequenceName = "__kernel_config";
+    }
+
     LDBG("MaterializeUserConfigsPass on variant: " << variantOp);
     std::optional<ModuleOp> transformLibrary = std::nullopt;
-    if (!clCodegenTransformDialectLibraryFileName.empty()) {
+    if (hasTransformLibrary) {
       auto dialect =
           context->getOrLoadDialect<IREE::Codegen::IREECodegenDialect>();
-      auto maybeTransformLibrary = dialect->getOrLoadTransformLibraryModule(
-          clCodegenTransformDialectLibraryFileName);
+      auto maybeTransformLibrary =
+          dialect->getOrLoadTransformLibraryModule(libraryFileName);
       if (failed(maybeTransformLibrary)) {
-        variantOp.emitError() << "failed to load transform library module: "
-                              << clCodegenTransformDialectLibraryFileName;
+        variantOp.emitError()
+            << "failed to load transform library module: " << libraryFileName;
         return signalPassFailure();
       }
       transformLibrary = *maybeTransformLibrary;
-      LDBG("--found transform library @"
-           << clCodegenTransformDialectLibraryFileName);
-    }
+      LDBG("--found transform library @" << libraryFileName);
 
-    IREE::Codegen::DispatchLoweringPassPipeline tdPipeline =
-        IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen;
-    std::optional<IREE::Codegen::TranslationInfoAttr> clTranslationInfo;
-    // Here we always set the pipeline strategy to transform dialect if the
-    // flag is non-empty to ensure we pick the right lowering pipeline in the
-    // event a strategy symbol is defined.
-    if (!clCodegenTransformDialectLibraryFileName.empty() ||
-        !clCodegenTransformDialectStrategyName.empty()) {
-      StringRef strategyName =
-          (clCodegenTransformDialectStrategyName.empty())
-              ? StringRef(
-                    transform::TransformDialect::kTransformEntryPointSymbolName)
-              : clCodegenTransformDialectStrategyName;
-      clTranslationInfo = IREE::Codegen::TranslationInfoAttr::get(
-          context, tdPipeline,
-          /*codegenSpec=*/
-          SymbolRefAttr::get(context, llvm::StringRef(strategyName)),
-          /*configuration=*/DictionaryAttr());
-      LDBG("--clTranslationInfo: " << clTranslationInfo);
+      auto runResult = runTransformConfigurationStrategy(
+          variantOp, entrySequenceName, *transformLibrary);
+      if (runResult == StrategyRunResult::NotFound) {
+        variantOp.emitError() << "transform kernel config strategy `"
+                              << entrySequenceName << " not found";
+        return signalPassFailure();
+      } else if (runResult == StrategyRunResult::Failed) {
+        variantOp.emitError() << "transform kernel config strategy `"
+                              << entrySequenceName << "` failed to apply";
+        return signalPassFailure();
+      }
     }
 
     LDBG("--start iterating over: "
@@ -106,6 +159,11 @@ struct MaterializeUserConfigsPass
         continue;
       }
 
+      /// Nothing to do if the export already has a config.
+      if (getTranslationInfo(exportOp)) {
+        continue;
+      }
+
       /// First, apply all user configs.
       auto res = funcOp.walk([&](Operation *op) {
         if (auto compilationInfo = getCompilationInfo(op)) {
@@ -120,48 +178,14 @@ struct MaterializeUserConfigsPass
         moduleOp.emitOpError("error in setting user configuration");
         return signalPassFailure();
       }
-
-      /// Let user configs take priority over the global strategy flag.
-      if (IREE::Codegen::TranslationInfoAttr exportedTranslationInfo =
-              getTranslationInfo(exportOp)) {
-        if (translationInfo) {
-          /// Currently codegen is rooted on the variant, meaning every entry
-          /// must go through the same codegen pipeline. For multi-targeting we
-          /// will want to have multiple functions per variant, as well as
-          /// multiple exports per variant, meaning eventually the nesting of
-          /// the translation pipeline will need to change to the function, or
-          /// we'll need another level of module op nesting.
-          if (exportedTranslationInfo != translationInfo.value()) {
-            moduleOp.emitOpError(
-                "unhandled compilation of entry point functions with different "
-                "translation info");
-            return signalPassFailure();
-          }
-        } else {
-          translationInfo = exportedTranslationInfo;
-        }
-      } else {
-        if (translationInfo && translationInfo != clTranslationInfo) {
-          moduleOp.emitOpError(
-              "unhandled compilation of entry point functions with translation "
-              "info optionality");
-          return signalPassFailure();
-        }
-        if (clTranslationInfo) {
-          translationInfo = clTranslationInfo;
-          if (failed(setTranslationInfo(funcOp, translationInfo.value()))) {
-            moduleOp.emitOpError("failed to set command line translation info");
-            return signalPassFailure();
-          }
-        }
-      }
     }
 
     LDBG("--guaranteed unique translationInfo: " << translationInfo);
     /// We only need to resolve symbols for transform dialect based strategies.
     if (!translationInfo ||
         translationInfo.value().getDispatchLoweringPassPipeline() !=
-            tdPipeline) {
+            IREE::Codegen::DispatchLoweringPassPipeline::
+                TransformDialectCodegen) {
       return;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index e67f46e72ae3..48cfad4f0b09 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -267,7 +267,8 @@ createTileAndDistributeToWorkgroupsPass(
 
 /// Create an IREE-specific Transform dialect interpreter pass with all
 /// registrations necessary for IREE.
-std::unique_ptr<Pass> createTransformDialectInterpreterPass();
+std::unique_ptr<Pass>
+createTransformDialectInterpreterPass(StringRef transformSequenceName = "");
 
 /// Pass to propagate type to avoid generating load/stores of illegal types.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
index 4852485b8c9d..a5b4ca276fff 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformDialectInterpreterPass.cpp
@@ -72,13 +72,19 @@ class TransformDialectInterpreterPass
 
 namespace mlir::iree_compiler {
 
-extern llvm::cl::opt<std::string> clCodegenTransformDialectStrategyName;
 extern llvm::cl::opt<std::string> clCodegenTransformDialectLibraryFileName;
 
 /// Create a Transform dialect interpreter pass.
-std::unique_ptr<Pass> createTransformDialectInterpreterPass() {
+std::unique_ptr<Pass>
+createTransformDialectInterpreterPass(StringRef transformSequenceName) {
+  StringRef libraryPath = "";
+  SmallVector<StringRef, 2> parts;
+  llvm::SplitString(llvm::StringRef(clCodegenTransformDialectLibraryFileName),
+                    parts, "@");
+  if (!parts.empty()) {
+    libraryPath = parts[0];
+  }
   return std::make_unique<TransformDialectInterpreterPass>(
-      clCodegenTransformDialectLibraryFileName,
-      clCodegenTransformDialectStrategyName);
+      libraryPath, transformSequenceName);
 }
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
index 009305cebdb1..9a9cd79c4c1f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -186,9 +186,12 @@ void LLVMCPULowerExecutableTargetPass::runOnOperation() {
     break;
   }
   // Transform-dialect pipelines.
-  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen:
-    addTransformDialectPasses(pipeline);
+  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: {
+    SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
+    addTransformDialectPasses(
+        pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
     break;
+  }
   default:
     moduleOp.emitOpError("Unsupported pipeline on CPU target.");
     return signalPassFailure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
index 2f05877c3e59..a467ee2c216d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -582,10 +582,11 @@ void addCPUDefaultPassPipeline(OpPassManager &passManager) {
   addCPUBufferizePasses(nestedModulePM);
 }
 
-void addTransformDialectPasses(OpPassManager &passManager) {
+void addTransformDialectPasses(OpPassManager &passManager,
+                               StringRef entryPoint) {
   // Give control to the transform dialect.
   passManager.addPass(
-      mlir::iree_compiler::createTransformDialectInterpreterPass());
+      mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
   // Dropping the schedule is needed:
   //   1. if we want to embed the transform in the module: we should drop the
   //      schedule once applied.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
index 91a1cf6d3890..d9b7efa2f5e2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.h
@@ -169,7 +169,8 @@ void addTensorToVectorsPassPipeline(OpPassManager &passManager,
                                     bool lowerToVectors = true);
 
 /// Transform dialect-based common.
-void addTransformDialectPasses(OpPassManager &passManager);
+void addTransformDialectPasses(OpPassManager &passManager,
+                               StringRef entryPoint);
 
 // Populates the passes needed to do tiling, decomposing, and vectorizing the
 // convolution ops.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index a89102cb5192..084d1ccbee9e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -128,9 +128,12 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
     addGPUPackUnPackPasses(pipeline);
     break;
   // Transform-dialect pipelines.
-  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen:
-    addGPUTransformDialectPasses(pipeline);
+  case IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen: {
+    SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
+    addGPUTransformDialectPasses(
+        pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
     break;
+  }
   // no pipeline specified, nothing to do.
   case IREE::Codegen::DispatchLoweringPassPipeline::None:
     return;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 77c08850324a..c76d7fcca05a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -740,9 +740,10 @@ static void addLowerToLLVMGPUPasses(OpPassManager &pm, bool forROCDL) {
 extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugPayloadTag;
 extern llvm::cl::opt<std::string> clGPUCodegenTransformDialectDebugTransformTag;
 
-void addGPUTransformDialectPasses(OpPassManager &passManager) {
+void addGPUTransformDialectPasses(OpPassManager &passManager,
+                                  StringRef entryPoint) {
   passManager.addPass(
-      mlir::iree_compiler::createTransformDialectInterpreterPass());
+      mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
 
   // Dropping the schedule is needed:
   //   1. if we want to embed the transform in the module: we should drop the
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 40dce21771d0..d1bed4be9d68 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -40,7 +40,7 @@ void addGPUPackUnPackPasses(OpPassManager &pm);
 void addGPUSimpleDistributePassPipeline(OpPassManager &pm);
 
 /// Transform dialect-based path.
-void addGPUTransformDialectPasses(OpPassManager &pm);
+void addGPUTransformDialectPasses(OpPassManager &pm, StringRef entryPoint);
 
 /// Lowering transpose using shared memory.
 void addGPUTransposePassPipeline(OpPassManager &pm);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 69b56cb70f00..d50b8e57b9b2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -52,12 +52,13 @@ iree_lit_test_suite(
             "pack_shared_memory_alloc.mlir",
             "tensor_pad.mlir",
             "tensorcore_vectorization.mlir",
-            "transform_dialect_hoist_allocs.mlir",
-            "transform_dialect_vector_distribution.mlir",
             "transform_dialect_bufferize.mlir",
             "transform_dialect_eliminate_gpu_barriers.mlir",
+            "transform_dialect_hoist_allocs.mlir",
             "transform_dialect_pack_shared_memory_alloc.mlir",
             "transform_dialect_promote_operands.mlir",
+            "transform_dialect_vector_distribution.mlir",
+            "transform_dialect_vector_to_nvgpu_mma.mlir",
             "transform_distribute_forall.mlir",
             "transform_gpu_pipelining.mlir",
             "transform_vector_to_mma.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index c6e911346d5c..7cbffbcca38e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -54,6 +54,7 @@ iree_lit_test_suite(
     "transform_dialect_pack_shared_memory_alloc.mlir"
     "transform_dialect_promote_operands.mlir"
     "transform_dialect_vector_distribution.mlir"
+    "transform_dialect_vector_to_nvgpu_mma.mlir"
     "transform_distribute_forall.mlir"
     "transform_gpu_pipelining.mlir"
     "transform_vector_to_mma.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
index 7b119937f973..2a59b3594bd6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
@@ -1,11 +1,11 @@
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-llvmgpu-lower-executable-target)))" \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_bufferize_spec.mlir@__transform_main | \
 // RUN: FileCheck %s
 
 // RUN: iree-opt %s  --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-codegen-llvmgpu-configuration-pipeline, iree-llvmgpu-lower-executable-target)))" \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir@__transform_main | \
 // RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU
 
 #device_target_cuda = #hal.device.target<"cuda", {executable_targets = [#hal.executable.target<"cuda", "cuda-nvptx-fb", {target_arch = "sm_60"}>]}>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
index 472b508bb49b..3a47a94bb817 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_bufferize_spec.mlir
@@ -4,6 +4,11 @@ module attributes { transform.with_named_sequence } {
     transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
     %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
     %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
index d8a1a99572bd..b15fe9812f95 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_codegen_foreach_to_gpu_spec.mlir
@@ -40,6 +40,11 @@ module attributes { transform.with_named_sequence } {
     } : !transform.any_op
     transform.iree.apply_licm %memref_func : !transform.any_op
     transform.apply_cse to %memref_func : !transform.any_op
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_to_nvgpu_mma.mlir
similarity index 100%
rename from tests/transform_dialect/cuda/mma.mlir
rename to compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_to_nvgpu_mma.mlir
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
index a0b043fd7e8e..7018f13e3b8c 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
@@ -264,9 +264,10 @@ static void addSPIRVLoweringPasses(OpPassManager &pm, bool enableFastMath) {
   spirvPM.addPass(spirv::createSPIRVUpdateVCEPass());
 }
 
-void addSPIRVTransformDialectPasses(OpPassManager &passManager) {
+void addSPIRVTransformDialectPasses(OpPassManager &passManager,
+                                    StringRef entryPoint) {
   passManager.addPass(
-      mlir::iree_compiler::createTransformDialectInterpreterPass());
+      mlir::iree_compiler::createTransformDialectInterpreterPass(entryPoint));
 
   // Dropping the schedule is needed:
   //   1. if we want to embed the transform in the module: we should drop the
@@ -647,8 +648,9 @@ void addSPIRVSubgroupReducePassPipeline(OpPassManager &pm) {
   nestedModulePM.addPass(createCSEPass());
 }
 
-void addSPIRVTransformDialectPassPipeline(OpPassManager &pm) {
-  addSPIRVTransformDialectPasses(pm);
+void addSPIRVTransformDialectPassPipeline(OpPassManager &pm,
+                                          StringRef entryPoint) {
+  addSPIRVTransformDialectPasses(pm, entryPoint);
 
   // Run GenericVectorization pass additionally to convert vectors into forms
   // needed for SPIR-V.
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
index ac91fe23ecc6..e9e0d40579c5 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Passes.h
@@ -44,7 +44,8 @@ void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &pm,
 void addSPIRVSubgroupReducePassPipeline(OpPassManager &pm);
 
 /// Pass pipeline to lower IREE HAL executables via transform dialect schedules.
-void addSPIRVTransformDialectPassPipeline(OpPassManager &pm);
+void addSPIRVTransformDialectPassPipeline(OpPassManager &pm,
+                                          StringRef entryPoint);
 
 /// Pass pipeline to lower winograd ops. This pipeline follows the
 /// SPIRVBaseVectorize pipeline with the following exception:
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
index 721d9026a959..ded7a8ae9069 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
@@ -113,9 +113,12 @@ void SPIRVLowerExecutableTargetPass::runOnOperation() {
   case CodeGenPipeline::SPIRVWinogradVectorize:
     addSPIRVWinogradVectorizePassPipeline(pipeline);
     break;
-  case CodeGenPipeline::TransformDialectCodegen:
-    addSPIRVTransformDialectPassPipeline(pipeline);
+  case CodeGenPipeline::TransformDialectCodegen: {
+    SymbolRefAttr codegenSpec = translationInfo.value().getCodegenSpec();
+    addSPIRVTransformDialectPassPipeline(
+        pipeline, codegenSpec ? codegenSpec.getLeafReference() : StringRef(""));
     break;
+  }
   // No pipeline specified, nothing to do.
   case CodeGenPipeline::None:
     return;
diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir
index c5eab4696a76..1e4ac4e8ed4d 100644
--- a/samples/transform_dialect/example_module.mlir
+++ b/samples/transform_dialect/example_module.mlir
@@ -107,28 +107,29 @@ module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executabl
 }
 
 /// We test first with threading off so that the printers are legible.
-// R-UN: iree-compile %s --iree-hal-target-backends=vulkan \
-// R-UN:   --iree-codegen-use-transform-dialect-strategy=transform_main \
-// R-UN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// R-UN:   --compile-from=executable-sources \
-// R-UN:   --compile-to=executable-targets \
-// R-UN:   --mlir-disable-threading | \
-// R-UN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
+// RUN: iree-compile %s --iree-hal-target-backends=vulkan \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@kernel_config \
+// RUN:   --compile-from=executable-sources \
+// RUN:   --compile-to=executable-targets \
+// RUN:   --mlir-disable-threading | \
+// RUN: FileCheck %s --check-prefixes=CODEGEN-PRINTER
 
-// CODEGEN-PRINTER:     IR printer: Setting matmul strategy to default top-level
-// CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @transform_main
+// CODEGEN-PRINTER:     IR printer: Setting matmul strategy to custom_transform_strategy
+// CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
 // CODEGEN-PRINTER:     IR printer: Setting reduce strategy to base vectorize top-level
 // CODEGEN-PRINTER:       translation_info = #iree_codegen.translation_info<SPIRVBaseVectorize>, workgroup_size = [16 : index, 1 : index, 1 : index]
 
 /// Then test with threading to make sure it runs
 // RUN: iree-compile %s --iree-hal-target-backends=vulkan \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=@transform_main \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@kernel_config \
 // RUN:   --compile-from=executable-sources \
 // RUN:   --compile-to=executable-targets \
 // RUN:   --mlir-disable-threading | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN
 
+// CODEGEN: Ran custom_transform_strategy
 // CODEGEN: spirv.func @example_module_dispatch_0_generic_80_f32
-// CODEGEN: spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
+// CODEGEN: hal.executable private @example_module_dispatch_1
+// CODEGEN:   #iree_codegen.translation_info<TransformDialectCodegen codegen_spec = @custom_transform_strategy>
+// CODEGEN:     spirv.func @example_module_dispatch_1_matmul_16x16x5_f32
 // CODEGEN: spirv.func @example_module_dispatch_2_generic_16x16_f32
diff --git a/samples/transform_dialect/transform_library.mlir b/samples/transform_dialect/transform_library.mlir
index 3bb75ad6a39f..8b17af73ee13 100644
--- a/samples/transform_dialect/transform_library.mlir
+++ b/samples/transform_dialect/transform_library.mlir
@@ -1,13 +1,76 @@
 module attributes { transform.with_named_sequence } {
-  // Print and send it down normal IREE codegen.
-  transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.consumed}) {  
-    %1 = transform.structured.generalize %matmul : (!transform.any_op) -> !transform.any_op
-    transform.print {name = "Setting matmul strategy to default"}
+  // Example of a custom matmul strategy. The target matmul is annotated with
+  // the name of this strategy down below before strategy selection, overriding
+  // default IREE codegen.
+  transform.named_sequence @custom_transform_strategy(
+      %variant_op: !transform.any_op {transform.consumed}) {
+    // Step 1. Re-match the matmul
+    // ===========================================================================
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+
+    // Step 2. Tile to grid
+    // ===========================================================================
+    %grid_reduction, %forall_grid =
+    transform.structured.tile_using_forall %matmul tile_sizes [16, 16] ( mapping = [#gpu.block<x>, #gpu.block<y>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+
+    // Step 3. Vectorize
+    // ===========================================================================
+    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+    } : !transform.any_op
+    %func_1 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+
+    // Step 4. Bufferize
+    // ===========================================================================
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.iree.fold_fill_into_pad
+      transform.apply_patterns.linalg.tiling_canonicalization
+      transform.apply_patterns.scf.for_loop_canonicalization
+    } : !transform.any_op
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.tensor.reassociative_reshape_folding
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %func_1 : !transform.any_op
+    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+    transform.apply_patterns to %func_1 {
+      transform.apply_patterns.linalg.erase_unnecessary_inputs
+    } : !transform.any_op
+    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+
+    // Step 6. Post-bufferization vector distribution
+    // ===========================================================================
+    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+    transform.iree.map_nested_forall_to_gpu_threads %func_7
+        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+
+    // Step 7. Do layout analysis and lower to mma
+    // ===========================================================================
+    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+    transform.print {name = "Ran custom_transform_strategy"}
+    transform.yield
+  }
+
+  // Send it down a custom transform dialect pipeline.
+  transform.named_sequence @custom_matmul(%matmul: !transform.any_op {transform.readonly}) {
+    %variant_op = transform.get_parent_op %matmul {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    %subgroup_reduce = transform.param.constant #iree_codegen.translation_info<TransformDialectCodegen
+                                                                               codegen_spec = @custom_transform_strategy> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %subgroup_reduce : !transform.any_op, !transform.any_param
+    transform.print {name = "Setting matmul strategy to custom_transform_strategy"}
     transform.yield
   }
 
-  // Send it down subgroup reduce.
-  transform.named_sequence @use_subgroup_reduce(%reduce: !transform.any_op {transform.readonly}) {  
+  // Send it down subgroup reduce with a custom tiling configuration.
+  transform.named_sequence @use_base_vectorize(%reduce: !transform.any_op {transform.readonly}) {
     %variant_op = transform.get_parent_op %reduce {op_name = "hal.executable.variant"} : (!transform.any_op) -> !transform.any_op
     %lowering_config = transform.param.constant #iree_codegen.lowering_config<tile_sizes = [[8, 0], [1, 0], [0, 0, 4]]> -> !transform.any_param
     transform.annotate %reduce "lowering_config" = %lowering_config : !transform.any_op, !transform.any_param
@@ -42,10 +105,34 @@ module attributes { transform.with_named_sequence } {
     transform.yield %matched : !transform.any_op
   }
 
-  transform.named_sequence @transform_main(%variant_op: !transform.any_op {transform.consumed}) {  
+  // An example of a custom transform dialect based kernel config. Note that
+  // because of the way `transform.foreach_match` works, the callback cannot
+  // manipulate IR beyond the op *given* to the matcher, as foreach_match will
+  // attempt to keep walking the IR even after a successful match. The expected
+  // flow for a strategy like this is as follows:
+  //
+  // Author an entry point like this (@kernel_config) that walks the IR and
+  // attempts to annotate the dispatch with the codegen strategy to use, i.e.
+  //   transform.foreach_match in %variant_op
+  //       @matcher_0 -> @annotator_0,
+  //       @matcher_1 -> @annotator_1,
+  //       ...
+  //
+  // the annotators should attach an #iree_codegen.translation_info attribute
+  // to the `hal.executable.export` ops within the variant as well as any
+  // relevant op specific tile sizes (and other important attributes like
+  // workgroup_size and subgroup_size, if relevant). This will then get handed
+  // off to backend specific kernel config, which will let these user configs
+  // pass through unperturbed.
+  //
+  // To couple this with a transform dialect based codegen strategy, the target
+  // codegen strategy can be included inline with this library and relevant ops
+  // can be annotated with `TransformDialectCodegen` as the lowering pipeline,
+  // with a reference to the strategy to use (see an example above).
+  transform.named_sequence @kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
     transform.foreach_match in %variant_op
         @match_matmul -> @custom_matmul,
-        @match_reduce -> @use_subgroup_reduce
+        @match_reduce -> @use_base_vectorize
       : (!transform.any_op) -> (!transform.any_op)
     transform.yield
   }
diff --git a/tests/e2e/linalg_transform/BUILD.bazel b/tests/e2e/linalg_transform/BUILD.bazel
deleted file mode 100644
index 9b5ffa4767e3..000000000000
--- a/tests/e2e/linalg_transform/BUILD.bazel
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_lit_test_suite(
-    name = "check_linalg_transform",
-    srcs = ["linalg_transform.mlir"],
-    cfg = "//tests:lit.cfg.py",
-    # transform_dialect_xxx_spec are MLIR files that specify a transformation,
-    # they need to be included as data.
-    data = [
-        "//tests/e2e/linalg_transform:transform_dialect_codegen_spec.mlir",
-        "//tests/e2e/linalg_transform:transform_dialect_dispatch_spec.mlir",
-    ],
-    tags = [
-        "hostonly",
-    ],
-    tools = [
-        "//tools:iree-opt",
-        "//tools:iree-run-mlir",
-        "@llvm-project//lld",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tests/e2e/linalg_transform/CMakeLists.txt b/tests/e2e/linalg_transform/CMakeLists.txt
deleted file mode 100644
index 38c6124399a9..000000000000
--- a/tests/e2e/linalg_transform/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/e2e/linalg_transform/BUILD.bazel                                       #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-iree_lit_test_suite(
-  NAME
-    check_linalg_transform
-  SRCS
-    "linalg_transform.mlir"
-  TOOLS
-    ${IREE_LLD_TARGET}
-    FileCheck
-    iree-opt
-    iree-run-mlir
-  DATA
-    iree::tests::e2e::linalg_transform::transform_dialect_codegen_spec.mlir
-    iree::tests::e2e::linalg_transform::transform_dialect_dispatch_spec.mlir
-  LABELS
-    "hostonly"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/e2e/linalg_transform/linalg_transform.mlir b/tests/e2e/linalg_transform/linalg_transform.mlir
deleted file mode 100644
index 796ec223e468..000000000000
--- a/tests/e2e/linalg_transform/linalg_transform.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// R-UN: iree-run-mlir --Xcompiler,iree-hal-target-backends=llvm-cpu \
-/// Specify the dispatch region formation with the transform dialect.
-// R-UN:   --iree-flow-dispatch-use-transform-dialect=%p/transform_dialect_dispatch_spec.mlir \
-/// Specify the codegen strategy with the transform dialect.
-// R-UN:   --iree-codegen-use-transform-dialect-strategy=%p/transform_dialect_codegen_spec.mlir \
-// R-UN: %s | FileCheck %s
-
-
-// RUN: iree-opt %s \
-// RUN:   --iree-abi-transformation-pipeline \
-// RUN:   --iree-flow-transformation-pipeline \
-// RUN:   --iree-flow-dispatch-use-transform-dialect=%p/transform_dialect_dispatch_spec.mlir
-
-func.func @matmul_static() -> tensor<5x5xf32> {
-  %res = flow.tensor.constant dense<[
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0],
-    [0.0, 0.0, 0.0, 0.0, 0.0]]> : tensor<5x5xf32> -> tensor<5x5xf32>
-  %lhs = flow.tensor.constant dense<[
-    [15.0, 14.0, 13.0],
-    [12.0, 11.0, 10.0],
-    [09.0, 08.0, 07.0],
-    [06.0, 05.0, 04.0],
-    [03.0, 02.0, 01.0]]> : tensor<5x3xf32> -> tensor<5x3xf32>
-  %rhs = flow.tensor.constant dense<[
-    [15.0, 14.0, 13.0, 12.0, 11.0],
-    [10.0, 09.0, 08.0, 07.0, 06.0],
-    [05.0, 04.0, 03.0, 02.0, 01.0]]> : tensor<3x5xf32> -> tensor<3x5xf32>
-
-  %matmul = linalg.matmul
-      ins(%lhs, %rhs : tensor<5x3xf32>, tensor<3x5xf32>)
-      outs(%res : tensor<5x5xf32>) -> tensor<5x5xf32>
-  %matmul_res = util.optimization_barrier %matmul : tensor<5x5xf32>
-
-  return %matmul_res : tensor<5x5xf32>
-}
-
-//      CHECK: 5x5xf32=
-// CHECK-SAME: [430 388 346 304 262]
-// CHECK-SAME: [340 307 274 241 208]
-// CHECK-SAME: [250 226 202 178 154]
-// CHECK-SAME: [160 145 130 115 100]
-// CHECK-SAME: [70 64 58 52 46]
diff --git a/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
deleted file mode 100644
index c95b85c3a987..000000000000
--- a/tests/e2e/linalg_transform/transform_dialect_codegen_spec.mlir
+++ /dev/null
@@ -1,5 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %variant_op_2 = transform.iree.bufferize %variant_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-}
diff --git a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir b/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
deleted file mode 100644
index 53fbec94150d..000000000000
--- a/tests/e2e/linalg_transform/transform_dialect_dispatch_spec.mlir
+++ /dev/null
@@ -1,7 +0,0 @@
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %tiled_op, %foreach_op = transform.structured.tile_using_forall %0 num_threads [13, 33]
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %dispatch_op = transform.iree.forall_to_flow %foreach_op : (!transform.any_op) -> !transform.any_op
-}
diff --git a/tests/transform_dialect/cpu/attention.mlir b/tests/transform_dialect/cpu/attention.mlir
index 9dd587f0d271..b103ba62900a 100644
--- a/tests/transform_dialect/cpu/attention.mlir
+++ b/tests/transform_dialect/cpu/attention.mlir
@@ -9,8 +9,7 @@ func.func @attention() -> tensor<1x4x4xf32> {
 }
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN: --iree-codegen-transform-dialect-library=%p/attention_codegen_spec.mlir \
-// RUN: --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN: --iree-codegen-transform-dialect-library=%p/attention_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=attention | \
 // RUN: FileCheck %s --check-prefixes=EXEC
 
diff --git a/tests/transform_dialect/cpu/attention_codegen_spec.mlir b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
index 5f22de1a8f7d..ab64721a2a12 100644
--- a/tests/transform_dialect/cpu/attention_codegen_spec.mlir
+++ b/tests/transform_dialect/cpu/attention_codegen_spec.mlir
@@ -63,6 +63,11 @@ module attributes { transform.with_named_sequence } {
     } : !transform.any_op
     transform.apply_cse to %func_8 : !transform.any_op
     transform.memref.erase_dead_alloc_and_stores %func_8 : (!transform.any_op) -> ()
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   } // codegen
 
diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir
index 246b71200277..39a2074f6043 100644
--- a/tests/transform_dialect/cpu/matmul.mlir
+++ b/tests/transform_dialect/cpu/matmul.mlir
@@ -11,20 +11,7 @@ func.func @matmul_static(
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-opt-data-tiling=false \
-// RUN:   --compile-to=executable-configurations | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-lower-executable-target)))' \
-// RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
-
-// CODEGEN-DEFAULT:     hal.executable.export public @matmul_static_dispatch_0_matmul_3x3x5
-// CODEGEN-DEFAULT-DAG:     %[[C1:.+]] = arith.constant 1 : index
-// CODEGEN-DEFAULT-DAG:     %[[C2:.+]] = arith.constant 2 : index
-// CODEGEN-DEFAULT:         hal.return %[[C2]], %[[C1]], %[[C1]]
-
-// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=%p/matmul_codegen_default_spec.mlir | \
+// RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=matmul_static \
 // RUN:   --input="3x5xf32=1" \
 // RUN:   --input="5x3xf32=2" \
diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
index 34eaa2fae252..1fa31050e506 100644
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
@@ -22,6 +22,11 @@ module attributes { transform.with_named_sequence } {
     // =========================================================
     %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cpu/matmul_library_call.mlir b/tests/transform_dialect/cpu/matmul_library_call.mlir
index e2f066ad5419..ea93fbb37841 100644
--- a/tests/transform_dialect/cpu/matmul_library_call.mlir
+++ b/tests/transform_dialect/cpu/matmul_library_call.mlir
@@ -14,8 +14,7 @@ module {
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=custom_matmul \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@custom_matmul \
 // RUN:   --compile-to=executable-targets | \
 // RUN: FileCheck %s --check-prefixes=CODEGEN-DEFAULT
 
@@ -26,8 +25,7 @@ module {
 
 // RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
 // RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir \
-// RUN:   --iree-codegen-use-transform-dialect-strategy=custom_matmul | \
+// RUN:   --iree-codegen-transform-dialect-library=%p/transform_library.mlir@custom_matmul | \
 // RUN: iree-run-module --module=- --function=matmul_static \
 // RUN:   --input="3x5xf32=1" \
 // RUN:   --input="5x3xf32=2" \
diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel
index b8bf5079ca26..a35e0dbe609f 100644
--- a/tests/transform_dialect/cuda/BUILD.bazel
+++ b/tests/transform_dialect/cuda/BUILD.bazel
@@ -28,7 +28,6 @@ endif()
 iree_lit_test_suite(
     name = "lit",
     srcs = [
-        "mma.mlir",
         # TODO(#15892): reductions have flakes and need to be triaged.
         # "reduction.mlir",
         # "reduction_eltwise.mlir",
diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt
index 347630b22d77..7534bc8bfd46 100644
--- a/tests/transform_dialect/cuda/CMakeLists.txt
+++ b/tests/transform_dialect/cuda/CMakeLists.txt
@@ -21,8 +21,6 @@ endif()
 iree_lit_test_suite(
   NAME
     lit
-  SRCS
-    "mma.mlir"
   TOOLS
     FileCheck
     iree-compile
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
index cb53367d07f6..9293fd4ff0ab 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
@@ -15,8 +15,7 @@ func.func @double_matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>, %se
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=double_matmul --device=cuda \
 // RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
 // RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
index 5157fb74838e..02a1d92172e8 100644
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
@@ -68,6 +68,11 @@ module attributes { transform.with_named_sequence } {
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/eltwise_reduction.mlir b/tests/transform_dialect/cuda/eltwise_reduction.mlir
index d276879f1d27..eabf1a0cf739 100644
--- a/tests/transform_dialect/cuda/eltwise_reduction.mlir
+++ b/tests/transform_dialect/cuda/eltwise_reduction.mlir
@@ -31,69 +31,9 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   return %6 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: FileCheck %s --check-prefix=DISPATCH
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))'
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/%S_codegen_spec.mlir | \
-// RUN: FileCheck %s
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-// Check that both generics ended up in the same region.
-// DISPATCH:     hal.executable.variant
-// DISPATCH:     linalg.fill
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-
-//     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<f32>
-//     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<1x2xf32, 3>
-//     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-//     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-//     CHECK-DAG: %[[TIDZ:.]] = gpu.thread_id  z
-
-//         CHECK: %[[SHMEM_VIEW_EXPANDED:.*]] = memref.subview %[[SHMEM_ALLOC]][%[[TIDZ]], %[[TIDY]]]{{.*}}to memref<f32, {{.*}}, 3>
-
-// Distributed reduction: everyone loads, does the elementwise then 5 xor + addf expected
-//         CHECK: vector.transfer_read %{{.*}}[%[[TIDZ]], %[[TIDY]], %[[TIDX]]]
-//         CHECK: arith.addf
-//         CHECK: arith.addf
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-//         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-//         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK: scf.if %[[CONDXIS0]]
-//         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_VIEW_EXPANDED]][]
-//         CHECK: gpu.barrier
-
-// Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-//         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-//          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-//         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-//         CHECK: scf.if %[[CONXANDYARE0]] {
-//         CHECK:   vector.transfer_read
-//         CHECK:   vector.reduction <add>
-//         CHECK:   vector.transfer_write
-//         CHECK: gpu.barrier
-//         CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, 3>
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=256 256 256 256 256 256 256 256
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
deleted file mode 100644
index 2f03915b83bc..000000000000
--- a/tests/transform_dialect/cuda/eltwise_reduction_codegen_spec.mlir
+++ /dev/null
@@ -1,103 +0,0 @@
-// RUN: iree-opt %s
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-  // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-  // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %eltwise, %reduction = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-    transform.structured.split_reduction %reduction
-      { split_factor = 2, insert_split_dimension = 1 }
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Step 2. First level of tiling + fusion parallelizes to blocks.
-  // ===========================================================================
-  %grid_combiner_op, %forall_grid =
-    transform.structured.tile_using_forall %combiner_op tile_sizes [1]
-      ( mapping = [#gpu.block<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 2.1: Cannot fuse across the "expand_shape" produced by reduction
-  // splitting above, so we need to bubble that up via patterns and rematch
-  // the entire structure.
-  // TODO: bubbling should be a proper transform op, at which point we will be
-  // able to preserve the handles.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.bubble_expand
-  } : !transform.any_op
-  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill_2, %more_parallel_fill_2 = transform.split_handle %fills
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %expanded_eltwise, %more_parallel_2, %combiner_2 =
-    transform.split_handle %generics : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %forall_grid_2 = transform.structured.match ops{["scf.forall"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %not_combiner = transform.merge_handles %fill_2, %more_parallel_fill_2, %more_parallel_2, %expanded_eltwise : !transform.any_op
-  transform.structured.fuse_into_containing_op %not_combiner into %forall_grid_2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 3. Second level of tiling + fusion parallelizes to threads. Also
-  // fuse in the leading elementwise.
-  // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_block_combiner_op, %block_combiner_op =
-    transform.structured.tile_using_forall %combiner_2 tile_sizes [1]
-    ( mapping = [#gpu.thread<z>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %forall_block_more_parallel_op, %block_more_parallel_op =
-    transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %grid_eltwise_op into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_1 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_3 = transform.structured.vectorize_children_and_apply_patterns %func_1 : (!transform.any_op) -> !transform.any_op
-
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_2 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_4 = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_4 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_4 workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
-
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_4 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-  // at this point.
-  transform.sequence %variant_op_2 : !transform.any_op failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  }
-  transform.iree.vector.warp_distribute %func_4 : (!transform.any_op) -> ()
-}
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
index 3ad0c9674577..70aa3322dfd3 100644
--- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
+++ b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
@@ -43,72 +43,9 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   return %8 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: FileCheck %s --check-prefix=DISPATCH
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))'
-// RUN:     --iree-codegen-use-transform-dialect-strategy=%p/%S_codegen_spec.mlir | \
-// RUN: FileCheck %s
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-// Check that all generics ended up in the same region.
-// DISPATCH:     hal.executable.variant
-// DISPATCH:     linalg.fill
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-// DISPATCH-NOT: hal.executable.variant
-// DISPATCH:     linalg.generic
-
-//     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<f32>
-//     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<1x2xf32, 3>
-//     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-//     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-//     CHECK-DAG: %[[TIDZ:.]] = gpu.thread_id  z
-
-//         CHECK: %[[SHMEM_VIEW_EXPANDED:.*]] = memref.subview %[[SHMEM_ALLOC]][%[[TIDZ]], %[[TIDY]]]{{.*}}to memref<f32, {{.*}}, 3>
-
-// Distributed reduction: everyone loads, does the elementwise then 5 xor + addf expected
-//         CHECK: vector.transfer_read %{{.*}}[%[[TIDZ]], %[[TIDY]], %[[TIDX]]]
-//         CHECK: arith.addf
-//         CHECK: arith.addf
-// CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-//         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-//         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-//         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-//         CHECK: scf.if %[[CONDXIS0]]
-//         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_VIEW_EXPANDED]][]
-//         CHECK: gpu.barrier
-
-// Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-//         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-//          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-//         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-//         CHECK: scf.if %[[CONXANDYARE0]] {
-//         CHECK:   vector.transfer_read
-//         CHECK:   vector.reduction <add>
-//         CHECK:   math.sqrt
-//         CHECK:   vector.transfer_write
-//         CHECK: gpu.barrier
-//         CHECK: memref.dealloc %[[SHMEM_ALLOC]] : memref<1x2xf32, 3>
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=16 16 16 16 16 16 16 16
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
deleted file mode 100644
index 0c55aa3d5a99..000000000000
--- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise_codegen_spec.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: iree-opt %s
-
-transform.sequence failures(propagate) {
-^bb1(%variant_op: !transform.any_op):
-  %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-  // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-  // ===========================================================================
-  %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %leading_eltwise, %reduction, %trailing_eltwise = transform.split_handle %0
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-  %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-    transform.structured.split_reduction %reduction
-      { split_factor = 2, insert_split_dimension = 1 }
-       : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-  // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
-  // trailing elementwise the same way we want to tile the reduction.
-  // ===========================================================================
-  %trailing_eltwise_grid_op, %grid_loop =
-    transform.structured.tile_using_forall %trailing_eltwise tile_sizes [1]
-      ( mapping = [#gpu.block<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 2.1: Cannot fuse across the "expand_shape" produced by reduction
-  // splitting above, so we need to bubble that up via patterns and rematch
-  // the entire structure.
-  // TODO: bubbling should be a proper transform op, at which point we will be
-  // able to preserve the handles.
-  // ===========================================================================
-  %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func {
-    transform.apply_patterns.iree.bubble_expand
-  } : !transform.any_op
-  %fills = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill_2, %more_parallel_fill_2 = transform.split_handle %fill
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %generics = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %expanded_eltwise, %more_parallel_2, %combiner_2, %trailing_eltwise_2 =
-    transform.split_handle %generics
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-  %forall_grid_2 = transform.structured.match ops{["scf.forall"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %not_trailing = transform.merge_handles %fill_2, %more_parallel_fill_2,
-    %more_parallel_2, %expanded_eltwise, %combiner_2 : !transform.any_op
-  transform.structured.fuse_into_containing_op %not_trailing into %forall_grid_2 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 3. Second level of tiling + fusion parallelizes to threads. Also
-  // fuse in the leading and trailing elementwise.
-  // ===========================================================================
-  %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %block_trailing_eltwise_op, %forall_trailing_eltwise_op =
-    transform.structured.tile_using_forall %trailing_eltwise_2 tile_sizes [1]
-    ( mapping = [#gpu.thread<z>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
-    attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %fill_and_reduction = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
-  transform.structured.fuse_into_containing_op %fill_and_reduction into %forall_trailing_eltwise_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %grid_eltwise_op = transform.structured.match ops{["linalg.generic"]}
-    attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  %block_more_parallel_op, %forall_block_more_parallel_op =
-    transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-    ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-    : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-  transform.structured.fuse_into_containing_op %grid_eltwise_op into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-  // Step 4. Rank-reduce and vectorize.
-  // ===========================================================================
-  %func_1 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %func_1 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %func_2 = transform.structured.vectorize_children_and_apply_patterns %func_1 : (!transform.any_op) -> !transform.any_op
-
-  // Step 5. Bufferize and drop HAL decriptor from memref ops.
-  // ===========================================================================
-  transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-  %variant_op_2 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-  %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-
-  // Step 6. Post-bufferization mapping to blocks and threads.
-  // ===========================================================================
-  %func_3 = transform.structured.match ops{["func.func"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  transform.iree.forall_to_workgroup %func_3 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_3 workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
-
-  // Step 7. Post-bufferization vector distribution with rank-reduction.
-  // ===========================================================================
-  transform.apply_patterns to %func_3 {
-    transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-    transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-    transform.apply_patterns.memref.fold_memref_alias_ops
-    transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-  } : !transform.any_op
-  %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_2 : (!transform.any_op) -> !transform.any_op
-  // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-  // at this point.
-  transform.sequence %variant_op_2 : !transform.any_op failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
-    transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-    : (!transform.any_op) -> !transform.any_op
-  }
-  transform.iree.vector.warp_distribute %func_3 : (!transform.any_op) -> ()
-}
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
index c60c261dba5a..d1362d0f96e0 100644
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
@@ -3,66 +3,71 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(
       %variant_op: !transform.any_op {transform.consumed}) {
-    // Step 1. Find the fill, matmul and generic ops
-    // ===========================================================================
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %matmul = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
-    %generic = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
+      // Step 1. Find the fill, matmul and generic ops
+      // ===========================================================================
+      %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      %matmul = transform.structured.match ops{["linalg.generic"]}
+                  attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
+                  in %variant_op : (!transform.any_op) -> !transform.any_op
+      %generic = transform.structured.match ops{["linalg.generic"]}
+                  attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]}
+                  in %variant_op : (!transform.any_op) -> !transform.any_op
 
-    // Step 2. Tile the generic and fuse the fill and matmul
-    // ===========================================================================
-    %grid_reduction, %forall_grid =
-    transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
+      // Step 2. Tile the generic and fuse the fill and matmul
+      // ===========================================================================
+      %grid_reduction, %forall_grid =
+      transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
 
-    transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-    // Step 3. Vectorize
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
+      // Step 3. Vectorize
+      // ===========================================================================
+      %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %func {
+        transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
+        transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
+        transform.apply_patterns.vector.cast_away_vector_leading_one_dim
+      } : !transform.any_op
+      %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
 
-    // Step 4. Bufferize
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.linalg.erase_unnecessary_inputs
-    } : !transform.any_op
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      // Step 4. Bufferize
+      // ===========================================================================
+      transform.apply_patterns to %func_3 {
+        transform.apply_patterns.iree.fold_fill_into_pad
+        transform.apply_patterns.linalg.tiling_canonicalization
+        transform.apply_patterns.scf.for_loop_canonicalization
+      } : !transform.any_op
+      transform.apply_patterns to %func_3 {
+        transform.apply_patterns.tensor.reassociative_reshape_folding
+        transform.apply_patterns.canonicalization
+      } : !transform.any_op
+      transform.apply_cse to %func_3 : !transform.any_op
+      transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
+      transform.apply_patterns to %func_3 {
+        transform.apply_patterns.linalg.erase_unnecessary_inputs
+      } : !transform.any_op
+      %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
+      %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
 
-    // Step 6. Post-bufferization vector distribution
-    // ===========================================================================
-    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_7
-        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
+      // Step 6. Post-bufferization vector distribution
+      // ===========================================================================
+      %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
+      transform.iree.map_nested_forall_to_gpu_threads %func_7
+          workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
 
-    // Step 7. Do layout analysis and lower to mma
-    // ===========================================================================
-    %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
-    transform.yield
-  }
+      // Step 7. Do layout analysis and lower to mma
+      // ===========================================================================
+      %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+      // Annotate the exported function as already translated.
+      %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+      %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+      transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+      transform.yield
+    }
 } // module
 
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
index a99b19d03d7a..f01f07cc2d59 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
@@ -27,8 +27,7 @@ func.func @matmul_reduction(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>)
 // RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \
 // RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \
 // RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
index d72accb0499e..a7c32a4cfe79 100644
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
@@ -66,6 +66,10 @@ module attributes { transform.with_named_sequence } {
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
index 33bfe44dbb53..83e249618bc2 100644
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
@@ -70,6 +70,11 @@ module attributes { transform.with_named_sequence } {
     // ===========================================================================
     %func_10 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
     %func_11 = transform.iree.layout_analysis_and_distribution %func_10 : (!transform.any_op) -> (!transform.any_op)
+
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir
index 2642b0303d12..d506a1740cf1 100644
--- a/tests/transform_dialect/cuda/reduction.mlir
+++ b/tests/transform_dialect/cuda/reduction.mlir
@@ -18,24 +18,9 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   return %2 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
@@ -44,34 +29,5 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>>
-  //     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-  //     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-  //     CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]]
-  //         CHECK: gpu.barrier
-
-  // Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-  //         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-  //          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-  //         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-  //         CHECK: scf.if %[[CONXANDYARE0]] {
-  //         CHECK:   vector.transfer_read
-  //         CHECK:   vector.reduction <add>
-  //         CHECK:   vector.transfer_write
-  //         CHECK: gpu.barrier
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
index 364f1a6bef48..c8b46b3756a6 100644
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
@@ -115,6 +115,11 @@ module attributes { transform.with_named_sequence } {
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.apply_cse to %func_op_3 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir
index a266998dc1b6..0bd49b5d4298 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise.mlir
@@ -29,59 +29,17 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   return %7 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
+/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit
+/// only works for exactly this reduction atm.
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32, #gpu.address_space<workgroup>>
-  //     CHECK-DAG: %[[TIDX:.]] = gpu.thread_id  x
-  //     CHECK-DAG: %[[TIDY:.]] = gpu.thread_id  y
-  //     CHECK-DAG: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[workgroup_id_x]], %[[TIDY]], %[[TIDX]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]], %[[SHMEM_ALLOC]][%[[C0]], %[[TIDY]]]
-  //         CHECK: gpu.barrier
-
-  // Last part is not distributed atm and is only ran by threadIdx.x == 0 and threadIdx.y == 0.
-  // It should contain the fused elementwise operation.
-  //         CHECK: %[[CONDYIS0:.*]] = arith.cmpi ult, %[[TIDY]], %[[C1]] : index
-  //          TODO: cond eq 0 and cond ult 1 do not CSE atm.
-  //         CHECK: %[[CONXANDYARE0:.*]] = arith.andi %{{.*}}, %[[CONDYIS0]] : i1
-  //         CHECK: scf.if %[[CONXANDYARE0]] {
-  //         CHECK:   vector.transfer_read
-  //         CHECK:   vector.reduction <add>
-  //         CHECK:   math.sqrt
-  //         CHECK:   vector.transfer_write
-  //         CHECK: gpu.barrier
-
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8
diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
index aad683dbbfa3..42a584803f84 100644
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
@@ -154,6 +154,11 @@ module attributes { transform.with_named_sequence } {
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.apply_cse to %func_op_3 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir
index 6ff64421d684..6d367d218397 100644
--- a/tests/transform_dialect/cuda/reduction_v2.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2.mlir
@@ -18,21 +18,9 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   return %2 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
@@ -40,39 +28,6 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x128xf32, #gpu.address_space<workgroup>>
-
-  //         CHECK: %[[TIDX:.]] = gpu.thread_id  x
-  //         CHECK: %[[IDX_0:.*]] = affine.apply{{.*}}()[%[[TIDX]]]
-  //         CHECK: gpu.barrier
-  // TODO: Properly poduce/CSE IDX_1 vs IDX_0
-  //         CHECK: %[[IDX_1:.*]] = affine.apply{{.*}}(%[[TIDX]])
-  // Local per-thread scf.for-based reduction.
-  //         CHECK: scf.for
-  //         CHECK:   vector.transfer_read
-  //         CHECK:   vector.transfer_read %[[SHMEM_ALLOC]][%[[C0]], %[[IDX_1]]]
-  //         CHECK:   arith.addf %{{.*}}, %{{.*}} : vector<4xf32>
-  //         CHECK:   vector.transfer_write %{{.*}}, %[[SHMEM_ALLOC]][%[[C0]], %[[IDX_1]]]
-  // TODO: remote unnecessary barrier within the loop
-  //         CHECK:   gpu.barrier
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[IDX_0]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]]
-  //         CHECK: gpu.barrier
-
 // only checking the first 6 of 33
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 33xf32=1024 1024 1024 1024 1024 1024
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
index 4fd18f0bd50e..bb9ecbfc6429 100644
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
@@ -103,6 +103,11 @@ module attributes { transform.with_named_sequence } {
     transform.iree.apply_licm %func_7 : !transform.any_op
     transform.apply_cse to %func_7 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
index 29b2d48bb7ac..66b88cdf2232 100644
--- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
+++ b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
@@ -18,52 +18,12 @@ func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   return %2 : !out_tensor_t
 }
 
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC
 
-  //     CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  //     CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-  //     CHECK-DAG: %[[F0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-  //     CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-  //     CHECK-DAG: %[[SHMEM_ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x128xf32, #gpu.address_space<workgroup>>
-
-  //         CHECK: %[[TIDX:.]] = gpu.thread_id  x
-  //         CHECK: %[[IDX:.*]] = affine.apply{{.*}}%[[TIDX]]
-  //         CHECK: gpu.barrier
-  // Local per-thread scf.for-based reduction.
-  //         CHECK: scf.for
-  //     CHECK-NOT:   memref.alloc
-  //         CHECK:   linalg.generic
-  // TODO: remote unnecessary barrier within the loop
-  //         CHECK:   gpu.barrier
-
-  // Distributed reduction: everyone loads then 5 xor + addf expected
-  //         CHECK: vector.transfer_read %{{.*}}[%[[C0]], %[[IDX]]]
-  // CHECK-COUNT-5: gpu.shuffle  xor{{.*}}{{[[:space:]].*}}{{.*}} arith.addf
-
-  //         CHECK: %[[RES:.*]] = arith.addf %{{.*}}
-
-  //         CHECK: %[[RES_VEC:.*]] = vector.broadcast %[[RES]] : f32 to vector<f32>
-  //         CHECK: %[[CONDXIS0:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index
-  //         CHECK: scf.if %[[CONDXIS0]]
-  //         CHECK:   vector.transfer_write %[[RES_VEC]]
-  //         CHECK: gpu.barrier
-
 // only checking the first 6 of 33
 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 33xf32=34567 34567 34567 34567 34567 34567
diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
index 27464dba42fd..91bbc07b7a28 100644
--- a/tests/transform_dialect/cuda/softmax.mlir
+++ b/tests/transform_dialect/cuda/softmax.mlir
@@ -1,24 +1,7 @@
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
-// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
-
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=softmax --device=cuda | \
 // RUN: FileCheck %s
 
@@ -27,12 +10,6 @@
 !in_tensor_t = tensor<16x128x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>
 
-// Compilation checks that shuffles are produced.
-// CHECK-SHUFFLE: vector.reduction <maximumf>
-// CHECK-SHUFFLE-COUNT-5: gpu.shuffle  xor
-// CHECK-SHUFFLE: vector.reduction <add>
-// CHECK-SHUFFLE-COUNT-5: gpu.shuffle  xor
-
 // Execution only checks that @softmax runs.
 //      CHECK: EXEC @softmax
 //      CHECK: 16x128x128xf32=[
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
index 95a0374c8913..86f49091536d 100644
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
@@ -109,6 +109,11 @@ module attributes { transform.with_named_sequence } {
     %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir
index 91032cbd40fd..018ad8c42ac8 100644
--- a/tests/transform_dialect/cuda/softmax_partial.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial.mlir
@@ -1,31 +1,12 @@
-
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
-// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \
 // RUN: FileCheck %s
 
 !tmp_tensor_t = tensor<16x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>
 
-// Compilation checks that shuffles are produced.
-// CHECK-SHUFFLE: gpu.shuffle  xor
-
 // Execution only checks that @softmax_partial runs.
 //      CHECK: EXEC @softmax_partial
 //      CHECK: 16x128x128xf32=[
diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
index d62558fb302d..65ea847af92c 100644
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
@@ -92,6 +92,11 @@ module attributes { transform.with_named_sequence } {
       : (!transform.any_op) -> !transform.any_op
     transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module
diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir
index 07e3c28b7cbf..5ef3a90347b9 100644
--- a/tests/transform_dialect/cuda/softmax_v2.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2.mlir
@@ -1,23 +1,7 @@
-// RUN: iree-opt %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-abi-transformation-pipeline \
-// RUN:     --iree-flow-transformation-pipeline  \
-// RUN:     --iree-flow-fuse-multi-use \
-// RUN:     --iree-stream-transformation-pipeline \
-// RUN:     --iree-hal-configuration-pipeline | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs, iree-llvmgpu-lower-executable-target)))' \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
-// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE
-
 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
-/// Constant JIT'ing must be disabled because the transform-dialect debug
-/// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-flow-fuse-multi-use \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir \
-// RUN:     --iree-codegen-use-transform-dialect-strategy=codegen | \
+// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=softmax --device=cuda | \
 // RUN: FileCheck %s
 
@@ -25,9 +9,6 @@
 !in_tensor_t = tensor<16x128x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>
 
-// Compilation checks that shuffles are produced.
-// CHECK-SHUFFLE: gpu.shuffle  xor
-
 // Execution only checks that @softmax runs.
 //      CHECK: EXEC @softmax
 //      CHECK: 16x128x128xf32=[
diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
index 773c93893520..c73cbe9e18d2 100644
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
@@ -138,6 +138,11 @@ module attributes { transform.with_named_sequence } {
     transform.iree.apply_licm %func_op_3 : !transform.any_op
     transform.apply_cse to %func_op_3 : !transform.any_op
 
+    // Annotate the exported function as already translated.
+    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
+    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
+    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
+
     transform.yield
   }
 } // module