diff --git a/compiler/plugins/target/MetalSPIRV/test/smoketest.mlir b/compiler/plugins/target/MetalSPIRV/test/smoketest.mlir index d32ac8ef561f7..7a7af7c6e2cfe 100644 --- a/compiler/plugins/target/MetalSPIRV/test/smoketest.mlir +++ b/compiler/plugins/target/MetalSPIRV/test/smoketest.mlir @@ -6,7 +6,8 @@ module attributes { #hal.executable.target<"metal-spirv", "metal-msl-fb", { iree.gpu.target = #iree_gpu.target> + max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> ]> : !hal.device ] diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir index 15240f92ce33c..cffccce6dfd72 100644 --- a/compiler/plugins/target/ROCM/test/target_device_features.mlir +++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir @@ -8,7 +8,8 @@ // GFX942-SAME: subgroup = shuffle|arithmetic, dot = dp4xi8toi32, // GFX942-SAME: mma = [, , , , , ], // GFX942-SAME: subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], -// GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, +// GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, +// GFX942-SAME: max_workgroup_counts = [2147483647, 2147483647, 2147483647]>, // GFX942-SAME: chip = > // GFX940: target = #iree_gpu.target> + max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> ]> : !hal.device ] diff --git a/compiler/plugins/target/WebGPUSPIRV/test/smoketest.mlir b/compiler/plugins/target/WebGPUSPIRV/test/smoketest.mlir index 69c5ceba58ba3..e0985808d5eeb 100644 --- a/compiler/plugins/target/WebGPUSPIRV/test/smoketest.mlir +++ b/compiler/plugins/target/WebGPUSPIRV/test/smoketest.mlir @@ -7,7 +7,8 @@ module attributes { #hal.executable.target<"webgpu-spirv", "webgpu-wgsl-fb", { iree.gpu.target = #iree_gpu.target> + max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> ]> : !hal.device ] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index e421f4ea8e340..4c5b2d685b474 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -289,6 +289,8 @@ def IREEGPU_TargetWgpAttr : AttrDef { "uint32_t":$max_thread_count_per_workgroup, // The maximal number of shared memory bytes we can allocate per workgroup. "uint32_t":$max_workgroup_memory_bytes, + // Tthe maximum number of workgroups per X/Y/Z dimension in a dispatch. + "DenseI32ArrayAttr":$max_workgroup_counts, // An optional extra dict // This field allows to inject more features/limits not supported in the diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir index baa47b2be12ed..e8611005d71fb 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir @@ -11,7 +11,8 @@ func.func @test_target_wgp() attributes { // CHECK-SAME: subgroup_size_choices = [32, 64], // CHECK-SAME: max_workgroup_sizes = [1024, 1024, 1024], // CHECK-SAME: max_thread_count_per_workgroup = 1024, - // CHECK-SAME: max_workgroup_memory_bytes = 65536> + // CHECK-SAME: max_workgroup_memory_bytes = 65536, + // CHECK-SAME: max_workgroup_counts = [2147483647, 2147483647, 2147483647]> wgp = #iree_gpu.target_wgp< compute = fp16|fp32|int8, storage = b16|b32, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, @@ -19,7 +20,8 @@ func.func @test_target_wgp() attributes { subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, - max_workgroup_memory_bytes = 65536 + max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [2147483647, 2147483647, 2147483647] > } { return } @@ -37,7 +39,8 @@ func.func @test_target_wgp_none() attributes { subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, - max_workgroup_memory_bytes = 65536 + max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [2147483647, 2147483647, 2147483647] > } { return } @@ -67,7 +70,8 @@ func.func @test_target() attributes { subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, - max_workgroup_memory_bytes = 65536>, + max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [2147483647, 2147483647, 2147483647]>, chip = > } { return } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index 2f3d254b65877..520c26c047897 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -45,6 +45,7 @@ struct WgpDetails { std::array maxWorkgroupSizes; uint32_t maxThreadSize; uint32_t maxWorkgroupMemoryBytes; + std::array maxWorkgroupCounts; }; // Chip level feature/limit details @@ -106,7 +107,9 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch, MMAOpsArrayAttr::get(context, mmaAttrs), DenseI32ArrayAttr::get(context, subgroupSizes), DenseI32ArrayAttr::get(context, wgp->maxWorkgroupSizes), - wgp->maxThreadSize, wgp->maxWorkgroupMemoryBytes, DictionaryAttr{}); + wgp->maxThreadSize, wgp->maxWorkgroupMemoryBytes, + DenseI32ArrayAttr::get(context, wgp->maxWorkgroupCounts), + DictionaryAttr{}); TargetChipAttr targetChip; if (details.chip) @@ -118,6 +121,10 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch, //===----------------------------------------------------------------------===// // Known AMD target details +// +// Note: the max workgroup size is given as signed int32 max because MLIR's +// `index` is signed and the workgroup ID is sign-extended, not zero-extended, +// to 64-bits. //===----------------------------------------------------------------------===// const WgpDetails *getCDNA3WgpDetails() { @@ -129,11 +136,17 @@ const WgpDetails *getCDNA3WgpDetails() { MMAIntrinsic::MFMA_I8_16x16x32_I32, MMAIntrinsic::MFMA_I8_32x32x16_I32, }; - static const WgpDetails cdna3Wgp = { - allComputeBits, allStorageBits, allSubgroupOps, - allDotProductOps, ARRAY_SIZE(cdna3MMAOps), cdna3MMAOps, - {64, 64}, {1024, 1024, 1024}, 1024, - 64 * 1024}; + static const WgpDetails cdna3Wgp = {allComputeBits, + allStorageBits, + allSubgroupOps, + allDotProductOps, + ARRAY_SIZE(cdna3MMAOps), + cdna3MMAOps, + {64, 64}, + {1024, 1024, 1024}, + 1024, + 64 * 1024, + {0x7fffffff, 0x7fffffff, 0x7fffffff}}; return &cdna3Wgp; } @@ -142,11 +155,17 @@ const WgpDetails *getCDNA2WgpDetails() { MMAIntrinsic::MFMA_F16_16x16x16_F32, MMAIntrinsic::MFMA_F16_32x32x8_F32, }; - static const WgpDetails cdna2Wgp = { - allComputeBits, allStorageBits, allSubgroupOps, - allDotProductOps, ARRAY_SIZE(cdna2MMAOps), cdna2MMAOps, - {64, 64}, {1024, 1024, 1024}, 1024, - 64 * 1024}; + static const WgpDetails cdna2Wgp = {allComputeBits, + allStorageBits, + allSubgroupOps, + allDotProductOps, + ARRAY_SIZE(cdna2MMAOps), + cdna2MMAOps, + {64, 64}, + {1024, 1024, 1024}, + 1024, + 64 * 1024, + {0x7fffffff, 0x7fffffff, 0x7fffffff}}; return &cdna2Wgp; } @@ -155,11 +174,17 @@ const WgpDetails *getCDNA1WgpDetails() { MMAIntrinsic::MFMA_F16_16x16x16_F32, MMAIntrinsic::MFMA_F16_32x32x8_F32, }; - static const WgpDetails cdna1Wgp = { - allComputeBits, allStorageBits, allSubgroupOps, - allDotProductOps, ARRAY_SIZE(cdna1MMAOps), cdna1MMAOps, - {64, 64}, {1024, 1024, 1024}, 1024, - 64 * 1024}; + static const WgpDetails cdna1Wgp = {allComputeBits, + allStorageBits, + allSubgroupOps, + allDotProductOps, + ARRAY_SIZE(cdna1MMAOps), + cdna1MMAOps, + {64, 64}, + {1024, 1024, 1024}, + 1024, + 64 * 1024, + {0x7fffffff, 0x7fffffff, 0x7fffffff}}; return &cdna1Wgp; } @@ -168,27 +193,39 @@ const WgpDetails *getRDNA3WgpDetails() { MMAIntrinsic::WMMA_F16_16x16x16_F32, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; - static const WgpDetails rdna3Wgp = { - allComputeBits, allStorageBits, allSubgroupOps, - allDotProductOps, ARRAY_SIZE(rdna3MMAOps), rdna3MMAOps, - {32, 64}, {1024, 1024, 1024}, 1024, - 64 * 1024}; + static const WgpDetails rdna3Wgp = {allComputeBits, + allStorageBits, + allSubgroupOps, + allDotProductOps, + ARRAY_SIZE(rdna3MMAOps), + rdna3MMAOps, + {32, 64}, + {1024, 1024, 1024}, + 1024, + 64 * 1024, + {0x7fffffff, 0x7fffffff, 0x7fffffff}}; return &rdna3Wgp; } const WgpDetails *getRDNA2WgpDetails() { static const WgpDetails rdna2Wgp = { - allComputeBits, allStorageBits, allSubgroupOps, allDotProductOps, - /*mmaCount=*/0, /*mmaOps=*/nullptr, {32, 64}, {1024, 1024, 1024}, - 1024, 64 * 1024}; + allComputeBits, allStorageBits, + allSubgroupOps, allDotProductOps, + /*mmaCount=*/0, + /*mmaOps=*/nullptr, {32, 64}, + {1024, 1024, 1024}, 1024, + 64 * 1024, {0x7fffffff, 0x7fffffff, 0x7fffffff}}; return &rdna2Wgp; } const WgpDetails *getRDNA1WgpDetails() { static const WgpDetails rdna1Wgp = { - allComputeBits, allStorageBits, allSubgroupOps, DotProductOps::None, - /*mmaCount=*/0, /*mmaOps=*/nullptr, {32, 64}, {1024, 1024, 1024}, - 1024, 64 * 1024}; + allComputeBits, allStorageBits, + allSubgroupOps, DotProductOps::None, + /*mmaCount=*/0, + /*mmaOps=*/nullptr, {32, 64}, + {1024, 1024, 1024}, 1024, + 64 * 1024, {0x7fffffff, 0x7fffffff, 0x7fffffff}}; return &rdna1Wgp; } @@ -281,7 +318,9 @@ std::optional getAppleTargetDetails() { static const WgpDetails wgp = { computeBitwdiths, allStorageBits, allSubgroupOps, allDotProductOps, /*mmaCount=*/0, /*mmaOps=*/nullptr, {32, 32}, - {1024, 1024, 1024}, 1024, 32 * 1024}; + {1024, 1024, 1024}, 1024, 32 * 1024, + // Note: These values have not been checked and may be higher + {0xffff, 0xffff, 0xffff}}; // clang-format on return TargetDetails{&wgp, nullptr}; @@ -302,7 +341,9 @@ const WgpDetails *getValhallWgpDetails() { static const WgpDetails valhallWgp = { computeBitwdiths, allStorageBits, allSubgroupOps, allDotProductOps, /*mmaCount=*/0, /*mmaOps=*/nullptr, {16, 16}, {512, 512, 512}, - 512, 32 * 1024}; + 512, 32 * 1024, + // Note: These values have not been checked and may be higher + {0xffff, 0xffff, 0xffff}}; // clang-format on return &valhallWgp; } @@ -358,11 +399,17 @@ const WgpDetails *getAmpereWgpDetails() { MMAIntrinsic::WMMA_F16_16x16x16_F32, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; - static const WgpDetails ampereWgp = { - allComputeBits, allStorageBits, allSubgroupOps, - allDotProductOps, ARRAY_SIZE(mmaOps), mmaOps, - {32, 32}, {1024, 1024, 1024}, 1024, - 163 * 1024}; + static const WgpDetails ampereWgp = {allComputeBits, + allStorageBits, + allSubgroupOps, + allDotProductOps, + ARRAY_SIZE(mmaOps), + mmaOps, + {32, 32}, + {1024, 1024, 1024}, + 1024, + 163 * 1024, + {0x7fffffff, 0xffff, 0xffff}}; return &ereWgp; } @@ -371,11 +418,17 @@ const WgpDetails *getTuringWgpDetails() { MMAIntrinsic::WMMA_F16_16x16x16_F32, MMAIntrinsic::WMMA_F16_16x16x16_F16, }; - static const WgpDetails turingWgp = { - allComputeBits, allStorageBits, allSubgroupOps, - allDotProductOps, ARRAY_SIZE(mmaOps), mmaOps, - {32, 32}, {1024, 1024, 1024}, 1024, - 64 * 1024}; + static const WgpDetails turingWgp = {allComputeBits, + allStorageBits, + allSubgroupOps, + allDotProductOps, + ARRAY_SIZE(mmaOps), + mmaOps, + {32, 32}, + {1024, 1024, 1024}, + 1024, + 64 * 1024, + {0x7fffffff, 0xffff, 0xffff}}; return &turingWgp; } @@ -388,7 +441,8 @@ const WgpDetails *getVoltaWgpDetails() { static const WgpDetails voltaWgp = { allComputeBits, allStorageBits, allSubgroupOps, DotProductOps::None, ARRAY_SIZE(mmaOps), mmaOps, {32, 32}, {1024, 1024, 1024}, - 1024, 96 * 1024}; + 1024, 96 * 1024, + {0x7fffffff, 0xffff, 0xffff}}; // clang-format on return &voltaWgp; } @@ -398,7 +452,8 @@ const WgpDetails *getPascalWgpDetails() { static const WgpDetails pascalWgp = { allComputeBits, allStorageBits, allSubgroupOps, DotProductOps::None, 0, nullptr, // Pascal does not have tensor core support. - {32, 32}, {1024, 1024, 1024}, 1024, 48 * 1024}; + {32, 32}, {1024, 1024, 1024}, 1024, 48 * 1024, + {0x7fffffff, 0xffff, 0xffff}}; // clang-format on return &pascalWgp; } @@ -479,7 +534,9 @@ const WgpDetails *getAdrenoWgpDetails() { computeBitwdiths, storageBitwidths, allSubgroupOps, allDotProductOps, /*mmaCount=*/0, /*mmaOps=*/nullptr, {64, 64}, {1024, 1024, 1024}, 1024, - 32 * 1024}; + 32 * 1024, + // Note: These values have not been checked and may be higher + {0xffff, 0xffff, 0xffff}}; // clang-format on return &adrenoWgp; } @@ -545,7 +602,8 @@ const WgpDetails *getAndroidBaseline2022WgpDetails() { computeBitwdiths, storageBitwidths, SubgroupOps::None, DotProductOps::None, /*mmaCount=*/0, /*mmaOps=*/nullptr, {64, 64}, {128, 128, 64}, 128, - 16 * 1024}; + 16 * 1024, + {0xffff, 0xffff, 0xffff}}; // clang-format on return &androidWgp; } @@ -645,7 +703,8 @@ TargetAttr getWebGPUTargetDetails(MLIRContext *context) { computeBitwdiths, storageBitwidths, SubgroupOps::None, DotProductOps::None, /*mmaCount=*/0, /*mmaOps=*/nullptr, {32, 32}, {128, 128, 64}, 128, - 16 * 1024}; + 16 * 1024, + {0xffff, 0xffff, 0xffff}}; // clang-format on return createTargetAttr( diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir index 08ef0e89d2cc3..6b0ae9ccbbe59 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir @@ -93,7 +93,8 @@ func.func @conv_nhwc() { subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [2147483647, 2147483647, 2147483647]>> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #target}> func.func @matmul_256x256x256() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { %cst = arith.constant 0.000000e+00 : f32 diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir index e02c07d52c3c1..13f86b7514692 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir @@ -10,7 +10,8 @@ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @copy_as_generic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { @@ -44,7 +45,8 @@ func.func @copy_as_generic() attributes {hal.executable.target = #executable_tar iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> func.func @copy() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { @@ -81,7 +83,8 @@ func.func @copy() attributes {hal.executable.target = #executable_target_vulkan_ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index @@ -118,7 +121,8 @@ func.func @avg_pool() attributes {hal.executable.target = #executable_target_vul iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { @@ -162,7 +166,8 @@ func.func @avg_pool() attributes {hal.executable.target = #executable_target_vul iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @max_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %cst = arith.constant 0xFF800000 : f32 @@ -203,7 +208,8 @@ func.func @max_pool() attributes {hal.executable.target = #executable_target_vul iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1)> @@ -244,7 +250,8 @@ func.func @elementwise() attributes {hal.executable.target = #executable_target_ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> func.func @dwconv_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { @@ -292,7 +299,8 @@ func.func @dwconv_elementwise() attributes {hal.executable.target = #executable_ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2) -> (d2, d0, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -332,7 +340,8 @@ func.func @outermost_reduction() attributes {hal.executable.target = #executable iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> @@ -381,7 +390,8 @@ func.func @innermost_reduction() attributes {hal.executable.target = #executable iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -418,7 +428,8 @@ func.func @four_dim_elementwise() attributes {hal.executable.target = #executabl iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> @@ -465,7 +476,8 @@ func.func @odd_reduction_dimension_size_501() attributes {hal.executable.target iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> @@ -512,7 +524,8 @@ func.func @odd_reduction_dimension_size_2809() attributes {hal.executable.target iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2, d3) -> ()> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir index 9ff2c67156bf8..9370b6c429fa9 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir @@ -13,7 +13,8 @@ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @batch_matmul_1x3x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index @@ -55,7 +56,8 @@ func.func @batch_matmul_1x3x32() attributes {hal.executable.target = #executable iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @matmul_64x16xi8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index @@ -96,7 +98,8 @@ func.func @matmul_64x16xi8() attributes {hal.executable.target = #executable_tar iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @matmul_64x16xi64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index @@ -137,7 +140,8 @@ func.func @matmul_64x16xi64() attributes {hal.executable.target = #executable_ta iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> @@ -189,7 +193,8 @@ func.func @matmul_400x273() attributes {hal.executable.target = #executable_targ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> @@ -243,7 +248,8 @@ func.func @matmul_25x546() attributes {hal.executable.target = #executable_targe iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @matmul_pointwise_256x1024() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir index 09c4c36f12457..0f31e2af78dc2 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir @@ -10,7 +10,8 @@ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 512, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> @@ -50,7 +51,8 @@ func.func @subgroup_reduce_f32() attributes {hal.executable.target = #executable iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -97,7 +99,8 @@ func.func @subgroup_reduce_f16() attributes {hal.executable.target = #executable iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir index d8d3770c18485..d5141497aeb35 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_gpu_target.mlir @@ -4,7 +4,8 @@ hal.executable @dispatch { hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target, ], - subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>>}>) { + subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>) { hal.executable.export public @dispatch ordinal(0) layout(#hal.pipeline.layout]>]>) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir index a7a2f9d854699..eb1c28116fbf7 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir @@ -13,7 +13,8 @@ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @buffer_types() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index @@ -50,7 +51,8 @@ func.func @buffer_types() attributes {hal.executable.target = #executable_target iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @emulate_1d_vector() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c95232 = arith.constant 95232 : index @@ -103,7 +105,8 @@ func.func @emulate_1d_vector() attributes {hal.executable.target = #executable_t iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @no_emulation() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir index bb7668854bd60..978bf7ad72efb 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir @@ -14,7 +14,8 @@ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -42,7 +43,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -70,7 +72,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -98,7 +101,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -126,7 +130,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -154,7 +159,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -182,7 +188,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -210,7 +217,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -239,7 +247,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none, mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -274,7 +283,8 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none, mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -309,7 +319,8 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none, mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -344,7 +355,8 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none, mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -379,7 +391,8 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe compute = fp32|fp16|int32, storage = b32|b16, subgroup = none, dot = none, mma = [, ], subgroup_size_choices = [32, 64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -413,7 +426,8 @@ func.func @matmul_tensor() attributes {hal.executable.target = #executable_targe iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<()[s0] -> (s0 * 4)> #map1 = affine_map<()[s0] -> (s0 * 16)> @@ -472,7 +486,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<()[s0] -> (s0 * 4)> #map1 = affine_map<()[s0] -> (s0 * 16)> @@ -531,7 +546,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<()[s0] -> (s0 * 4)> #map1 = affine_map<()[s0] -> (s0 * 16)> @@ -590,7 +606,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info @@ -618,7 +635,8 @@ func.func @illegal() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]>> }> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir index 243a361dc81b2..6d4d16315827a 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir @@ -12,7 +12,8 @@ iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> @@ -100,7 +101,8 @@ func.func @warp_reduction_dispatch() attributes {hal.executable.target = #execut iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -183,7 +185,8 @@ func.func @warp_reduction_dispatch() attributes {hal.executable.target = #execut iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c0 = arith.constant 0 : index @@ -290,7 +293,8 @@ func.func @softmax() attributes {hal.executable.target = #executable_target_vulk iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }> func.func @dynamic_softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { %c32_i64 = arith.constant 32 : i64 diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/map_memref_storage_class.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/map_memref_storage_class.mlir index 9c622d8e2b302..a2507c3798d35 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/map_memref_storage_class.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/map_memref_storage_class.mlir @@ -6,7 +6,7 @@ compute = fp32|int32, storage = b32, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [64], max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, - max_workgroup_memory_bytes = 16384>>}> + max_workgroup_memory_bytes = 16384, max_workgroup_counts = [65535, 65535, 65535]>>}> func.func @vulkan_client_api() attributes {hal.executable.target = #target} { %0 = "dialect.memref_producer"() : () -> (memref>) @@ -45,7 +45,7 @@ func.func @vulkan_client_api() attributes {hal.executable.target = #target} { compute = fp32|int32, storage = b32, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [64], max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, - max_workgroup_memory_bytes = 16384>>}> + max_workgroup_memory_bytes = 16384, max_workgroup_counts = [65535, 65535, 65535]>>}> func.func @opencl_client_api() attributes {hal.executable.target = #target} { %0 = "dialect.memref_producer"() : () -> (memref>) diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir index aa5d7cb5ea784..aed1a325aaee8 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir @@ -16,7 +16,8 @@ hal.executable @i4_dequant_unit_matmul_f16 { iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }>) { hal.executable.export @i4_dequant_unit_matmul_f16 layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): @@ -125,7 +126,8 @@ hal.executable @i4_dequant_matvec_f16_subgroup_64 { iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> }>) { hal.executable.export @i4_dequant_matvec_f16_subgroup_64 layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir index 105012f62bd07..50aa57e57b66f 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_conv.mlir @@ -30,7 +30,8 @@ func.func @nwc_conv_1d_dot_prod(%input: tensor<1x7x3xi8>, %filter: tensor<1x3x4x iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> } { %c0 = arith.constant 0 : i32 %i0 = arith.constant 0 : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir index c2da6a555217b..03644e7c9beca 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_matmul.mlir @@ -271,7 +271,8 @@ func.func @matmul_4x4x4_i8_to_i32_dot_prod(%lhs: tensor<4x4xi8>, %rhs : tensor<4 iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> } { %c0 = arith.constant 0 : i32 %i0 = arith.constant 0 : index @@ -330,7 +331,8 @@ func.func @matmul_4x16x4_i8_to_i32_dot_prod(%lhs: tensor<4x16xi8>, %rhs : tensor iree.gpu.target = #iree_gpu.target> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [65535, 65535, 65535]>> } { %c0 = arith.constant 0 : i32 %i0 = arith.constant 0 : index diff --git a/compiler/src/iree/compiler/Preprocessing/Common/test/pad_to_intrinsics_mfma.mlir b/compiler/src/iree/compiler/Preprocessing/Common/test/pad_to_intrinsics_mfma.mlir index 7d9da4586c212..4b7b0fce76e26 100644 --- a/compiler/src/iree/compiler/Preprocessing/Common/test/pad_to_intrinsics_mfma.mlir +++ b/compiler/src/iree/compiler/Preprocessing/Common/test/pad_to_intrinsics_mfma.mlir @@ -69,7 +69,8 @@ func.func @main1(%arg0: tensor<2x130x130x320xf16>, %arg1: tensor<3x3x320x4xf16>, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], - max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>> + max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, + max_workgroup_counts = [2147483647, 2147483647, 2147483647]>> #rocm_executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #target, ukernels = "none"}> // CHECK-LABEL: func.func @main2( diff --git a/samples/custom_dispatch/vulkan/shaders/example.mlir b/samples/custom_dispatch/vulkan/shaders/example.mlir index 69843aa1691e6..d9cb5e1d07b42 100644 --- a/samples/custom_dispatch/vulkan/shaders/example.mlir +++ b/samples/custom_dispatch/vulkan/shaders/example.mlir @@ -19,7 +19,8 @@ compute = fp32|int32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [64, 64], max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, - max_workgroup_memory_bytes = 16384> + max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]> > }> diff --git a/samples/custom_dispatch/vulkan/shaders/example_inline.mlir b/samples/custom_dispatch/vulkan/shaders/example_inline.mlir index 41576518089f7..2882134a05671 100644 --- a/samples/custom_dispatch/vulkan/shaders/example_inline.mlir +++ b/samples/custom_dispatch/vulkan/shaders/example_inline.mlir @@ -19,7 +19,8 @@ compute = fp32|int32, storage = b32, subgroup = none, dot = none, mma = [], subgroup_size_choices = [64, 64], max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, - max_workgroup_memory_bytes = 16384> + max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]> > }> diff --git a/samples/custom_dispatch/vulkan/shaders/example_transform.mlir b/samples/custom_dispatch/vulkan/shaders/example_transform.mlir index 4bea02d2210ee..08c40a16c927b 100644 --- a/samples/custom_dispatch/vulkan/shaders/example_transform.mlir +++ b/samples/custom_dispatch/vulkan/shaders/example_transform.mlir @@ -23,7 +23,8 @@ compute = fp32|int32, storage = b32, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [64, 64], max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, - max_workgroup_memory_bytes = 16384> + max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]> > }> diff --git a/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir b/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir index 5bcdafe7fba1e..8e232069fa153 100644 --- a/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir +++ b/samples/custom_dispatch/vulkan/shaders/example_transform_spec.mlir @@ -12,7 +12,8 @@ compute = fp32|int32, storage = b32, subgroup = shuffle|arithmetic, dot = none, mma = [], subgroup_size_choices = [64, 64], max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, - max_workgroup_memory_bytes = 16384> + max_workgroup_memory_bytes = 16384, + max_workgroup_counts = [65535, 65535, 65535]> > }> diff --git a/samples/transform_dialect/example_module.mlir b/samples/transform_dialect/example_module.mlir index 723bbbf70895c..2fb3498a1b3e1 100644 --- a/samples/transform_dialect/example_module.mlir +++ b/samples/transform_dialect/example_module.mlir @@ -27,7 +27,7 @@ #target = #iree_gpu.target> + max_workgroup_sizes = [128, 128, 64], max_thread_count_per_workgroup = 128, max_workgroup_memory_bytes = 16384, max_workgroup_counts = [65535, 65535, 65535]>> #pipeline_layout_0 = #hal.pipeline.layout, <1, storage_buffer>]>]> #pipeline_layout_1 = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>