From edc5a556d63ed9a8b4324ba751ff79e82c973e8e Mon Sep 17 00:00:00 2001
From: Stuart Carnie <stuart.carnie@gmail.com>
Date: Fri, 29 Nov 2024 15:44:26 +1100
Subject: [PATCH] Metal: Support Apple4 GPUs (2017 era iOS devices)

Closes #99682
---
 drivers/metal/metal_device_properties.h       |  25 +-
 drivers/metal/metal_device_properties.mm      |   1 +
 drivers/metal/metal_objects.h                 |  21 +-
 drivers/metal/metal_objects.mm                | 449 +++++++++--
 drivers/metal/rendering_device_driver_metal.h |   6 +-
 .../metal/rendering_device_driver_metal.mm    | 151 +++-
 thirdparty/README.md                          |   2 +-
 thirdparty/spirv-cross/spirv.hpp              |  12 +-
 thirdparty/spirv-cross/spirv_common.hpp       |  10 +-
 thirdparty/spirv-cross/spirv_cross.cpp        |  16 +-
 .../spirv-cross/spirv_cross_parsed_ir.cpp     |   7 +-
 thirdparty/spirv-cross/spirv_glsl.cpp         | 116 ++-
 thirdparty/spirv-cross/spirv_msl.cpp          | 747 ++++++++++++++++--
 thirdparty/spirv-cross/spirv_msl.hpp          |  16 +-
 thirdparty/spirv-cross/spirv_reflect.cpp      |   2 +
 15 files changed, 1351 insertions(+), 230 deletions(-)

diff --git a/drivers/metal/metal_device_properties.h b/drivers/metal/metal_device_properties.h
index 7467e8ceb49a..8d4cdf2f700a 100644
--- a/drivers/metal/metal_device_properties.h
+++ b/drivers/metal/metal_device_properties.h
@@ -71,18 +71,19 @@ typedef NS_OPTIONS(NSUInteger, SampleCount) {
 };
 
 struct API_AVAILABLE(macos(11.0), ios(14.0)) MetalFeatures {
-	uint32_t mslVersion;
-	MTLGPUFamily highestFamily;
-	MTLLanguageVersion mslVersionEnum;
-	SampleCount supportedSampleCounts;
-	long hostMemoryPageSize;
-	bool layeredRendering;
-	bool multisampleLayeredRendering;
-	bool quadPermute; /**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
-	bool simdPermute; /**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
-	bool simdReduction; /**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
-	bool tessellationShader; /**< If true, tessellation shaders are supported. */
-	bool imageCubeArray; /**< If true, image cube arrays are supported. */
+	uint32_t mslVersion = 0;
+	MTLGPUFamily highestFamily = MTLGPUFamilyApple6;
+	MTLLanguageVersion mslVersionEnum = MTLLanguageVersion1_2;
+	SampleCount supportedSampleCounts = SampleCount1;
+	long hostMemoryPageSize = 0;
+	bool layeredRendering = false;
+	bool multisampleLayeredRendering = false;
+	bool quadPermute = false; /**< If true, quadgroup permutation functions (vote, ballot, shuffle) are supported in shaders. */
+	bool simdPermute = false; /**< If true, SIMD-group permutation functions (vote, ballot, shuffle) are supported in shaders. */
+	bool simdReduction = false; /**< If true, SIMD-group reduction functions (arithmetic) are supported in shaders. */
+	bool tessellationShader = false; /**< If true, tessellation shaders are supported. */
+	bool imageCubeArray = false; /**< If true, image cube arrays are supported. */
+	MTLArgumentBuffersTier argument_buffers_tier = MTLArgumentBuffersTier1;
 };
 
 struct MetalLimits {
diff --git a/drivers/metal/metal_device_properties.mm b/drivers/metal/metal_device_properties.mm
index 857fa8c66ee1..2c3f99dd50c2 100644
--- a/drivers/metal/metal_device_properties.mm
+++ b/drivers/metal/metal_device_properties.mm
@@ -98,6 +98,7 @@
 	features.quadPermute = [p_device supportsFamily:MTLGPUFamilyApple4];
 	features.simdPermute = [p_device supportsFamily:MTLGPUFamilyApple6];
 	features.simdReduction = [p_device supportsFamily:MTLGPUFamilyApple7];
+	features.argument_buffers_tier = p_device.argumentBuffersSupport;
 
 	MTLCompileOptions *opts = [MTLCompileOptions new];
 	features.mslVersionEnum = opts.languageVersion; // By default, Metal uses the most recent language version.
diff --git a/drivers/metal/metal_objects.h b/drivers/metal/metal_objects.h
index 1870221b22c0..4cb929a9081f 100644
--- a/drivers/metal/metal_objects.h
+++ b/drivers/metal/metal_objects.h
@@ -696,11 +696,12 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) MDShader {
 public:
 	CharString name;
 	Vector<UniformSet> sets;
+	bool uses_argument_buffers = true;
 
 	virtual void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) = 0;
 
-	MDShader(CharString p_name, Vector<UniformSet> p_sets) :
-			name(p_name), sets(p_sets) {}
+	MDShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers) :
+			name(p_name), sets(p_sets), uses_argument_buffers(p_uses_argument_buffers) {}
 	virtual ~MDShader() = default;
 };
 
@@ -719,7 +720,7 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) MDComputeShader final : public MDSha
 
 	void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;
 
-	MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, MDLibrary *p_kernel);
+	MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers, MDLibrary *p_kernel);
 };
 
 class API_AVAILABLE(macos(11.0), ios(14.0)) MDRenderShader final : public MDShader {
@@ -746,8 +747,9 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) MDRenderShader final : public MDShad
 	void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;
 
 	MDRenderShader(CharString p_name,
-			bool p_needs_view_mask_buffer,
 			Vector<UniformSet> p_sets,
+			bool p_needs_view_mask_buffer,
+			bool p_uses_argument_buffers,
 			MDLibrary *p_vert, MDLibrary *p_frag);
 };
 
@@ -783,12 +785,21 @@ struct BoundUniformSet {
 };
 
 class API_AVAILABLE(macos(11.0), ios(14.0)) MDUniformSet {
+private:
+	void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state);
+	void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state);
+	void bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state);
+	void bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state);
+
 public:
 	uint32_t index;
 	LocalVector<RDD::BoundUniform> uniforms;
 	HashMap<MDShader *, BoundUniformSet> bound_uniforms;
 
-	BoundUniformSet &boundUniformSetForShader(MDShader *p_shader, id<MTLDevice> p_device);
+	void bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state);
+	void bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state);
+
+	BoundUniformSet &bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage);
 };
 
 class API_AVAILABLE(macos(11.0), ios(14.0)) MDPipeline {
diff --git a/drivers/metal/metal_objects.mm b/drivers/metal/metal_objects.mm
index a3a2f75fef60..b44c496e4f6f 100644
--- a/drivers/metal/metal_objects.mm
+++ b/drivers/metal/metal_objects.mm
@@ -249,7 +249,7 @@
 	const MDSubpass &subpass = render.get_subpass();
 
 	uint32_t vertex_count = p_rects.size() * 6 * subpass.view_count;
-	simd::float4 vertices[vertex_count];
+	simd::float4 *vertices = ALLOCA_ARRAY(simd::float4, vertex_count);
 	simd::float4 clear_colors[ClearAttKey::ATTACHMENT_COUNT];
 
 	Size2i size = render.frameBuffer->size;
@@ -362,7 +362,7 @@
 
 	if (render.dirty.has_flag(RenderState::DIRTY_SCISSOR) && !render.scissors.is_empty()) {
 		size_t len = render.scissors.size();
-		MTLScissorRect rects[len];
+		MTLScissorRect *rects = ALLOCA_ARRAY(MTLScissorRect, len);
 		for (size_t i = 0; i < len; i++) {
 			rects[i] = render.clip_to_render_area(render.scissors[i]);
 		}
@@ -466,9 +466,7 @@
 	uint64_t set_uniforms = render.uniform_set_mask;
 	render.uniform_set_mask = 0;
 
-	id<MTLRenderCommandEncoder> enc = render.encoder;
 	MDRenderShader *shader = render.pipeline->shader;
-	id<MTLDevice> device = enc.device;
 
 	while (set_uniforms != 0) {
 		// Find the index of the next set bit.
@@ -479,25 +477,7 @@
 		if (set == nullptr || set->index >= (uint32_t)shader->sets.size()) {
 			continue;
 		}
-		UniformSet const &set_info = shader->sets[set->index];
-
-		BoundUniformSet &bus = set->boundUniformSetForShader(shader, device);
-		bus.merge_into(render.resource_usage);
-
-		// Set the buffer for the vertex stage.
-		{
-			uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_VERTEX);
-			if (offset) {
-				[enc setVertexBuffer:bus.buffer offset:*offset atIndex:set->index];
-			}
-		}
-		// Set the buffer for the fragment stage.
-		{
-			uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_FRAGMENT);
-			if (offset) {
-				[enc setFragmentBuffer:bus.buffer offset:*offset atIndex:set->index];
-			}
-		}
+		set->bind_uniforms(shader, render);
 	}
 }
 
@@ -968,54 +948,21 @@
 void MDCommandBuffer::compute_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
-	id<MTLComputeCommandEncoder> enc = compute.encoder;
-	id<MTLDevice> device = enc.device;
-
 	MDShader *shader = (MDShader *)(p_shader.id);
-	UniformSet const &set_info = shader->sets[p_set_index];
-
 	MDUniformSet *set = (MDUniformSet *)(p_uniform_set.id);
-	BoundUniformSet &bus = set->boundUniformSetForShader(shader, device);
-	bus.merge_into(compute.resource_usage);
-
-	uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_COMPUTE);
-	if (offset) {
-		[enc setBuffer:bus.buffer offset:*offset atIndex:p_set_index];
-	}
+	set->bind_uniforms(shader, compute);
 }
 
 void MDCommandBuffer::compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
-	id<MTLComputeCommandEncoder> enc = compute.encoder;
-	id<MTLDevice> device = enc.device;
-
 	MDShader *shader = (MDShader *)(p_shader.id);
 
-	thread_local LocalVector<__unsafe_unretained id<MTLBuffer>> buffers;
-	thread_local LocalVector<NSUInteger> offsets;
-
-	buffers.resize(p_set_count);
-	offsets.resize(p_set_count);
-
+	// TODO(sgc): Bind multiple buffers using [encoder setBuffers:offsets:withRange:]
 	for (size_t i = 0u; i < p_set_count; ++i) {
-		UniformSet const &set_info = shader->sets[p_first_set_index + i];
-
 		MDUniformSet *set = (MDUniformSet *)(p_uniform_sets[i].id);
-		BoundUniformSet &bus = set->boundUniformSetForShader(shader, device);
-		bus.merge_into(compute.resource_usage);
-
-		uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_COMPUTE);
-		if (offset) {
-			buffers[i] = bus.buffer;
-			offsets[i] = *offset;
-		} else {
-			buffers[i] = nullptr;
-			offsets[i] = 0u;
-		}
+		set->bind_uniforms(shader, compute);
 	}
-
-	[enc setBuffers:buffers.ptr() offsets:offsets.ptr() withRange:NSMakeRange(p_first_set_index, p_set_count)];
 }
 
 void MDCommandBuffer::compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) {
@@ -1052,8 +999,11 @@
 	type = MDCommandBufferStateType::None;
 }
 
-MDComputeShader::MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, MDLibrary *p_kernel) :
-		MDShader(p_name, p_sets), kernel(p_kernel) {
+MDComputeShader::MDComputeShader(CharString p_name,
+		Vector<UniformSet> p_sets,
+		bool p_uses_argument_buffers,
+		MDLibrary *p_kernel) :
+		MDShader(p_name, p_sets, p_uses_argument_buffers), kernel(p_kernel) {
 }
 
 void MDComputeShader::encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) {
@@ -1071,15 +1021,19 @@
 }
 
 MDRenderShader::MDRenderShader(CharString p_name,
-		bool p_needs_view_mask_buffer,
 		Vector<UniformSet> p_sets,
+		bool p_needs_view_mask_buffer,
+		bool p_uses_argument_buffers,
 		MDLibrary *_Nonnull p_vert, MDLibrary *_Nonnull p_frag) :
-		MDShader(p_name, p_sets), needs_view_mask_buffer(p_needs_view_mask_buffer), vert(p_vert), frag(p_frag) {
+		MDShader(p_name, p_sets, p_uses_argument_buffers),
+		needs_view_mask_buffer(p_needs_view_mask_buffer),
+		vert(p_vert),
+		frag(p_frag) {
 }
 
 void MDRenderShader::encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) {
 	DEV_ASSERT(p_cb->type == MDCommandBufferStateType::Render);
-	id<MTLRenderCommandEncoder> enc = p_cb->render.encoder;
+	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_cb->render.encoder;
 
 	void const *ptr = p_data.ptr();
 	size_t length = p_data.size() * sizeof(uint32_t);
@@ -1093,9 +1047,373 @@
 	}
 }
 
-BoundUniformSet &MDUniformSet::boundUniformSetForShader(MDShader *p_shader, id<MTLDevice> p_device) {
+void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state) {
+	DEV_ASSERT(p_shader->uses_argument_buffers);
+	DEV_ASSERT(p_state.encoder != nil);
+
+	UniformSet const &set_info = p_shader->sets[index];
+
+	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_state.encoder;
+	id<MTLDevice> __unsafe_unretained device = enc.device;
+
+	BoundUniformSet &bus = bound_uniform_set(p_shader, device, p_state.resource_usage);
+
+	// Set the buffer for the vertex stage.
+	{
+		uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_VERTEX);
+		if (offset) {
+			[enc setVertexBuffer:bus.buffer offset:*offset atIndex:index];
+		}
+	}
+	// Set the buffer for the fragment stage.
+	{
+		uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_FRAGMENT);
+		if (offset) {
+			[enc setFragmentBuffer:bus.buffer offset:*offset atIndex:index];
+		}
+	}
+}
+
+void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::RenderState &p_state) {
+	DEV_ASSERT(!p_shader->uses_argument_buffers);
+	DEV_ASSERT(p_state.encoder != nil);
+
+	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_state.encoder;
+
+	UniformSet const &set = p_shader->sets[index];
+
+	for (uint32_t i = 0; i < uniforms.size(); i++) {
+		RDD::BoundUniform const &uniform = uniforms[i];
+		UniformInfo ui = set.uniforms[i];
+
+		static const RDC::ShaderStage stage_usages[2] = { RDC::ShaderStage::SHADER_STAGE_VERTEX, RDC::ShaderStage::SHADER_STAGE_FRAGMENT };
+		for (const RDC::ShaderStage stage : stage_usages) {
+			ShaderStageUsage const stage_usage = ShaderStageUsage(1 << stage);
+
+			BindingInfo *bi = ui.bindings.getptr(stage);
+			if (bi == nullptr) {
+				// No binding for this stage.
+				continue;
+			}
+
+			if ((ui.active_stages & stage_usage) == 0) {
+				// Not active for this state, so don't bind anything.
+				continue;
+			}
+
+			switch (uniform.type) {
+				case RDD::UNIFORM_TYPE_SAMPLER: {
+					size_t count = uniform.ids.size();
+					id<MTLSamplerState> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLSamplerState> __unsafe_unretained, count);
+					for (size_t j = 0; j < count; j += 1) {
+						objects[j] = rid::get(uniform.ids[j].id);
+					}
+					if (stage == RDD::SHADER_STAGE_VERTEX) {
+						[enc setVertexSamplerStates:objects withRange:NSMakeRange(bi->index, count)];
+					} else {
+						[enc setFragmentSamplerStates:objects withRange:NSMakeRange(bi->index, count)];
+					}
+				} break;
+				case RDD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE: {
+					size_t count = uniform.ids.size() / 2;
+					id<MTLTexture> __unsafe_unretained *textures = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+					id<MTLSamplerState> __unsafe_unretained *samplers = ALLOCA_ARRAY(id<MTLSamplerState> __unsafe_unretained, count);
+					for (uint32_t j = 0; j < count; j += 1) {
+						id<MTLSamplerState> sampler = rid::get(uniform.ids[j * 2 + 0]);
+						id<MTLTexture> texture = rid::get(uniform.ids[j * 2 + 1]);
+						samplers[j] = sampler;
+						textures[j] = texture;
+					}
+					BindingInfo *sbi = ui.bindings_secondary.getptr(stage);
+					if (sbi) {
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexSamplerStates:samplers withRange:NSMakeRange(sbi->index, count)];
+						} else {
+							[enc setFragmentSamplerStates:samplers withRange:NSMakeRange(sbi->index, count)];
+						}
+					}
+					if (stage == RDD::SHADER_STAGE_VERTEX) {
+						[enc setVertexTextures:textures withRange:NSMakeRange(bi->index, count)];
+					} else {
+						[enc setFragmentTextures:textures withRange:NSMakeRange(bi->index, count)];
+					}
+				} break;
+				case RDD::UNIFORM_TYPE_TEXTURE: {
+					size_t count = uniform.ids.size();
+					if (count == 1) {
+						id<MTLTexture> obj = rid::get(uniform.ids[0]);
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexTexture:obj atIndex:bi->index];
+						} else {
+							[enc setFragmentTexture:obj atIndex:bi->index];
+						}
+					} else {
+						id<MTLTexture> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+						for (size_t j = 0; j < count; j += 1) {
+							id<MTLTexture> obj = rid::get(uniform.ids[j]);
+							objects[j] = obj;
+						}
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexTextures:objects withRange:NSMakeRange(bi->index, count)];
+						} else {
+							[enc setFragmentTextures:objects withRange:NSMakeRange(bi->index, count)];
+						}
+					}
+				} break;
+				case RDD::UNIFORM_TYPE_IMAGE: {
+					size_t count = uniform.ids.size();
+					if (count == 1) {
+						id<MTLTexture> obj = rid::get(uniform.ids[0]);
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexTexture:obj atIndex:bi->index];
+						} else {
+							[enc setFragmentTexture:obj atIndex:bi->index];
+						}
+
+						BindingInfo *sbi = ui.bindings_secondary.getptr(stage);
+						if (sbi) {
+							id<MTLTexture> tex = obj.parentTexture ? obj.parentTexture : obj;
+							id<MTLBuffer> buf = tex.buffer;
+							if (buf) {
+								if (stage == RDD::SHADER_STAGE_VERTEX) {
+									[enc setVertexBuffer:buf offset:tex.bufferOffset atIndex:sbi->index];
+								} else {
+									[enc setFragmentBuffer:buf offset:tex.bufferOffset atIndex:sbi->index];
+								}
+							}
+						}
+					} else {
+						id<MTLTexture> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+						for (size_t j = 0; j < count; j += 1) {
+							id<MTLTexture> obj = rid::get(uniform.ids[j]);
+							objects[j] = obj;
+						}
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexTextures:objects withRange:NSMakeRange(bi->index, count)];
+						} else {
+							[enc setFragmentTextures:objects withRange:NSMakeRange(bi->index, count)];
+						}
+					}
+				} break;
+				case RDD::UNIFORM_TYPE_TEXTURE_BUFFER: {
+					ERR_PRINT("not implemented: UNIFORM_TYPE_TEXTURE_BUFFER");
+				} break;
+				case RDD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE_BUFFER: {
+					ERR_PRINT("not implemented: UNIFORM_TYPE_SAMPLER_WITH_TEXTURE_BUFFER");
+				} break;
+				case RDD::UNIFORM_TYPE_IMAGE_BUFFER: {
+					CRASH_NOW_MSG("not implemented: UNIFORM_TYPE_IMAGE_BUFFER");
+				} break;
+				case RDD::UNIFORM_TYPE_UNIFORM_BUFFER: {
+					id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
+					if (stage == RDD::SHADER_STAGE_VERTEX) {
+						[enc setVertexBuffer:buffer offset:0 atIndex:bi->index];
+					} else {
+						[enc setFragmentBuffer:buffer offset:0 atIndex:bi->index];
+					}
+				} break;
+				case RDD::UNIFORM_TYPE_STORAGE_BUFFER: {
+					id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
+					if (stage == RDD::SHADER_STAGE_VERTEX) {
+						[enc setVertexBuffer:buffer offset:0 atIndex:bi->index];
+					} else {
+						[enc setFragmentBuffer:buffer offset:0 atIndex:bi->index];
+					}
+				} break;
+				case RDD::UNIFORM_TYPE_INPUT_ATTACHMENT: {
+					size_t count = uniform.ids.size();
+					if (count == 1) {
+						id<MTLTexture> obj = rid::get(uniform.ids[0]);
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexTexture:obj atIndex:bi->index];
+						} else {
+							[enc setFragmentTexture:obj atIndex:bi->index];
+						}
+					} else {
+						id<MTLTexture> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+						for (size_t j = 0; j < count; j += 1) {
+							id<MTLTexture> obj = rid::get(uniform.ids[j]);
+							objects[j] = obj;
+						}
+
+						if (stage == RDD::SHADER_STAGE_VERTEX) {
+							[enc setVertexTextures:objects withRange:NSMakeRange(bi->index, count)];
+						} else {
+							[enc setFragmentTextures:objects withRange:NSMakeRange(bi->index, count)];
+						}
+					}
+				} break;
+				default: {
+					DEV_ASSERT(false);
+				}
+			}
+		}
+	}
+}
+
+void MDUniformSet::bind_uniforms(MDShader *p_shader, MDCommandBuffer::RenderState &p_state) {
+	if (p_shader->uses_argument_buffers) {
+		bind_uniforms_argument_buffers(p_shader, p_state);
+	} else {
+		bind_uniforms_direct(p_shader, p_state);
+	}
+}
+
+void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state) {
+	DEV_ASSERT(p_shader->uses_argument_buffers);
+	DEV_ASSERT(p_state.encoder != nil);
+
+	UniformSet const &set_info = p_shader->sets[index];
+
+	id<MTLComputeCommandEncoder> enc = p_state.encoder;
+	id<MTLDevice> device = enc.device;
+
+	BoundUniformSet &bus = bound_uniform_set(p_shader, device, p_state.resource_usage);
+
+	uint32_t const *offset = set_info.offsets.getptr(RDD::SHADER_STAGE_COMPUTE);
+	if (offset) {
+		[enc setBuffer:bus.buffer offset:*offset atIndex:index];
+	}
+}
+
+void MDUniformSet::bind_uniforms_direct(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state) {
+	DEV_ASSERT(!p_shader->uses_argument_buffers);
+	DEV_ASSERT(p_state.encoder != nil);
+
+	id<MTLComputeCommandEncoder> __unsafe_unretained enc = p_state.encoder;
+
+	UniformSet const &set = p_shader->sets[index];
+
+	for (uint32_t i = 0; i < uniforms.size(); i++) {
+		RDD::BoundUniform const &uniform = uniforms[i];
+		UniformInfo ui = set.uniforms[i];
+
+		const RDC::ShaderStage stage = RDC::ShaderStage::SHADER_STAGE_COMPUTE;
+		const ShaderStageUsage stage_usage = ShaderStageUsage(1 << stage);
+
+		BindingInfo *bi = ui.bindings.getptr(stage);
+		if (bi == nullptr) {
+			// No binding for this stage.
+			continue;
+		}
+
+		if ((ui.active_stages & stage_usage) == 0) {
+			// Not active for this state, so don't bind anything.
+			continue;
+		}
+
+		switch (uniform.type) {
+			case RDD::UNIFORM_TYPE_SAMPLER: {
+				size_t count = uniform.ids.size();
+				id<MTLSamplerState> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLSamplerState> __unsafe_unretained, count);
+				for (size_t j = 0; j < count; j += 1) {
+					objects[j] = rid::get(uniform.ids[j].id);
+				}
+				[enc setSamplerStates:objects withRange:NSMakeRange(bi->index, count)];
+			} break;
+			case RDD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE: {
+				size_t count = uniform.ids.size() / 2;
+				id<MTLTexture> __unsafe_unretained *textures = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+				id<MTLSamplerState> __unsafe_unretained *samplers = ALLOCA_ARRAY(id<MTLSamplerState> __unsafe_unretained, count);
+				for (uint32_t j = 0; j < count; j += 1) {
+					id<MTLSamplerState> sampler = rid::get(uniform.ids[j * 2 + 0]);
+					id<MTLTexture> texture = rid::get(uniform.ids[j * 2 + 1]);
+					samplers[j] = sampler;
+					textures[j] = texture;
+				}
+				BindingInfo *sbi = ui.bindings_secondary.getptr(stage);
+				if (sbi) {
+					[enc setSamplerStates:samplers withRange:NSMakeRange(sbi->index, count)];
+				}
+				[enc setTextures:textures withRange:NSMakeRange(bi->index, count)];
+			} break;
+			case RDD::UNIFORM_TYPE_TEXTURE: {
+				size_t count = uniform.ids.size();
+				if (count == 1) {
+					id<MTLTexture> obj = rid::get(uniform.ids[0]);
+					[enc setTexture:obj atIndex:bi->index];
+				} else {
+					id<MTLTexture> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+					for (size_t j = 0; j < count; j += 1) {
+						id<MTLTexture> obj = rid::get(uniform.ids[j]);
+						objects[j] = obj;
+					}
+					[enc setTextures:objects withRange:NSMakeRange(bi->index, count)];
+				}
+			} break;
+			case RDD::UNIFORM_TYPE_IMAGE: {
+				size_t count = uniform.ids.size();
+				if (count == 1) {
+					id<MTLTexture> obj = rid::get(uniform.ids[0]);
+					[enc setTexture:obj atIndex:bi->index];
+
+					BindingInfo *sbi = ui.bindings_secondary.getptr(stage);
+					if (sbi) {
+						id<MTLTexture> tex = obj.parentTexture ? obj.parentTexture : obj;
+						id<MTLBuffer> buf = tex.buffer;
+						if (buf) {
+							[enc setBuffer:buf offset:tex.bufferOffset atIndex:sbi->index];
+						}
+					}
+				} else {
+					id<MTLTexture> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+					for (size_t j = 0; j < count; j += 1) {
+						id<MTLTexture> obj = rid::get(uniform.ids[j]);
+						objects[j] = obj;
+					}
+					[enc setTextures:objects withRange:NSMakeRange(bi->index, count)];
+				}
+			} break;
+			case RDD::UNIFORM_TYPE_TEXTURE_BUFFER: {
+				ERR_PRINT("not implemented: UNIFORM_TYPE_TEXTURE_BUFFER");
+			} break;
+			case RDD::UNIFORM_TYPE_SAMPLER_WITH_TEXTURE_BUFFER: {
+				ERR_PRINT("not implemented: UNIFORM_TYPE_SAMPLER_WITH_TEXTURE_BUFFER");
+			} break;
+			case RDD::UNIFORM_TYPE_IMAGE_BUFFER: {
+				CRASH_NOW_MSG("not implemented: UNIFORM_TYPE_IMAGE_BUFFER");
+			} break;
+			case RDD::UNIFORM_TYPE_UNIFORM_BUFFER: {
+				id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
+				[enc setBuffer:buffer offset:0 atIndex:bi->index];
+			} break;
+			case RDD::UNIFORM_TYPE_STORAGE_BUFFER: {
+				id<MTLBuffer> buffer = rid::get(uniform.ids[0]);
+				[enc setBuffer:buffer offset:0 atIndex:bi->index];
+			} break;
+			case RDD::UNIFORM_TYPE_INPUT_ATTACHMENT: {
+				size_t count = uniform.ids.size();
+				if (count == 1) {
+					id<MTLTexture> obj = rid::get(uniform.ids[0]);
+					[enc setTexture:obj atIndex:bi->index];
+				} else {
+					id<MTLTexture> __unsafe_unretained *objects = ALLOCA_ARRAY(id<MTLTexture> __unsafe_unretained, count);
+					for (size_t j = 0; j < count; j += 1) {
+						id<MTLTexture> obj = rid::get(uniform.ids[j]);
+						objects[j] = obj;
+					}
+					[enc setTextures:objects withRange:NSMakeRange(bi->index, count)];
+				}
+			} break;
+			default: {
+				DEV_ASSERT(false);
+			}
+		}
+	}
+}
+
+void MDUniformSet::bind_uniforms(MDShader *p_shader, MDCommandBuffer::ComputeState &p_state) {
+	if (p_shader->uses_argument_buffers) {
+		bind_uniforms_argument_buffers(p_shader, p_state);
+	} else {
+		bind_uniforms_direct(p_shader, p_state);
+	}
+}
+
+BoundUniformSet &MDUniformSet::bound_uniform_set(MDShader *p_shader, id<MTLDevice> p_device, ResourceUsageMap &p_resource_usage) {
 	BoundUniformSet *sus = bound_uniforms.getptr(p_shader);
 	if (sus != nullptr) {
+		sus->merge_into(p_resource_usage);
 		return *sus;
 	}
 
@@ -1261,6 +1579,7 @@
 
 	BoundUniformSet bs = { .buffer = enc_buffer, .usage_to_resources = usage_to_resources };
 	bound_uniforms.insert(p_shader, bs);
+	bs.merge_into(p_resource_usage);
 	return bound_uniforms.get(p_shader);
 }
 
diff --git a/drivers/metal/rendering_device_driver_metal.h b/drivers/metal/rendering_device_driver_metal.h
index 52eb0f79300d..0fff49da4106 100644
--- a/drivers/metal/rendering_device_driver_metal.h
+++ b/drivers/metal/rendering_device_driver_metal.h
@@ -61,7 +61,7 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) RenderingDeviceDriverMetal : public
 
 	uint32_t version_major = 2;
 	uint32_t version_minor = 0;
-	MetalDeviceProperties *metal_device_properties = nullptr;
+	MetalDeviceProperties *device_properties = nullptr;
 	PixelFormats *pixel_formats = nullptr;
 	std::unique_ptr<MDResourceCache> resource_cache;
 
@@ -431,10 +431,10 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) RenderingDeviceDriverMetal : public
 	id<MTLDevice> get_device() const { return device; }
 	PixelFormats &get_pixel_formats() const { return *pixel_formats; }
 	MDResourceCache &get_resource_cache() const { return *resource_cache; }
-	MetalDeviceProperties const &get_device_properties() const { return *metal_device_properties; }
+	MetalDeviceProperties const &get_device_properties() const { return *device_properties; }
 
 	_FORCE_INLINE_ uint32_t get_metal_buffer_index_for_vertex_attribute_binding(uint32_t p_binding) {
-		return (metal_device_properties->limits.maxPerStageBufferCount - 1) - p_binding;
+		return (device_properties->limits.maxPerStageBufferCount - 1) - p_binding;
 	}
 
 	size_t get_texel_buffer_alignment_for_format(RDD::DataFormat p_format) const;
diff --git a/drivers/metal/rendering_device_driver_metal.mm b/drivers/metal/rendering_device_driver_metal.mm
index dea02f699775..0c05f0e114a4 100644
--- a/drivers/metal/rendering_device_driver_metal.mm
+++ b/drivers/metal/rendering_device_driver_metal.mm
@@ -218,7 +218,7 @@ _FORCE_INLINE_ MTLSize mipmapLevelSizeFromSize(MTLSize p_size, NSUInteger p_leve
 	// desc.compressionType = MTLTextureCompressionTypeLossy;
 
 	if (p_format.samples > TEXTURE_SAMPLES_1) {
-		SampleCount supported = (*metal_device_properties).find_nearest_supported_sample_count(p_format.samples);
+		SampleCount supported = (*device_properties).find_nearest_supported_sample_count(p_format.samples);
 
 		if (supported > SampleCount1) {
 			bool ok = p_format.texture_type == TEXTURE_TYPE_2D || p_format.texture_type == TEXTURE_TYPE_2D_ARRAY;
@@ -277,7 +277,7 @@ _FORCE_INLINE_ MTLSize mipmapLevelSizeFromSize(MTLSize p_size, NSUInteger p_leve
 	// Usage.
 
 	MTLResourceOptions options = 0;
-	const bool supports_memoryless = (*metal_device_properties).features.highestFamily >= MTLGPUFamilyApple2 && (*metal_device_properties).features.highestFamily < MTLGPUFamilyMac1;
+	const bool supports_memoryless = (*device_properties).features.highestFamily >= MTLGPUFamilyApple2 && (*device_properties).features.highestFamily < MTLGPUFamilyMac1;
 	if (supports_memoryless && p_format.usage_bits & TEXTURE_USAGE_TRANSIENT_BIT) {
 		options = MTLResourceStorageModeMemoryless | MTLResourceHazardTrackingModeTracked;
 		desc.storageMode = MTLStorageModeMemoryless;
@@ -1058,7 +1058,7 @@ static const API_AVAILABLE(macos(11.0), ios(14.0)) MTLSamplerBorderColor SAMPLER
 
 #pragma mark - Shader
 
-const uint32_t SHADER_BINARY_VERSION = 3;
+const uint32_t SHADER_BINARY_VERSION = 4;
 
 // region Serialization
 
@@ -1503,6 +1503,9 @@ void deserialize(BufReader &p_reader) {
 		p_reader.read(index);
 		p_reader.read(uniforms);
 	}
+	UniformSetData() = default;
+	UniformSetData(uint32_t p_index) :
+			index(p_index) {}
 };
 
 struct PushConstantData {
@@ -1536,6 +1539,11 @@ void deserialize(BufReader &p_reader) {
 };
 
 struct API_AVAILABLE(macos(11.0), ios(14.0)) ShaderBinaryData {
+	enum Flags : uint32_t {
+		NONE = 0,
+		NEEDS_VIEW_MASK_BUFFER = 1 << 0,
+		USES_ARGUMENT_BUFFERS = 1 << 1,
+	};
 	CharString shader_name;
 	// The Metal language version specified when compiling SPIR-V to MSL.
 	// Format is major * 10000 + minor * 100 + patch.
@@ -1543,8 +1551,7 @@ struct API_AVAILABLE(macos(11.0), ios(14.0)) ShaderBinaryData {
 	uint32_t vertex_input_mask = UINT32_MAX;
 	uint32_t fragment_output_mask = UINT32_MAX;
 	uint32_t spirv_specialization_constants_ids_mask = UINT32_MAX;
-	uint32_t is_compute = UINT32_MAX;
-	uint32_t needs_view_mask_buffer = UINT32_MAX;
+	uint32_t flags = NONE;
 	ComputeSize compute_local_size;
 	PushConstantData push_constant;
 	LocalVector<ShaderStageData> stages;
@@ -1557,15 +1564,44 @@ MTLLanguageVersion get_msl_version() const {
 		return MTLLanguageVersion((major << 0x10) + minor);
 	}
 
+	bool is_compute() const {
+		return std::any_of(stages.begin(), stages.end(), [](ShaderStageData const &e) {
+			return e.stage == RD::ShaderStage::SHADER_STAGE_COMPUTE;
+		});
+	}
+
+	bool needs_view_mask_buffer() const {
+		return flags & NEEDS_VIEW_MASK_BUFFER;
+	}
+
+	void set_needs_view_mask_buffer(bool p_value) {
+		if (p_value) {
+			flags |= NEEDS_VIEW_MASK_BUFFER;
+		} else {
+			flags &= ~NEEDS_VIEW_MASK_BUFFER;
+		}
+	}
+
+	bool uses_argument_buffers() const {
+		return flags & USES_ARGUMENT_BUFFERS;
+	}
+
+	void set_uses_argument_buffers(bool p_value) {
+		if (p_value) {
+			flags |= USES_ARGUMENT_BUFFERS;
+		} else {
+			flags &= ~USES_ARGUMENT_BUFFERS;
+		}
+	}
+
 	size_t serialize_size() const {
 		size_t size = 0;
 		size += sizeof(uint32_t) + shader_name.length(); // shader_name
-		size += sizeof(uint32_t); // msl_version
-		size += sizeof(uint32_t); // vertex_input_mask
-		size += sizeof(uint32_t); // fragment_output_mask
-		size += sizeof(uint32_t); // spirv_specialization_constants_ids_mask
-		size += sizeof(uint32_t); // is_compute
-		size += sizeof(uint32_t); // needs_view_mask_buffer
+		size += sizeof(msl_version); // msl_version
+		size += sizeof(vertex_input_mask); // vertex_input_mask
+		size += sizeof(fragment_output_mask); // fragment_output_mask
+		size += sizeof(spirv_specialization_constants_ids_mask); // spirv_specialization_constants_ids_mask
+		size += sizeof(flags); // flags
 		size += compute_local_size.serialize_size(); // compute_local_size
 		size += push_constant.serialize_size(); // push_constant
 		size += sizeof(uint32_t); // stages.size()
@@ -1589,8 +1625,7 @@ void serialize(BufWriter &p_writer) const {
 		p_writer.write(vertex_input_mask);
 		p_writer.write(fragment_output_mask);
 		p_writer.write(spirv_specialization_constants_ids_mask);
-		p_writer.write(is_compute);
-		p_writer.write(needs_view_mask_buffer);
+		p_writer.write(flags);
 		p_writer.write(compute_local_size);
 		p_writer.write(push_constant);
 		p_writer.write(VectorView(stages));
@@ -1604,8 +1639,7 @@ void deserialize(BufReader &p_reader) {
 		p_reader.read(vertex_input_mask);
 		p_reader.read(fragment_output_mask);
 		p_reader.read(spirv_specialization_constants_ids_mask);
-		p_reader.read(is_compute);
-		p_reader.read(needs_view_mask_buffer);
+		p_reader.read(flags);
 		p_reader.read(compute_local_size);
 		p_reader.read(push_constant);
 		p_reader.read(stages);
@@ -1952,14 +1986,13 @@ void deserialize(BufReader &p_reader) {
 		.y = spirv_data.compute_local_size[1],
 		.z = spirv_data.compute_local_size[2],
 	};
-	bin_data.is_compute = spirv_data.is_compute;
 	bin_data.push_constant.size = spirv_data.push_constant_size;
 	bin_data.push_constant.stages = (ShaderStageUsage)(uint8_t)spirv_data.push_constant_stages;
-	bin_data.needs_view_mask_buffer = shader_meta.has_multiview ? 1 : 0;
+	bin_data.set_needs_view_mask_buffer(shader_meta.has_multiview);
 
 	for (uint32_t i = 0; i < spirv_data.uniform_sets.size(); i++) {
 		const ::Vector<ShaderUniform> &spirv_set = spirv_data.uniform_sets[i];
-		UniformSetData set{ .index = i };
+		UniformSetData set(i);
 		for (const ShaderUniform &spirv_uniform : spirv_set) {
 			UniformData binding{};
 			binding.type = spirv_uniform.type;
@@ -1999,10 +2032,25 @@ void deserialize(BufReader &p_reader) {
 #endif
 
 #if TARGET_OS_IOS
-	msl_options.ios_use_simdgroup_functions = (*metal_device_properties).features.simdPermute;
+	msl_options.ios_use_simdgroup_functions = (*device_properties).features.simdPermute;
 #endif
 
-	msl_options.argument_buffers = true;
+	bool disable_argument_buffers = false;
+	if (String v = OS::get_singleton()->get_environment(U"GODOT_DISABLE_ARGUMENT_BUFFERS"); v == U"1") {
+		disable_argument_buffers = true;
+	}
+
+	if (device_properties->features.argument_buffers_tier >= MTLArgumentBuffersTier2 && !disable_argument_buffers) {
+		msl_options.argument_buffers_tier = CompilerMSL::Options::ArgumentBuffersTier::Tier2;
+		msl_options.argument_buffers = true;
+		bin_data.set_uses_argument_buffers(true);
+	} else {
+		msl_options.argument_buffers_tier = CompilerMSL::Options::ArgumentBuffersTier::Tier1;
+		// tier 1 argument buffers don't support writable textures, so we disable them completely
+		msl_options.argument_buffers = false;
+		bin_data.set_uses_argument_buffers(false);
+	}
+
 	msl_options.force_active_argument_buffer_resources = true; // Same as MoltenVK when using argument buffers.
 	// msl_options.pad_argument_buffer_resources = true; // Same as MoltenVK when using argument buffers.
 	msl_options.texture_buffer_native = true; // Enable texture buffer support.
@@ -2088,8 +2136,9 @@ void deserialize(BufReader &p_reader) {
 			return res;
 		};
 
-		auto descriptor_bindings = [&compiler, &active, &uniform_sets, stage, &get_decoration](SmallVector<Resource> &resources, Writable writable) {
-			for (Resource const &res : resources) {
+		auto descriptor_bindings = [&compiler, &active, &uniform_sets, stage, &get_decoration](SmallVector<Resource> &p_resources, Writable p_writable) {
+			for (Resource const &res : p_resources) {
+				auto name = compiler.get_name(res.id);
 				uint32_t dset = get_decoration(res.id, spv::DecorationDescriptorSet);
 				uint32_t dbin = get_decoration(res.id, spv::DecorationBinding);
 				UniformData *found = nullptr;
@@ -2195,7 +2244,7 @@ void deserialize(BufReader &p_reader) {
 				}
 
 				// Update writable.
-				if (writable == Writable::Maybe) {
+				if (p_writable == Writable::Maybe) {
 					if (basetype == BT::Struct) {
 						Bitset flags = compiler.get_buffer_block_flags(res.id);
 						if (!flags.get(spv::DecorationNonWritable)) {
@@ -2384,6 +2433,11 @@ void deserialize(BufReader &p_reader) {
 			ERR_FAIL_V_MSG(ShaderID(), "Unexpected end of buffer");
 	}
 
+	// We need to regenerate the shader if the cache is moved to an incompatible device.
+	ERR_FAIL_COND_V_MSG(device_properties->features.argument_buffers_tier < MTLArgumentBuffersTier2 && binary_data.uses_argument_buffers(),
+			ShaderID(),
+			"Shader was generated with argument buffers, but device has limited support");
+
 	MTLCompileOptions *options = [MTLCompileOptions new];
 	options.languageVersion = binary_data.get_msl_version();
 	HashMap<ShaderStage, MDLibrary *> libraries;
@@ -2505,8 +2559,12 @@ void deserialize(BufReader &p_reader) {
 	}
 
 	MDShader *shader = nullptr;
-	if (binary_data.is_compute) {
-		MDComputeShader *cs = new MDComputeShader(binary_data.shader_name, uniform_sets, libraries[ShaderStage::SHADER_STAGE_COMPUTE]);
+	if (binary_data.is_compute()) {
+		MDComputeShader *cs = new MDComputeShader(
+				binary_data.shader_name,
+				uniform_sets,
+				binary_data.uses_argument_buffers(),
+				libraries[ShaderStage::SHADER_STAGE_COMPUTE]);
 
 		uint32_t *binding = binary_data.push_constant.msl_binding.getptr(SHADER_STAGE_COMPUTE);
 		if (binding) {
@@ -2520,7 +2578,13 @@ void deserialize(BufReader &p_reader) {
 #endif
 		shader = cs;
 	} else {
-		MDRenderShader *rs = new MDRenderShader(binary_data.shader_name, (bool)binary_data.needs_view_mask_buffer, uniform_sets, libraries[ShaderStage::SHADER_STAGE_VERTEX], libraries[ShaderStage::SHADER_STAGE_FRAGMENT]);
+		MDRenderShader *rs = new MDRenderShader(
+				binary_data.shader_name,
+				uniform_sets,
+				binary_data.needs_view_mask_buffer(),
+				binary_data.uses_argument_buffers(),
+				libraries[ShaderStage::SHADER_STAGE_VERTEX],
+				libraries[ShaderStage::SHADER_STAGE_FRAGMENT]);
 
 		uint32_t *vert_binding = binary_data.push_constant.msl_binding.getptr(SHADER_STAGE_VERTEX);
 		if (vert_binding) {
@@ -2547,7 +2611,7 @@ void deserialize(BufReader &p_reader) {
 
 	r_shader_desc.vertex_input_mask = binary_data.vertex_input_mask;
 	r_shader_desc.fragment_output_mask = binary_data.fragment_output_mask;
-	r_shader_desc.is_compute = binary_data.is_compute;
+	r_shader_desc.is_compute = binary_data.is_compute();
 	r_shader_desc.compute_local_size[0] = binary_data.compute_local_size.x;
 	r_shader_desc.compute_local_size[1] = binary_data.compute_local_size.y;
 	r_shader_desc.compute_local_size[2] = binary_data.compute_local_size.z;
@@ -2572,7 +2636,7 @@ void deserialize(BufReader &p_reader) {
 RDD::UniformSetID RenderingDeviceDriverMetal::uniform_set_create(VectorView<BoundUniform> p_uniforms, ShaderID p_shader, uint32_t p_set_index, int p_linear_pool_index) {
 	//p_linear_pool_index = -1; // TODO:? Linear pools not implemented or not supported by API backend.
 
-	MDUniformSet *set = new MDUniformSet();
+	MDUniformSet *set = memnew(MDUniformSet);
 	Vector<BoundUniform> bound_uniforms;
 	bound_uniforms.resize(p_uniforms.size());
 	for (uint32_t i = 0; i < p_uniforms.size(); i += 1) {
@@ -2586,7 +2650,7 @@ void deserialize(BufReader &p_reader) {
 
 void RenderingDeviceDriverMetal::uniform_set_free(UniformSetID p_uniform_set) {
 	MDUniformSet *obj = (MDUniformSet *)p_uniform_set.id;
-	delete obj;
+	memdelete(obj);
 }
 
 void RenderingDeviceDriverMetal::command_uniform_set_prepare_for_use(CommandBufferID p_cmd_buffer, UniformSetID p_uniform_set, ShaderID p_shader, uint32_t p_set_index) {
@@ -2800,7 +2864,7 @@ static inline MTLSize clampMTLSize(MTLSize p_size, MTLOrigin p_origin, MTLSize p
 		uint32_t layerCnt = p_subresources.layer_count;
 		uint32_t layerEnd = layerStart + layerCnt;
 
-		MetalFeatures const &features = (*metal_device_properties).features;
+		MetalFeatures const &features = (*device_properties).features;
 
 		// Iterate across mipmap levels and layers, and perform and empty render to clear each.
 		for (uint32_t mipLvl = mipLvlStart; mipLvl < mipLvlEnd; mipLvl++) {
@@ -3057,7 +3121,7 @@ bool isArrayTexture(MTLTextureType p_type) {
 		MTLPixelFormat format = pf.getMTLPixelFormat(a.format);
 		mda.format = format;
 		if (a.samples > TEXTURE_SAMPLES_1) {
-			mda.samples = (*metal_device_properties).find_nearest_supported_sample_count(a.samples);
+			mda.samples = (*device_properties).find_nearest_supported_sample_count(a.samples);
 		}
 		mda.loadAction = LOAD_ACTIONS[a.load_op];
 		mda.storeAction = STORE_ACTIONS[a.store_op];
@@ -3436,7 +3500,7 @@ bool isArrayTexture(MTLTextureType p_type) {
 	}
 
 	if (p_multisample_state.sample_count > TEXTURE_SAMPLES_1) {
-		pipeline->sample_count = (*metal_device_properties).find_nearest_supported_sample_count(p_multisample_state.sample_count);
+		pipeline->sample_count = (*device_properties).find_nearest_supported_sample_count(p_multisample_state.sample_count);
 	}
 	desc.rasterSampleCount = static_cast<NSUInteger>(pipeline->sample_count);
 	desc.alphaToCoverageEnabled = p_multisample_state.enable_alpha_to_coverage;
@@ -3815,7 +3879,7 @@ bool isArrayTexture(MTLTextureType p_type) {
 }
 
 uint64_t RenderingDeviceDriverMetal::limit_get(Limit p_limit) {
-	MetalDeviceProperties const &props = (*metal_device_properties);
+	MetalDeviceProperties const &props = (*device_properties);
 	MetalLimits const &limits = props.limits;
 
 #if defined(DEV_ENABLED)
@@ -3911,11 +3975,13 @@ bool isArrayTexture(MTLTextureType p_type) {
 		case LIMIT_SUBGROUP_MAX_SIZE:
 			return limits.maxSubgroupSize;
 		case LIMIT_SUBGROUP_IN_SHADERS:
-			return (int64_t)limits.subgroupSupportedShaderStages;
+			return (uint64_t)limits.subgroupSupportedShaderStages;
 		case LIMIT_SUBGROUP_OPERATIONS:
-			return (int64_t)limits.subgroupSupportedOperations;
+			return (uint64_t)limits.subgroupSupportedOperations;
 		UNKNOWN(LIMIT_VRS_TEXEL_WIDTH);
 		UNKNOWN(LIMIT_VRS_TEXEL_HEIGHT);
+		UNKNOWN(LIMIT_VRS_MAX_FRAGMENT_WIDTH);
+		UNKNOWN(LIMIT_VRS_MAX_FRAGMENT_HEIGHT);
 		default:
 			ERR_FAIL_V(0);
 	}
@@ -4042,11 +4108,11 @@ bool isArrayTexture(MTLTextureType p_type) {
 	// Set the pipeline cache ID based on the Metal version.
 	pipeline_cache_id = "metal-driver-" + get_api_version();
 
-	metal_device_properties = memnew(MetalDeviceProperties(device));
+	device_properties = memnew(MetalDeviceProperties(device));
 	pixel_formats = memnew(PixelFormats(device));
-	if (metal_device_properties->features.layeredRendering) {
+	if (device_properties->features.layeredRendering) {
 		multiview_capabilities.is_supported = true;
-		multiview_capabilities.max_view_count = metal_device_properties->limits.maxViewports;
+		multiview_capabilities.max_view_count = device_properties->limits.maxViewports;
 		// NOTE: I'm not sure what the limit is as I don't see it referenced anywhere
 		multiview_capabilities.max_instance_count = UINT32_MAX;
 
@@ -4057,11 +4123,10 @@ bool isArrayTexture(MTLTextureType p_type) {
 		print_verbose("- Metal multiview not supported");
 	}
 
-	// Check required features and abort if any of them is missing.
-	if (!metal_device_properties->features.imageCubeArray) {
-		// NOTE: Apple A11 (Apple4) GPUs support image cube arrays, which are devices from 2017 and newer.
-		String error_string = vformat("Your Apple GPU does not support the following features which are required to use Metal-based renderers in Godot:\n\n");
-		if (!metal_device_properties->features.imageCubeArray) {
+	// The Metal renderer requires Apple4 family. This is 2017 era A11 chips and newer.
+	if (device_properties->features.highestFamily < MTLGPUFamilyApple4) {
+		String error_string = vformat("Your Apple GPU does not support the following features, which are required to use Metal-based renderers in Godot:\n\n");
+		if (!device_properties->features.imageCubeArray) {
 			error_string += "- No support for image cube arrays.\n";
 		}
 
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 57701e94ee38..df0743afa682 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -881,7 +881,7 @@ proposed by these libraries and better integrate them with Godot.
 ## spirv-cross
 
 - Upstream: https://github.com/KhronosGroup/SPIRV-Cross
-- Version: vulkan-sdk-1.3.290.0 (5d127b917f080c6f052553c47170ec0ba702e54f, 2024)
+- Version: git (6173e24b31f09a0c3217103a130e74c4ddec14a6, 2024)
 - License: Apache 2.0
 
 Files extracted from upstream source:
diff --git a/thirdparty/spirv-cross/spirv.hpp b/thirdparty/spirv-cross/spirv.hpp
index f2ee9096bdd5..5047b9b30230 100644
--- a/thirdparty/spirv-cross/spirv.hpp
+++ b/thirdparty/spirv-cross/spirv.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2014-2020 The Khronos Group Inc.
+// Copyright (c) 2014-2024 The Khronos Group Inc.
 // 
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and/or associated documentation files (the "Materials"),
@@ -507,6 +507,7 @@ enum Decoration {
     DecorationNoUnsignedWrap = 4470,
     DecorationWeightTextureQCOM = 4487,
     DecorationBlockMatchTextureQCOM = 4488,
+    DecorationBlockMatchSamplerQCOM = 4499,
     DecorationExplicitInterpAMD = 4999,
     DecorationOverrideCoverageNV = 5248,
     DecorationPassthroughNV = 5250,
@@ -992,6 +993,7 @@ enum Capability {
     CapabilityTextureSampleWeightedQCOM = 4484,
     CapabilityTextureBoxFilterQCOM = 4485,
     CapabilityTextureBlockMatchQCOM = 4486,
+    CapabilityTextureBlockMatch2QCOM = 4498,
     CapabilityFloat16ImageAMD = 5008,
     CapabilityImageGatherBiasLodAMD = 5009,
     CapabilityFragmentMaskAMD = 5010,
@@ -1601,6 +1603,10 @@ enum Op {
     OpImageBoxFilterQCOM = 4481,
     OpImageBlockMatchSSDQCOM = 4482,
     OpImageBlockMatchSADQCOM = 4483,
+    OpImageBlockMatchWindowSSDQCOM = 4500,
+    OpImageBlockMatchWindowSADQCOM = 4501,
+    OpImageBlockMatchGatherSSDQCOM = 4502,
+    OpImageBlockMatchGatherSADQCOM = 4503,
     OpGroupIAddNonUniformAMD = 5000,
     OpGroupFAddNonUniformAMD = 5001,
     OpGroupFMinNonUniformAMD = 5002,
@@ -2280,6 +2286,10 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpImageBoxFilterQCOM: *hasResult = true; *hasResultType = true; break;
     case OpImageBlockMatchSSDQCOM: *hasResult = true; *hasResultType = true; break;
     case OpImageBlockMatchSADQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchWindowSSDQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchWindowSADQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchGatherSSDQCOM: *hasResult = true; *hasResultType = true; break;
+    case OpImageBlockMatchGatherSADQCOM: *hasResult = true; *hasResultType = true; break;
     case OpGroupIAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
     case OpGroupFAddNonUniformAMD: *hasResult = true; *hasResultType = true; break;
     case OpGroupFMinNonUniformAMD: *hasResult = true; *hasResultType = true; break;
diff --git a/thirdparty/spirv-cross/spirv_common.hpp b/thirdparty/spirv-cross/spirv_common.hpp
index 93b266977097..b70536d9ecc0 100644
--- a/thirdparty/spirv-cross/spirv_common.hpp
+++ b/thirdparty/spirv-cross/spirv_common.hpp
@@ -578,7 +578,9 @@ struct SPIRType : IVariant
 		// Keep internal types at the end.
 		ControlPointArray,
 		Interpolant,
-		Char
+		Char,
+		// MSL specific type, that is used by 'object'(analog of 'task' from glsl) shader.
+		MeshGridProperties
 	};
 
 	// Scalar/vector/matrix support.
@@ -746,6 +748,10 @@ struct SPIRExpression : IVariant
 	// A list of expressions which this expression depends on.
 	SmallVector<ID> expression_dependencies;
 
+	// Similar as expression dependencies, but does not stop the tracking for force-temporary variables.
+	// We need to know the full chain from store back to any SSA variable.
+	SmallVector<ID> invariance_dependencies;
+
 	// By reading this expression, we implicitly read these expressions as well.
 	// Used by access chain Store and Load since we read multiple expressions in this case.
 	SmallVector<ID> implied_read_expressions;
@@ -1598,6 +1604,8 @@ struct AccessChainMeta
 	bool flattened_struct = false;
 	bool relaxed_precision = false;
 	bool access_meshlet_position_y = false;
+	bool chain_is_builtin = false;
+	spv::BuiltIn builtin = {};
 };
 
 enum ExtendedDecorations
diff --git a/thirdparty/spirv-cross/spirv_cross.cpp b/thirdparty/spirv-cross/spirv_cross.cpp
index 8c3e7d381202..3492f0b3ed9b 100644
--- a/thirdparty/spirv-cross/spirv_cross.cpp
+++ b/thirdparty/spirv-cross/spirv_cross.cpp
@@ -1850,6 +1850,11 @@ const SmallVector<SPIRBlock::Case> &Compiler::get_case_list(const SPIRBlock &blo
 		const auto &type = get<SPIRType>(constant->constant_type);
 		width = type.width;
 	}
+	else if (const auto *op = maybe_get<SPIRConstantOp>(block.condition))
+	{
+		const auto &type = get<SPIRType>(op->basetype);
+		width = type.width;
+	}
 	else if (const auto *var = maybe_get<SPIRVariable>(block.condition))
 	{
 		const auto &type = get<SPIRType>(var->basetype);
@@ -2564,6 +2569,15 @@ void Compiler::add_active_interface_variable(uint32_t var_id)
 
 void Compiler::inherit_expression_dependencies(uint32_t dst, uint32_t source_expression)
 {
+	auto *ptr_e = maybe_get<SPIRExpression>(dst);
+
+	if (is_position_invariant() && ptr_e && maybe_get<SPIRExpression>(source_expression))
+	{
+		auto &deps = ptr_e->invariance_dependencies;
+		if (std::find(deps.begin(), deps.end(), source_expression) == deps.end())
+			deps.push_back(source_expression);
+	}
+
 	// Don't inherit any expression dependencies if the expression in dst
 	// is not a forwarded temporary.
 	if (forwarded_temporaries.find(dst) == end(forwarded_temporaries) ||
@@ -2572,7 +2586,7 @@ void Compiler::inherit_expression_dependencies(uint32_t dst, uint32_t source_exp
 		return;
 	}
 
-	auto &e = get<SPIRExpression>(dst);
+	auto &e = *ptr_e;
 	auto *phi = maybe_get<SPIRVariable>(source_expression);
 	if (phi && phi->phi_variable)
 	{
diff --git a/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp b/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
index 3072cd8abb05..b05afeb3f57e 100644
--- a/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
+++ b/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
@@ -564,7 +564,8 @@ Bitset ParsedIR::get_buffer_block_type_flags(const SPIRType &type) const
 Bitset ParsedIR::get_buffer_block_flags(const SPIRVariable &var) const
 {
 	auto &type = get<SPIRType>(var.basetype);
-	assert(type.basetype == SPIRType::Struct);
+	if (type.basetype != SPIRType::Struct)
+		SPIRV_CROSS_THROW("Cannot get buffer block flags for non-buffer variable.");
 
 	// Some flags like non-writable, non-readable are actually found
 	// as member decorations. If all members have a decoration set, propagate
@@ -927,6 +928,8 @@ void ParsedIR::reset_all_of_type(Types type)
 
 void ParsedIR::add_typed_id(Types type, ID id)
 {
+	assert(id < ids.size());
+
 	if (loop_iteration_depth_hard != 0)
 		SPIRV_CROSS_THROW("Cannot add typed ID while looping over it.");
 
@@ -1029,6 +1032,8 @@ ParsedIR::LoopLock &ParsedIR::LoopLock::operator=(LoopLock &&other) SPIRV_CROSS_
 
 void ParsedIR::make_constant_null(uint32_t id, uint32_t type, bool add_to_typed_id_set)
 {
+	assert(id < ids.size());
+
 	auto &constant_type = get<SPIRType>(type);
 
 	if (constant_type.pointer)
diff --git a/thirdparty/spirv-cross/spirv_glsl.cpp b/thirdparty/spirv-cross/spirv_glsl.cpp
index fad1132e82a7..6c1d5208b980 100644
--- a/thirdparty/spirv-cross/spirv_glsl.cpp
+++ b/thirdparty/spirv-cross/spirv_glsl.cpp
@@ -2764,6 +2764,8 @@ void CompilerGLSL::emit_interface_block(const SPIRVariable &var)
 				block_qualifier = "patch ";
 			else if (has_decoration(var.self, DecorationPerPrimitiveEXT))
 				block_qualifier = "perprimitiveEXT ";
+			else if (has_decoration(var.self, DecorationPerVertexKHR))
+				block_qualifier = "pervertexEXT ";
 			else
 				block_qualifier = "";
 
@@ -3691,11 +3693,11 @@ void CompilerGLSL::emit_resources()
 				auto &type = this->get<SPIRType>(undef.basetype);
 				// OpUndef can be void for some reason ...
 				if (type.basetype == SPIRType::Void)
-					return;
+					continue;
 
 				// This will break. It is bogus and should not be legal.
 				if (type_is_top_level_block(type))
-					return;
+					continue;
 
 				string initializer;
 				if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
@@ -6436,7 +6438,7 @@ string CompilerGLSL::constant_expression_vector(const SPIRConstant &c, uint32_t
 		if (splat)
 		{
 			res += convert_to_string(c.scalar(vector, 0));
-			if (is_legacy())
+			if (is_legacy() && !has_extension("GL_EXT_gpu_shader4"))
 			{
 				// Fake unsigned constant literals with signed ones if possible.
 				// Things like array sizes, etc, tend to be unsigned even though they could just as easily be signed.
@@ -6455,7 +6457,7 @@ string CompilerGLSL::constant_expression_vector(const SPIRConstant &c, uint32_t
 				else
 				{
 					res += convert_to_string(c.scalar(vector, i));
-					if (is_legacy())
+					if (is_legacy() && !has_extension("GL_EXT_gpu_shader4"))
 					{
 						// Fake unsigned constant literals with signed ones if possible.
 						// Things like array sizes, etc, tend to be unsigned even though they could just as easily be signed.
@@ -10208,6 +10210,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 	bool pending_array_enclose = false;
 	bool dimension_flatten = false;
 	bool access_meshlet_position_y = false;
+	bool chain_is_builtin = false;
+	spv::BuiltIn chained_builtin = {};
 
 	if (auto *base_expr = maybe_get<SPIRExpression>(base))
 	{
@@ -10365,6 +10369,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				auto builtin = ir.meta[base].decoration.builtin_type;
 				bool mesh_shader = get_execution_model() == ExecutionModelMeshEXT;
 
+				chain_is_builtin = true;
+				chained_builtin = builtin;
+
 				switch (builtin)
 				{
 				case BuiltInCullDistance:
@@ -10500,6 +10507,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 					{
 						access_meshlet_position_y = true;
 					}
+
+					chain_is_builtin = true;
+					chained_builtin = builtin;
 				}
 				else
 				{
@@ -10719,6 +10729,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		meta->storage_physical_type = physical_type;
 		meta->relaxed_precision = relaxed_precision;
 		meta->access_meshlet_position_y = access_meshlet_position_y;
+		meta->chain_is_builtin = chain_is_builtin;
+		meta->builtin = chained_builtin;
 	}
 
 	return expr;
@@ -11764,13 +11776,13 @@ void CompilerGLSL::disallow_forwarding_in_expression_chain(const SPIRExpression
 	// Allow trivially forwarded expressions like OpLoad or trivial shuffles,
 	// these will be marked as having suppressed usage tracking.
 	// Our only concern is to make sure arithmetic operations are done in similar ways.
-	if (expression_is_forwarded(expr.self) && !expression_suppresses_usage_tracking(expr.self) &&
-	    forced_invariant_temporaries.count(expr.self) == 0)
+	if (forced_invariant_temporaries.count(expr.self) == 0)
 	{
-		force_temporary_and_recompile(expr.self);
+		if (!expression_suppresses_usage_tracking(expr.self))
+			force_temporary_and_recompile(expr.self);
 		forced_invariant_temporaries.insert(expr.self);
 
-		for (auto &dependent : expr.expression_dependencies)
+		for (auto &dependent : expr.invariance_dependencies)
 			disallow_forwarding_in_expression_chain(get<SPIRExpression>(dependent));
 	}
 }
@@ -12334,6 +12346,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			flattened_structs[ops[1]] = true;
 		if (meta.relaxed_precision && backend.requires_relaxed_precision_analysis)
 			set_decoration(ops[1], DecorationRelaxedPrecision);
+		if (meta.chain_is_builtin)
+			set_decoration(ops[1], DecorationBuiltIn, meta.builtin);
 
 		// If we have some expression dependencies in our access chain, this access chain is technically a forwarded
 		// temporary which could be subject to invalidation.
@@ -13227,13 +13241,24 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		uint32_t op0 = ops[2];
 		uint32_t op1 = ops[3];
 
-		// Needs special handling.
+		auto &out_type = get<SPIRType>(result_type);
+
 		bool forward = should_forward(op0) && should_forward(op1);
-		auto expr = join(to_enclosed_expression(op0), " - ", to_enclosed_expression(op1), " * ", "(",
-		                 to_enclosed_expression(op0), " / ", to_enclosed_expression(op1), ")");
+		string cast_op0, cast_op1;
+		auto expected_type = binary_op_bitcast_helper(cast_op0, cast_op1, int_type, op0, op1, false);
+
+		// Needs special handling.
+		auto expr = join(cast_op0, " - ", cast_op1, " * ", "(", cast_op0, " / ", cast_op1, ")");
 
 		if (implicit_integer_promotion)
+		{
 			expr = join(type_to_glsl(get<SPIRType>(result_type)), '(', expr, ')');
+		}
+		else if (out_type.basetype != int_type)
+		{
+			expected_type.basetype = int_type;
+			expr = join(bitcast_glsl_op(out_type, expected_type), '(', expr, ')');
+		}
 
 		emit_op(result_type, result_id, expr, forward);
 		inherit_expression_dependencies(result_id, op0);
@@ -14481,6 +14506,50 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpImageBlockMatchWindowSSDQCOM:
+	case OpImageBlockMatchWindowSADQCOM:
+	case OpImageBlockMatchGatherSSDQCOM:
+	case OpImageBlockMatchGatherSADQCOM:
+	{
+		require_extension_internal("GL_QCOM_image_processing2");
+		uint32_t result_type_id = ops[0];
+		uint32_t id = ops[1];
+		string expr;
+		switch (opcode)
+		{
+		case OpImageBlockMatchWindowSSDQCOM:
+			expr = "textureBlockMatchWindowSSDQCOM";
+			break;
+		case OpImageBlockMatchWindowSADQCOM:
+			expr = "textureBlockMatchWindowSADQCOM";
+			break;
+		case OpImageBlockMatchGatherSSDQCOM:
+			expr = "textureBlockMatchGatherSSDQCOM";
+			break;
+		case OpImageBlockMatchGatherSADQCOM:
+			expr = "textureBlockMatchGatherSADQCOM";
+			break;
+		default:
+			SPIRV_CROSS_THROW("Invalid opcode for QCOM_image_processing2.");
+		}
+		expr += "(";
+
+		bool forward = false;
+		expr += to_expression(ops[2]);
+		expr += ", " + to_expression(ops[3]);
+
+		expr += ", " + to_non_uniform_aware_expression(ops[4]);
+		expr += ", " + to_expression(ops[5]);
+		expr += ", " + to_expression(ops[6]);
+
+		expr += ")";
+		emit_op(result_type_id, id, expr, forward);
+
+		inherit_expression_dependencies(id, ops[3]);
+		inherit_expression_dependencies(id, ops[5]);
+		break;
+	}
+
 	// Compute
 	case OpControlBarrier:
 	case OpMemoryBarrier:
@@ -15622,7 +15691,16 @@ string CompilerGLSL::argument_decl(const SPIRFunction::Parameter &arg)
 
 	if (type.pointer)
 	{
-		if (arg.write_count && arg.read_count)
+		// If we're passing around block types to function, we really mean reference in a pointer sense,
+		// but DXC does not like inout for mesh blocks, so workaround that. out is technically not correct,
+		// but it works in practice due to legalization. It's ... not great, but you gotta do what you gotta do.
+		// GLSL will never hit this case since it's not valid.
+		if (type.storage == StorageClassOutput && get_execution_model() == ExecutionModelMeshEXT &&
+		    has_decoration(type.self, DecorationBlock) && is_builtin_type(type) && arg.write_count)
+		{
+			direction = "out ";
+		}
+		else if (arg.write_count && arg.read_count)
 			direction = "inout ";
 		else if (arg.write_count)
 			direction = "out ";
@@ -15899,7 +15977,7 @@ string CompilerGLSL::image_type_glsl(const SPIRType &type, uint32_t id, bool /*m
 	case DimBuffer:
 		if (options.es && options.version < 320)
 			require_extension_internal("GL_EXT_texture_buffer");
-		else if (!options.es && options.version < 300)
+		else if (!options.es && options.version < 140)
 			require_extension_internal("GL_EXT_texture_buffer_object");
 		res += "Buffer";
 		break;
@@ -16442,6 +16520,8 @@ void CompilerGLSL::emit_function(SPIRFunction &func, const Bitset &return_flags)
 	{
 		auto &var = get<SPIRVariable>(v);
 		var.deferred_declaration = false;
+		if (var.storage == StorageClassTaskPayloadWorkgroupEXT)
+			continue;
 
 		if (variable_decl_is_remapped_storage(var, StorageClassWorkgroup))
 		{
@@ -17608,7 +17688,7 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 
 		if (!collapsed_switch)
 		{
-			if (block_like_switch || is_legacy_es())
+			if (block_like_switch || is_legacy())
 			{
 				// ESSL 1.0 is not guaranteed to support do/while.
 				if (is_legacy_es())
@@ -17638,7 +17718,7 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 				// Default case.
 				if (!block_like_switch)
 				{
-					if (is_legacy_es())
+					if (is_legacy())
 						statement("else");
 					else
 						statement("default:");
@@ -17646,7 +17726,7 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 			}
 			else
 			{
-				if (is_legacy_es())
+				if (is_legacy())
 				{
 					statement((i ? "else " : ""), "if (", to_legacy_case_label(block.condition, literals, label_suffix),
 					          ")");
@@ -17698,7 +17778,7 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 
 			if (block.default_block == block.next_block)
 			{
-				if (is_legacy_es())
+				if (is_legacy())
 					statement("else");
 				else
 					statement("default:");
@@ -17712,7 +17792,7 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 
 		if (!collapsed_switch)
 		{
-			if (block_like_switch && !is_legacy_es())
+			if ((block_like_switch || is_legacy()) && !is_legacy_es())
 				end_scope_decl("while(false)");
 			else
 				end_scope();
diff --git a/thirdparty/spirv-cross/spirv_msl.cpp b/thirdparty/spirv-cross/spirv_msl.cpp
index 383ce688e981..642fcfa59a39 100644
--- a/thirdparty/spirv-cross/spirv_msl.cpp
+++ b/thirdparty/spirv-cross/spirv_msl.cpp
@@ -202,6 +202,9 @@ uint32_t CompilerMSL::get_resource_array_size(const SPIRType &type, uint32_t id)
 {
 	uint32_t array_size = to_array_size_literal(type);
 
+	if (id == 0)
+		return array_size;
+
 	// If we have argument buffers, we need to honor the ABI by using the correct array size
 	// from the layout. Only use shader declared size if we're not using argument buffers.
 	uint32_t desc_set = get_decoration(id, DecorationDescriptorSet);
@@ -269,7 +272,7 @@ void CompilerMSL::build_implicit_builtins()
 	    (active_input_builtins.get(BuiltInVertexId) || active_input_builtins.get(BuiltInVertexIndex) ||
 	     active_input_builtins.get(BuiltInBaseVertex) || active_input_builtins.get(BuiltInInstanceId) ||
 	     active_input_builtins.get(BuiltInInstanceIndex) || active_input_builtins.get(BuiltInBaseInstance));
-	bool need_local_invocation_index = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInSubgroupId);
+	bool need_local_invocation_index = (msl_options.emulate_subgroups && active_input_builtins.get(BuiltInSubgroupId)) || is_mesh_shader();
 	bool need_workgroup_size = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInNumSubgroups);
 	bool force_frag_depth_passthrough =
 	    get_execution_model() == ExecutionModelFragment && !uses_explicit_early_fragment_test() && need_subpass_input &&
@@ -278,7 +281,7 @@ void CompilerMSL::build_implicit_builtins()
 	if (need_subpass_input || need_sample_pos || need_subgroup_mask || need_vertex_params || need_tesc_params ||
 	    need_tese_params || need_multiview || need_dispatch_base || need_vertex_base_params || need_grid_params ||
 	    needs_sample_id || needs_subgroup_invocation_id || needs_subgroup_size || needs_helper_invocation ||
-		has_additional_fixed_sample_mask() || need_local_invocation_index || need_workgroup_size || force_frag_depth_passthrough)
+	    has_additional_fixed_sample_mask() || need_local_invocation_index || need_workgroup_size || force_frag_depth_passthrough || is_mesh_shader())
 	{
 		bool has_frag_coord = false;
 		bool has_sample_id = false;
@@ -325,6 +328,13 @@ void CompilerMSL::build_implicit_builtins()
 				}
 			}
 
+			if (builtin == BuiltInPrimitivePointIndicesEXT ||
+			    builtin == BuiltInPrimitiveLineIndicesEXT ||
+			    builtin == BuiltInPrimitiveTriangleIndicesEXT)
+			{
+				builtin_mesh_primitive_indices_id = var.self;
+			}
+
 			if (var.storage != StorageClassInput)
 				return;
 
@@ -1057,6 +1067,53 @@ void CompilerMSL::build_implicit_builtins()
 		set_decoration(var_id, DecorationBuiltIn, BuiltInPosition);
 		mark_implicit_builtin(StorageClassOutput, BuiltInPosition, var_id);
 	}
+
+	if (is_mesh_shader())
+	{
+		uint32_t offset = ir.increase_bound_by(2);
+		uint32_t type_ptr_id = offset;
+		uint32_t var_id = offset + 1;
+
+		// Create variable to store meshlet size.
+		uint32_t type_id = build_extended_vector_type(get_uint_type_id(), 2);
+		SPIRType uint_type_ptr = get<SPIRType>(type_id);
+		uint_type_ptr.op = OpTypePointer;
+		uint_type_ptr.pointer = true;
+		uint_type_ptr.pointer_depth++;
+		uint_type_ptr.parent_type = type_id;
+		uint_type_ptr.storage = StorageClassWorkgroup;
+
+		auto &ptr_type = set<SPIRType>(type_ptr_id, uint_type_ptr);
+		ptr_type.self = type_id;
+		set<SPIRVariable>(var_id, type_ptr_id, StorageClassWorkgroup);
+		set_name(var_id, "spvMeshSizes");
+		builtin_mesh_sizes_id = var_id;
+	}
+
+	if (get_execution_model() == spv::ExecutionModelTaskEXT)
+	{
+		uint32_t offset = ir.increase_bound_by(3);
+		uint32_t type_id = offset;
+		uint32_t type_ptr_id = offset + 1;
+		uint32_t var_id = offset + 2;
+
+		SPIRType mesh_grid_type { OpTypeStruct };
+		mesh_grid_type.basetype = SPIRType::MeshGridProperties;
+		set<SPIRType>(type_id, mesh_grid_type);
+
+		SPIRType mesh_grid_type_ptr = mesh_grid_type;
+		mesh_grid_type_ptr.op = spv::OpTypePointer;
+		mesh_grid_type_ptr.pointer = true;
+		mesh_grid_type_ptr.pointer_depth++;
+		mesh_grid_type_ptr.parent_type = type_id;
+		mesh_grid_type_ptr.storage = StorageClassOutput;
+
+		auto &ptr_in_type = set<SPIRType>(type_ptr_id, mesh_grid_type_ptr);
+		ptr_in_type.self = type_id;
+		set<SPIRVariable>(var_id, type_ptr_id, StorageClassOutput);
+		set_name(var_id, "spvMgp");
+		builtin_task_grid_id = var_id;
+	}
 }
 
 // Checks if the specified builtin variable (e.g. gl_InstanceIndex) is marked as active.
@@ -1509,6 +1566,10 @@ void CompilerMSL::emit_entry_point_declarations()
 		statement(CompilerGLSL::variable_decl(var), ";");
 		var.deferred_declaration = false;
 	}
+
+	// Holds SetMeshOutputsEXT information. Threadgroup since first thread wins.
+	if (processing_entry_point && is_mesh_shader())
+		statement("threadgroup uint2 spvMeshSizes;");
 }
 
 string CompilerMSL::compile()
@@ -1544,6 +1605,8 @@ string CompilerMSL::compile()
 	backend.native_pointers = true;
 	backend.nonuniform_qualifier = "";
 	backend.support_small_type_sampling_result = true;
+	backend.force_merged_mesh_block = false;
+	backend.force_gl_in_out_block = get_execution_model() == ExecutionModelMeshEXT;
 	backend.supports_empty_struct = true;
 	backend.support_64bit_switch = true;
 	backend.boolean_in_struct_remapped_type = SPIRType::Short;
@@ -1559,6 +1622,9 @@ string CompilerMSL::compile()
 	capture_output_to_buffer = msl_options.capture_output_to_buffer;
 	is_rasterization_disabled = msl_options.disable_rasterization || capture_output_to_buffer;
 
+	if (is_mesh_shader() && !get_entry_point().flags.get(ExecutionModeOutputPoints))
+		msl_options.enable_point_size_builtin = false;
+
 	// Initialize array here rather than constructor, MSVC 2013 workaround.
 	for (auto &id : next_metal_resource_ids)
 		id = 0;
@@ -1566,6 +1632,11 @@ string CompilerMSL::compile()
 	fixup_anonymous_struct_names();
 	fixup_type_alias();
 	replace_illegal_names();
+	if (get_execution_model() == ExecutionModelMeshEXT)
+	{
+		// Emit proxy entry-point for the sake of copy-pass
+		emit_mesh_entry_point();
+	}
 	sync_entry_point_aliases_and_names();
 
 	build_function_control_flow_graphs_and_analyze();
@@ -1576,8 +1647,7 @@ string CompilerMSL::compile()
 	preprocess_op_codes();
 	build_implicit_builtins();
 
-	if (needs_manual_helper_invocation_updates() &&
-	    (active_input_builtins.get(BuiltInHelperInvocation) || needs_helper_invocation))
+	if (needs_manual_helper_invocation_updates() && needs_helper_invocation)
 	{
 		string builtin_helper_invocation = builtin_to_glsl(BuiltInHelperInvocation, StorageClassInput);
 		string discard_expr = join(builtin_helper_invocation, " = true, discard_fragment()");
@@ -1618,9 +1688,17 @@ string CompilerMSL::compile()
 	// Create structs to hold input, output and uniform variables.
 	// Do output first to ensure out. is declared at top of entry function.
 	qual_pos_var_name = "";
-	stage_out_var_id = add_interface_block(StorageClassOutput);
-	patch_stage_out_var_id = add_interface_block(StorageClassOutput, true);
-	stage_in_var_id = add_interface_block(StorageClassInput);
+	if (is_mesh_shader())
+	{
+		fixup_implicit_builtin_block_names(get_execution_model());
+	}
+	else
+	{
+		stage_out_var_id = add_interface_block(StorageClassOutput);
+		patch_stage_out_var_id = add_interface_block(StorageClassOutput, true);
+		stage_in_var_id = add_interface_block(StorageClassInput);
+	}
+
 	if (is_tese_shader())
 		patch_stage_in_var_id = add_interface_block(StorageClassInput, true);
 
@@ -1629,6 +1707,12 @@ string CompilerMSL::compile()
 	if (is_tessellation_shader())
 		stage_in_ptr_var_id = add_interface_block_pointer(stage_in_var_id, StorageClassInput);
 
+	if (is_mesh_shader())
+	{
+		mesh_out_per_vertex = add_meshlet_block(false);
+		mesh_out_per_primitive = add_meshlet_block(true);
+	}
+
 	// Metal vertex functions that define no output must disable rasterization and return void.
 	if (!stage_out_var_id)
 		is_rasterization_disabled = true;
@@ -1721,7 +1805,7 @@ void CompilerMSL::preprocess_op_codes()
 	    (is_sample_rate() && (active_input_builtins.get(BuiltInFragCoord) ||
 	                          (need_subpass_input_ms && !msl_options.use_framebuffer_fetch_subpasses))))
 		needs_sample_id = true;
-	if (preproc.needs_helper_invocation)
+	if (preproc.needs_helper_invocation || active_input_builtins.get(BuiltInHelperInvocation))
 		needs_helper_invocation = true;
 
 	// OpKill is removed by the parser, so we need to identify those by inspecting
@@ -1763,12 +1847,18 @@ void CompilerMSL::localize_global_variables()
 	{
 		uint32_t v_id = *iter;
 		auto &var = get<SPIRVariable>(v_id);
-		if (var.storage == StorageClassPrivate || var.storage == StorageClassWorkgroup)
+		if (var.storage == StorageClassPrivate || var.storage == StorageClassWorkgroup ||
+		    var.storage == StorageClassTaskPayloadWorkgroupEXT)
 		{
 			if (!variable_is_lut(var))
 				entry_func.add_local_variable(v_id);
 			iter = global_variables.erase(iter);
 		}
+		else if (var.storage == StorageClassOutput && is_mesh_shader())
+		{
+			entry_func.add_local_variable(v_id);
+			iter = global_variables.erase(iter);
+		}
 		else
 			iter++;
 	}
@@ -2058,8 +2148,7 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 			}
 
 			case OpDemoteToHelperInvocation:
-				if (needs_manual_helper_invocation_updates() &&
-				    (active_input_builtins.get(BuiltInHelperInvocation) || needs_helper_invocation))
+				if (needs_manual_helper_invocation_updates() && needs_helper_invocation)
 					added_arg_ids.insert(builtin_helper_invocation_id);
 				break;
 
@@ -2107,18 +2196,30 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				break;
 			}
 
+			case OpSetMeshOutputsEXT:
+			{
+				if (builtin_local_invocation_index_id != 0)
+					added_arg_ids.insert(builtin_local_invocation_index_id);
+				if (builtin_mesh_sizes_id != 0)
+					added_arg_ids.insert(builtin_mesh_sizes_id);
+				break;
+			}
+
 			default:
 				break;
 			}
 
 			if (needs_manual_helper_invocation_updates() && b.terminator == SPIRBlock::Kill &&
-			    (active_input_builtins.get(BuiltInHelperInvocation) || needs_helper_invocation))
+			    needs_helper_invocation)
 				added_arg_ids.insert(builtin_helper_invocation_id);
 
 			// TODO: Add all other operations which can affect memory.
 			// We should consider a more unified system here to reduce boiler-plate.
 			// This kind of analysis is done in several places ...
 		}
+
+		if (b.terminator == SPIRBlock::EmitMeshTasks && builtin_task_grid_id != 0)
+			added_arg_ids.insert(builtin_task_grid_id);
 	}
 
 	function_global_vars[func_id] = added_arg_ids;
@@ -2208,6 +2309,17 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				if (is_tese_shader() && msl_options.raw_buffer_tese_input && var.storage == StorageClassInput)
 					set_decoration(next_id, DecorationNonWritable);
 			}
+			else if (is_builtin && is_mesh_shader())
+			{
+				uint32_t next_id = ir.increase_bound_by(1);
+				func.add_parameter(type_id, next_id, true);
+				auto &v = set<SPIRVariable>(next_id, type_id, StorageClassFunction, 0, arg_id);
+				v.storage = StorageClassWorkgroup;
+
+				// Ensure the existing variable has a valid name and the new variable has all the same meta info
+				set_name(arg_id, ensure_valid_name(to_name(arg_id), "v"));
+				ir.meta[next_id] = ir.meta[arg_id];
+			}
 			else if (is_builtin && has_decoration(p_type->self, DecorationBlock))
 			{
 				// Get the pointee type
@@ -4492,6 +4604,42 @@ uint32_t CompilerMSL::add_interface_block_pointer(uint32_t ib_var_id, StorageCla
 	return ib_ptr_var_id;
 }
 
+uint32_t CompilerMSL::add_meshlet_block(bool per_primitive)
+{
+	// Accumulate the variables that should appear in the interface struct.
+	SmallVector<SPIRVariable *> vars;
+
+	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
+		if (var.storage != StorageClassOutput || var.self == builtin_mesh_primitive_indices_id)
+			return;
+		if (is_per_primitive_variable(var) != per_primitive)
+			return;
+		vars.push_back(&var);
+	});
+
+	if (vars.empty())
+		return 0;
+
+	uint32_t next_id = ir.increase_bound_by(1);
+	auto &type = set<SPIRType>(next_id, SPIRType(OpTypeStruct));
+	type.basetype = SPIRType::Struct;
+
+	InterfaceBlockMeta meta;
+	for (auto *p_var : vars)
+	{
+		meta.strip_array = true;
+		meta.allow_local_declaration = false;
+		add_variable_to_interface_block(StorageClassOutput, "", type, *p_var, meta);
+	}
+
+	if (per_primitive)
+		set_name(type.self, "spvPerPrimitive");
+	else
+		set_name(type.self, "spvPerVertex");
+
+	return next_id;
+}
+
 // Ensure that the type is compatible with the builtin.
 // If it is, simply return the given type ID.
 // Otherwise, create a new type, and return it's ID.
@@ -5484,6 +5632,19 @@ void CompilerMSL::emit_custom_templates()
 			begin_scope();
 			statement("return elements[pos];");
 			end_scope();
+			if (get_execution_model() == spv::ExecutionModelMeshEXT ||
+			    get_execution_model() == spv::ExecutionModelTaskEXT)
+			{
+				statement("");
+				statement("object_data T& operator [] (size_t pos) object_data");
+				begin_scope();
+				statement("return elements[pos];");
+				end_scope();
+				statement("constexpr const object_data T& operator [] (size_t pos) const object_data");
+				begin_scope();
+				statement("return elements[pos];");
+				end_scope();
+			}
 			end_scope_decl();
 			statement("");
 			break;
@@ -7601,6 +7762,28 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
+		case SPVFuncImplMulExtended:
+			// Compiler may hit an internal error with mulhi, but doesn't when encapsulated for some reason.
+			statement("template<typename T, typename U, typename V>");
+			statement("[[clang::optnone]] T spvMulExtended(V l, V r)");
+			begin_scope();
+			statement("return T{U(l * r), U(mulhi(l, r))};");
+			end_scope();
+			statement("");
+			break;
+
+		case SPVFuncImplSetMeshOutputsEXT:
+			statement("void spvSetMeshOutputsEXT(uint gl_LocalInvocationIndex, threadgroup uint2& spvMeshSizes, uint vertexCount, uint primitiveCount)");
+			begin_scope();
+			statement("if (gl_LocalInvocationIndex == 0)");
+			begin_scope();
+			statement("spvMeshSizes.x = vertexCount;");
+			statement("spvMeshSizes.y = primitiveCount;");
+			end_scope();
+			end_scope();
+			statement("");
+			break;
+
 		default:
 			break;
 		}
@@ -7702,6 +7885,23 @@ void CompilerMSL::emit_resources()
 	emit_interface_block(patch_stage_out_var_id);
 	emit_interface_block(stage_in_var_id);
 	emit_interface_block(patch_stage_in_var_id);
+
+	if (get_execution_model() == ExecutionModelMeshEXT)
+	{
+		auto &execution = get_entry_point();
+		const char *topology = "";
+		if (execution.flags.get(ExecutionModeOutputTrianglesEXT))
+			topology = "topology::triangle";
+		else if (execution.flags.get(ExecutionModeOutputLinesEXT))
+			topology = "topology::line";
+		else if (execution.flags.get(ExecutionModeOutputPoints))
+			topology = "topology::point";
+
+		const char *per_primitive = mesh_out_per_primitive ? "spvPerPrimitive" : "void";
+		statement("using spvMesh_t = mesh<", "spvPerVertex, ", per_primitive, ", ", execution.output_vertices, ", ",
+		          execution.output_primitives, ", ", topology, ">;");
+		statement("");
+	}
 }
 
 // Emit declarations for the specialization Metal function constants
@@ -7725,7 +7925,7 @@ void CompilerMSL::emit_specialization_constants_and_structs()
 			mark_scalar_layout_structs(type);
 	});
 
-	bool builtin_block_type_is_required = false;
+	bool builtin_block_type_is_required = is_mesh_shader();
 	// Very special case. If gl_PerVertex is initialized as an array (tessellation)
 	// we have to potentially emit the gl_PerVertex struct type so that we can emit a constant LUT.
 	ir.for_each_typed_id<SPIRConstant>([&](uint32_t, SPIRConstant &c) {
@@ -9552,13 +9752,13 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		uint32_t op0 = ops[2];
 		uint32_t op1 = ops[3];
 		auto &type = get<SPIRType>(result_type);
+		auto &op_type = get<SPIRType>(type.member_types[0]);
 		auto input_type = opcode == OpSMulExtended ? int_type : uint_type;
 		string cast_op0, cast_op1;
 
 		binary_op_bitcast_helper(cast_op0, cast_op1, input_type, op0, op1, false);
-		emit_uninitialized_temporary_expression(result_type, result_id);
-		statement(to_expression(result_id), ".", to_member_name(type, 0), " = ", cast_op0, " * ", cast_op1, ";");
-		statement(to_expression(result_id), ".", to_member_name(type, 1), " = mulhi(", cast_op0, ", ", cast_op1, ");");
+		auto expr = join("spvMulExtended<", type_to_glsl(type), ", ", type_to_glsl(op_type), ">(", cast_op0, ", ", cast_op1, ")");
+		emit_op(result_type, result_id, expr, true);
 		break;
 	}
 
@@ -9917,6 +10117,14 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpSetMeshOutputsEXT:
+	{
+		flush_variable_declaration(builtin_mesh_primitive_indices_id);
+		add_spv_func_and_recompile(SPVFuncImplSetMeshOutputsEXT);
+		statement("spvSetMeshOutputsEXT(gl_LocalInvocationIndex, spvMeshSizes, ", to_unpacked_expression(ops[0]), ", ", to_unpacked_expression(ops[1]), ");");
+		break;
+	}
+
 	default:
 		CompilerGLSL::emit_instruction(instruction);
 		break;
@@ -9958,8 +10166,13 @@ void CompilerMSL::emit_texture_op(const Instruction &i, bool sparse)
 
 void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem)
 {
-	if (get_execution_model() != ExecutionModelGLCompute && !is_tesc_shader())
+	auto model = get_execution_model();
+
+	if (model != ExecutionModelGLCompute && model != ExecutionModelTaskEXT &&
+	    model != ExecutionModelMeshEXT && !is_tesc_shader())
+	{
 		return;
+	}
 
 	uint32_t exe_scope = id_exe_scope ? evaluate_constant_u32(id_exe_scope) : uint32_t(ScopeInvocation);
 	uint32_t mem_scope = id_mem_scope ? evaluate_constant_u32(id_mem_scope) : uint32_t(ScopeInvocation);
@@ -10305,6 +10518,13 @@ void CompilerMSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id,
 	{
 		auto obj_expression = to_expression(obj);
 		auto split_index = obj_expression.find_first_of('@');
+		bool needs_reinterpret = opcode == OpAtomicUMax || opcode == OpAtomicUMin || opcode == OpAtomicSMax || opcode == OpAtomicSMin;
+		needs_reinterpret &= type.basetype != expected_type;
+		SPIRVariable *backing_var = nullptr;
+
+		// Try to avoid waiting until not force recompile later mode to enable force recompile later
+		if (needs_reinterpret && (backing_var = maybe_get_backing_variable(obj)))
+			add_spv_func_and_recompile(SPVFuncImplTextureCast);
 
 		// Will only be false if we're in "force recompile later" mode.
 		if (split_index != string::npos)
@@ -10315,27 +10535,21 @@ void CompilerMSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id,
 			// Handle problem cases with sign where we need signed min/max on a uint image for example.
 			// It seems to work to cast the texture type itself, even if it is probably wildly outside of spec,
 			// but SPIR-V requires this to work.
-			if ((opcode == OpAtomicUMax || opcode == OpAtomicUMin ||
-			     opcode == OpAtomicSMax || opcode == OpAtomicSMin) &&
-			    type.basetype != expected_type)
+			if (needs_reinterpret && backing_var)
 			{
-				auto *backing_var = maybe_get_backing_variable(obj);
-				if (backing_var)
-				{
-					add_spv_func_and_recompile(SPVFuncImplTextureCast);
+				assert(spv_function_implementations.count(SPVFuncImplTextureCast) && "Should have been added above");
 
-					const auto *backing_type = &get<SPIRType>(backing_var->basetype);
-					while (backing_type->op != OpTypeImage)
-						backing_type = &get<SPIRType>(backing_type->parent_type);
+				const auto *backing_type = &get<SPIRType>(backing_var->basetype);
+				while (backing_type->op != OpTypeImage)
+					backing_type = &get<SPIRType>(backing_type->parent_type);
 
-					auto img_type = *backing_type;
-					auto tmp_type = type;
-					tmp_type.basetype = expected_type;
-					img_type.image.type = ir.increase_bound_by(1);
-					set<SPIRType>(img_type.image.type, tmp_type);
+				auto img_type = *backing_type;
+				auto tmp_type = type;
+				tmp_type.basetype = expected_type;
+				img_type.image.type = ir.increase_bound_by(1);
+				set<SPIRType>(img_type.image.type, tmp_type);
 
-					image_expr = join("spvTextureCast<", type_to_glsl(img_type, obj), ">(", image_expr, ")");
-				}
+				image_expr = join("spvTextureCast<", type_to_glsl(img_type, obj), ">(", image_expr, ")");
 			}
 
 			exp += join(image_expr, ".", op, "(");
@@ -10999,6 +11213,21 @@ void CompilerMSL::emit_function_prototype(SPIRFunction &func, const Bitset &)
 			if (ir.ids[initializer].get_type() == TypeNone || ir.ids[initializer].get_type() == TypeExpression)
 				set<SPIRExpression>(ed_var.initializer, "{}", ed_var.basetype, true);
 		}
+
+		// add `taskPayloadSharedEXT` variable to entry-point arguments
+		for (auto &v : func.local_variables)
+		{
+			auto &var = get<SPIRVariable>(v);
+			if (var.storage != StorageClassTaskPayloadWorkgroupEXT)
+				continue;
+
+			add_local_variable_name(v);
+			SPIRFunction::Parameter arg = {};
+			arg.id = v;
+			arg.type = var.basetype;
+			arg.alias_global_variable = true;
+			decl += join(", ", argument_decl(arg), " [[payload]]");
+		}
 	}
 
 	for (auto &arg : func.arguments)
@@ -11316,7 +11545,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 		if (args.has_array_offsets)
 		{
 			forward = forward && should_forward(args.offset);
-			farg_str += ", " + to_expression(args.offset);
+			farg_str += ", " + to_unpacked_expression(args.offset);
 		}
 
 		// Const offsets gather or swizzled gather puts the component before the other args.
@@ -11329,7 +11558,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 
 	// Texture coordinates
 	forward = forward && should_forward(args.coord);
-	auto coord_expr = to_enclosed_expression(args.coord);
+	auto coord_expr = to_enclosed_unpacked_expression(args.coord);
 	auto &coord_type = expression_type(args.coord);
 	bool coord_is_fp = type_is_floating_point(coord_type);
 	bool is_cube_fetch = false;
@@ -11453,14 +11682,14 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 			if (type.basetype != SPIRType::UInt)
 				tex_coords += join(" + uint2(", bitcast_expression(SPIRType::UInt, args.offset), ", 0)");
 			else
-				tex_coords += join(" + uint2(", to_enclosed_expression(args.offset), ", 0)");
+				tex_coords += join(" + uint2(", to_enclosed_unpacked_expression(args.offset), ", 0)");
 		}
 		else
 		{
 			if (type.basetype != SPIRType::UInt)
 				tex_coords += " + " + bitcast_expression(SPIRType::UInt, args.offset);
 			else
-				tex_coords += " + " + to_enclosed_expression(args.offset);
+				tex_coords += " + " + to_enclosed_unpacked_expression(args.offset);
 		}
 	}
 
@@ -11547,10 +11776,10 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 
 		string dref_expr;
 		if (args.base.is_proj)
-			dref_expr = join(to_enclosed_expression(args.dref), " / ",
+			dref_expr = join(to_enclosed_unpacked_expression(args.dref), " / ",
 			                 to_extract_component_expression(args.coord, alt_coord_component));
 		else
-			dref_expr = to_expression(args.dref);
+			dref_expr = to_unpacked_expression(args.dref);
 
 		if (sampling_type_needs_f32_conversion(dref_type))
 			dref_expr = convert_to_f32(dref_expr, 1);
@@ -11601,7 +11830,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 	if (bias && (imgtype.image.dim != Dim1D || msl_options.texture_1D_as_2D))
 	{
 		forward = forward && should_forward(bias);
-		farg_str += ", bias(" + to_expression(bias) + ")";
+		farg_str += ", bias(" + to_unpacked_expression(bias) + ")";
 	}
 
 	// Metal does not support LOD for 1D textures.
@@ -11610,7 +11839,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 		forward = forward && should_forward(lod);
 		if (args.base.is_fetch)
 		{
-			farg_str += ", " + to_expression(lod);
+			farg_str += ", " + to_unpacked_expression(lod);
 		}
 		else if (msl_options.sample_dref_lod_array_as_grad && args.dref && imgtype.image.arrayed)
 		{
@@ -11667,12 +11896,12 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 				extent = "float3(1.0)";
 				break;
 			}
-			farg_str += join(", ", grad_opt, "(", grad_coord, "exp2(", to_expression(lod), " - 0.5) / ", extent,
-			                 ", exp2(", to_expression(lod), " - 0.5) / ", extent, ")");
+			farg_str += join(", ", grad_opt, "(", grad_coord, "exp2(", to_unpacked_expression(lod), " - 0.5) / ", extent,
+			                 ", exp2(", to_unpacked_expression(lod), " - 0.5) / ", extent, ")");
 		}
 		else
 		{
-			farg_str += ", level(" + to_expression(lod) + ")";
+			farg_str += ", level(" + to_unpacked_expression(lod) + ")";
 		}
 	}
 	else if (args.base.is_fetch && !lod && (imgtype.image.dim != Dim1D || msl_options.texture_1D_as_2D) &&
@@ -11718,7 +11947,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 			grad_opt = "unsupported_gradient_dimension";
 			break;
 		}
-		farg_str += join(", ", grad_opt, "(", grad_coord, to_expression(grad_x), ", ", to_expression(grad_y), ")");
+		farg_str += join(", ", grad_opt, "(", grad_coord, to_unpacked_expression(grad_x), ", ", to_unpacked_expression(grad_y), ")");
 	}
 
 	if (args.min_lod)
@@ -11727,7 +11956,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 			SPIRV_CROSS_THROW("min_lod_clamp() is only supported in MSL 2.2+ and up.");
 
 		forward = forward && should_forward(args.min_lod);
-		farg_str += ", min_lod_clamp(" + to_expression(args.min_lod) + ")";
+		farg_str += ", min_lod_clamp(" + to_unpacked_expression(args.min_lod) + ")";
 	}
 
 	// Add offsets
@@ -11736,7 +11965,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 	if (args.offset && !args.base.is_fetch && !args.has_array_offsets)
 	{
 		forward = forward && should_forward(args.offset);
-		offset_expr = to_expression(args.offset);
+		offset_expr = to_unpacked_expression(args.offset);
 		offset_type = &expression_type(args.offset);
 	}
 
@@ -11802,7 +12031,7 @@ string CompilerMSL::to_function_args(const TextureFunctionArguments &args, bool
 	{
 		forward = forward && should_forward(args.sample);
 		farg_str += ", ";
-		farg_str += to_expression(args.sample);
+		farg_str += to_unpacked_expression(args.sample);
 	}
 
 	*p_forward = forward;
@@ -12454,12 +12683,50 @@ string CompilerMSL::to_struct_member(const SPIRType &type, uint32_t member_type_
 				((stage_out_var_id && get_stage_out_struct_type().self == type.self &&
 				  variable_storage_requires_stage_io(StorageClassOutput)) ||
 				 (stage_in_var_id && get_stage_in_struct_type().self == type.self &&
-				  variable_storage_requires_stage_io(StorageClassInput)));
+				  variable_storage_requires_stage_io(StorageClassInput))) ||
+				is_mesh_shader();
 		if (is_ib_in_out && is_member_builtin(type, index, &builtin))
 			is_using_builtin_array = true;
 		array_type = type_to_array_glsl(physical_type, orig_id);
 	}
 
+	if (is_mesh_shader())
+	{
+		BuiltIn builtin = BuiltInMax;
+		if (is_member_builtin(type, index, &builtin))
+		{
+			if (builtin == BuiltInPrimitiveShadingRateKHR)
+			{
+				// not supported in metal 3.0
+				is_using_builtin_array = false;
+				return "";
+			}
+
+			SPIRType metallic_type = *declared_type;
+			if (builtin == BuiltInCullPrimitiveEXT)
+				metallic_type.basetype = SPIRType::Boolean;
+			else if (builtin == BuiltInPrimitiveId || builtin == BuiltInLayer || builtin == BuiltInViewportIndex)
+				metallic_type.basetype = SPIRType::UInt;
+
+			is_using_builtin_array = true;
+			std::string result;
+			if (has_member_decoration(type.self, orig_id, DecorationBuiltIn))
+			{
+				// avoid '_RESERVED_IDENTIFIER_FIXUP_' in variable name
+				result = join(type_to_glsl(metallic_type, orig_id, false), " ", qualifier,
+				              builtin_to_glsl(builtin, StorageClassOutput), member_attribute_qualifier(type, index),
+				              array_type, ";");
+			}
+			else
+			{
+				result = join(type_to_glsl(metallic_type, orig_id, false), " ", qualifier,
+				              to_member_name(type, index), member_attribute_qualifier(type, index), array_type, ";");
+			}
+			is_using_builtin_array = false;
+			return result;
+		}
+	}
+
 	if (orig_id)
 	{
 		auto *data_type = declared_type;
@@ -12513,6 +12780,16 @@ void CompilerMSL::emit_struct_member(const SPIRType &type, uint32_t member_type_
 		statement("char _m", index, "_pad", "[", pad_len, "];");
 	}
 
+	BuiltIn builtin = BuiltInMax;
+	if (is_mesh_shader() && is_member_builtin(type, index, &builtin))
+	{
+		if (!has_active_builtin(builtin, StorageClassOutput) && !has_active_builtin(builtin, StorageClassInput))
+		{
+			// Do not emit unused builtins in mesh-output blocks
+			return;
+		}
+	}
+
 	// Handle HLSL-style 0-based vertex/instance index.
 	builtin_declaration = true;
 	statement(to_struct_member(type, member_type_id, index, qualifier));
@@ -12586,9 +12863,11 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 			return string(" [[attribute(") + convert_to_string(locn) + ")]]";
 	}
 
-	// Vertex and tessellation evaluation function outputs
-	if (((execution.model == ExecutionModelVertex && !msl_options.vertex_for_tessellation) || is_tese_shader()) &&
-	    type.storage == StorageClassOutput)
+	bool use_semantic_stage_output = is_mesh_shader() || is_tese_shader() ||
+	                                 (execution.model == ExecutionModelVertex && !msl_options.vertex_for_tessellation);
+
+	// Vertex, mesh and tessellation evaluation function outputs
+	if ((type.storage == StorageClassOutput || is_mesh_shader()) && use_semantic_stage_output)
 	{
 		if (is_builtin)
 		{
@@ -12607,6 +12886,9 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 				/* fallthrough */
 			case BuiltInPosition:
 			case BuiltInLayer:
+			case BuiltInCullPrimitiveEXT:
+			case BuiltInPrimitiveShadingRateKHR:
+			case BuiltInPrimitiveId:
 				return string(" [[") + builtin_qualifier(builtin) + "]]" + (mbr_type.array.empty() ? "" : " ");
 
 			case BuiltInClipDistance:
@@ -12760,17 +13042,10 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 		else
 			quals = member_location_attribute_qualifier(type, index);
 
-		if (builtin == BuiltInBaryCoordKHR || builtin == BuiltInBaryCoordNoPerspKHR)
+		if (builtin == BuiltInBaryCoordKHR && has_member_decoration(type.self, index, DecorationNoPerspective))
 		{
-			if (has_member_decoration(type.self, index, DecorationFlat) ||
-			    has_member_decoration(type.self, index, DecorationCentroid) ||
-			    has_member_decoration(type.self, index, DecorationSample) ||
-			    has_member_decoration(type.self, index, DecorationNoPerspective))
-			{
-				// NoPerspective is baked into the builtin type.
-				SPIRV_CROSS_THROW(
-				    "Flat, Centroid, Sample, NoPerspective decorations are not supported for BaryCoord inputs.");
-			}
+			// NoPerspective is baked into the builtin type.
+			SPIRV_CROSS_THROW("NoPerspective decorations are not supported for BaryCoord inputs.");
 		}
 
 		// Don't bother decorating integers with the 'flat' attribute; it's
@@ -12788,6 +13063,10 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 			{
 				if (!quals.empty())
 					quals += ", ";
+
+				if (builtin == BuiltInBaryCoordNoPerspKHR || builtin == BuiltInBaryCoordKHR)
+					SPIRV_CROSS_THROW("Centroid interpolation not supported for barycentrics in MSL.");
+
 				if (has_member_decoration(type.self, index, DecorationNoPerspective))
 					quals += "centroid_no_perspective";
 				else
@@ -12797,17 +13076,27 @@ string CompilerMSL::member_attribute_qualifier(const SPIRType &type, uint32_t in
 			{
 				if (!quals.empty())
 					quals += ", ";
+
+				if (builtin == BuiltInBaryCoordNoPerspKHR || builtin == BuiltInBaryCoordKHR)
+					SPIRV_CROSS_THROW("Sample interpolation not supported for barycentrics in MSL.");
+
 				if (has_member_decoration(type.self, index, DecorationNoPerspective))
 					quals += "sample_no_perspective";
 				else
 					quals += "sample_perspective";
 			}
-			else if (has_member_decoration(type.self, index, DecorationNoPerspective))
+			else if (has_member_decoration(type.self, index, DecorationNoPerspective) || builtin == BuiltInBaryCoordNoPerspKHR)
 			{
 				if (!quals.empty())
 					quals += ", ";
 				quals += "center_no_perspective";
 			}
+			else if (builtin == BuiltInBaryCoordKHR)
+			{
+				if (!quals.empty())
+					quals += ", ";
+				quals += "center_perspective";
+			}
 		}
 
 		if (!quals.empty())
@@ -13070,6 +13359,12 @@ string CompilerMSL::func_type_decl(SPIRType &type)
 	case ExecutionModelKernel:
 		entry_type = "kernel";
 		break;
+	case ExecutionModelMeshEXT:
+		entry_type = "[[mesh]]";
+		break;
+	case ExecutionModelTaskEXT:
+		entry_type = "[[object]]";
+		break;
 	default:
 		entry_type = "unknown";
 		break;
@@ -13088,6 +13383,11 @@ bool CompilerMSL::is_tese_shader() const
 	return get_execution_model() == ExecutionModelTessellationEvaluation;
 }
 
+bool CompilerMSL::is_mesh_shader() const
+{
+	return get_execution_model() == spv::ExecutionModelMeshEXT;
+}
+
 bool CompilerMSL::uses_explicit_early_fragment_test()
 {
 	auto &ep_flags = get_entry_point().flags;
@@ -13203,6 +13503,16 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 			if (!addr_space)
 				addr_space = "device";
 		}
+
+		if (is_mesh_shader())
+			addr_space = "threadgroup";
+		break;
+
+	case StorageClassTaskPayloadWorkgroupEXT:
+		if (is_mesh_shader())
+			addr_space = "const object_data";
+		else
+			addr_space = "object_data";
 		break;
 
 	default:
@@ -13215,7 +13525,10 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 		addr_space = type.pointer || (argument && type.basetype == SPIRType::ControlPointArray) ? "thread" : "";
 	}
 
-	return join(decoration_flags_signal_volatile(flags) ? "volatile " : "", addr_space);
+	if (decoration_flags_signal_volatile(flags) && 0 != strcmp(addr_space, "thread"))
+		return join("volatile ", addr_space);
+	else
+		return addr_space;
 }
 
 const char *CompilerMSL::to_restrict(uint32_t id, bool space)
@@ -13601,6 +13914,20 @@ void CompilerMSL::entry_point_args_builtin(string &ep_args)
 			                " [[buffer(", convert_to_string(msl_options.shader_input_buffer_index), ")]]");
 		}
 	}
+
+	if (is_mesh_shader())
+	{
+		if (!ep_args.empty())
+			ep_args += ", ";
+		ep_args += join("spvMesh_t spvMesh");
+	}
+
+	if (get_execution_model() == ExecutionModelTaskEXT)
+	{
+		if (!ep_args.empty())
+			ep_args += ", ";
+		ep_args += join("mesh_grid_properties spvMgp");
+	}
 }
 
 string CompilerMSL::entry_point_args_argument_buffer(bool append_comma)
@@ -13872,6 +14199,7 @@ void CompilerMSL::entry_point_args_discrete_descriptors(string &ep_args)
 					}
 					else
 					{
+						add_spv_func_and_recompile(SPVFuncImplVariableDescriptor);
 						ep_args += "const device spvDescriptor<" + get_argument_address_space(var) + " " +
 						           type_to_glsl(type) + "*>* ";
 					}
@@ -14038,6 +14366,14 @@ void CompilerMSL::fix_up_shader_inputs_outputs()
 		});
 	}
 
+	if (is_mesh_shader())
+	{
+		// If shader doesn't call SetMeshOutputsEXT, nothing should be rendered.
+		// No need to barrier after this, because only thread 0 writes to this later.
+		entry_func.fixup_hooks_in.push_back([this]() { statement("if (gl_LocalInvocationIndex == 0) spvMeshSizes.y = 0u;"); });
+		entry_func.fixup_hooks_out.push_back([this]() { emit_mesh_outputs(); });
+	}
+
 	// Look for sampled images and buffer. Add hooks to set up the swizzle constants or array lengths.
 	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
 		auto &type = get_variable_data_type(var);
@@ -14838,7 +15174,8 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 
 	if (var.basevariable && (var.basevariable == stage_in_ptr_var_id || var.basevariable == stage_out_ptr_var_id))
 		decl = join(cv_qualifier, type_to_glsl(type, arg.id));
-	else if (builtin)
+	else if (builtin && builtin_type != spv::BuiltInPrimitiveTriangleIndicesEXT &&
+	         builtin_type != spv::BuiltInPrimitiveLineIndicesEXT && builtin_type != spv::BuiltInPrimitivePointIndicesEXT)
 	{
 		// Only use templated array for Clip/Cull distance when feasible.
 		// In other scenarios, we need need to override array length for tess levels (if used as outputs),
@@ -15252,6 +15589,7 @@ const std::unordered_set<std::string> &CompilerMSL::get_illegal_func_names()
 		"fmin3",
 		"fmax3",
 		"divide",
+		"fmod",
 		"median3",
 		"VARIABLE_TRACEPOINT",
 		"STATIC_DATA_TRACEPOINT",
@@ -15474,6 +15812,9 @@ string CompilerMSL::to_qualifiers_glsl(uint32_t id)
 	auto *var = maybe_get<SPIRVariable>(id);
 	auto &type = expression_type(id);
 
+	if (type.storage == StorageClassTaskPayloadWorkgroupEXT)
+		quals += "object_data ";
+
 	if (type.storage == StorageClassWorkgroup || (var && variable_decl_is_remapped_storage(*var, StorageClassWorkgroup)))
 		quals += "threadgroup ";
 
@@ -15658,6 +15999,8 @@ string CompilerMSL::type_to_glsl(const SPIRType &type, uint32_t id, bool member)
 		break;
 	case SPIRType::RayQuery:
 		return "raytracing::intersection_query<raytracing::instancing, raytracing::triangle_data>";
+	case SPIRType::MeshGridProperties:
+		return "mesh_grid_properties";
 
 	default:
 		return "unknown_type";
@@ -15772,6 +16115,9 @@ bool CompilerMSL::variable_decl_is_remapped_storage(const SPIRVariable &variable
 			return true;
 		}
 
+		if (is_mesh_shader())
+			return variable.storage == StorageClassOutput;
+
 		return variable.storage == StorageClassOutput && is_tesc_shader() && is_stage_output_variable_masked(variable);
 	}
 	else if (storage == StorageClassStorageBuffer)
@@ -16541,6 +16887,8 @@ string CompilerMSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 	case BuiltInLayer:
 		if (is_tesc_shader())
 			break;
+		if (is_mesh_shader())
+			break;
 		if (storage != StorageClassInput && current_function && (current_function->self == ir.default_entry_point) &&
 		    !is_stage_output_builtin_masked(builtin))
 			return stage_out_var_name + "." + CompilerGLSL::builtin_to_glsl(builtin, storage);
@@ -16598,6 +16946,9 @@ string CompilerMSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 		// In SPIR-V 1.6 with Volatile HelperInvocation, we cannot emit a fixup early.
 		return "simd_is_helper_thread()";
 
+	case BuiltInPrimitiveId:
+		return "gl_PrimitiveID";
+
 	default:
 		break;
 	}
@@ -16631,6 +16982,8 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 	// Vertex function out
 	case BuiltInClipDistance:
 		return "clip_distance";
+	case BuiltInCullDistance:
+		return "cull_distance";
 	case BuiltInPointSize:
 		return "point_size";
 	case BuiltInPosition:
@@ -16678,6 +17031,8 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 			else if (msl_options.is_macos() && !msl_options.supports_msl_version(2, 2))
 				SPIRV_CROSS_THROW("PrimitiveId on macOS requires MSL 2.2.");
 			return "primitive_id";
+		case ExecutionModelMeshEXT:
+			return "primitive_id";
 		default:
 			SPIRV_CROSS_THROW("PrimitiveId is not supported in this execution model.");
 		}
@@ -16707,7 +17062,7 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 		// Shouldn't be reached.
 		SPIRV_CROSS_THROW("Sample position is retrieved by a function in MSL.");
 	case BuiltInViewIndex:
-		if (execution.model != ExecutionModelFragment)
+		if (execution.model != ExecutionModelFragment && execution.model != ExecutionModelMeshEXT)
 			SPIRV_CROSS_THROW("ViewIndex is handled specially outside fragment shaders.");
 		// The ViewIndex was implicitly used in the prior stages to set the render_target_array_index,
 		// so we can get it from there.
@@ -16805,18 +17160,15 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 		SPIRV_CROSS_THROW("Subgroup ballot masks are handled specially in MSL.");
 
 	case BuiltInBaryCoordKHR:
-		if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 3))
-			SPIRV_CROSS_THROW("Barycentrics are only supported in MSL 2.3 and above on iOS.");
-		else if (!msl_options.supports_msl_version(2, 2))
-			SPIRV_CROSS_THROW("Barycentrics are only supported in MSL 2.2 and above on macOS.");
-		return "barycentric_coord, center_perspective";
-
 	case BuiltInBaryCoordNoPerspKHR:
 		if (msl_options.is_ios() && !msl_options.supports_msl_version(2, 3))
 			SPIRV_CROSS_THROW("Barycentrics are only supported in MSL 2.3 and above on iOS.");
 		else if (!msl_options.supports_msl_version(2, 2))
 			SPIRV_CROSS_THROW("Barycentrics are only supported in MSL 2.2 and above on macOS.");
-		return "barycentric_coord, center_no_perspective";
+		return "barycentric_coord";
+
+	case BuiltInCullPrimitiveEXT:
+		return "primitive_culled";
 
 	default:
 		return "unsupported-built-in";
@@ -16934,6 +17286,13 @@ string CompilerMSL::builtin_type_decl(BuiltIn builtin, uint32_t id)
 	case BuiltInDeviceIndex:
 		return "int";
 
+	case BuiltInPrimitivePointIndicesEXT:
+		return "uint";
+	case BuiltInPrimitiveLineIndicesEXT:
+		return "uint2";
+	case BuiltInPrimitiveTriangleIndicesEXT:
+		return "uint3";
+
 	default:
 		return "unsupported-built-in-type";
 	}
@@ -17704,6 +18063,10 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 	case OpSUDotAccSat:
 		return SPVFuncImplReduceAdd;
 
+	case OpSMulExtended:
+	case OpUMulExtended:
+		return SPVFuncImplMulExtended;
+
 	default:
 		break;
 	}
@@ -18441,6 +18804,7 @@ void CompilerMSL::analyze_argument_buffers()
 
 		uint32_t member_index = 0;
 		uint32_t next_arg_buff_index = 0;
+		uint32_t prev_was_scalar_on_array_offset = 0;
 		for (auto &resource : resources)
 		{
 			auto &var = *resource.var;
@@ -18453,7 +18817,9 @@ void CompilerMSL::analyze_argument_buffers()
 			// member_index and next_arg_buff_index are incremented when padding members are added.
 			if (msl_options.pad_argument_buffer_resources && resource.plane == 0 && resource.overlapping_var_id == 0)
 			{
-				auto rez_bind = get_argument_buffer_resource(desc_set, next_arg_buff_index);
+				auto rez_bind = get_argument_buffer_resource(desc_set, next_arg_buff_index - prev_was_scalar_on_array_offset);
+				rez_bind.count -= prev_was_scalar_on_array_offset;
+
 				while (resource.index > next_arg_buff_index)
 				{
 					switch (rez_bind.basetype)
@@ -18492,12 +18858,19 @@ void CompilerMSL::analyze_argument_buffers()
 
 					// After padding, retrieve the resource again. It will either be more padding, or the actual resource.
 					rez_bind = get_argument_buffer_resource(desc_set, next_arg_buff_index);
+					prev_was_scalar_on_array_offset = 0;
 				}
 
+				uint32_t count = rez_bind.count;
+
+				// If the current resource is an array in the descriptor, but is a scalar
+				// in the shader, only the first element will be consumed. The next pass
+				// will add a padding member to consume the remaining array elements.
+				if (count > 1 && type.array.empty())
+					count = prev_was_scalar_on_array_offset = 1;
+
 				// Adjust the number of slots consumed by current member itself.
-				// Use the count value from the app, instead of the shader, in case the
-				// shader is only accessing part, or even one element, of the array.
-				next_arg_buff_index += resource.plane_count * rez_bind.count;
+				next_arg_buff_index += resource.plane_count * count;
 			}
 
 			string mbr_name = ensure_valid_name(resource.name, "m");
@@ -18788,6 +19161,224 @@ void CompilerMSL::emit_block_hints(const SPIRBlock &)
 {
 }
 
+void CompilerMSL::emit_mesh_entry_point()
+{
+	auto &ep = get_entry_point();
+	auto &f = get<SPIRFunction>(ir.default_entry_point);
+
+	const uint32_t func_id = ir.increase_bound_by(3);
+	const uint32_t block_id = func_id + 1;
+	const uint32_t ret_id = func_id + 2;
+	auto &wrapped_main = set<SPIRFunction>(func_id, f.return_type, f.function_type);
+
+	wrapped_main.blocks.push_back(block_id);
+	wrapped_main.entry_block = block_id;
+
+	auto &wrapped_entry = set<SPIRBlock>(block_id);
+	wrapped_entry.terminator = SPIRBlock::Return;
+
+	// Push call to original 'main'
+	Instruction ix = {};
+	ix.op = OpFunctionCall;
+	ix.offset = uint32_t(ir.spirv.size());
+	ix.length = 3;
+
+	ir.spirv.push_back(f.return_type);
+	ir.spirv.push_back(ret_id);
+	ir.spirv.push_back(ep.self);
+
+	wrapped_entry.ops.push_back(ix);
+
+	// relace entry-point for new one
+	SPIREntryPoint proxy_ep = ep;
+	proxy_ep.self = func_id;
+	ir.entry_points.insert(std::make_pair(func_id, proxy_ep));
+	ir.meta[func_id] = ir.meta[ir.default_entry_point];
+	ir.meta[ir.default_entry_point].decoration.alias.clear();
+
+	ir.default_entry_point = func_id;
+}
+
+void CompilerMSL::emit_mesh_outputs()
+{
+	auto &mode = get_entry_point();
+
+	// predefined thread count or zero, if specialization constant is in use
+	uint32_t num_invocations = 0;
+	if (mode.workgroup_size.id_x == 0 && mode.workgroup_size.id_y == 0 && mode.workgroup_size.id_z == 0)
+		num_invocations = mode.workgroup_size.x * mode.workgroup_size.y * mode.workgroup_size.z;
+
+	statement("threadgroup_barrier(mem_flags::mem_threadgroup);");
+	statement("if (spvMeshSizes.y == 0)");
+	begin_scope();
+	statement("return;");
+	end_scope();
+	statement("spvMesh.set_primitive_count(spvMeshSizes.y);");
+
+	statement("const uint spvThreadCount [[maybe_unused]] = (gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z);");
+
+	if (mesh_out_per_vertex != 0)
+	{
+		auto &type_vert = get<SPIRType>(mesh_out_per_vertex);
+
+		if (num_invocations < mode.output_vertices)
+		{
+			statement("for (uint spvVI = gl_LocalInvocationIndex; spvVI < spvMeshSizes.x; spvVI += spvThreadCount)");
+		}
+		else
+		{
+			statement("const uint spvVI = gl_LocalInvocationIndex;");
+			statement("if (gl_LocalInvocationIndex < spvMeshSizes.x)");
+		}
+
+		begin_scope();
+
+		statement("spvPerVertex spvV = {};");
+		for (uint32_t index = 0; index < uint32_t(type_vert.member_types.size()); ++index)
+		{
+			uint32_t orig_var = get_extended_member_decoration(type_vert.self, index, SPIRVCrossDecorationInterfaceOrigID);
+			uint32_t orig_id = get_extended_member_decoration(type_vert.self, index, SPIRVCrossDecorationInterfaceMemberIndex);
+
+			// Clip/cull distances are special-case
+			if (orig_var == 0 && orig_id == (~0u))
+				continue;
+
+			auto &orig = get<SPIRVariable>(orig_var);
+			auto &orig_type = get<SPIRType>(orig.basetype);
+
+			// FIXME: Need to deal with complex composite IO types. These may need extra unroll, etc.
+
+			BuiltIn builtin = BuiltInMax;
+			std::string access;
+			if (orig_type.basetype == SPIRType::Struct)
+			{
+				if (has_member_decoration(orig_type.self, orig_id, DecorationBuiltIn))
+					builtin = BuiltIn(get_member_decoration(orig_type.self, orig_id, DecorationBuiltIn));
+
+				switch (builtin)
+				{
+				case BuiltInPosition:
+				case BuiltInPointSize:
+				case BuiltInClipDistance:
+				case BuiltInCullDistance:
+					access = "." + builtin_to_glsl(builtin, StorageClassOutput);
+					break;
+				default:
+					access = "." + to_member_name(orig_type, orig_id);
+					break;
+				}
+
+				if (has_member_decoration(type_vert.self, index, DecorationIndex))
+				{
+					// Declare the Clip/CullDistance as [[user(clip/cullN)]].
+					const uint32_t orig_index = get_member_decoration(type_vert.self, index, DecorationIndex);
+					access += "[" + to_string(orig_index) + "]";
+					statement("spvV.", builtin_to_glsl(builtin, StorageClassOutput), "[", orig_index, "] = ", to_name(orig_var), "[spvVI]", access, ";");
+				}
+			}
+
+			statement("spvV.", to_member_name(type_vert, index), " = ", to_name(orig_var), "[spvVI]", access, ";");
+			if (options.vertex.flip_vert_y && builtin == BuiltInPosition)
+			{
+				statement("spvV.", to_member_name(type_vert, index), ".y = -(", "spvV.",
+				          to_member_name(type_vert, index), ".y);", "    // Invert Y-axis for Metal");
+			}
+		}
+		statement("spvMesh.set_vertex(spvVI, spvV);");
+		end_scope();
+	}
+
+	if (mesh_out_per_primitive != 0 || builtin_mesh_primitive_indices_id != 0)
+	{
+		if (num_invocations < mode.output_primitives)
+		{
+			statement("for (uint spvPI = gl_LocalInvocationIndex; spvPI < spvMeshSizes.y; spvPI += spvThreadCount)");
+		}
+		else
+		{
+			statement("const uint spvPI = gl_LocalInvocationIndex;");
+			statement("if (gl_LocalInvocationIndex < spvMeshSizes.y)");
+		}
+
+		// FIXME: Need to deal with complex composite IO types. These may need extra unroll, etc.
+
+		begin_scope();
+
+		if (builtin_mesh_primitive_indices_id != 0)
+		{
+			if (mode.flags.get(ExecutionModeOutputTrianglesEXT))
+			{
+				statement("spvMesh.set_index(spvPI * 3u + 0u, gl_PrimitiveTriangleIndicesEXT[spvPI].x);");
+				statement("spvMesh.set_index(spvPI * 3u + 1u, gl_PrimitiveTriangleIndicesEXT[spvPI].y);");
+				statement("spvMesh.set_index(spvPI * 3u + 2u, gl_PrimitiveTriangleIndicesEXT[spvPI].z);");
+			}
+			else if (mode.flags.get(ExecutionModeOutputLinesEXT))
+			{
+				statement("spvMesh.set_index(spvPI * 2u + 0u, gl_PrimitiveLineIndicesEXT[spvPI].x);");
+				statement("spvMesh.set_index(spvPI * 2u + 1u, gl_PrimitiveLineIndicesEXT[spvPI].y);");
+			}
+			else
+			{
+				statement("spvMesh.set_index(spvPI, gl_PrimitivePointIndicesEXT[spvPI]);");
+			}
+		}
+
+		if (mesh_out_per_primitive != 0)
+		{
+			auto &type_prim = get<SPIRType>(mesh_out_per_primitive);
+			statement("spvPerPrimitive spvP = {};");
+			for (uint32_t index = 0; index < uint32_t(type_prim.member_types.size()); ++index)
+			{
+				uint32_t orig_var =
+						get_extended_member_decoration(type_prim.self, index, SPIRVCrossDecorationInterfaceOrigID);
+				uint32_t orig_id =
+						get_extended_member_decoration(type_prim.self, index, SPIRVCrossDecorationInterfaceMemberIndex);
+				auto &orig = get<SPIRVariable>(orig_var);
+				auto &orig_type = get<SPIRType>(orig.basetype);
+
+				BuiltIn builtin = BuiltInMax;
+				std::string access;
+				if (orig_type.basetype == SPIRType::Struct)
+				{
+					if (has_member_decoration(orig_type.self, orig_id, DecorationBuiltIn))
+						builtin = BuiltIn(get_member_decoration(orig_type.self, orig_id, DecorationBuiltIn));
+
+					switch (builtin)
+					{
+					case BuiltInPrimitiveId:
+					case BuiltInLayer:
+					case BuiltInViewportIndex:
+					case BuiltInCullPrimitiveEXT:
+					case BuiltInPrimitiveShadingRateKHR:
+						access = "." + builtin_to_glsl(builtin, StorageClassOutput);
+						break;
+					default:
+						access = "." + to_member_name(orig_type, orig_id);
+					}
+				}
+				statement("spvP.", to_member_name(type_prim, index), " = ", to_name(orig_var), "[spvPI]", access, ";");
+			}
+			statement("spvMesh.set_primitive(spvPI, spvP);");
+		}
+
+		end_scope();
+	}
+}
+
+void CompilerMSL::emit_mesh_tasks(SPIRBlock &block)
+{
+	// GLSL: Once this instruction is called, the workgroup must be terminated immediately, and the mesh shaders are launched.
+	// TODO: find relieble and clean of terminating shader.
+	flush_variable_declaration(builtin_task_grid_id);
+	statement("spvMgp.set_threadgroups_per_grid(uint3(", to_unpacked_expression(block.mesh.groups[0]), ", ",
+	          to_unpacked_expression(block.mesh.groups[1]), ", ", to_unpacked_expression(block.mesh.groups[2]), "));");
+	// This is correct if EmitMeshTasks is called in the entry function for shader.
+	// Only viable solutions would be:
+	// - Caller ensures the SPIR-V is inlined, then this always holds true.
+	// - Pass down a "should terminate" bool to leaf functions and chain return (horrible and disgusting, let's not).
+	statement("return;");
+}
+
 string CompilerMSL::additional_fixed_sample_mask_str() const
 {
 	char print_buffer[32];
diff --git a/thirdparty/spirv-cross/spirv_msl.hpp b/thirdparty/spirv-cross/spirv_msl.hpp
index 2d970c0da5b8..4aaad01a8921 100644
--- a/thirdparty/spirv-cross/spirv_msl.hpp
+++ b/thirdparty/spirv-cross/spirv_msl.hpp
@@ -838,7 +838,9 @@ class CompilerMSL : public CompilerGLSL
 		SPVFuncImplPaddedStd140,
 		SPVFuncImplReduceAdd,
 		SPVFuncImplImageFence,
-		SPVFuncImplTextureCast
+		SPVFuncImplTextureCast,
+		SPVFuncImplMulExtended,
+		SPVFuncImplSetMeshOutputsEXT,
 	};
 
 	// If the underlying resource has been used for comparison then duplicate loads of that resource must be too
@@ -867,6 +869,9 @@ class CompilerMSL : public CompilerGLSL
 	std::string type_to_glsl(const SPIRType &type, uint32_t id, bool member);
 	std::string type_to_glsl(const SPIRType &type, uint32_t id = 0) override;
 	void emit_block_hints(const SPIRBlock &block) override;
+	void emit_mesh_entry_point();
+	void emit_mesh_outputs();
+	void emit_mesh_tasks(SPIRBlock &block) override;
 
 	// Allow Metal to use the array<T> template to make arrays a value type
 	std::string type_to_array_glsl(const SPIRType &type, uint32_t variable_id) override;
@@ -918,6 +923,7 @@ class CompilerMSL : public CompilerGLSL
 
 	bool is_tesc_shader() const;
 	bool is_tese_shader() const;
+	bool is_mesh_shader() const;
 
 	void preprocess_op_codes();
 	void localize_global_variables();
@@ -932,6 +938,7 @@ class CompilerMSL : public CompilerGLSL
 	                                            std::unordered_set<uint32_t> &processed_func_ids);
 	uint32_t add_interface_block(spv::StorageClass storage, bool patch = false);
 	uint32_t add_interface_block_pointer(uint32_t ib_var_id, spv::StorageClass storage);
+	uint32_t add_meshlet_block(bool per_primitive);
 
 	struct InterfaceBlockMeta
 	{
@@ -1103,12 +1110,17 @@ class CompilerMSL : public CompilerGLSL
 	uint32_t builtin_stage_input_size_id = 0;
 	uint32_t builtin_local_invocation_index_id = 0;
 	uint32_t builtin_workgroup_size_id = 0;
+	uint32_t builtin_mesh_primitive_indices_id = 0;
+	uint32_t builtin_mesh_sizes_id = 0;
+	uint32_t builtin_task_grid_id = 0;
 	uint32_t builtin_frag_depth_id = 0;
 	uint32_t swizzle_buffer_id = 0;
 	uint32_t buffer_size_buffer_id = 0;
 	uint32_t view_mask_buffer_id = 0;
 	uint32_t dynamic_offsets_buffer_id = 0;
 	uint32_t uint_type_id = 0;
+	uint32_t shared_uint_type_id = 0;
+	uint32_t meshlet_type_id = 0;
 	uint32_t argument_buffer_padding_buffer_type_id = 0;
 	uint32_t argument_buffer_padding_image_type_id = 0;
 	uint32_t argument_buffer_padding_sampler_type_id = 0;
@@ -1173,6 +1185,8 @@ class CompilerMSL : public CompilerGLSL
 	VariableID stage_out_ptr_var_id = 0;
 	VariableID tess_level_inner_var_id = 0;
 	VariableID tess_level_outer_var_id = 0;
+	VariableID mesh_out_per_vertex = 0;
+	VariableID mesh_out_per_primitive = 0;
 	VariableID stage_out_masked_builtin_type_id = 0;
 
 	// Handle HLSL-style 0-based vertex/instance index.
diff --git a/thirdparty/spirv-cross/spirv_reflect.cpp b/thirdparty/spirv-cross/spirv_reflect.cpp
index 633983bd30de..552d671a649b 100644
--- a/thirdparty/spirv-cross/spirv_reflect.cpp
+++ b/thirdparty/spirv-cross/spirv_reflect.cpp
@@ -637,6 +637,8 @@ void CompilerReflection::emit_resources(const char *tag, const SmallVector<Resou
 			json_stream->emit_json_key_value("WeightTextureQCOM", get_decoration(res.id, DecorationWeightTextureQCOM));
 		if (mask.get(DecorationBlockMatchTextureQCOM))
 			json_stream->emit_json_key_value("BlockMatchTextureQCOM", get_decoration(res.id, DecorationBlockMatchTextureQCOM));
+		if (mask.get(DecorationBlockMatchSamplerQCOM))
+			json_stream->emit_json_key_value("BlockMatchSamplerQCOM", get_decoration(res.id, DecorationBlockMatchSamplerQCOM));
 
 		// For images, the type itself adds a layout qualifer.
 		// Only emit the format for storage images.