diff --git a/meson_options.txt b/meson_options.txt
index c95b65415a93..ac9ab64e7a06 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -177,13 +177,21 @@ option(
 )
 
 option(
-  'gallium-windows-dll-name',
+  'gallium-wgl-dll-name',
   type : 'string',
   value : 'libgallium_wgl',
-  description : 'name of gallium megadriver DLL built for Windows. ' +
+  description : 'name of gallium wgl target DLL built for Windows. ' +
                 'defaults to libgallium_wgl.dll to match DRI',
 )
 
+option(
+  'gallium-d3d10-dll-name',
+  type : 'string',
+  value : 'libgallium_d3d10',
+  description : 'name of gallium d3d10 target DLL built for Windows. ' +
+                'defaults to libgallium_d3d10.dll to match DRI',
+)
+
 option(
   'opencl-spirv',
   type : 'boolean',
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
index 49508e961379..1500984a1213 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -468,6 +468,35 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* inst
       out.push_back(encoding);
       break;
    }
+   case Format::VOPD: {
+      VOPD_instruction& vopd = instr->vopd();
+      uint32_t encoding = (0b110010 << 26);
+      encoding |= reg(ctx, instr->operands[0]);
+      if (instr->opcode != aco_opcode::v_dual_mov_b32)
+         encoding |= reg(ctx, instr->operands[1], 8) << 9;
+      encoding |= (uint32_t)ctx.opcode[(int)vopd.opy] << 17;
+      encoding |= opcode << 22;
+      out.push_back(encoding);
+
+      unsigned opy_start = instr->opcode == aco_opcode::v_dual_mov_b32 ? 1 : 2;
+      switch (instr->opcode) {
+      case aco_opcode::v_dual_fmac_f32:
+      case aco_opcode::v_dual_fmaak_f32:
+      case aco_opcode::v_dual_fmamk_f32:
+      case aco_opcode::v_dual_cndmask_b32:
+      case aco_opcode::v_dual_dot2acc_f32_f16:
+      case aco_opcode::v_dual_dot2acc_f32_bf16: opy_start = 3; break;
+      default: break;
+      }
+
+      encoding = reg(ctx, instr->operands[opy_start]);
+      if (vopd.opy != aco_opcode::v_dual_mov_b32)
+         encoding |= reg(ctx, instr->operands[opy_start + 1], 8) << 9;
+      encoding |= (reg(ctx, instr->definitions[1], 8) >> 1) << 17;
+      encoding |= reg(ctx, instr->definitions[0], 8) << 24;
+      out.push_back(encoding);
+      break;
+   }
    case Format::DS: {
       DS_instruction& ds = instr->ds();
       uint32_t encoding = (0b110110 << 26);
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index e4518a0f9c58..f5106a298472 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -578,6 +578,7 @@ class Builder {
            ("vopc_sdwa", [Format.VOPC, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2])),
            ("vop3", [Format.VOP3], 'VALU_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]),
            ("vop3p", [Format.VOP3P], 'VALU_instruction', [(1, 2), (1, 3)]),
+           ("vopd", [Format.VOPD], 'VOPD_instruction', [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6)]),
            ("vinterp_inreg", [Format.VINTERP_INREG], 'VINTERP_inreg_instruction', [(1, 3)]),
            ("vintrp", [Format.VINTRP], 'VINTRP_instruction', [(1, 2), (1, 3)]),
            ("vop1_dpp", [Format.VOP1, Format.DPP16], 'DPP16_instruction', [(1, 1)]),
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index b405fbd82cbe..f6bc400b444e 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -59,6 +59,7 @@ static const std::array<aco_compiler_statistic_info, aco_num_statistics> statist
    ret[aco_statistic_salu] = aco_compiler_statistic_info{"SALU", "Number of SALU instructions"};
    ret[aco_statistic_vmem] = aco_compiler_statistic_info{"VMEM", "Number of VMEM instructions"};
    ret[aco_statistic_smem] = aco_compiler_statistic_info{"SMEM", "Number of SMEM instructions"};
+   ret[aco_statistic_vopd] = aco_compiler_statistic_info{"VOPD", "Number of VOPD instructions"};
    return ret;
 }();
 
@@ -199,6 +200,9 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
    aco::lower_to_hw_instr(program.get());
    validate(program.get());
 
+   if (!options->optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_SCHED_VOPD))
+      aco::schedule_vopd(program.get());
+
    /* Schedule hardware instructions for ILP */
    if (!options->optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_SCHED_ILP))
       aco::schedule_ilp(program.get());
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index ea49a10da5c7..1a54f7de2e49 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -45,8 +45,9 @@ static const struct debug_control aco_debug_options[] = {
    {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
    {"novn", DEBUG_NO_VN},
    {"noopt", DEBUG_NO_OPT},
-   {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP},
+   {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD},
    {"nosched-ilp", DEBUG_NO_SCHED_ILP},
+   {"nosched-vopd", DEBUG_NO_SCHED_VOPD},
    {"perfinfo", DEBUG_PERF_INFO},
    {"liveinfo", DEBUG_LIVE_INFO},
    {NULL, 0}};
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index cd4ceb221ce8..3863cf11b720 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -58,6 +58,7 @@ enum {
    DEBUG_FORCE_WAITDEPS = 0x200,
    DEBUG_NO_VALIDATE_IR = 0x400,
    DEBUG_NO_SCHED_ILP = 0x800,
+   DEBUG_NO_SCHED_VOPD = 0x1000,
 };
 
 enum storage_class : uint8_t {
@@ -957,6 +958,7 @@ struct Pseudo_reduction_instruction;
 struct VALU_instruction;
 struct VINTERP_inreg_instruction;
 struct VINTRP_instruction;
+struct VOPD_instruction;
 struct DPP16_instruction;
 struct DPP8_instruction;
 struct SDWA_instruction;
@@ -1210,6 +1212,17 @@ struct Instruction {
       return *(VINTERP_inreg_instruction*)this;
    }
    constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; }
+   VOPD_instruction& vopd() noexcept
+   {
+      assert(isVOPD());
+      return *(VOPD_instruction*)this;
+   }
+   const VOPD_instruction& vopd() const noexcept
+   {
+      assert(isVOPD());
+      return *(VOPD_instruction*)this;
+   }
+   constexpr bool isVOPD() const noexcept { return format == Format::VOPD; }
    constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; }
    constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; }
    constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; }
@@ -1278,7 +1291,8 @@ struct Instruction {
    }
    constexpr bool isVALU() const noexcept
    {
-      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG();
+      return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() ||
+             isVOPD();
    }
 
    constexpr bool isSALU() const noexcept
@@ -1368,6 +1382,12 @@ struct VINTERP_inreg_instruction : public VALU_instruction {
 static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4,
               "Unexpected padding");
 
+struct VOPD_instruction : public VALU_instruction {
+   aco_opcode opy;
+   uint16_t padding;
+};
+static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding");
+
 /**
  * Data Parallel Primitives Format:
  * This format can be used for VOP1, VOP2 or VOPC instructions.
@@ -2209,6 +2229,7 @@ void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
 void schedule_ilp(Program* program);
+void schedule_vopd(Program* program);
 void spill(Program* program, live& live_vars);
 void insert_wait_states(Program* program);
 bool dealloc_vgprs(Program* program);
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 1cf23b5e061b..4a512113c3f4 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -89,6 +89,7 @@ class Format(IntEnum):
    VINTRP = auto()
    # Vector ALU Formats
    VINTERP_INREG = auto()
+   VOPD = auto()
    VOP1 = 1 << 7
    VOP2 = 1 << 8
    VOPC = 1 << 9
@@ -186,6 +187,8 @@ def get_builder_fields(self):
       elif self == Format.VOP3P:
          return [('uint8_t', 'opsel_lo', None),
                  ('uint8_t', 'opsel_hi', None)]
+      elif self == Format.VOPD:
+         return [('aco_opcode', 'opy', None)]
       elif self == Format.VINTERP_INREG:
          return [('unsigned', 'wait_exp', 7),
                  ('uint8_t', 'opsel', 0)]
@@ -1272,6 +1275,29 @@ def default_class(opcodes, cls):
    opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP3, cls, in_mod, out_mod, definitions = defs, operands = ops)
 
 
+VOPD = {
+   (0x00, "v_dual_fmac_f32"),
+   (0x01, "v_dual_fmaak_f32"),
+   (0x02, "v_dual_fmamk_f32"),
+   (0x03, "v_dual_mul_f32"),
+   (0x04, "v_dual_add_f32"),
+   (0x05, "v_dual_sub_f32"),
+   (0x06, "v_dual_subrev_f32"),
+   (0x07, "v_dual_mul_dx9_zero_f32"),
+   (0x08, "v_dual_mov_b32"),
+   (0x09, "v_dual_cndmask_b32"),
+   (0x0a, "v_dual_max_f32"),
+   (0x0b, "v_dual_min_f32"),
+   (0x0c, "v_dual_dot2acc_f32_f16"),
+   (0x0d, "v_dual_dot2acc_f32_bf16"),
+   (0x10, "v_dual_add_nc_u32"),
+   (0x11, "v_dual_lshlrev_b32"),
+   (0x12, "v_dual_and_b32"),
+}
+for gfx11, name in VOPD:
+   opcode(name, -1, -1, -1, gfx11, format = Format.VOPD, cls = InstrClass.Valu32)
+
+
 # DS instructions: 3 inputs (1 addr, 2 data), 1 output
 DS = {
    (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"),
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index a0ed4ebae89a..5a7ae9d94b93 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -443,6 +443,12 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
       fprintf(output, " attr%d.%c", vintrp.attribute, "xyzw"[vintrp.component]);
       break;
    }
+   case Format::VOPD: {
+      const VOPD_instruction& vopd = instr->vopd();
+      // TODO: beautify
+      fprintf(output, " %s", instr_info.name[(int)vopd.opy]);
+      break;
+   }
    case Format::DS: {
       const DS_instruction& ds = instr->ds();
       if (ds.offset0)
diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp
index 007386cef7b6..c6a288fe5ef8 100644
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@@ -27,6 +27,16 @@ constexpr unsigned num_nodes = 16;
 using mask_t = uint16_t;
 static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
 
+struct VOPDInfo {
+   VOPDInfo() : is_opy_only(0), is_dst_odd(0), src_banks(0), has_literal(0) {}
+   uint16_t is_opy_only : 1;
+   uint16_t is_dst_odd : 1;
+   uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */
+   uint16_t has_literal : 1;
+   aco_opcode op = aco_opcode::num_opcodes;
+   uint32_t literal = 0;
+};
+
 struct InstrInfo {
    Instruction* instr;
    int32_t priority;
@@ -46,12 +56,21 @@ struct RegisterInfo {
 
 struct SchedILPContext {
    Program* program;
+   bool is_vopd = false;
    InstrInfo nodes[num_nodes];
    RegisterInfo regs[512];
    mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */
    mask_t active_mask = 0;      /* bitmask of valid instruction nodes. */
    uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */
    uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */
+
+   /* VOPD scheduler: */
+   VOPDInfo vopd[num_nodes];
+   VOPDInfo prev_vopd_info;
+   InstrInfo prev_info;
+
+   mask_t vopd_odd_mask = 0;
+   mask_t vopd_even_mask = 0;
 };
 
 /**
@@ -98,6 +117,117 @@ can_reorder(const Instruction* const instr)
    return true;
 }
 
+VOPDInfo
+get_vopd_info(const Instruction* instr)
+{
+   if (instr->format != Format::VOP1 && instr->format != Format::VOP2)
+      return VOPDInfo();
+
+   VOPDInfo info;
+   switch (instr->opcode) {
+   case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break;
+   case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break;
+   case aco_opcode::v_fmamk_f32: info.op = aco_opcode::v_dual_fmamk_f32; break;
+   case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break;
+   case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break;
+   case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break;
+   case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break;
+   case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break;
+   case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break;
+   case aco_opcode::v_cndmask_b32: info.op = aco_opcode::v_dual_cndmask_b32; break;
+   case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break;
+   case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break;
+   case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break;
+   case aco_opcode::v_add_u32:
+      info.op = aco_opcode::v_dual_add_nc_u32;
+      info.is_opy_only = true;
+      break;
+   case aco_opcode::v_lshlrev_b32:
+      info.op = aco_opcode::v_dual_lshlrev_b32;
+      info.is_opy_only = true;
+      break;
+   case aco_opcode::v_and_b32:
+      info.op = aco_opcode::v_dual_and_b32;
+      info.is_opy_only = true;
+      break;
+   default: return VOPDInfo();
+   }
+
+   /* Each instruction may use at most one SGPR. */
+   if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr))
+      return VOPDInfo();
+
+   info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1;
+
+   static const unsigned bank_mask[3] = {0x3, 0x3, 0x1};
+   bool has_sgpr = false;
+   for (unsigned i = 0; i < instr->operands.size(); i++) {
+      unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i;
+      if (instr->operands[i].isOfType(RegType::vgpr))
+         info.src_banks |= 1 << (port * 4 + (instr->operands[i].physReg().reg() & bank_mask[port]));
+
+      /* Check all operands because of fmaak/fmamk. */
+      if (instr->operands[i].isLiteral()) {
+         assert(!info.has_literal || info.literal == instr->operands[i].constantValue());
+         info.has_literal = true;
+         info.literal = instr->operands[i].constantValue();
+      }
+
+      /* Check all operands because of cndmask. */
+      has_sgpr |= !instr->operands[i].isConstant() && instr->operands[i].isOfType(RegType::sgpr);
+   }
+
+   /* An instruction can't use both a literal and an SGPR. */
+   if (has_sgpr && info.has_literal)
+      return VOPDInfo();
+
+   return info;
+}
+
+bool
+can_use_vopd(const SchedILPContext& ctx, unsigned idx)
+{
+   VOPDInfo cur_vopd = ctx.vopd[idx];
+   Instruction* first = ctx.nodes[idx].instr;
+   Instruction* second = ctx.prev_info.instr;
+
+   if (!second)
+      return false;
+
+   if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes)
+      return false;
+
+   if ((ctx.prev_vopd_info.src_banks & cur_vopd.src_banks) ||
+       (ctx.prev_vopd_info.is_opy_only & cur_vopd.is_opy_only) ||
+       (ctx.prev_vopd_info.is_dst_odd == cur_vopd.is_dst_odd)) {
+      return false;
+   }
+
+   /* Both can use a literal, but it must be the same literal. */
+   if (ctx.prev_vopd_info.has_literal && cur_vopd.has_literal &&
+       ctx.prev_vopd_info.literal != cur_vopd.literal)
+      return false;
+
+   assert(first->definitions.size() == 1);
+   assert(first->definitions[0].size() == 1);
+   assert(second->definitions.size() == 1);
+   assert(second->definitions[0].size() == 1);
+
+   /* Check for WaW dependency. */
+   if (first->definitions[0].physReg() == second->definitions[0].physReg())
+      return false;
+
+   /* Check for RaW dependency. */
+   for (Operand op : second->operands) {
+      assert(op.size() == 1);
+      if (first->definitions[0].physReg() == op.physReg())
+         return false;
+   }
+
+   /* WaR dependencies are not a concern. */
+   return true;
+}
+
 unsigned
 get_latency(const Instruction* const instr)
 {
@@ -138,6 +268,16 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
    bool reorder = can_reorder(instr);
    ctx.active_mask |= mask;
 
+   if (ctx.is_vopd) {
+      VOPDInfo vopd = get_vopd_info(entry.instr);
+
+      ctx.vopd[idx] = vopd;
+      ctx.vopd_odd_mask &= ~mask;
+      ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0;
+      ctx.vopd_even_mask &= ~mask;
+      ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask;
+   }
+
    for (const Operand& op : instr->operands) {
       assert(op.isFixed());
       unsigned reg = op.physReg();
@@ -206,8 +346,10 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
          reg_info.has_direct_dependency = 1;
          reg_info.direct_dependency = idx;
 
-         /* Add latency information for the next register read. */
-         reg_info.latency = get_latency(instr);
+         if (!ctx.is_vopd) {
+            /* Add latency information for the next register read. */
+            reg_info.latency = get_latency(instr);
+         }
       }
    }
 
@@ -225,7 +367,7 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
 
       /* Just don't reorder these at all. */
       if (!is_memory_instr(instr) || instr->definitions.empty() ||
-          get_sync_info(instr).semantics & semantic_volatile) {
+          get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) {
          /* Add all previous instructions as dependencies. */
          entry.dependency_mask = ctx.active_mask;
       }
@@ -343,7 +485,7 @@ collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask
  * Returns the index of the next instruction to be selected.
  */
 unsigned
-select_instruction(const SchedILPContext& ctx)
+select_instruction_ilp(const SchedILPContext& ctx)
 {
    mask_t mask = ctx.active_mask;
 
@@ -377,6 +519,145 @@ select_instruction(const SchedILPContext& ctx)
    return idx;
 }
 
+bool
+compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd,
+                   unsigned current, unsigned candidate)
+{
+   if (can_use_vopd(ctx, candidate)) {
+      /* If we can form a VOPD instruction, always prefer to do so. */
+      if (!*use_vopd) {
+         *use_vopd = true;
+         return true;
+      }
+   } else {
+      if (*use_vopd)
+         return false;
+
+      /* Neither current nor candidate can form a VOPD instruction with the previously scheduled
+       * instruction. */
+      VOPDInfo current_vopd = ctx.vopd[current];
+      VOPDInfo candidate_vopd = ctx.vopd[candidate];
+
+      /* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */
+      bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes;
+      bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes;
+      if (current_vopd_capable != candidate_vopd_capable)
+         return !candidate_vopd_capable;
+
+      /* If we have to select from VOPD-capable instructions, prefer maintaining a balance of
+       * odd/even instructions, in case selecting this instruction fails to make a pair.
+       */
+      if (current_vopd_capable && num_vopd_odd_minus_even != 0) {
+         assert(candidate_vopd_capable);
+         bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0;
+         if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd)
+            return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd;
+      }
+   }
+
+   return ctx.nodes[candidate].priority > ctx.nodes[current].priority;
+}
+
+unsigned
+select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd)
+{
+   *use_vopd = false;
+
+   mask_t mask = ctx.active_mask;
+   if (ctx.next_non_reorderable != UINT8_MAX)
+      mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask;
+
+   if (mask == 0)
+      return ctx.next_non_reorderable;
+
+   int num_vopd_odd_minus_even =
+      (int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask);
+
+   unsigned cur = -1u;
+   u_foreach_bit (i, mask) {
+      const InstrInfo& candidate = ctx.nodes[i];
+
+      /* Check if the candidate has pending dependencies. */
+      if (candidate.dependency_mask)
+         continue;
+
+      if (cur == -1u) {
+         cur = i;
+         *use_vopd = can_use_vopd(ctx, i);
+      } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) {
+         cur = i;
+      }
+   }
+
+   assert(cur != -1u);
+   return cur;
+}
+
+Instruction*
+create_vopd_instruction(const SchedILPContext& ctx, unsigned idx)
+{
+   Instruction* x = ctx.prev_info.instr;
+   Instruction* y = ctx.nodes[idx].instr;
+   aco_opcode opx = ctx.prev_vopd_info.op;
+   aco_opcode opy = ctx.vopd[idx].op;
+   if (ctx.prev_vopd_info.is_opy_only) {
+      std::swap(x, y);
+      std::swap(opx, opy);
+   }
+
+   VOPD_instruction* instr = create_instruction<VOPD_instruction>(
+      opx, Format::VOPD, x->operands.size() + y->operands.size(), 2);
+   instr->opy = opy;
+   instr->definitions[0] = x->definitions[0];
+   instr->definitions[1] = y->definitions[0];
+   std::copy(x->operands.begin(), x->operands.end(), instr->operands.begin());
+   std::copy(y->operands.begin(), y->operands.end(),
+             std::next(instr->operands.begin(), x->operands.size()));
+
+   return instr;
+}
+
+template <typename It>
+void
+do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin,
+            It instructions_end)
+{
+   for (unsigned i = 0; i < num_nodes; i++) {
+      if (remove_it == instructions_end)
+         break;
+
+      add_entry(ctx, (remove_it++)->get(), i);
+   }
+
+   ctx.prev_info.instr = NULL;
+   bool use_vopd = false;
+
+   while (ctx.active_mask) {
+      unsigned next_idx =
+         ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx);
+      Instruction* next_instr = ctx.nodes[next_idx].instr;
+
+      if (use_vopd) {
+         std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx));
+         ctx.prev_info.instr = NULL;
+      } else {
+         (insert_it++)->reset(next_instr);
+         ctx.prev_info = ctx.nodes[next_idx];
+         ctx.prev_vopd_info = ctx.vopd[next_idx];
+      }
+
+      remove_entry(ctx, next_instr, next_idx);
+      ctx.nodes[next_idx].instr = NULL;
+
+      if (remove_it != instructions_end) {
+         add_entry(ctx, (remove_it++)->get(), next_idx);
+      } else if (ctx.last_non_reorderable != UINT8_MAX) {
+         ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
+         ctx.last_non_reorderable = UINT8_MAX;
+      }
+   }
+}
+
 } // namespace
 
 void
@@ -386,29 +667,26 @@ schedule_ilp(Program* program)
 
    for (Block& block : program->blocks) {
       auto it = block.instructions.begin();
-      for (unsigned i = 0; i < num_nodes; i++) {
-         if (it == block.instructions.end())
-            break;
+      auto insert_it = block.instructions.begin();
+      do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end());
+      block.instructions.resize(insert_it - block.instructions.begin());
+   }
+}
 
-         add_entry(ctx, (it++)->get(), i);
-      }
+void
+schedule_vopd(Program* program)
+{
+   if (program->gfx_level < GFX11 || program->wave_size != 32)
+      return;
 
-      auto insert_it = block.instructions.begin();
-      while (insert_it != block.instructions.end()) {
-         unsigned next_idx = select_instruction(ctx);
-         Instruction* next_instr = ctx.nodes[next_idx].instr;
-         remove_entry(ctx, next_instr, next_idx);
-         (insert_it++)->reset(next_instr);
-         ctx.nodes[next_idx].instr = NULL;
+   SchedILPContext ctx = {program};
+   ctx.is_vopd = true;
 
-         if (it != block.instructions.end()) {
-            add_entry(ctx, (it++)->get(), next_idx);
-         } else if (ctx.last_non_reorderable != UINT8_MAX) {
-            ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
-            ctx.last_non_reorderable = UINT8_MAX;
-         }
-      }
-      assert(it == block.instructions.end());
+   for (Block& block : program->blocks) {
+      auto it = block.instructions.rbegin();
+      auto insert_it = block.instructions.rbegin();
+      do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend());
+      block.instructions.erase(block.instructions.begin(), insert_it.base());
    }
 }
 
diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h
index e6ff2c8a5f7d..bfe8071dee2d 100644
--- a/src/amd/compiler/aco_shader_info.h
+++ b/src/amd/compiler/aco_shader_info.h
@@ -228,6 +228,7 @@ enum aco_statistic {
    aco_statistic_salu,
    aco_statistic_vmem,
    aco_statistic_smem,
+   aco_statistic_vopd,
    aco_num_statistics
 };
 
diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp
index e454c2de93be..5eb202b8ba7e 100644
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@@ -540,6 +540,8 @@ collect_preasm_stats(Program* program)
          if (instr->isSALU() && !instr->isSOPP() &&
              instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
             program->statistics[aco_statistic_salu]++;
+         if (instr->isVOPD())
+            program->statistics[aco_statistic_vopd]++;
 
          if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
              !instr->operands.empty()) {
diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp
index a7106e98686a..8c92e669a649 100644
--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@@ -1055,3 +1055,62 @@ BEGIN_TEST(assembler.vop3_dpp)
 
    finish_assembler_test();
 END_TEST
+
+BEGIN_TEST(assembler.vopd)
+   if (!setup_cs(NULL, GFX11))
+      return;
+
+   Definition dst_v0 = bld.def(v1);
+   dst_v0.setFixed(PhysReg(256));
+
+   Definition dst_v1 = bld.def(v1);
+   dst_v1.setFixed(PhysReg(256 + 1));
+
+   Operand op_v0(bld.tmp(v1));
+   op_v0.setFixed(PhysReg(256 + 0));
+
+   Operand op_v1(bld.tmp(v1));
+   op_v1.setFixed(PhysReg(256 + 1));
+
+   Operand op_v2(bld.tmp(v1));
+   op_v2.setFixed(PhysReg(256 + 2));
+
+   Operand op_v3(bld.tmp(v1));
+   op_v3.setFixed(PhysReg(256 + 3));
+
+   Operand op_s0(bld.tmp(s1));
+   op_s0.setFixed(PhysReg(0));
+
+   Operand op_vcc(bld.tmp(s1));
+   op_vcc.setFixed(vcc);
+
+   //>> BB0:
+   //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101
+   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, aco_opcode::v_dual_mov_b32);
+
+   //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060
+   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0,
+            aco_opcode::v_dual_mov_b32);
+
+   //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060
+   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96),
+            aco_opcode::v_dual_mov_b32);
+
+   //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102
+   bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
+            aco_opcode::v_dual_mov_b32);
+
+   //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103
+   bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3,
+            aco_opcode::v_dual_mov_b32);
+
+   //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501
+   bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
+            aco_opcode::v_dual_and_b32);
+
+   //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702
+   bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3,
+            op_vcc, aco_opcode::v_dual_cndmask_b32);
+
+   finish_assembler_test();
+END_TEST
diff --git a/src/amd/vulkan/bvh/update.comp b/src/amd/vulkan/bvh/update.comp
index 905f807ebe66..c3c740238f22 100644
--- a/src/amd/vulkan/bvh/update.comp
+++ b/src/amd/vulkan/bvh/update.comp
@@ -74,17 +74,9 @@ void main() {
     bool is_active;
     if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
         is_active = build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x);
-    } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) {
-        VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-        is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x);
     } else {
         VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-        /* arrayOfPointers */
-        if (args.geom_data.stride == 8) {
-            src_ptr = DEREF(REF(VOID_REF)(src_ptr));
-        }
-
-        is_active = build_instance(bounds, src_ptr, dst_ptr, gl_GlobalInvocationID.x);
+        is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x);
     }
 
     if (!is_active)
@@ -110,10 +102,15 @@ void main() {
                       gl_StorageSemanticsBuffer,
                       gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible);
 
-        radv_bvh_box32_node node = DEREF(REF(radv_bvh_box32_node)OFFSET(src_bvh, offset));
+        REF(radv_bvh_box32_node) src_node = REF(radv_bvh_box32_node)OFFSET(src_bvh, offset);
+        REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, offset);
+        uint32_t children[4];
+        for (uint32_t i = 0; i < 4; ++i)
+            children[i] = DEREF(src_node).children[i];
+
         uint32_t valid_child_count = 0;
         for (uint32_t i = 0; i < 4; ++valid_child_count, ++i)
-            if (node.children[i] == RADV_BVH_INVALID_NODE)
+            if (children[i] == RADV_BVH_INVALID_NODE)
                 break;
 
         /* Check if all children have been processed. As this is an atomic the last path coming from
@@ -127,33 +124,37 @@ void main() {
         if (ready_child_count != valid_child_count - 1)
             break;
 
+        for (uint32_t i = 0; i < 4; ++i)
+            DEREF(dst_node).children[i] = children[i];
+
         for (uint32_t i = 0; i < valid_child_count; ++i) {
-            uint32_t child_offset = id_to_offset(node.children[i]);
+            uint32_t child_offset = id_to_offset(children[i]);
+            radv_aabb child_bounds;
             if (child_offset == dst_offset)
-                node.coords[i] = bounds;
+                child_bounds = bounds;
             else if (child_offset >= internal_nodes_offset) {
-                radv_aabb child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
-                radv_bvh_box32_node child_node = DEREF(REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset));
+                child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
+                REF(radv_bvh_box32_node) child_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset);
                 for (uint32_t j = 0; j < 4; ++j) {
-                    if (child_node.children[j] == RADV_BVH_INVALID_NODE)
+                    if (DEREF(child_node).children[j] == RADV_BVH_INVALID_NODE)
                         break;
-                    child_bounds.min = min(child_bounds.min, child_node.coords[j].min);
-                    child_bounds.max = max(child_bounds.max, child_node.coords[j].max);
+                    child_bounds.min = min(child_bounds.min, DEREF(child_node).coords[j].min);
+                    child_bounds.max = max(child_bounds.max, DEREF(child_node).coords[j].max);
                 }
-                node.coords[i] = child_bounds;
             } else {
                 uint32_t child_index = (child_offset - first_leaf_offset) / leaf_node_size;
-                node.coords[i] = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index));
+                child_bounds = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index));
             }
-        }
 
-        DEREF(REF(radv_bvh_box32_node)OFFSET(dst_bvh, offset)) = node;
+            DEREF(dst_node).coords[i] = child_bounds;
+        }
 
         if (parent_id == RADV_BVH_ROOT_NODE) {
             radv_aabb root_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY));
             for (uint32_t i = 0; i < valid_child_count; ++i) {
-                root_bounds.min = min(root_bounds.min, node.coords[i].min);
-                root_bounds.max = max(root_bounds.max, node.coords[i].max);
+                radv_aabb bounds = DEREF(dst_node).coords[i];
+                root_bounds.min = min(root_bounds.min, bounds.min);
+                root_bounds.max = max(root_bounds.max, bounds.max);
             }
             DEREF(args.dst).aabb = root_bounds;
         }
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 4921d2d7ea56..29a0ff599b34 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -1075,6 +1075,11 @@ radv_image_create_layout(struct radv_device *device, struct radv_image_create_in
       radv_video_get_profile_alignments(device->physical_device, profile_list, &width_align, &height_align);
       image_info.width = align(image_info.width, width_align);
       image_info.height = align(image_info.height, height_align);
+
+      if (radv_has_uvd(device->physical_device) && image->vk.usage & VK_IMAGE_USAGE_VIDEO_DECODE_DPB_BIT_KHR) {
+         /* UVD and kernel demand a full DPB allocation. */
+         image_info.array_size = MIN2(16, image_info.array_size);
+      }
    }
 
    unsigned plane_count = radv_get_internal_plane_count(device->physical_device, image->vk.format);
diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c
index 588e2750b8ee..b07e0c9e5e86 100644
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@@ -1345,7 +1345,8 @@ radv_get_physical_device_properties(struct radv_physical_device *pdevice)
    p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT |
                                     VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT |
                                     VK_SUBGROUP_FEATURE_CLUSTERED_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT |
-                                    VK_SUBGROUP_FEATURE_SHUFFLE_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;
+                                    VK_SUBGROUP_FEATURE_SHUFFLE_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+                                    VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR | VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR;
    p->subgroupQuadOperationsInAllStages = true;
 
    p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES;
@@ -1698,7 +1699,7 @@ radv_get_physical_device_properties(struct radv_physical_device *pdevice)
    /* VK_NV_device_generated_commands */
    p->maxIndirectCommandsStreamCount = 1;
    p->maxIndirectCommandsStreamStride = UINT32_MAX;
-   p->maxIndirectCommandsTokenCount = UINT32_MAX;
+   p->maxIndirectCommandsTokenCount = 512;
    p->maxIndirectCommandsTokenOffset = UINT16_MAX;
    p->minIndirectCommandsBufferOffsetAlignment = 4;
    p->minSequencesCountBufferOffsetAlignment = 4;
@@ -2050,13 +2051,13 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
    if ((device->instance->debug_flags & RADV_DEBUG_INFO))
       ac_print_gpu_info(&device->rad_info, stdout);
 
+   radv_init_physical_device_decoder(device);
+
    radv_physical_device_init_queue_table(device);
 
    /* We don't check the error code, but later check if it is initialized. */
    ac_init_perfcounters(&device->rad_info, false, false, &device->ac_perfcounters);
 
-   radv_init_physical_device_decoder(device);
-
    /* The WSI is structured as a layer on top of the driver, so this has
     * to be the last part of initialization (at least until we get other
     * semi-layers).
diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c
index 0a2430a53176..0120c7aa1d87 100644
--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@@ -1647,7 +1647,8 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
          queue->device->ws->cs_unchain(cmd_buffer->cs);
          if (!chainable || !queue->device->ws->cs_chain(chainable, cmd_buffer->cs, queue->state.uses_shadow_regs)) {
             /* don't submit empty command buffers to the kernel. */
-            if (radv_queue_ring(queue) != AMD_IP_VCN_ENC || cmd_buffer->cs->cdw != 0)
+            if ((radv_queue_ring(queue) != AMD_IP_VCN_ENC && radv_queue_ring(queue) != AMD_IP_UVD) ||
+                cmd_buffer->cs->cdw != 0)
                cs_array[num_submitted_cs++] = cmd_buffer->cs;
          }
 
diff --git a/src/amd/vulkan/radv_video.c b/src/amd/vulkan/radv_video.c
index 96851b9d771b..8637c502bad4 100644
--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@@ -1724,10 +1724,12 @@ radv_uvd_cmd_reset(struct radv_cmd_buffer *cmd_buffer)
    if (vid->sessionctx.mem)
       send_cmd(cmd_buffer, RDECODE_CMD_SESSION_CONTEXT_BUFFER, vid->sessionctx.mem->bo, vid->sessionctx.offset);
    send_cmd(cmd_buffer, RDECODE_CMD_MSG_BUFFER, cmd_buffer->upload.upload_bo, out_offset);
+
    /* pad out the IB to the 16 dword boundary - otherwise the fw seems to be unhappy */
-   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8);
-   for (unsigned i = 0; i < 8; i++)
-      radeon_emit(cmd_buffer->cs, 0x81ff);
+   int padsize = vid->sessionctx.mem ? 4 : 6;
+   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, padsize);
+   for (unsigned i = 0; i < padsize; i++)
+      radeon_emit(cmd_buffer->cs, PKT2_NOP_PAD);
 }
 
 VKAPI_ATTR void VKAPI_CALL
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index 4fd9bb7053d8..233ede17ba8d 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -457,7 +457,17 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
       *cs->ib_size_ptr |= cs->base.cdw;
    } else {
       /* Pad the CS with NOP packets. */
-      if (ip_type != AMDGPU_HW_IP_VCN_ENC) {
+      bool pad = true;
+
+      /* Don't pad on VCN encode/unified as no NOPs */
+      if (ip_type == AMDGPU_HW_IP_VCN_ENC)
+         pad = false;
+
+      /* Don't add padding to 0 length UVD due to kernel */
+      if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0)
+         pad = false;
+
+      if (pad) {
          while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask))
             radeon_emit_unchecked(&cs->base, nop_packet);
       }
diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt
index 2332f8c2f324..6eda8515a8d7 100644
--- a/src/broadcom/ci/broadcom-rpi4-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi4-fails.txt
@@ -508,3 +508,5 @@ KHR-GL31.texture_size_promotion.functional,Fail
 # uprev Piglit in Mesa
 spec@glsl-1.40@uniform_buffer@two-stages,Fail
 
+# Couldn't reproduce locally
+spec@oes_packed_depth_stencil@depth_stencil texture gles2,Fail
diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c
index 91b99b7df65a..d95d7fcb7a80 100644
--- a/src/freedreno/ir3/ir3_lower_subgroups.c
+++ b/src/freedreno/ir3/ir3_lower_subgroups.c
@@ -344,6 +344,9 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in
       struct ir3_block *store = ir3_block_create(ir);
       list_add(&store->node, &body->node);
 
+      body->reconvergence_point = true;
+      after_block->reconvergence_point = true;
+
       link_blocks(before_block, body, 0);
 
       link_blocks(body, store, 0);
diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c
index 3d20eb6eb2fe..c4a4ef037260 100644
--- a/src/gallium/drivers/crocus/crocus_blorp.c
+++ b/src/gallium/drivers/crocus/crocus_blorp.c
@@ -261,20 +261,31 @@ blorp_get_l3_config(struct blorp_batch *blorp_batch)
    struct crocus_batch *batch = blorp_batch->driver_batch;
    return batch->screen->l3_config_3d;
 }
-#else /* GFX_VER < 7 */
+#endif
+
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
+                          struct intel_urb_config *urb_cfg)
+{
+   /* Dummy. */
+}
+
 static void
 blorp_emit_urb_config(struct blorp_batch *blorp_batch,
-                      unsigned vs_entry_size,
-                      UNUSED unsigned sf_entry_size)
+                      struct intel_urb_config *urb_cfg)
 {
+#if GFX_VER < 7
    struct crocus_batch *batch = blorp_batch->driver_batch;
 #if GFX_VER <= 5
-   batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size);
+   batch->screen->vtbl.calculate_urb_fence(batch, 0,
+                                           urb_cfg->size[MESA_SHADER_VERTEX],
+                                           urb_cfg->size[MESA_SHADER_FRAGMENT]);
 #else
-   genX(crocus_upload_urb)(batch, vs_entry_size, false, vs_entry_size);
+   genX(crocus_upload_urb)(batch, urb_cfg->size[MESA_SHADER_VERTEX], false,
+                           urb_cfg->size[MESA_SHADER_VERTEX]);
 #endif
-}
 #endif
+}
 
 static void
 crocus_blorp_exec(struct blorp_batch *blorp_batch,
diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c
index 2781470fb9c4..8385a03b8348 100644
--- a/src/gallium/drivers/crocus/crocus_state.c
+++ b/src/gallium/drivers/crocus/crocus_state.c
@@ -6058,49 +6058,46 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
       const struct intel_device_info *devinfo = &batch->screen->devinfo;
       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
       bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
-      unsigned entry_size[4];
+      struct intel_urb_config urb_cfg;
 
       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
          if (!ice->shaders.prog[i]) {
-            entry_size[i] = 1;
+            urb_cfg.size[i] = 1;
          } else {
             struct brw_vue_prog_data *vue_prog_data =
                (void *) ice->shaders.prog[i]->prog_data;
-            entry_size[i] = vue_prog_data->urb_entry_size;
+            urb_cfg.size[i] = vue_prog_data->urb_entry_size;
          }
-         assert(entry_size[i] != 0);
+         assert(urb_cfg.size[i] != 0);
       }
 
       /* If we're just switching between programs with the same URB requirements,
        * skip the rest of the logic.
        */
       bool no_change = false;
-      if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
+      if (ice->urb.vsize == urb_cfg.size[MESA_SHADER_VERTEX] &&
           ice->urb.gs_present == gs_present &&
-          ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
+          ice->urb.gsize == urb_cfg.size[MESA_SHADER_GEOMETRY] &&
           ice->urb.tess_present == tess_present &&
-          ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
-          ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
+          ice->urb.hsize == urb_cfg.size[MESA_SHADER_TESS_CTRL] &&
+          ice->urb.dsize == urb_cfg.size[MESA_SHADER_TESS_EVAL]) {
          no_change = true;
       }
 
       if (!no_change) {
-         ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
+         ice->urb.vsize = urb_cfg.size[MESA_SHADER_VERTEX];
          ice->urb.gs_present = gs_present;
-         ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
+         ice->urb.gsize = urb_cfg.size[MESA_SHADER_GEOMETRY];
          ice->urb.tess_present = tess_present;
-         ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
-         ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
+         ice->urb.hsize = urb_cfg.size[MESA_SHADER_TESS_CTRL];
+         ice->urb.dsize = urb_cfg.size[MESA_SHADER_TESS_EVAL];
 
-         unsigned entries[4];
-         unsigned start[4];
          bool constrained;
          intel_get_urb_config(devinfo,
                               batch->screen->l3_config_3d,
                               tess_present,
                               gs_present,
-                              entry_size,
-                              entries, start, NULL, &constrained);
+                              &urb_cfg, NULL, &constrained);
 
 #if GFX_VER == 7
          if (devinfo->platform == INTEL_PLATFORM_IVB)
@@ -6109,9 +6106,9 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
          for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
             crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
                urb._3DCommandSubOpcode += i;
-               urb.VSURBStartingAddress     = start[i];
-               urb.VSURBEntryAllocationSize = entry_size[i] - 1;
-               urb.VSNumberofURBEntries     = entries[i];
+               urb.VSURBStartingAddress     = urb_cfg.start[i];
+               urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
+               urb.VSNumberofURBEntries     = urb_cfg.entries[i];
             }
          }
       }
diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c
index 1f716b35658c..2869d624ebd1 100644
--- a/src/gallium/drivers/iris/iris_blorp.c
+++ b/src/gallium/drivers/iris/iris_blorp.c
@@ -274,6 +274,13 @@ blorp_flush_range(UNUSED struct blorp_batch *blorp_batch,
     */
 }
 
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
+                          struct intel_urb_config *urb_cfg)
+{
+   genX(urb_workaround)(blorp_batch->driver_batch, urb_cfg);
+}
+
 static const struct intel_l3_config *
 blorp_get_l3_config(struct blorp_batch *blorp_batch)
 {
@@ -410,8 +417,8 @@ iris_blorp_exec_render(struct blorp_batch *blorp_batch,
    ice->state.dirty |= ~skip_bits;
    ice->state.stage_dirty |= ~skip_stage_bits;
 
-   for (int i = 0; i < ARRAY_SIZE(ice->shaders.urb.size); i++)
-      ice->shaders.urb.size[i] = 0;
+   for (int i = 0; i < ARRAY_SIZE(ice->shaders.urb.cfg.size); i++)
+      ice->shaders.urb.cfg.size[i] = 0;
 
    if (params->src.enabled)
       iris_bo_bump_seqno(params->src.addr.buffer, batch->next_seqno,
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index 74461c18eaca..17b9a9596474 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -714,12 +714,13 @@ struct iris_context {
       struct iris_compiled_shader *prog[MESA_SHADER_STAGES];
       struct iris_compiled_shader *last_vue_shader;
       struct {
-         unsigned size[4];
-         unsigned entries[4];
-         unsigned start[4];
+         struct intel_urb_config cfg;
          bool constrained;
       } urb;
 
+      /** Last urb emitted by the driver. */
+      struct intel_urb_config last_urb;
+
       /** Uploader for shader assembly from the driver thread */
       struct u_upload_mgr *uploader_driver;
       /** Uploader for shader assembly from the threaded context */
@@ -1180,21 +1181,6 @@ iris_execute_indirect_draw_supported(const struct iris_context *ice,
 #ifdef genX
 #  include "iris_genx_protos.h"
 #else
-#  define genX(x) gfx4_##x
-#  include "iris_genx_protos.h"
-#  undef genX
-#  define genX(x) gfx5_##x
-#  include "iris_genx_protos.h"
-#  undef genX
-#  define genX(x) gfx6_##x
-#  include "iris_genx_protos.h"
-#  undef genX
-#  define genX(x) gfx7_##x
-#  include "iris_genx_protos.h"
-#  undef genX
-#  define genX(x) gfx75_##x
-#  include "iris_genx_protos.h"
-#  undef genX
 #  define genX(x) gfx8_##x
 #  include "iris_genx_protos.h"
 #  undef genX
diff --git a/src/gallium/drivers/iris/iris_draw.c b/src/gallium/drivers/iris/iris_draw.c
index bc897ba0f7d5..597a18c5c0e7 100644
--- a/src/gallium/drivers/iris/iris_draw.c
+++ b/src/gallium/drivers/iris/iris_draw.c
@@ -37,7 +37,6 @@
 #include "util/u_transfer.h"
 #include "util/u_upload_mgr.h"
 #include "intel/compiler/brw_compiler.h"
-#include "intel/compiler/brw_eu_defines.h"
 #include "compiler/shader_info.h"
 #include "iris_context.h"
 #include "iris_defines.h"
diff --git a/src/gallium/drivers/iris/iris_genx_protos.h b/src/gallium/drivers/iris/iris_genx_protos.h
index 44c5d427ce65..20e9815bda79 100644
--- a/src/gallium/drivers/iris/iris_genx_protos.h
+++ b/src/gallium/drivers/iris/iris_genx_protos.h
@@ -48,6 +48,8 @@ void genX(emit_3dprimitive_was)(struct iris_batch *batch,
                                 const struct pipe_draw_indirect_info *indirect,
                                 uint32_t primitive_topology,
                                 uint32_t vertex_count);
+void genX(urb_workaround)(struct iris_batch *batch,
+                          const struct intel_urb_config *urb_cfg);
 
 static inline void
 genX(maybe_emit_breakpoint)(struct iris_batch *batch,
diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c
index 600d95fa6b81..c48df4783151 100644
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@@ -436,7 +436,6 @@ static void
 iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo,
                     void *mem_ctx,
                     nir_shader *nir,
-                    struct brw_stage_prog_data *prog_data,
                     unsigned kernel_input_size,
                     enum brw_param_builtin **out_system_values,
                     unsigned *out_num_system_values,
@@ -1145,7 +1144,7 @@ check_urb_size(struct iris_context *ice,
                unsigned needed_size,
                gl_shader_stage stage)
 {
-   unsigned last_allocated_size = ice->shaders.urb.size[stage];
+   unsigned last_allocated_size = ice->shaders.urb.cfg.size[stage];
 
    /* If the last URB allocation wasn't large enough for our needs,
     * flag it as needing to be reconfigured.  Otherwise, we can use
@@ -1315,7 +1314,7 @@ iris_compile_vs(struct iris_screen *screen,
 
    prog_data->use_alt_mode = nir->info.use_legacy_math_rules;
 
-   iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
+   iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values,
                        &num_system_values, &num_cbufs);
 
    struct iris_binding_table bt;
@@ -1500,7 +1499,7 @@ iris_compile_tcs(struct iris_screen *screen,
       source_hash = *(uint32_t*)nir->info.source_sha1;
    }
 
-   iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
+   iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values,
                        &num_system_values, &num_cbufs);
    iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
                             num_system_values, num_cbufs);
@@ -1657,7 +1656,7 @@ iris_compile_tes(struct iris_screen *screen,
       nir_shader_gather_info(nir, impl);
    }
 
-   iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
+   iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values,
                        &num_system_values, &num_cbufs);
 
    struct iris_binding_table bt;
@@ -1800,7 +1799,7 @@ iris_compile_gs(struct iris_screen *screen,
       nir_shader_gather_info(nir, impl);
    }
 
-   iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
+   iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values,
                        &num_system_values, &num_cbufs);
 
    struct iris_binding_table bt;
@@ -1930,7 +1929,7 @@ iris_compile_fs(struct iris_screen *screen,
 
    prog_data->use_alt_mode = nir->info.use_legacy_math_rules;
 
-   iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values,
+   iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values,
                        &num_system_values, &num_cbufs);
 
    /* Lower output variables to load_output intrinsics before setting up
@@ -2226,8 +2225,7 @@ iris_compile_cs(struct iris_screen *screen,
 
    NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, cs_prog_data);
 
-   iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data,
-                       ish->kernel_input_size,
+   iris_setup_uniforms(devinfo, mem_ctx, nir, ish->kernel_input_size,
                        &system_values, &num_system_values, &num_cbufs);
 
    struct iris_binding_table bt;
diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c
index 481f575b5115..ef02586774a6 100644
--- a/src/gallium/drivers/iris/iris_program_cache.c
+++ b/src/gallium/drivers/iris/iris_program_cache.c
@@ -38,10 +38,7 @@
 #include "util/u_upload_mgr.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
-#include "intel/common/intel_disasm.h"
 #include "intel/compiler/brw_compiler.h"
-#include "intel/compiler/brw_eu.h"
-#include "intel/compiler/brw_nir.h"
 #include "iris_context.h"
 #include "iris_resource.h"
 
diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c
index 6e7069058fb4..294acfc7f413 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -6819,31 +6819,31 @@ iris_upload_dirty_render_state(struct iris_context *ice,
    if (dirty & IRIS_DIRTY_URB) {
       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
          if (!ice->shaders.prog[i]) {
-            ice->shaders.urb.size[i] = 1;
+            ice->shaders.urb.cfg.size[i] = 1;
          } else {
             struct brw_vue_prog_data *vue_prog_data =
                (void *) ice->shaders.prog[i]->prog_data;
-            ice->shaders.urb.size[i] = vue_prog_data->urb_entry_size;
+            ice->shaders.urb.cfg.size[i] = vue_prog_data->urb_entry_size;
          }
-         assert(ice->shaders.urb.size[i] != 0);
+         assert(ice->shaders.urb.cfg.size[i] != 0);
       }
 
       intel_get_urb_config(screen->devinfo,
                            screen->l3_config_3d,
                            ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
                            ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL,
-                           ice->shaders.urb.size,
-                           ice->shaders.urb.entries,
-                           ice->shaders.urb.start,
+                           &ice->shaders.urb.cfg,
                            &ice->state.urb_deref_block_size,
                            &ice->shaders.urb.constrained);
 
+      genX(urb_workaround)(batch, &ice->shaders.urb.cfg);
+
       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
          iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
             urb._3DCommandSubOpcode += i;
-            urb.VSURBStartingAddress     = ice->shaders.urb.start[i];
-            urb.VSURBEntryAllocationSize = ice->shaders.urb.size[i] - 1;
-            urb.VSNumberofURBEntries     = ice->shaders.urb.entries[i];
+            urb.VSURBStartingAddress     = ice->shaders.urb.cfg.start[i];
+            urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1;
+            urb.VSNumberofURBEntries     = ice->shaders.urb.cfg.entries[i];
          }
       }
    }
@@ -8137,6 +8137,35 @@ genX(emit_3dprimitive_was)(struct iris_batch *batch,
 #endif
 }
 
+void
+genX(urb_workaround)(struct iris_batch *batch,
+                     const struct intel_urb_config *urb_cfg)
+{
+#if INTEL_NEEDS_WA_16014912113
+   if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb,
+                               MESA_SHADER_TESS_EVAL) &&
+       batch->ice->shaders.last_urb.size[0] != 0) {
+      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+         iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
+            urb._3DCommandSubOpcode += i;
+            urb.VSURBStartingAddress =
+               batch->ice->shaders.last_urb.start[i];
+            urb.VSURBEntryAllocationSize =
+               batch->ice->shaders.last_urb.size[i] - 1;
+            urb.VSNumberofURBEntries = i == 0 ? 256 : 0;
+         }
+      }
+      iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
+         pc.HDCPipelineFlushEnable = true;
+      }
+   }
+#endif
+
+   /* Update current urb config. */
+   memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg,
+          sizeof(struct intel_urb_config));
+}
+
 static void
 iris_upload_render_state(struct iris_context *ice,
                          struct iris_batch *batch,
diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c
index 6493b85dd78b..87104f19cb7d 100644
--- a/src/gallium/drivers/panfrost/pan_shader.c
+++ b/src/gallium/drivers/panfrost/pan_shader.c
@@ -405,7 +405,7 @@ panfrost_create_shader_state(struct pipe_context *pctx,
    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
        nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) {
 
-      NIR_PASS_V(nir, nir_lower_fragcolor, 8);
+      NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
       so->fragcolor_lowered = true;
    }
 
diff --git a/src/gallium/drivers/virgl/meson.build b/src/gallium/drivers/virgl/meson.build
index 0bb26c67c7c6..6f94c53f5feb 100644
--- a/src/gallium/drivers/virgl/meson.build
+++ b/src/gallium/drivers/virgl/meson.build
@@ -41,9 +41,15 @@ libvirgl = static_library(
   dependencies : [dep_libdrm, idep_mesautil, idep_xmlconfig, idep_nir],
 )
 
+virgl_deps = [libvirgl]
+if not with_platform_windows
+  virgl_deps += libvirgldrm
+  virgl_deps += libvirglvtest
+endif
+
 driver_virgl = declare_dependency(
   compile_args : '-DGALLIUM_VIRGL',
-  link_with : [libvirgl, libvirgldrm, libvirglvtest],
+  link_with : virgl_deps,
 )
 
 if with_tests
diff --git a/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp b/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp
index 34dfe8356443..057445a77186 100644
--- a/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp
+++ b/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp
@@ -161,7 +161,7 @@ TEST_P(VirglStagingMgrWithAlignment,
    struct virgl_hw_res *out_resource[num_resources] = {0};
    unsigned expected_offset = 0;
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    for (unsigned i = 0; i < num_resources; ++i) {
@@ -197,7 +197,7 @@ TEST_F(VirglStagingMgr,
 {
    struct virgl_hw_res *out_resource[2] = {0};
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    alloc_succeeded =
@@ -229,7 +229,7 @@ TEST_F(VirglStagingMgr,
 {
    struct virgl_hw_res *out_resource[2] = {0};
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    alloc_succeeded =
@@ -261,7 +261,7 @@ TEST_F(VirglStagingMgr,
 {
    struct virgl_hw_res *out_resource[2] = {0};
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    ASSERT_LT(staging_size, 5123);
@@ -295,7 +295,7 @@ TEST_F(VirglStagingMgr, releases_resource_on_destruction)
 {
    struct virgl_hw_res *out_resource = NULL;
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    alloc_succeeded =
@@ -334,7 +334,7 @@ TEST_F(VirglStagingMgr, fails_gracefully_if_resource_create_fails)
    struct virgl_screen *vs = virgl_screen(ctx->screen);
    struct virgl_hw_res *out_resource = NULL;
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    vs->vws->resource_create = failing_resource_create;
@@ -359,7 +359,7 @@ TEST_F(VirglStagingMgr, fails_gracefully_if_map_fails)
    struct virgl_screen *vs = virgl_screen(ctx->screen);
    struct virgl_hw_res *out_resource = NULL;
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    vs->vws->resource_map = failing_resource_map;
@@ -377,7 +377,7 @@ TEST_F(VirglStagingMgr, uses_staging_buffer_resource)
 {
    struct virgl_hw_res *out_resource = NULL;
    unsigned out_offset;
-   void *map_ptr;
+   uint8_t *map_ptr;
    bool alloc_succeeded;
 
    alloc_succeeded =
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index f47c469e858e..548335aac802 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -21,7 +21,11 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include <string.h>
+#ifndef _WIN32
 #include <libsync.h>
+#endif
+
 #include "pipe/p_shader_tokens.h"
 
 #include "compiler/nir/nir.h"
@@ -592,7 +596,7 @@ static void virgl_hw_set_vertex_buffers(struct virgl_context *vctx)
    if (vctx->vertex_array_dirty) {
       const struct virgl_vertex_elements_state *ve = vctx->vertex_elements;
 
-      if (ve->num_bindings) {
+      if (ve && ve->num_bindings) {
          struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
          for (int i = 0; i < ve->num_bindings; ++i)
             vertex_buffers[i] = vctx->vertex_buffer[ve->binding_map[i]];
@@ -1003,7 +1007,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx,
 
    struct virgl_context *vctx = virgl_context(ctx);
    struct virgl_screen *rs = virgl_screen(ctx->screen);
-   struct virgl_indexbuf ib = {};
+   struct virgl_indexbuf ib = { 0 };
    struct pipe_draw_info info = *dinfo;
 
    if (!indirect &&
@@ -1478,7 +1482,7 @@ static void *virgl_create_compute_state(struct pipe_context *ctx,
    uint32_t handle;
    const struct tgsi_token *ntt_tokens = NULL;
    const struct tgsi_token *tokens;
-   struct pipe_stream_output_info so_info = {};
+   struct pipe_stream_output_info so_info = { 0 };
    int ret;
 
    if (state->ir_type == PIPE_SHADER_IR_NIR) {
diff --git a/src/gallium/drivers/virgl/virgl_query.c b/src/gallium/drivers/virgl/virgl_query.c
index 96a62a524af2..2aeafc07ec9e 100644
--- a/src/gallium/drivers/virgl/virgl_query.c
+++ b/src/gallium/drivers/virgl/virgl_query.c
@@ -30,7 +30,13 @@
 #include "virgl_screen.h"
 
 struct virgl_query {
-   struct virgl_resource *buf;
+   enum pipe_query_type type;
+
+   union {
+      struct virgl_resource *buf;
+      struct pipe_fence_handle *fence; // PIPE_QUERY_GPU_FINISHED
+   };
+
    uint32_t handle;
    uint32_t result_size;
    uint32_t pipeline_stats;
@@ -123,6 +129,11 @@ static struct pipe_query *virgl_create_query(struct pipe_context *ctx,
    if (!query)
       return NULL;
 
+   query->type = query_type;
+
+   if (query->type == PIPE_QUERY_GPU_FINISHED)
+      return (struct pipe_query *)query;
+
    query->buf = (struct virgl_resource *)
       pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING,
                          sizeof(struct virgl_host_query_state));
@@ -159,9 +170,13 @@ static void virgl_destroy_query(struct pipe_context *ctx,
    struct virgl_context *vctx = virgl_context(ctx);
    struct virgl_query *query = virgl_query(q);
 
-   virgl_encode_delete_object(vctx, query->handle, VIRGL_OBJECT_QUERY);
+   if (query->type == PIPE_QUERY_GPU_FINISHED) {
+      ctx->screen->fence_reference(ctx->screen, &query->fence, NULL);
+   } else {
+      virgl_encode_delete_object(vctx, query->handle, VIRGL_OBJECT_QUERY);
+      pipe_resource_reference((struct pipe_resource **)&query->buf, NULL);
+   }
 
-   pipe_resource_reference((struct pipe_resource **)&query->buf, NULL);
    FREE(query);
 }
 
@@ -184,6 +199,11 @@ static bool virgl_end_query(struct pipe_context *ctx,
    struct virgl_query *query = virgl_query(q);
    struct virgl_host_query_state *host_state;
 
+   if (query->type == PIPE_QUERY_GPU_FINISHED) {
+      ctx->flush(ctx, &query->fence, PIPE_FLUSH_DEFERRED);
+      return true;
+   }
+
    host_state = vs->vws->resource_map(vs->vws, query->buf->hw_res);
    if (!host_state)
       return false;
@@ -207,6 +227,13 @@ static bool virgl_get_query_result(struct pipe_context *ctx,
 {
    struct virgl_query *query = virgl_query(q);
 
+   if (query->type == PIPE_QUERY_GPU_FINISHED) {
+      struct pipe_screen *screen = ctx->screen;
+
+      result->b = screen->fence_finish(screen, ctx, query->fence, wait ? OS_TIMEOUT_INFINITE : 0);
+      return result->b;
+   }
+
    if (!query->ready) {
       struct virgl_screen *vs = virgl_screen(ctx->screen);
       struct virgl_context *vctx = virgl_context(ctx);
diff --git a/src/gallium/drivers/virgl/virgl_resource.c b/src/gallium/drivers/virgl/virgl_resource.c
index 494af9116b70..87d610ebbfc4 100644
--- a/src/gallium/drivers/virgl/virgl_resource.c
+++ b/src/gallium/drivers/virgl/virgl_resource.c
@@ -350,7 +350,7 @@ virgl_staging_map(struct virgl_context *vctx,
    unsigned align_offset;
    unsigned stride;
    uintptr_t layer_stride;
-   void *map_addr;
+   uint8_t *map_addr;
    bool alloc_succeeded;
 
    assert(vctx->supports_staging);
@@ -529,7 +529,7 @@ virgl_resource_transfer_map(struct pipe_context *ctx,
    case VIRGL_TRANSFER_MAP_HW_RES:
       trans->hw_res_map = vws->resource_map(vws, vres->hw_res);
       if (trans->hw_res_map)
-         map_addr = trans->hw_res_map + trans->offset;
+         map_addr = (uint8_t *)trans->hw_res_map + trans->offset;
       else
          map_addr = NULL;
       break;
@@ -717,22 +717,29 @@ static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *scre
    uint32_t storage_size;
 
    struct virgl_screen *vs = virgl_screen(screen);
-   if (templ->target == PIPE_BUFFER)
+   if (templ && templ->target == PIPE_BUFFER)
       return NULL;
 
    struct virgl_resource *res = CALLOC_STRUCT(virgl_resource);
-   res->b = *templ;
+   if (templ)
+      res->b = *templ;
    res->b.screen = &vs->base;
    pipe_reference_init(&res->b.reference, 1);
 
    plane = winsys_stride = plane_offset = modifier = 0;
    res->hw_res = vs->vws->resource_create_from_handle(vs->vws, whandle,
+                                                      &res->b,
                                                       &plane,
                                                       &winsys_stride,
                                                       &plane_offset,
                                                       &modifier,
                                                       &res->blob_mem);
 
+   if (!res->hw_res) {
+      FREE(res);
+      return NULL;
+   }
+
    /* do not use winsys returns for guest storage info of classic resource */
    if (!res->blob_mem) {
       winsys_stride = 0;
@@ -742,10 +749,6 @@ static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *scre
 
    virgl_resource_layout(&res->b, &res->metadata, plane, winsys_stride,
                          plane_offset, modifier);
-   if (!res->hw_res) {
-      FREE(res);
-      return NULL;
-   }
 
    /*
    *  If the overall resource is larger than a single page in size, we can
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index d86ca5d1e8c3..6599f5f5c4cd 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -936,7 +936,7 @@ static void virgl_flush_frontbuffer(struct pipe_screen *screen,
 
    if (vws->flush_frontbuffer) {
       virgl_flush_eq(vctx, vctx, NULL);
-      vws->flush_frontbuffer(vws, vres->hw_res, level, layer, winsys_drawable_handle,
+      vws->flush_frontbuffer(vws, vctx->cbuf, vres->hw_res, level, layer, winsys_drawable_handle,
                              sub_box);
    }
 }
@@ -1054,6 +1054,10 @@ static struct disk_cache *virgl_get_disk_shader_cache (struct pipe_screen *pscre
 
 static void virgl_disk_cache_create(struct virgl_screen *screen)
 {
+   struct mesa_sha1 sha1_ctx;
+   _mesa_sha1_init(&sha1_ctx);
+
+#ifdef HAVE_DL_ITERATE_PHDR
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(virgl_disk_cache_create);
    assert(note);
@@ -1064,9 +1068,8 @@ static void virgl_disk_cache_create(struct virgl_screen *screen)
    const uint8_t *id_sha1 = build_id_data(note);
    assert(id_sha1);
 
-   struct mesa_sha1 sha1_ctx;
-   _mesa_sha1_init(&sha1_ctx);
    _mesa_sha1_update(&sha1_ctx, id_sha1, build_id_len);
+#endif
 
    /* When we switch the host the caps might change and then we might have to
     * apply different lowering. */
diff --git a/src/gallium/drivers/virgl/virgl_staging_mgr.c b/src/gallium/drivers/virgl/virgl_staging_mgr.c
index 947f343bd7f0..c73c79ec2339 100644
--- a/src/gallium/drivers/virgl/virgl_staging_mgr.c
+++ b/src/gallium/drivers/virgl/virgl_staging_mgr.c
@@ -96,7 +96,7 @@ virgl_staging_alloc(struct virgl_staging_mgr *staging,
                     unsigned alignment,
                     unsigned *out_offset,
                     struct virgl_hw_res **outbuf,
-                    void **ptr)
+                    uint8_t **ptr)
 {
    struct virgl_winsys *vws = staging->vws;
    unsigned offset = align(staging->offset, alignment);
diff --git a/src/gallium/drivers/virgl/virgl_staging_mgr.h b/src/gallium/drivers/virgl/virgl_staging_mgr.h
index 7abf18713bb8..18a1cd5e5eb9 100644
--- a/src/gallium/drivers/virgl/virgl_staging_mgr.h
+++ b/src/gallium/drivers/virgl/virgl_staging_mgr.h
@@ -83,7 +83,7 @@ virgl_staging_alloc(struct virgl_staging_mgr *staging,
                     unsigned alignment,
                     unsigned *out_offset,
                     struct virgl_hw_res **outbuf,
-                    void **ptr);
+                    uint8_t **ptr);
 
 #ifdef __cplusplus
 } // extern "C" {
diff --git a/src/gallium/drivers/virgl/virgl_texture.c b/src/gallium/drivers/virgl/virgl_texture.c
index cde0b21e65aa..7412d86b35e4 100644
--- a/src/gallium/drivers/virgl/virgl_texture.c
+++ b/src/gallium/drivers/virgl/virgl_texture.c
@@ -190,7 +190,7 @@ static void *texture_transfer_map_resolve(struct pipe_context *ctx,
             goto fail;
 
          if (!util_format_translate_3d(resource->format,
-                                       ptr + vtex->metadata.level_offset[level],
+                                       (uint8_t *)ptr + vtex->metadata.level_offset[level],
                                        trans->base.stride,
                                        trans->base.layer_stride,
                                        box->x, box->y, box->z,
@@ -212,7 +212,7 @@ static void *texture_transfer_map_resolve(struct pipe_context *ctx,
       if ((usage & PIPE_MAP_WRITE) == 0)
          pipe_resource_reference(&trans->resolve_transfer->resource, NULL);
 
-      return ptr + trans->offset;
+      return (uint8_t *)ptr + trans->offset;
    }
 
 fail:
diff --git a/src/gallium/drivers/virgl/virgl_transfer_queue.c b/src/gallium/drivers/virgl/virgl_transfer_queue.c
index 2353ad5042e5..da8255377a85 100644
--- a/src/gallium/drivers/virgl/virgl_transfer_queue.c
+++ b/src/gallium/drivers/virgl/virgl_transfer_queue.c
@@ -372,7 +372,7 @@ virgl_transfer_queue_extend_buffer(struct virgl_transfer_queue *queue,
    assert(queued->base.resource->target == PIPE_BUFFER);
    assert(queued->hw_res_map);
 
-   memcpy(queued->hw_res_map + offset, data, size);
+   memcpy((uint8_t *)queued->hw_res_map + offset, data, size);
    u_box_union_2d(&queued->base.box, &queued->base.box, &box);
    queued->offset = queued->base.box.x;
 
diff --git a/src/gallium/drivers/virgl/virgl_video.c b/src/gallium/drivers/virgl/virgl_video.c
index f0afa0491a8b..29af663c600f 100644
--- a/src/gallium/drivers/virgl/virgl_video.c
+++ b/src/gallium/drivers/virgl/virgl_video.c
@@ -66,12 +66,12 @@
  */
 
 #include <string.h>
-#include <sys/param.h>
 
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "util/u_video.h"
 #include "util/u_memory.h"
+#include "util/macros.h"
 
 #include "virgl_screen.h"
 #include "virgl_resource.h"
@@ -106,7 +106,7 @@ static int fill_base_picture_desc(const struct pipe_picture_desc *desc,
     ITEM_SET(vbase, desc, protected_playback);
     ITEM_SET(vbase, desc, key_size);
     memcpy(vbase->decrypt_key, desc->decrypt_key,
-           MIN(desc->key_size, sizeof(vbase->decrypt_key)));
+           MIN2(desc->key_size, sizeof(vbase->decrypt_key)));
 
     return 0;
 }
@@ -1042,7 +1042,7 @@ static void virgl_video_decode_bitstream(struct pipe_video_codec *codec,
     if (!ptr)
         return;
     for (i = 0, vcdc->bs_size = 0; i < num_buffers; i++) {
-        memcpy(ptr + vcdc->bs_size, buffers[i], sizes[i]);
+        memcpy((uint8_t *)ptr + vcdc->bs_size, buffers[i], sizes[i]);
         vcdc->bs_size += sizes[i];
     }
     pipe_buffer_unmap(&vctx->base, xfer);
diff --git a/src/gallium/drivers/virgl/virgl_winsys.h b/src/gallium/drivers/virgl/virgl_winsys.h
index e780f5eef9b4..3d83ac728f2d 100644
--- a/src/gallium/drivers/virgl/virgl_winsys.h
+++ b/src/gallium/drivers/virgl/virgl_winsys.h
@@ -27,6 +27,7 @@
 #include "virtio-gpu/virgl_hw.h"
 
 struct pipe_box;
+struct pipe_resource;
 struct pipe_fence_handle;
 struct winsys_handle;
 struct virgl_hw_res;
@@ -86,6 +87,7 @@ struct virgl_winsys {
 
    struct virgl_hw_res *(*resource_create_from_handle)(struct virgl_winsys *vws,
                                                        struct winsys_handle *whandle,
+                                                       struct pipe_resource *templ,
                                                        uint32_t *plane,
                                                        uint32_t *stride,
                                                        uint32_t *plane_offset,
@@ -133,6 +135,7 @@ struct virgl_winsys {
 
    /* for sw paths */
    void (*flush_frontbuffer)(struct virgl_winsys *vws,
+                             struct virgl_cmd_buf *cbuf,
                              struct virgl_hw_res *res,
                              unsigned level, unsigned layer,
                              void *winsys_drawable_handle,
@@ -184,5 +187,5 @@ static inline void virgl_ws_fill_new_caps_defaults(struct virgl_drm_caps *caps)
 }
 
 extern enum virgl_formats pipe_to_virgl_format(enum pipe_format format);
-
+extern enum pipe_format virgl_to_pipe_format(enum virgl_formats format);
 #endif
diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index c79fb23a8343..da4d8d76b8ee 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -3564,26 +3564,39 @@ is_residency_code(nir_def *src)
    }
 }
 
+static bool
+lower_sparse_and_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
+{
+   if (instr->intrinsic != nir_intrinsic_sparse_residency_code_and)
+      return false;
+
+   b->cursor = nir_before_instr(&instr->instr);
+   nir_def *src0;
+   if (is_residency_code(instr->src[0].ssa))
+      src0 = nir_is_sparse_texels_resident(b, 1, instr->src[0].ssa);
+   else
+      src0 = instr->src[0].ssa;
+   nir_def *src1;
+   if (is_residency_code(instr->src[1].ssa))
+      src1 = nir_is_sparse_texels_resident(b, 1, instr->src[1].ssa);
+   else
+      src1 = instr->src[1].ssa;
+   nir_def *def = nir_iand(b, src0, src1);
+   nir_def_rewrite_uses_after(&instr->def, def, &instr->instr);
+   nir_instr_remove(&instr->instr);
+   return true;
+}
+
+static bool
+lower_sparse_and(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(shader, lower_sparse_and_instr,
+                                     nir_metadata_dominance, NULL);
+}
+
 static bool
 lower_sparse_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
 {
-   if (instr->intrinsic == nir_intrinsic_sparse_residency_code_and) {
-      b->cursor = nir_before_instr(&instr->instr);
-      nir_def *src0;
-      if (is_residency_code(instr->src[0].ssa))
-         src0 = nir_is_sparse_texels_resident(b, 1, instr->src[0].ssa);
-      else
-         src0 = instr->src[0].ssa;
-      nir_def *src1;
-      if (is_residency_code(instr->src[1].ssa))
-         src1 = nir_is_sparse_texels_resident(b, 1, instr->src[1].ssa);
-      else
-         src1 = instr->src[1].ssa;
-      nir_def *def = nir_iand(b, src0, src1);
-      nir_def_rewrite_uses_after(&instr->def, def, &instr->instr);
-      nir_instr_remove(&instr->instr);
-      return true;
-   }
    if (instr->intrinsic != nir_intrinsic_is_sparse_texels_resident)
       return false;
 
@@ -4021,7 +4034,10 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
          zs->can_inline = false;
    } else if (need_optimize)
       optimize_nir(nir, zs, true);
-   NIR_PASS_V(nir, lower_sparse);
+   bool has_sparse = false;
+   NIR_PASS(has_sparse, nir, lower_sparse);
+   if (has_sparse)
+      optimize_nir(nir, zs, false);
    
    struct zink_shader_object obj = compile_module(screen, zs, nir, can_shobj, pg);
    ralloc_free(nir);
@@ -5472,6 +5488,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir)
 
    NIR_PASS_V(nir, lower_basevertex);
    NIR_PASS_V(nir, lower_baseinstance);
+   NIR_PASS_V(nir, lower_sparse_and);
    NIR_PASS_V(nir, split_bitfields);
    NIR_PASS_V(nir, nir_lower_frexp); /* TODO: Use the spirv instructions for this. */
 
diff --git a/src/gallium/frontends/d3d10umd/README.md b/src/gallium/frontends/d3d10umd/README.md
index ebd408d4558e..36f1df28fbe7 100644
--- a/src/gallium/frontends/d3d10umd/README.md
+++ b/src/gallium/frontends/d3d10umd/README.md
@@ -3,4 +3,4 @@ This directory has a Gallium state tracker for D3D10 UMD DDI.
 It still uses TGSI, not not NIR.
 
 Currently it only supports SW rasterizers.  See
-src/gallium/targets/d3d10sw/README.md for further details.
+src/gallium/targets/d3d10umd/README.md for further details.
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index e2b84e192768..5014bdbd700a 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -167,8 +167,10 @@ else
 endif
 if with_gallium_virgl
   subdir('winsys/virgl/common')
-  subdir('winsys/virgl/drm')
-  subdir('winsys/virgl/vtest')
+  if not with_platform_windows
+    subdir('winsys/virgl/drm')
+    subdir('winsys/virgl/vtest')
+  endif
   subdir('drivers/virgl')
 else
   driver_virgl = declare_dependency()
@@ -239,7 +241,7 @@ if with_gallium_st_nine
 endif
 if with_gallium_st_d3d10umd
   subdir('frontends/d3d10umd')
-  subdir('targets/d3d10sw')
+  subdir('targets/d3d10umd')
 endif
 if with_platform_windows
   if with_opengl
diff --git a/src/gallium/targets/d3d10sw/README.md b/src/gallium/targets/d3d10umd/README.md
similarity index 79%
rename from src/gallium/targets/d3d10sw/README.md
rename to src/gallium/targets/d3d10umd/README.md
index 7487bb2bc6a3..0675f9589e6b 100644
--- a/src/gallium/targets/d3d10sw/README.md
+++ b/src/gallium/targets/d3d10umd/README.md
@@ -1,15 +1,14 @@
-The resulting d3d10sw.dll implements D3D10's software rendering interface, like
-WARP.
+When compiled with `gallium-driver=llvmpipe` or `gallium-driver=softpipe` the resulting libgallium_d3d10.dll implements D3D10's software rendering interface, like WARP.
 
 
 It can be used directly from WLK 1.6 and WHCK 2.0 D3D10+ tests, via the -Src
 and -SWDLL options. For example:
 
-    wgf11blend.exe -Debug -DoNotCatchExceptions -DXGI:1.1 -FeatureLevel:10.0 -Src:SW -SWDLL:d3d10sw.dll -LogClean -LogVerbose
+    wgf11blend.exe -Debug -DoNotCatchExceptions -DXGI:1.1 -FeatureLevel:10.0 -Src:SW -SWDLL:libgallium_d3d10.dll -LogClean -LogVerbose
 
 However, as of WHCK version 2.1 this mechanism no longer works reliably.
 Either you use WHCK 2.0 binaries, or you must use the alternative method
-cribed below (of copying d3d10sw.dll into the executable directory and rename
+described below (of copying libgallium_d3d10.dll into the executable directory and rename
 it such that it matches the D3D10 UMD of the test machine).
 
 
@@ -17,7 +16,7 @@ Examples can be easily modified to load it too:
 
     D3D10CreateDeviceAndSwapChain(NULL,
                                   D3D10_DRIVER_TYPE_SOFTWARE,
-                                  LoadLibraryA("d3d10sw"), /* Software */
+                                  LoadLibraryA("libgallium_d3d10"), /* Software */
                                   Flags,
                                   D3D10_SDK_VERSION,
                                   &SwapChainDesc,
@@ -26,7 +25,7 @@ Examples can be easily modified to load it too:
 
     D3D11CreateDeviceAndSwapChain(NULL, /* pAdapter */
                                   D3D_DRIVER_TYPE_SOFTWARE,
-                                  LoadLibraryA("d3d10sw"), /* Software */
+                                  LoadLibraryA("libgallium_d3d10"), /* Software */
                                   Flags,
                                   FeatureLevels,
                                   sizeof FeatureLevels / sizeof FeatureLevels[0],
diff --git a/src/gallium/targets/d3d10sw/d3d10_sw.def.in b/src/gallium/targets/d3d10umd/d3d10.def.in
similarity index 100%
rename from src/gallium/targets/d3d10sw/d3d10_sw.def.in
rename to src/gallium/targets/d3d10umd/d3d10.def.in
diff --git a/src/gallium/targets/d3d10sw/d3d10_gdi.c b/src/gallium/targets/d3d10umd/d3d10_gdi.c
similarity index 100%
rename from src/gallium/targets/d3d10sw/d3d10_gdi.c
rename to src/gallium/targets/d3d10umd/d3d10_gdi.c
diff --git a/src/gallium/targets/d3d10sw/meson.build b/src/gallium/targets/d3d10umd/meson.build
similarity index 82%
rename from src/gallium/targets/d3d10sw/meson.build
rename to src/gallium/targets/d3d10umd/meson.build
index 5fe2f5fa39db..570225504343 100644
--- a/src/gallium/targets/d3d10sw/meson.build
+++ b/src/gallium/targets/d3d10umd/meson.build
@@ -19,17 +19,18 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-d3d10_sw_def = custom_target(
-  'd3d10_sw.def',
-  input: 'd3d10_sw.def.in',
-  output : 'd3d10_sw.def',
+libgallium_d3d10_def = custom_target(
+  'd3d10.def',
+  input: 'd3d10.def.in',
+  output : 'd3d10.def',
   command : gen_vs_module_defs_normal_command,
 )
 
-libd3d10sw = shared_library(
-  'd3d10sw',
+gallium_d3d10_name = get_option('gallium-d3d10-dll-name')
+libgallium_d3d10 = shared_library(
+  gallium_d3d10_name,
   ['d3d10_gdi.c'],
-  vs_module_defs : d3d10_sw_def,
+  vs_module_defs : libgallium_d3d10_def,
   include_directories : [
     inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_d3d10umd, inc_gallium_winsys, inc_gallium_winsys_sw, inc_gallium_drivers, inc_winddk
   ],
@@ -40,20 +41,20 @@ libd3d10sw = shared_library(
   dependencies : [
     dep_ws2_32, idep_nir, driver_swrast
   ],
-  name_prefix : '',  # otherwise mingw will create libd3d10sw.dll
+  name_prefix: '',
   install : true,
 )
 
 if with_tests
   test(
-    'd3d10sw',
+    'd3d10',
     executable(
-      'test_d3d10sw',
+      'test_d3d10',
       files('tests/tri.cpp'),
       cpp_args : [cpp_msvc_compat_args],
       dependencies : [cpp.find_library('d3d11')],
-      link_with : [libd3d10sw],
+      link_with : [libgallium_d3d10],
     ),
-    suite : ['d3d10sw'],
+    suite : ['d3d10'],
   )
 endif
diff --git a/src/gallium/targets/d3d10sw/tests/tri.cpp b/src/gallium/targets/d3d10umd/tests/tri.cpp
similarity index 99%
rename from src/gallium/targets/d3d10sw/tests/tri.cpp
rename to src/gallium/targets/d3d10umd/tests/tri.cpp
index d7e95d4db725..da3008127717 100644
--- a/src/gallium/targets/d3d10sw/tests/tri.cpp
+++ b/src/gallium/targets/d3d10umd/tests/tri.cpp
@@ -96,7 +96,7 @@ main(int argc, char *argv[])
         D3D_FEATURE_LEVEL_10_0
     };
 
-    HMODULE hSoftware = LoadLibraryA("d3d10sw.dll");
+    HMODULE hSoftware = LoadLibraryA("libgallium_d3d10.dll");
     if (!hSoftware) {
        return EXIT_FAILURE;
     }
diff --git a/src/gallium/targets/d3d10sw/tests/tri_ps_4_0.h b/src/gallium/targets/d3d10umd/tests/tri_ps_4_0.h
old mode 100755
new mode 100644
similarity index 100%
rename from src/gallium/targets/d3d10sw/tests/tri_ps_4_0.h
rename to src/gallium/targets/d3d10umd/tests/tri_ps_4_0.h
diff --git a/src/gallium/targets/d3d10sw/tests/tri_vs_4_0.h b/src/gallium/targets/d3d10umd/tests/tri_vs_4_0.h
old mode 100755
new mode 100644
similarity index 100%
rename from src/gallium/targets/d3d10sw/tests/tri_vs_4_0.h
rename to src/gallium/targets/d3d10umd/tests/tri_vs_4_0.h
diff --git a/src/gallium/targets/wgl/meson.build b/src/gallium/targets/wgl/meson.build
index eaca0aec86b5..d48863da4f94 100644
--- a/src/gallium/targets/wgl/meson.build
+++ b/src/gallium/targets/wgl/meson.build
@@ -28,7 +28,7 @@ wgl_def = custom_target(
   command : gen_vs_module_defs_normal_command,
 )
 
-gallium_wgl_name = get_option('gallium-windows-dll-name')
+gallium_wgl_name = get_option('gallium-wgl-dll-name')
 libgallium_wgl = shared_library(
   gallium_wgl_name,
   ['wgl.c'],
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index e3bbd34f00e3..03e33eb55088 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -478,6 +478,7 @@ virgl_drm_winsys_resource_get_storage_size(struct virgl_winsys *qws,
 static struct virgl_hw_res *
 virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws,
                                         struct winsys_handle *whandle,
+                                        UNUSED struct pipe_resource *templ,
                                         uint32_t *plane,
                                         uint32_t *stride,
                                         uint32_t *plane_offset,
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
index 4dcc43f48067..fb9daaa57e6e 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -21,6 +21,7 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include <stdio.h>
+#include "util/macros.h"
 #include "util/u_surface.h"
 #include "util/u_memory.h"
 #include "util/format/u_format.h"
@@ -630,6 +631,7 @@ static void virgl_fence_reference(struct virgl_winsys *vws,
 }
 
 static void virgl_vtest_flush_frontbuffer(struct virgl_winsys *vws,
+                                          UNUSED struct virgl_cmd_buf *cmdbuf,
                                           struct virgl_hw_res *res,
                                           unsigned level, unsigned layer,
                                           void *winsys_drawable_handle,
diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h
index cd395aa33b2e..5a1b13ae29f9 100644
--- a/src/intel/blorp/blorp_genX_exec.h
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -113,11 +113,15 @@ blorp_get_surface_base_address(struct blorp_batch *batch);
 #if GFX_VER >= 7
 static const struct intel_l3_config *
 blorp_get_l3_config(struct blorp_batch *batch);
-# else
+#endif
+
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *batch,
+                          struct intel_urb_config *urb_config);
+
 static void
 blorp_emit_urb_config(struct blorp_batch *batch,
-                      unsigned vs_entry_size, unsigned sf_entry_size);
-#endif
+                      struct intel_urb_config *urb_config);
 
 static void
 blorp_emit_pipeline(struct blorp_batch *batch,
@@ -241,14 +245,19 @@ emit_urb_config(struct blorp_batch *batch,
 
 #if GFX_VER >= 7
    assert(sf_entry_size == 0);
-   const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 };
 
-   unsigned entries[4], start[4];
+   struct intel_urb_config urb_cfg = {
+      .size = { vs_entry_size, 1, 1, 1 },
+   };
+
    bool constrained;
    intel_get_urb_config(batch->blorp->compiler->devinfo,
                         blorp_get_l3_config(batch),
-                        false, false, entry_size,
-                        entries, start, deref_block_size, &constrained);
+                        false, false, &urb_cfg,
+                        deref_block_size, &constrained);
+
+   /* Tell drivers about the config. */
+   blorp_pre_emit_urb_config(batch, &urb_cfg);
 
 #if GFX_VERx10 == 70
    /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
@@ -269,9 +278,9 @@ emit_urb_config(struct blorp_batch *batch,
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
       blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
          urb._3DCommandSubOpcode      += i;
-         urb.VSURBStartingAddress      = start[i];
-         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
-         urb.VSNumberofURBEntries      = entries[i];
+         urb.VSURBStartingAddress      = urb_cfg.start[i];
+         urb.VSURBEntryAllocationSize  = urb_cfg.size[i] - 1;
+         urb.VSNumberofURBEntries      = urb_cfg.entries[i];
       }
    }
 
@@ -283,7 +292,10 @@ emit_urb_config(struct blorp_batch *batch,
    }
 
 #else /* GFX_VER < 7 */
-   blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
+   struct intel_urb_config urb_cfg = {
+      .size = { vs_entry_size, 0, 0, 0, sf_entry_size, },
+   };
+   blorp_emit_urb_config(batch, &urb_cfg);
 #endif
 }
 
diff --git a/src/intel/common/intel_l3_config.h b/src/intel/common/intel_l3_config.h
index 8ed6c86acade..fa7a6b60de37 100644
--- a/src/intel/common/intel_l3_config.h
+++ b/src/intel/common/intel_l3_config.h
@@ -106,14 +106,33 @@ enum intel_urb_deref_block_size {
    INTEL_URB_DEREF_BLOCK_SIZE_MESH       = 3,
 };
 
+struct intel_urb_config {
+   unsigned size[5];
+   unsigned entries[5];
+   unsigned start[5];
+};
+
 void intel_get_urb_config(const struct intel_device_info *devinfo,
                           const struct intel_l3_config *l3_cfg,
                           bool tess_present, bool gs_present,
-                          const unsigned entry_size[4],
-                          unsigned entries[4], unsigned start[4],
+                          struct intel_urb_config *urb_cfg,
                           enum intel_urb_deref_block_size *deref_block_size,
                           bool *constrained);
 
+/* Returns if URB changed for given shader stage. */
+static inline bool
+intel_urb_setup_changed(const struct intel_urb_config *a,
+                        const struct intel_urb_config *b,
+                        gl_shader_stage stage)
+{
+   if (a->size[stage] != b->size[stage] ||
+       a->entries[stage] != b->entries[stage] ||
+       a->start[stage] != b->start[stage])
+      return true;
+
+   return false;
+}
+
 struct intel_mesh_urb_allocation {
    unsigned task_entries;
    unsigned task_entry_size_64b;
diff --git a/src/intel/common/intel_urb_config.c b/src/intel/common/intel_urb_config.c
index d19645c31e5f..48ec0aef6cf0 100644
--- a/src/intel/common/intel_urb_config.c
+++ b/src/intel/common/intel_urb_config.c
@@ -64,8 +64,7 @@ void
 intel_get_urb_config(const struct intel_device_info *devinfo,
                      const struct intel_l3_config *l3_cfg,
                      bool tess_present, bool gs_present,
-                     const unsigned entry_size[4],
-                     unsigned entries[4], unsigned start[4],
+                     struct intel_urb_config *urb_cfg,
                      enum intel_urb_deref_block_size *deref_block_size,
                      bool *constrained)
 {
@@ -110,7 +109,7 @@ intel_get_urb_config(const struct intel_device_info *devinfo,
     */
    unsigned granularity[4];
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
-      granularity[i] = (entry_size[i] < 9) ? 8 : 1;
+      granularity[i] = (urb_cfg->size[i] < 9) ? 8 : 1;
    }
 
    unsigned min_entries[4] = {
@@ -148,7 +147,7 @@ intel_get_urb_config(const struct intel_device_info *devinfo,
 
    unsigned entry_size_bytes[4];
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
-      entry_size_bytes[i] = 64 * entry_size[i];
+      entry_size_bytes[i] = 64 * urb_cfg->size[i];
    }
 
    /* Initially, assign each stage the minimum amount of URB space it needs,
@@ -208,20 +207,21 @@ intel_get_urb_config(const struct intel_device_info *devinfo,
     * allocated to each stage.
     */
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
-      entries[i] = chunks[i] * chunk_size_bytes / entry_size_bytes[i];
+      urb_cfg->entries[i] = chunks[i] * chunk_size_bytes / entry_size_bytes[i];
 
       /* Since we rounded up when computing wants[], this may be slightly
        * more than the maximum allowed amount, so correct for that.
        */
-      entries[i] = MIN2(entries[i], devinfo->urb.max_entries[i]);
+      urb_cfg->entries[i] = MIN2(urb_cfg->entries[i],
+                                 devinfo->urb.max_entries[i]);
 
       /* Ensure that we program a multiple of the granularity. */
-      entries[i] = ROUND_DOWN_TO(entries[i], granularity[i]);
+      urb_cfg->entries[i] = ROUND_DOWN_TO(urb_cfg->entries[i], granularity[i]);
 
       /* Finally, sanity check to make sure we have at least the minimum
        * number of entries needed for each stage.
        */
-      assert(entries[i] >= min_entries[i]);
+      assert(urb_cfg->entries[i] >= min_entries[i]);
    }
 
    /* Lay out the URB in pipeline order: push constants, VS, HS, DS, GS. */
@@ -245,12 +245,12 @@ intel_get_urb_config(const struct intel_device_info *devinfo,
 
    int next_urb = first_urb;
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
-      if (entries[i]) {
-         start[i] = next_urb;
+      if (urb_cfg->entries[i]) {
+         urb_cfg->start[i] = next_urb;
          next_urb += chunks[i];
       } else {
          /* Put disabled stages at the beginning of the valid range */
-         start[i] = first_urb;
+         urb_cfg->start[i] = first_urb;
       }
    }
 
@@ -278,12 +278,12 @@ intel_get_urb_config(const struct intel_device_info *devinfo,
          if (gs_present) {
             *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_PER_POLY;
          } else if (tess_present) {
-            if (entries[MESA_SHADER_TESS_EVAL] < 324)
+            if (urb_cfg->entries[MESA_SHADER_TESS_EVAL] < 324)
                *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_PER_POLY;
             else
                *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_32;
          } else {
-            if (entries[MESA_SHADER_VERTEX] < 192)
+            if (urb_cfg->entries[MESA_SHADER_VERTEX] < 192)
                *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_PER_POLY;
             else
                *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_32;
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 5aad102fa07a..5a4a9ffc6556 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -80,6 +80,7 @@ static const driOptionDescription anv_dri_options[] = {
       DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
       DRI_CONF_ANV_DISABLE_FCV(false)
       DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false)
+      DRI_CONF_ANV_FORCE_FILTER_ADDR_ROUNDING(false)
       DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
       DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
       DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100)
@@ -958,7 +959,9 @@ get_properties_1_1(const struct anv_physical_device *pdevice,
                                     VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
                                     VK_SUBGROUP_FEATURE_QUAD_BIT |
                                     VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
-                                    VK_SUBGROUP_FEATURE_CLUSTERED_BIT;
+                                    VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
+                                    VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
+                                    VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR;
    p->subgroupQuadOperationsInAllStages = true;
 
    p->pointClippingBehavior      = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
@@ -2467,6 +2470,8 @@ anv_init_dri_options(struct anv_instance *instance)
             driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
     instance->sample_mask_out_opengl_behaviour =
             driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour");
+    instance->force_filter_addr_rounding =
+            driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding");
     instance->lower_depth_range_rate =
             driQueryOptionf(&instance->dri_options, "lower_depth_range_rate");
     instance->no_16bit =
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index bd3dc52f9107..a6a535334315 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -37,6 +37,7 @@
 #endif
 
 struct intel_sample_positions;
+struct intel_urb_config;
 
 extern const uint32_t genX(vk_to_intel_cullmode)[];
 
@@ -88,6 +89,9 @@ void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
                                         unsigned width, unsigned height,
                                         unsigned scale);
 
+void genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
+                          const struct intel_urb_config *urb_cfg);
+
 void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
 void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
 void genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
@@ -172,7 +176,8 @@ void
 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      const struct intel_l3_config *l3_config,
                      VkShaderStageFlags active_stages,
-                     const unsigned entry_size[4],
+                     const struct intel_urb_config *urb_cfg_in,
+                     struct intel_urb_config *urb_cfg_out,
                      enum intel_urb_deref_block_size *deref_block_size);
 
 void genX(emit_sample_pattern)(struct anv_batch *batch,
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 98ebaf151c90..d06dc723882a 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1172,6 +1172,7 @@ struct anv_instance {
     uint8_t                                     assume_full_subgroups;
     bool                                        limit_trig_input_range;
     bool                                        sample_mask_out_opengl_behaviour;
+    bool                                        force_filter_addr_rounding;
     bool                                        fp64_workaround_enabled;
     float                                       lower_depth_range_rate;
     unsigned                                    generated_indirect_threshold;
@@ -3333,6 +3334,8 @@ struct anv_simple_shader {
    struct anv_shader_bin *kernel;
    /* L3 config used by the shader */
    const struct intel_l3_config *l3_config;
+   /* Current URB config */
+   const struct intel_urb_config *urb_cfg;
 
    /* Managed by the simpler shader helper*/
    struct anv_state bt_state;
@@ -3443,6 +3446,8 @@ struct anv_cmd_graphics_state {
     */
    bool                                         viewport_set;
 
+   struct intel_urb_config urb_cfg;
+
    uint32_t n_occlusion_queries;
 
    struct anv_gfx_dynamic_state dyn_state;
@@ -4275,6 +4280,9 @@ struct anv_graphics_pipeline {
     */
    uint32_t                                     batch_data[416];
 
+   /* Urb setup utilized by this pipeline. */
+   struct intel_urb_config urb_cfg;
+
    /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
    struct {
       struct anv_gfx_state_ptr                  urb;
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index a4ec021224de..11a0e3c7d929 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -255,6 +255,18 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
     */
 }
 
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
+                          struct intel_urb_config *urb_cfg)
+{
+   struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch;
+   genX(urb_workaround)(cmd_buffer, urb_cfg);
+
+   /* Update urb config. */
+   memcpy(&cmd_buffer->state.gfx.urb_cfg, urb_cfg,
+          sizeof(struct intel_urb_config));
+}
+
 static const struct intel_l3_config *
 blorp_get_l3_config(struct blorp_batch *batch)
 {
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index f7b09ec6e86b..ad833b4ec1bd 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -395,22 +395,18 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
    }
 
-#if GFX_VER == 12
-   /* Depth/Stencil writes by the render pipeline to D16 & S8 formats use a
-    * different pairing bit for the compression cache line. This means that
-    * there is potential for aliasing with the wrong cache if you use another
-    * format OR a piece of HW that does not use the same pairing. To avoid
-    * this, flush the tile cache as the compression data does not live in the
-    * color/depth cache.
+   /* Additional tile cache flush for MTL:
+    *
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
     */
-   if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
-       final_needs_depth && !initial_depth_valid &&
-       anv_image_format_is_d16_or_s8(image)) {
+   if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
+       image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
+       final_needs_depth && !initial_depth_valid) {
       anv_add_pending_pipe_bits(cmd_buffer,
                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
-                                "D16 or S8 HIZ-CCS flush");
+                                "HIZ-CCS flush");
    }
-#endif
 }
 
 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
@@ -467,17 +463,15 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
       }
    }
 
-   /* Depth/Stencil writes by the render pipeline to D16 & S8 formats use a
-    * different pairing bit for the compression cache line. This means that
-    * there is potential for aliasing with the wrong cache if you use another
-    * format OR a piece of HW that does not use the same pairing. To avoid
-    * this, flush the tile cache as the compression data does not live in the
-    * color/depth cache.
+   /* Additional tile cache flush for MTL:
+    *
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
     */
-   if (anv_image_format_is_d16_or_s8(image)) {
+   if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
       anv_add_pending_pipe_bits(cmd_buffer,
                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
-                                "D16 or S8 HIZ-CCS flush");
+                                "HIZ-CCS flush");
    }
 #endif
 }
@@ -2956,6 +2950,7 @@ genX(CmdExecuteCommands)(
    container->state.current_hash_scale = 0;
    container->state.gfx.push_constant_stages = 0;
    container->state.gfx.ds_write_state = false;
+   memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
    memcpy(container->state.gfx.dyn_state.dirty,
           device->gfx_dirty_state,
           sizeof(container->state.gfx.dyn_state.dirty));
@@ -5513,6 +5508,30 @@ genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
 #endif
 }
 
+void
+genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
+                     const struct intel_urb_config *urb_cfg)
+{
+#if INTEL_NEEDS_WA_16014912113
+   const struct intel_urb_config *current =
+      &cmd_buffer->state.gfx.urb_cfg;
+   if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
+       current->size[0] != 0) {
+      for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
+            urb._3DCommandSubOpcode      += i;
+            urb.VSURBStartingAddress      = current->start[i];
+            urb.VSURBEntryAllocationSize  = current->size[i] - 1;
+            urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
+         }
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.HDCPipelineFlushEnable = true;
+      }
+   }
+#endif
+}
+
 struct anv_state
 genX(cmd_buffer_begin_companion_rcs_syncpoint)(
       struct anv_cmd_buffer   *cmd_buffer)
diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
index 4eb27d262d5a..89fceb8fac5b 100644
--- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
+++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@@ -151,6 +151,7 @@ genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_b
       .kernel               = device->internal_kernels[
          ANV_INTERNAL_KERNEL_GENERATED_DRAWS],
       .l3_config            = device->internal_kernels_l3_config,
+      .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
    };
 
    genX(emit_simple_shader_init)(state);
@@ -478,6 +479,7 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd
       .kernel               = device->internal_kernels[
          ANV_INTERNAL_KERNEL_GENERATED_DRAWS],
       .l3_config            = device->internal_kernels_l3_config,
+      .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
    };
    genX(emit_simple_shader_init)(&simple_state);
 
diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c
index 3a66f10c26f8..db853db4b39b 100644
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -1413,9 +1413,15 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
       &cmd_buffer->vk.dynamic_graphics_state;
    struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
 
-   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB))
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
+      genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
+
       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
 
+      memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
+             sizeof(struct intel_urb_config));
+   }
+
    if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE))
       anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ms);
 
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 70b0851850f6..21699dc19544 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -53,6 +53,8 @@ gcd_pow2_u64(uint64_t a, uint64_t b)
 
 static void
 emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+                      const struct intel_urb_config *urb_cfg_in,
+                      struct intel_urb_config *urb_cfg_out,
                       const struct intel_l3_config *l3_config)
 {
    anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
@@ -102,9 +104,11 @@ emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
     * store the data that VF is going to pass to SOL.
     */
    const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+   memcpy(urb_cfg_out->size, &entry_size, sizeof(entry_size));
 
    genX(emit_urb_setup)(device, batch, l3_config,
-                        VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL);
+                        VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, urb_cfg_out,
+                        NULL);
 
 #if GFX_VER >= 12
    /* Disable Primitive Replication. */
@@ -258,7 +262,10 @@ genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
    genX(emit_l3_config)(batch, device, cfg);
    genX(emit_pipeline_select)(batch, _3D, device);
 
-   emit_common_so_memcpy(batch, device, cfg);
+   struct intel_urb_config urb_cfg_in = { 0 };
+   struct intel_urb_config urb_cfg = { 0 };
+
+   emit_common_so_memcpy(batch, device, &urb_cfg_in, &urb_cfg, cfg);
 }
 
 void
@@ -325,7 +332,11 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
 
    genX(flush_pipeline_select_3d)(cmd_buffer);
 
+   struct intel_urb_config urb_cfg;
+
    emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device,
+                         &cmd_buffer->state.gfx.urb_cfg,
+                         &urb_cfg,
                          cmd_buffer->state.current_l3_config);
    emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size);
 
@@ -334,6 +345,10 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
                                                        1ull << 32);
 #endif
 
+   /* Update urb config after memcpy. */
+   memcpy(&cmd_buffer->state.gfx.urb_cfg, &urb_cfg,
+          sizeof(struct intel_urb_config));
+
    /* Flag all the instructions emitted by the memcpy. */
    struct anv_gfx_dynamic_state *hw_state =
       &cmd_buffer->state.gfx.dyn_state;
diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c
index b52023bfda43..a6bd444cf034 100644
--- a/src/intel/vulkan/genX_init_state.c
+++ b/src/intel/vulkan/genX_init_state.c
@@ -1154,8 +1154,12 @@ VkResult genX(CreateSampler)(
       const VkFilter mag_filter =
          plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
                             pCreateInfo->magFilter;
-      const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST;
-      const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST;
+      const bool force_addr_rounding =
+            device->physical->instance->force_filter_addr_rounding;
+      const bool enable_min_filter_addr_rounding =
+            force_addr_rounding || min_filter != VK_FILTER_NEAREST;
+      const bool enable_mag_filter_addr_rounding =
+            force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
       /* From Broadwell PRM, SAMPLER_STATE:
        *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
        */
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index a104b032271e..87f2ec4d763d 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -371,27 +371,42 @@ void
 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      const struct intel_l3_config *l3_config,
                      VkShaderStageFlags active_stages,
-                     const unsigned entry_size[4],
+                     const struct intel_urb_config *urb_cfg_in,
+                     struct intel_urb_config *urb_cfg_out,
                      enum intel_urb_deref_block_size *deref_block_size)
 {
    const struct intel_device_info *devinfo = device->info;
 
-   unsigned entries[4];
-   unsigned start[4];
    bool constrained;
    intel_get_urb_config(devinfo, l3_config,
                         active_stages &
                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
                         active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
-                        entry_size, entries, start, deref_block_size,
+                        urb_cfg_out, deref_block_size,
                         &constrained);
 
+#if INTEL_NEEDS_WA_16014912113
+      if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
+          MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
+         for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+            anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+               urb._3DCommandSubOpcode      += i;
+               urb.VSURBStartingAddress      = urb_cfg_in->start[i];
+               urb.VSURBEntryAllocationSize  = urb_cfg_in->size[i] - 1;
+               urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
+            }
+         }
+         genx_batch_emit_pipe_control(batch, device->info, _3D,
+                                      ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+      }
+#endif
+
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
          urb._3DCommandSubOpcode      += i;
-         urb.VSURBStartingAddress      = start[i];
-         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
-         urb.VSNumberofURBEntries      = entries[i];
+         urb.VSURBStartingAddress      = urb_cfg_out->start[i];
+         urb.VSURBEntryAllocationSize  = urb_cfg_out->size[i] - 1;
+         urb.VSNumberofURBEntries      = urb_cfg_out->entries[i];
       }
    }
 #if GFX_VERx10 >= 125
@@ -458,21 +473,18 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline,
       return;
    }
 #endif
-
-   unsigned entry_size[4];
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
       const struct brw_vue_prog_data *prog_data =
          !anv_pipeline_has_stage(pipeline, i) ? NULL :
          (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
 
-      entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
+      pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
    }
 
    struct anv_device *device = pipeline->base.base.device;
    const struct intel_device_info *devinfo = device->info;
 
-   unsigned entries[4];
-   unsigned start[4];
+
    bool constrained;
    intel_get_urb_config(devinfo,
                         pipeline->base.base.l3_config,
@@ -480,17 +492,18 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline,
                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
                         pipeline->base.base.active_stages &
                            VK_SHADER_STAGE_GEOMETRY_BIT,
-                        entry_size, entries, start, deref_block_size,
+                        &pipeline->urb_cfg, deref_block_size,
                         &constrained);
 
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
          urb._3DCommandSubOpcode      += i;
-         urb.VSURBStartingAddress      = start[i];
-         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
-         urb.VSNumberofURBEntries      = entries[i];
+         urb.VSURBStartingAddress      = pipeline->urb_cfg.start[i];
+         urb.VSURBEntryAllocationSize  = pipeline->urb_cfg.size[i] - 1;
+         urb.VSNumberofURBEntries      = pipeline->urb_cfg.entries[i];
       }
    }
+
 #if GFX_VERx10 >= 125
    if (device->vk.enabled_extensions.EXT_mesh_shader) {
       anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index bc5ce323ad4e..57ccd76c3371 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -1744,6 +1744,7 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
          ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE :
          ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT],
       .l3_config            = device->internal_kernels_l3_config,
+      .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
    };
    genX(emit_simple_shader_init)(&state);
 
diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c
index 2776f5ef256a..98fe617cff9b 100644
--- a/src/intel/vulkan/genX_simple_shader.c
+++ b/src/intel/vulkan/genX_simple_shader.c
@@ -103,7 +103,9 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)
     * allocate space for the VS.  Even though one isn't run, we need VUEs to
     * store the data that VF is going to pass to SOL.
     */
-   const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+   struct intel_urb_config urb_cfg_out = {
+      .size = { DIV_ROUND_UP(32, 64), 1, 1, 1 },
+   };
 
    genX(emit_l3_config)(batch, device, state->l3_config);
 
@@ -112,7 +114,7 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)
    enum intel_urb_deref_block_size deref_block_size;
    genX(emit_urb_setup)(device, batch, state->l3_config,
                         VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
-                        entry_size, &deref_block_size);
+                        state->urb_cfg, &urb_cfg_out, &deref_block_size);
 
    anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
       ps_blend.HasWriteableRT = true;
@@ -344,6 +346,10 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)
       BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
    }
 
+   /* Update urb config after simple shader. */
+   memcpy(&state->cmd_buffer->state.gfx.urb_cfg, &urb_cfg_out,
+          sizeof(struct intel_urb_config));
+
    state->cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0);
    state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
                                            ANV_CMD_DIRTY_XFB_ENABLE);
diff --git a/src/intel/vulkan_hasvk/genX_blorp_exec.c b/src/intel/vulkan_hasvk/genX_blorp_exec.c
index 34734d05c67d..0d10e5212445 100644
--- a/src/intel/vulkan_hasvk/genX_blorp_exec.c
+++ b/src/intel/vulkan_hasvk/genX_blorp_exec.c
@@ -250,6 +250,13 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
     */
 }
 
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
+                          struct intel_urb_config *urb_cfg)
+{
+   /* Dummy. */
+}
+
 static const struct intel_l3_config *
 blorp_get_l3_config(struct blorp_batch *batch)
 {
diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c
index 789e109f2bc6..8e50f660c4cd 100644
--- a/src/intel/vulkan_hasvk/genX_pipeline.c
+++ b/src/intel/vulkan_hasvk/genX_pipeline.c
@@ -276,16 +276,16 @@ genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      enum intel_urb_deref_block_size *deref_block_size)
 {
    const struct intel_device_info *devinfo = device->info;
+   struct intel_urb_config urb_cfg = {
+      .size = { entry_size[0], entry_size[1], entry_size[2], entry_size[3], },
+   };
 
-   unsigned entries[4];
-   unsigned start[4];
    bool constrained;
    intel_get_urb_config(devinfo, l3_config,
                         active_stages &
                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
                         active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
-                        entry_size, entries, start, deref_block_size,
-                        &constrained);
+                        &urb_cfg, deref_block_size, &constrained);
 
 #if GFX_VERx10 == 70
    /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
@@ -306,9 +306,9 @@ genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
          urb._3DCommandSubOpcode      += i;
-         urb.VSURBStartingAddress      = start[i];
-         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
-         urb.VSNumberofURBEntries      = entries[i];
+         urb.VSURBStartingAddress      = urb_cfg.start[i];
+         urb.VSURBEntryAllocationSize  = urb_cfg.size[i] - 1;
+         urb.VSNumberofURBEntries      = urb_cfg.entries[i];
       }
    }
 }
diff --git a/src/meson.build b/src/meson.build
index 1293538b8f66..fb516e122e6d 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -91,7 +91,7 @@ endif
 if with_gallium_etnaviv
   subdir('etnaviv')
 endif
-if system_has_kms_drm
+if system_has_kms_drm or with_gallium_virgl
   subdir('virtio')
 endif
 if with_gallium_freedreno or with_freedreno_vk or with_tools.contains('freedreno')
diff --git a/src/panfrost/util/pan_lower_writeout.c b/src/panfrost/util/pan_lower_writeout.c
index 56b33a495b0e..eb528ce3bf3c 100644
--- a/src/panfrost/util/pan_lower_writeout.c
+++ b/src/panfrost/util/pan_lower_writeout.c
@@ -106,6 +106,7 @@ pan_nir_lower_zs_store(nir_shader *nir)
                stores[1] = intr;
                writeout |= PAN_WRITEOUT_S;
             } else if (sem.dual_source_blend_index) {
+               assert(!stores[2]); /* there should be only 1 source for dual blending */
                stores[2] = intr;
                writeout |= PAN_WRITEOUT_2;
             }
diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
index 440e0f2ec814..521ea9321ad3 100644
--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@@ -1189,6 +1189,9 @@ TODO: document the other workarounds.
         <application name="Armored Core 6" executable="armoredcore6.exe">
             <option name="fake_sparse" value="true" />
         </application>
+        <application name="Age of Empires IV" executable="RelicCardinal.exe">
+            <option name="anv_force_filter_addr_rounding" value="true" />
+        </application>
         <!-- Needed to avoid XeSS code paths. -->
         <application name="Marvel's Spider-Man Remastered" executable="Spider-Man.exe">
             <option name="force_vk_vendor" value="-1" />
diff --git a/src/util/driconf.h b/src/util/driconf.h
index 04bf560f8bf2..837e35410758 100644
--- a/src/util/driconf.h
+++ b/src/util/driconf.h
@@ -736,6 +736,10 @@
    DRI_CONF_OPT_B(anv_sample_mask_out_opengl_behaviour, def, \
                   "Ignore sample mask out when having single sampled target")
 
+#define DRI_CONF_ANV_FORCE_FILTER_ADDR_ROUNDING(def) \
+   DRI_CONF_OPT_B(anv_force_filter_addr_rounding, def, \
+                  "Force min/mag filter address rounding to be enabled even for NEAREST sampling")
+
 #define DRI_CONF_ANV_MESH_CONV_PRIM_ATTRS_TO_VERT_ATTRS(def) \
    DRI_CONF_OPT_E(anv_mesh_conv_prim_attrs_to_vert_attrs, def, -2, 2, \
                   "Apply workaround for gfx12.5 per-prim attribute corruption HW bug", \
diff --git a/src/virtio/vulkan/vn_common.c b/src/virtio/vulkan/vn_common.c
index 7634c1b07a3d..6a3ff7a4fd99 100644
--- a/src/virtio/vulkan/vn_common.c
+++ b/src/virtio/vulkan/vn_common.c
@@ -53,6 +53,7 @@ static const struct debug_control vn_perf_options[] = {
    { "no_tiled_wsi_image", VN_PERF_NO_TILED_WSI_IMAGE },
    { "no_multi_ring", VN_PERF_NO_MULTI_RING },
    { "no_async_image_create", VN_PERF_NO_ASYNC_IMAGE_CREATE },
+   { "no_async_image_format", VN_PERF_NO_ASYNC_IMAGE_FORMAT },
    { NULL, 0 },
    /* clang-format on */
 };
diff --git a/src/virtio/vulkan/vn_common.h b/src/virtio/vulkan/vn_common.h
index 7456e260b169..f5b711793106 100644
--- a/src/virtio/vulkan/vn_common.h
+++ b/src/virtio/vulkan/vn_common.h
@@ -128,6 +128,7 @@ enum vn_perf {
    VN_PERF_NO_TILED_WSI_IMAGE = 1ull << 10,
    VN_PERF_NO_MULTI_RING = 1ull << 11,
    VN_PERF_NO_ASYNC_IMAGE_CREATE = 1ull << 12,
+   VN_PERF_NO_ASYNC_IMAGE_FORMAT = 1ull << 13,
 };
 
 typedef uint64_t vn_object_id;
@@ -535,4 +536,16 @@ vn_tls_get_ring(struct vn_instance *instance);
 void
 vn_tls_destroy_ring(struct vn_tls_ring *tls_ring);
 
+static inline uint32_t
+vn_cache_key_hash_function(const void *key)
+{
+   return _mesa_hash_data(key, SHA1_DIGEST_LENGTH);
+}
+
+static inline bool
+vn_cache_key_equal_function(const void *key1, const void *key2)
+{
+   return memcmp(key1, key2, SHA1_DIGEST_LENGTH) == 0;
+}
+
 #endif /* VN_COMMON_H */
diff --git a/src/virtio/vulkan/vn_image.c b/src/virtio/vulkan/vn_image.c
index 6f2f986eaf62..033a7d535fbc 100644
--- a/src/virtio/vulkan/vn_image.c
+++ b/src/virtio/vulkan/vn_image.c
@@ -46,19 +46,6 @@ vn_image_cache_debug_dump(struct vn_image_reqs_cache *cache)
    vn_log(NULL, "  skip %u\n", cache->debug.cache_skip_count);
 }
 
-static uint32_t
-vn_image_cache_key_hash_function(const void *key)
-{
-   return _mesa_hash_data(key, SHA1_DIGEST_LENGTH);
-}
-
-static bool
-vn_image_cache_key_equal_function(const void *void_a, const void *void_b)
-{
-   const struct vn_image_reqs_cache_entry *a = void_a, *b = void_b;
-   return memcmp(a, b, SHA1_DIGEST_LENGTH) == 0;
-}
-
 static bool
 vn_image_get_image_reqs_key(struct vn_device *dev,
                             const VkImageCreateInfo *create_info,
@@ -160,8 +147,8 @@ vn_image_reqs_cache_init(struct vn_device *dev)
    if (VN_PERF(NO_ASYNC_IMAGE_CREATE))
       return;
 
-   cache->ht = _mesa_hash_table_create(NULL, vn_image_cache_key_hash_function,
-                                       vn_image_cache_key_equal_function);
+   cache->ht = _mesa_hash_table_create(NULL, vn_cache_key_hash_function,
+                                       vn_cache_key_equal_function);
    if (!cache->ht)
       return;
 
@@ -231,8 +218,15 @@ vn_image_store_reqs_in_cache(struct vn_device *dev,
    assert(cache->ht);
 
    simple_mtx_lock(&cache->mutex);
-   uint32_t cache_entry_count = _mesa_hash_table_num_entries(cache->ht);
-   if (cache_entry_count == IMAGE_REQS_CACHE_MAX_ENTRIES) {
+
+   /* Check if entry was added before lock */
+   if (_mesa_hash_table_search(cache->ht, key)) {
+      simple_mtx_unlock(&cache->mutex);
+      return;
+   }
+
+   if (_mesa_hash_table_num_entries(cache->ht) ==
+       IMAGE_REQS_CACHE_MAX_ENTRIES) {
       /* Evict/use the last entry in the lru list for this new entry */
       cache_entry =
          list_last_entry(&cache->lru, struct vn_image_reqs_cache_entry, head);
@@ -242,11 +236,11 @@ vn_image_store_reqs_in_cache(struct vn_device *dev,
    } else {
       cache_entry = vk_zalloc(alloc, sizeof(*cache_entry), VN_DEFAULT_ALIGN,
                               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!cache_entry) {
+         simple_mtx_unlock(&cache->mutex);
+         return;
+      }
    }
-   simple_mtx_unlock(&cache->mutex);
-
-   if (!cache_entry)
-      return;
 
    for (uint32_t i = 0; i < plane_count; i++)
       cache_entry->requirements[i] = requirements[i];
@@ -254,12 +248,10 @@ vn_image_store_reqs_in_cache(struct vn_device *dev,
    memcpy(cache_entry->key, key, SHA1_DIGEST_LENGTH);
    cache_entry->plane_count = plane_count;
 
-   simple_mtx_lock(&cache->mutex);
-   if (!_mesa_hash_table_search(cache->ht, cache_entry->key)) {
-      _mesa_hash_table_insert(dev->image_reqs_cache.ht, cache_entry->key,
-                              cache_entry);
-      list_add(&cache_entry->head, &cache->lru);
-   }
+   _mesa_hash_table_insert(dev->image_reqs_cache.ht, cache_entry->key,
+                           cache_entry);
+   list_add(&cache_entry->head, &cache->lru);
+
    simple_mtx_unlock(&cache->mutex);
 }
 
diff --git a/src/virtio/vulkan/vn_physical_device.c b/src/virtio/vulkan/vn_physical_device.c
index 7c1914283dcd..a202d549d9bd 100644
--- a/src/virtio/vulkan/vn_physical_device.c
+++ b/src/virtio/vulkan/vn_physical_device.c
@@ -20,6 +20,8 @@
 #include "vn_android.h"
 #include "vn_instance.h"
 
+#define IMAGE_FORMAT_CACHE_MAX_ENTRIES 100
+
 #define VN_EXTENSION_TABLE_INDEX(tbl, ext)                                   \
    ((const bool *)((const void *)(&(tbl)) +                                  \
                    offsetof(__typeof__(tbl), ext)) -                         \
@@ -1319,6 +1321,59 @@ vn_physical_device_init_renderer_version(
    return VK_SUCCESS;
 }
 
+static void
+vn_image_format_cache_debug_dump(
+   struct vn_image_format_properties_cache *cache)
+{
+   vn_log(NULL, "  hit %u\n", cache->debug.cache_hit_count);
+   vn_log(NULL, "  miss %u\n", cache->debug.cache_miss_count);
+   vn_log(NULL, "  skip %u\n", cache->debug.cache_skip_count);
+}
+
+static void
+vn_image_format_cache_init(struct vn_physical_device *physical_dev)
+{
+   struct vn_image_format_properties_cache *cache =
+      &physical_dev->image_format_cache;
+
+   if (VN_PERF(NO_ASYNC_IMAGE_FORMAT))
+      return;
+
+   cache->ht = _mesa_hash_table_create(NULL, vn_cache_key_hash_function,
+                                       vn_cache_key_equal_function);
+   if (!cache->ht)
+      return;
+
+   simple_mtx_init(&cache->mutex, mtx_plain);
+   list_inithead(&cache->lru);
+}
+
+static void
+vn_image_format_cache_fini(struct vn_physical_device *physical_dev)
+{
+   const VkAllocationCallbacks *alloc =
+      &physical_dev->base.base.instance->alloc;
+   struct vn_image_format_properties_cache *cache =
+      &physical_dev->image_format_cache;
+
+   if (!cache->ht)
+      return;
+
+   hash_table_foreach(cache->ht, hash_entry) {
+      struct vn_image_format_cache_entry *cache_entry = hash_entry->data;
+      list_del(&cache_entry->head);
+      vk_free(alloc, cache_entry);
+   }
+   assert(list_is_empty(&cache->lru));
+
+   _mesa_hash_table_destroy(cache->ht, NULL);
+
+   simple_mtx_destroy(&cache->mutex);
+
+   if (VN_DEBUG(CACHE))
+      vn_image_format_cache_debug_dump(cache);
+}
+
 static VkResult
 vn_physical_device_init(struct vn_physical_device *physical_dev)
 {
@@ -1354,6 +1409,8 @@ vn_physical_device_init(struct vn_physical_device *physical_dev)
    util_sparse_array_init(&physical_dev->format_properties,
                           sizeof(struct vn_format_properties_entry), 64);
 
+   vn_image_format_cache_init(physical_dev);
+
    return VK_SUCCESS;
 
 fail:
@@ -1368,6 +1425,8 @@ vn_physical_device_fini(struct vn_physical_device *physical_dev)
    struct vn_instance *instance = physical_dev->instance;
    const VkAllocationCallbacks *alloc = &instance->base.base.alloc;
 
+   vn_image_format_cache_fini(physical_dev);
+
    simple_mtx_destroy(&physical_dev->format_update_mutex);
    util_sparse_array_finish(&physical_dev->format_properties);
 
@@ -2062,6 +2121,290 @@ vn_modifier_plane_count(struct vn_physical_device *physical_dev,
    return plane_count;
 }
 
+static bool
+vn_image_get_image_format_key(
+   struct vn_physical_device *physical_dev,
+   const VkPhysicalDeviceImageFormatInfo2 *format_info,
+   const VkImageFormatProperties2 *format_props,
+   uint8_t *key)
+{
+   struct mesa_sha1 sha1_ctx;
+
+   if (!physical_dev->image_format_cache.ht)
+      return false;
+
+   _mesa_sha1_init(&sha1_ctx);
+
+   /* VUID-VkPhysicalDeviceImageFormatInfo2-pNext-pNext
+    * Each pNext member of any structure (including this one) in the pNext
+    * chain must be either NULL or a pointer to a valid instance of
+    * VkImageCompressionControlEXT, VkImageFormatListCreateInfo,
+    * VkImageStencilUsageCreateInfo, VkOpticalFlowImageFormatInfoNV,
+    * VkPhysicalDeviceExternalImageFormatInfo,
+    * VkPhysicalDeviceImageDrmFormatModifierInfoEXT,
+    * VkPhysicalDeviceImageViewImageFormatInfoEXT, or VkVideoProfileListInfoKHR
+    *
+    * Exclude VkOpticalFlowImageFormatInfoNV and VkVideoProfileListInfoKHR
+    */
+   if (format_info->pNext) {
+      vk_foreach_struct_const(src, format_info->pNext) {
+         switch (src->sType) {
+         case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_CONTROL_EXT: {
+            struct VkImageCompressionControlEXT *compression_control =
+               (struct VkImageCompressionControlEXT *)src;
+            _mesa_sha1_update(&sha1_ctx, &compression_control->flags,
+                              sizeof(VkImageCompressionFlagsEXT));
+            _mesa_sha1_update(
+               &sha1_ctx, compression_control->pFixedRateFlags,
+               sizeof(uint32_t) *
+                  compression_control->compressionControlPlaneCount);
+            break;
+         }
+         case VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO: {
+            struct VkImageFormatListCreateInfo *format_list =
+               (struct VkImageFormatListCreateInfo *)src;
+            _mesa_sha1_update(
+               &sha1_ctx, format_list->pViewFormats,
+               sizeof(VkFormat) * format_list->viewFormatCount);
+
+            break;
+         }
+         case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO: {
+            struct VkImageStencilUsageCreateInfo *stencil_usage =
+               (struct VkImageStencilUsageCreateInfo *)src;
+            _mesa_sha1_update(&sha1_ctx, &stencil_usage->stencilUsage,
+                              sizeof(VkImageUsageFlags));
+            break;
+         }
+         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO: {
+            struct VkPhysicalDeviceExternalImageFormatInfo *ext_image =
+               (struct VkPhysicalDeviceExternalImageFormatInfo *)src;
+            _mesa_sha1_update(&sha1_ctx, &ext_image->handleType,
+                              sizeof(VkExternalMemoryHandleTypeFlagBits));
+            break;
+         }
+         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT: {
+            struct VkPhysicalDeviceImageDrmFormatModifierInfoEXT
+               *modifier_info =
+                  (struct VkPhysicalDeviceImageDrmFormatModifierInfoEXT *)src;
+            _mesa_sha1_update(&sha1_ctx, &modifier_info->drmFormatModifier,
+                              sizeof(uint64_t));
+            if (modifier_info->sharingMode == VK_SHARING_MODE_CONCURRENT) {
+               _mesa_sha1_update(
+                  &sha1_ctx, modifier_info->pQueueFamilyIndices,
+                  sizeof(uint32_t) * modifier_info->queueFamilyIndexCount);
+            }
+            break;
+         }
+         case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_VIEW_IMAGE_FORMAT_INFO_EXT: {
+            struct VkPhysicalDeviceImageViewImageFormatInfoEXT *view_image =
+               (struct VkPhysicalDeviceImageViewImageFormatInfoEXT *)src;
+            _mesa_sha1_update(&sha1_ctx, &view_image->imageViewType,
+                              sizeof(VkImageViewType));
+            break;
+         }
+         default:
+            physical_dev->image_format_cache.debug.cache_skip_count++;
+            return false;
+         }
+      }
+   }
+
+   /* Hash pImageFormatProperties pNext as well since some of them are
+    * optional in that they can be attached without a corresponding pNext
+    * in pImageFormatInfo.
+    *
+    * VUID-VkImageFormatProperties2-pNext-pNext
+    * Each pNext member of any structure (including this one) in the pNext
+    * chain must be either NULL or a pointer to a valid instance of
+    * VkAndroidHardwareBufferUsageANDROID, VkExternalImageFormatProperties,
+    * VkFilterCubicImageViewImageFormatPropertiesEXT,
+    * VkHostImageCopyDevicePerformanceQueryEXT,
+    * VkImageCompressionPropertiesEXT,
+    * VkSamplerYcbcrConversionImageFormatProperties, or
+    * VkTextureLODGatherFormatPropertiesAMD
+    *
+    * VkAndroidHardwareBufferUsageANDROID is handled outside of the cache.
+    * VkFilterCubicImageViewImageFormatPropertiesEXT,
+    * VkHostImageCopyDevicePerformanceQueryEXT,
+    * VkHostImageCopyDevicePerformanceQueryEXT,
+    * VkTextureLODGatherFormatPropertiesAMD are not supported
+    */
+   if (format_props->pNext) {
+      vk_foreach_struct_const(src, format_props->pNext) {
+         switch (src->sType) {
+         case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
+         case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_PROPERTIES_EXT:
+         case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
+            _mesa_sha1_update(&sha1_ctx, &src->sType,
+                              sizeof(VkStructureType));
+            break;
+         default:
+            physical_dev->image_format_cache.debug.cache_skip_count++;
+            return false;
+         }
+      }
+   }
+
+   static const size_t format_info_2_hash_block_size =
+      sizeof(VkFormat) + sizeof(VkImageType) + sizeof(VkImageTiling) +
+      sizeof(VkImageUsageFlags) + sizeof(VkImageCreateFlags);
+
+   _mesa_sha1_update(&sha1_ctx, &format_info->format,
+                     format_info_2_hash_block_size);
+   _mesa_sha1_final(&sha1_ctx, key);
+
+   return true;
+}
+
+static bool
+vn_image_init_format_from_cache(
+   struct vn_physical_device *physical_dev,
+   struct VkImageFormatProperties2 *pImageFormatProperties,
+   VkResult *cached_result,
+   uint8_t *key)
+{
+   struct vn_image_format_properties_cache *cache =
+      &physical_dev->image_format_cache;
+
+   assert(cache->ht);
+
+   simple_mtx_lock(&cache->mutex);
+   struct hash_entry *hash_entry = _mesa_hash_table_search(cache->ht, key);
+   if (hash_entry) {
+      struct vn_image_format_cache_entry *cache_entry = hash_entry->data;
+
+      /* Copy the properties even if the cached_result is not supported.
+       * Per spec 1.3.275 "If the combination of parameters to
+       * vkGetPhysicalDeviceImageFormatProperties2 is not supported by the
+       * implementation for use in vkCreateImage, then all members of
+       * imageFormatProperties will be filled with zero."
+       */
+      pImageFormatProperties->imageFormatProperties =
+         cache_entry->properties.format.imageFormatProperties;
+      *cached_result = cache_entry->properties.cached_result;
+
+      if (pImageFormatProperties->pNext) {
+         vk_foreach_struct_const(src, pImageFormatProperties->pNext) {
+            switch (src->sType) {
+            case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: {
+               struct VkExternalImageFormatProperties *ext_image =
+                  (struct VkExternalImageFormatProperties *)src;
+               ext_image->externalMemoryProperties =
+                  cache_entry->properties.ext_image.externalMemoryProperties;
+               break;
+            }
+            case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_PROPERTIES_EXT: {
+               struct VkImageCompressionPropertiesEXT *compression =
+                  (struct VkImageCompressionPropertiesEXT *)src;
+               compression->imageCompressionFlags =
+                  cache_entry->properties.compression.imageCompressionFlags;
+               compression->imageCompressionFixedRateFlags =
+                  cache_entry->properties.compression
+                     .imageCompressionFixedRateFlags;
+               break;
+            }
+            case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: {
+               struct VkSamplerYcbcrConversionImageFormatProperties
+                  *ycbcr_conversion =
+                     (struct VkSamplerYcbcrConversionImageFormatProperties *)
+                        src;
+               ycbcr_conversion->combinedImageSamplerDescriptorCount =
+                  cache_entry->properties.ycbcr_conversion
+                     .combinedImageSamplerDescriptorCount;
+               break;
+            }
+            default:
+               unreachable("unexpected format props pNext");
+            }
+         }
+      }
+
+      list_move_to(&cache_entry->head, &cache->lru);
+      p_atomic_inc(&cache->debug.cache_hit_count);
+   } else {
+      p_atomic_inc(&cache->debug.cache_miss_count);
+   }
+   simple_mtx_unlock(&cache->mutex);
+
+   return !!hash_entry;
+}
+
+static void
+vn_image_store_format_in_cache(
+   struct vn_physical_device *physical_dev,
+   uint8_t *key,
+   struct VkImageFormatProperties2 *pImageFormatProperties,
+   VkResult cached_result)
+{
+   const VkAllocationCallbacks *alloc =
+      &physical_dev->base.base.instance->alloc;
+   struct vn_image_format_properties_cache *cache =
+      &physical_dev->image_format_cache;
+   struct vn_image_format_cache_entry *cache_entry = NULL;
+
+   assert(cache->ht);
+
+   simple_mtx_lock(&cache->mutex);
+
+   /* Check if entry was added before lock */
+   if (_mesa_hash_table_search(cache->ht, key)) {
+      simple_mtx_unlock(&cache->mutex);
+      return;
+   }
+
+   if (_mesa_hash_table_num_entries(cache->ht) ==
+       IMAGE_FORMAT_CACHE_MAX_ENTRIES) {
+      /* Evict/use the last entry in the lru list for this new entry */
+      cache_entry = list_last_entry(&cache->lru,
+                                    struct vn_image_format_cache_entry, head);
+
+      _mesa_hash_table_remove_key(cache->ht, cache_entry->key);
+      list_del(&cache_entry->head);
+   } else {
+      cache_entry = vk_zalloc(alloc, sizeof(*cache_entry), VN_DEFAULT_ALIGN,
+                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!cache_entry) {
+         simple_mtx_unlock(&cache->mutex);
+         return;
+      }
+   }
+
+   if (pImageFormatProperties->pNext) {
+      vk_foreach_struct_const(src, pImageFormatProperties->pNext) {
+         switch (src->sType) {
+         case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES: {
+            cache_entry->properties.ext_image =
+               *((struct VkExternalImageFormatProperties *)src);
+            break;
+         }
+         case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_PROPERTIES_EXT: {
+            cache_entry->properties.compression =
+               *((struct VkImageCompressionPropertiesEXT *)src);
+            break;
+         }
+         case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES: {
+            cache_entry->properties.ycbcr_conversion =
+               *((struct VkSamplerYcbcrConversionImageFormatProperties *)src);
+            break;
+         }
+         default:
+            unreachable("unexpected format props pNext");
+         }
+      }
+   }
+
+   cache_entry->properties.format = *pImageFormatProperties;
+   cache_entry->properties.cached_result = cached_result;
+
+   memcpy(cache_entry->key, key, SHA1_DIGEST_LENGTH);
+
+   _mesa_hash_table_insert(cache->ht, cache_entry->key, cache_entry);
+   list_add(&cache_entry->head, &cache->lru);
+
+   simple_mtx_unlock(&cache->mutex);
+}
+
 VkResult
 vn_GetPhysicalDeviceImageFormatProperties2(
    VkPhysicalDevice physicalDevice,
@@ -2189,10 +2532,27 @@ vn_GetPhysicalDeviceImageFormatProperties2(
       local_info.format.flags &= ~VK_IMAGE_CREATE_ALIAS_BIT;
    }
 
-   VkResult result;
-   /* TODO per-device cache */
-   result = vn_call_vkGetPhysicalDeviceImageFormatProperties2(
-      ring, physicalDevice, pImageFormatInfo, pImageFormatProperties);
+   /* Check if image format props is in the cache. */
+   uint8_t key[SHA1_DIGEST_LENGTH] = { 0 };
+   const bool cacheable = vn_image_get_image_format_key(
+      physical_dev, pImageFormatInfo, pImageFormatProperties, key);
+
+   VkResult result = VK_SUCCESS;
+   if (!(cacheable &&
+         vn_image_init_format_from_cache(physical_dev, pImageFormatProperties,
+                                         &result, key))) {
+      result = vn_call_vkGetPhysicalDeviceImageFormatProperties2(
+         ring, physicalDevice, pImageFormatInfo, pImageFormatProperties);
+
+      /* If cacheable, cache successful and unsupported results. */
+      if (cacheable &&
+          (result == VK_SUCCESS || result == VK_ERROR_FORMAT_NOT_SUPPORTED ||
+           result == VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR)) {
+         vn_image_store_format_in_cache(physical_dev, key,
+                                        pImageFormatProperties, result);
+      }
+   }
+
    if (result != VK_SUCCESS || !external_info)
       return vn_result(physical_dev->instance, result);
 
diff --git a/src/virtio/vulkan/vn_physical_device.h b/src/virtio/vulkan/vn_physical_device.h
index cb25344e4801..42039ac6b084 100644
--- a/src/virtio/vulkan/vn_physical_device.h
+++ b/src/virtio/vulkan/vn_physical_device.h
@@ -48,6 +48,33 @@ struct vn_format_properties_entry {
    VkFormatProperties properties;
 };
 
+struct vn_image_format_properties {
+   struct VkImageFormatProperties2 format;
+   VkResult cached_result;
+
+   VkExternalImageFormatProperties ext_image;
+   VkImageCompressionPropertiesEXT compression;
+   VkSamplerYcbcrConversionImageFormatProperties ycbcr_conversion;
+};
+
+struct vn_image_format_cache_entry {
+   struct vn_image_format_properties properties;
+   uint8_t key[SHA1_DIGEST_LENGTH];
+   struct list_head head;
+};
+
+struct vn_image_format_properties_cache {
+   struct hash_table *ht;
+   struct list_head lru;
+   simple_mtx_t mutex;
+
+   struct {
+      uint32_t cache_hit_count;
+      uint32_t cache_miss_count;
+      uint32_t cache_skip_count;
+   } debug;
+};
+
 struct vn_physical_device {
    struct vn_physical_device_base base;
 
@@ -100,6 +127,8 @@ struct vn_physical_device {
 
    simple_mtx_t format_update_mutex;
    struct util_sparse_array format_properties;
+
+   struct vn_image_format_properties_cache image_format_cache;
 };
 VK_DEFINE_HANDLE_CASTS(vn_physical_device,
                        base.base.base,