diff --git a/meson_options.txt b/meson_options.txt index c95b65415a93..ac9ab64e7a06 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -177,13 +177,21 @@ option( ) option( - 'gallium-windows-dll-name', + 'gallium-wgl-dll-name', type : 'string', value : 'libgallium_wgl', - description : 'name of gallium megadriver DLL built for Windows. ' + + description : 'name of gallium wgl target DLL built for Windows. ' + 'defaults to libgallium_wgl.dll to match DRI', ) +option( + 'gallium-d3d10-dll-name', + type : 'string', + value : 'libgallium_d3d10', + description : 'name of gallium d3d10 target DLL built for Windows. ' + + 'defaults to libgallium_d3d10.dll to match DRI', +) + option( 'opencl-spirv', type : 'boolean', diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 49508e961379..1500984a1213 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -468,6 +468,35 @@ emit_instruction(asm_context& ctx, std::vector& out, Instruction* inst out.push_back(encoding); break; } + case Format::VOPD: { + VOPD_instruction& vopd = instr->vopd(); + uint32_t encoding = (0b110010 << 26); + encoding |= reg(ctx, instr->operands[0]); + if (instr->opcode != aco_opcode::v_dual_mov_b32) + encoding |= reg(ctx, instr->operands[1], 8) << 9; + encoding |= (uint32_t)ctx.opcode[(int)vopd.opy] << 17; + encoding |= opcode << 22; + out.push_back(encoding); + + unsigned opy_start = instr->opcode == aco_opcode::v_dual_mov_b32 ? 1 : 2; + switch (instr->opcode) { + case aco_opcode::v_dual_fmac_f32: + case aco_opcode::v_dual_fmaak_f32: + case aco_opcode::v_dual_fmamk_f32: + case aco_opcode::v_dual_cndmask_b32: + case aco_opcode::v_dual_dot2acc_f32_f16: + case aco_opcode::v_dual_dot2acc_f32_bf16: opy_start = 3; break; + default: break; + } + + encoding = reg(ctx, instr->operands[opy_start]); + if (vopd.opy != aco_opcode::v_dual_mov_b32) + encoding |= reg(ctx, instr->operands[opy_start + 1], 8) << 9; + encoding |= (reg(ctx, instr->definitions[1], 8) >> 1) << 17; + encoding |= reg(ctx, instr->definitions[0], 8) << 24; + out.push_back(encoding); + break; + } case Format::DS: { DS_instruction& ds = instr->ds(); uint32_t encoding = (0b110110 << 26); diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index e4518a0f9c58..f5106a298472 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -578,6 +578,7 @@ class Builder { ("vopc_sdwa", [Format.VOPC, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2])), ("vop3", [Format.VOP3], 'VALU_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), ("vop3p", [Format.VOP3P], 'VALU_instruction', [(1, 2), (1, 3)]), + ("vopd", [Format.VOPD], 'VOPD_instruction', [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6)]), ("vinterp_inreg", [Format.VINTERP_INREG], 'VINTERP_inreg_instruction', [(1, 3)]), ("vintrp", [Format.VINTRP], 'VINTRP_instruction', [(1, 2), (1, 3)]), ("vop1_dpp", [Format.VOP1, Format.DPP16], 'DPP16_instruction', [(1, 1)]), diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index b405fbd82cbe..f6bc400b444e 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -59,6 +59,7 @@ static const std::array statist ret[aco_statistic_salu] = aco_compiler_statistic_info{"SALU", "Number of SALU instructions"}; ret[aco_statistic_vmem] = aco_compiler_statistic_info{"VMEM", "Number of VMEM instructions"}; ret[aco_statistic_smem] = aco_compiler_statistic_info{"SMEM", "Number of SMEM instructions"}; + ret[aco_statistic_vopd] = aco_compiler_statistic_info{"VOPD", "Number of VOPD instructions"}; return ret; }(); @@ -199,6 +200,9 @@ aco_postprocess_shader(const struct aco_compiler_options* options, aco::lower_to_hw_instr(program.get()); validate(program.get()); + if (!options->optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_SCHED_VOPD)) + aco::schedule_vopd(program.get()); + /* Schedule hardware instructions for ILP */ if (!options->optimisations_disabled && !(aco::debug_flags & aco::DEBUG_NO_SCHED_ILP)) aco::schedule_ilp(program.get()); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index ea49a10da5c7..1a54f7de2e49 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -45,8 +45,9 @@ static const struct debug_control aco_debug_options[] = { {"force-waitdeps", DEBUG_FORCE_WAITDEPS}, {"novn", DEBUG_NO_VN}, {"noopt", DEBUG_NO_OPT}, - {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP}, + {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP | DEBUG_NO_SCHED_VOPD}, {"nosched-ilp", DEBUG_NO_SCHED_ILP}, + {"nosched-vopd", DEBUG_NO_SCHED_VOPD}, {"perfinfo", DEBUG_PERF_INFO}, {"liveinfo", DEBUG_LIVE_INFO}, {NULL, 0}}; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index cd4ceb221ce8..3863cf11b720 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -58,6 +58,7 @@ enum { DEBUG_FORCE_WAITDEPS = 0x200, DEBUG_NO_VALIDATE_IR = 0x400, DEBUG_NO_SCHED_ILP = 0x800, + DEBUG_NO_SCHED_VOPD = 0x1000, }; enum storage_class : uint8_t { @@ -957,6 +958,7 @@ struct Pseudo_reduction_instruction; struct VALU_instruction; struct VINTERP_inreg_instruction; struct VINTRP_instruction; +struct VOPD_instruction; struct DPP16_instruction; struct DPP8_instruction; struct SDWA_instruction; @@ -1210,6 +1212,17 @@ struct Instruction { return *(VINTERP_inreg_instruction*)this; } constexpr bool isVINTERP_INREG() const noexcept { return format == Format::VINTERP_INREG; } + VOPD_instruction& vopd() noexcept + { + assert(isVOPD()); + return *(VOPD_instruction*)this; + } + const VOPD_instruction& vopd() const noexcept + { + assert(isVOPD()); + return *(VOPD_instruction*)this; + } + constexpr bool isVOPD() const noexcept { return format == Format::VOPD; } constexpr bool isVOP1() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP1; } constexpr bool isVOP2() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP2; } constexpr bool isVOPC() const noexcept { return (uint16_t)format & (uint16_t)Format::VOPC; } @@ -1278,7 +1291,8 @@ struct Instruction { } constexpr bool isVALU() const noexcept { - return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG(); + return isVOP1() || isVOP2() || isVOPC() || isVOP3() || isVOP3P() || isVINTERP_INREG() || + isVOPD(); } constexpr bool isSALU() const noexcept @@ -1368,6 +1382,12 @@ struct VINTERP_inreg_instruction : public VALU_instruction { static_assert(sizeof(VINTERP_inreg_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); +struct VOPD_instruction : public VALU_instruction { + aco_opcode opy; + uint16_t padding; +}; +static_assert(sizeof(VOPD_instruction) == sizeof(VALU_instruction) + 4, "Unexpected padding"); + /** * Data Parallel Primitives Format: * This format can be used for VOP1, VOP2 or VOPC instructions. @@ -2209,6 +2229,7 @@ void ssa_elimination(Program* program); void lower_to_hw_instr(Program* program); void schedule_program(Program* program, live& live_vars); void schedule_ilp(Program* program); +void schedule_vopd(Program* program); void spill(Program* program, live& live_vars); void insert_wait_states(Program* program); bool dealloc_vgprs(Program* program); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 1cf23b5e061b..4a512113c3f4 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -89,6 +89,7 @@ class Format(IntEnum): VINTRP = auto() # Vector ALU Formats VINTERP_INREG = auto() + VOPD = auto() VOP1 = 1 << 7 VOP2 = 1 << 8 VOPC = 1 << 9 @@ -186,6 +187,8 @@ def get_builder_fields(self): elif self == Format.VOP3P: return [('uint8_t', 'opsel_lo', None), ('uint8_t', 'opsel_hi', None)] + elif self == Format.VOPD: + return [('aco_opcode', 'opy', None)] elif self == Format.VINTERP_INREG: return [('unsigned', 'wait_exp', 7), ('uint8_t', 'opsel', 0)] @@ -1272,6 +1275,29 @@ def default_class(opcodes, cls): opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP3, cls, in_mod, out_mod, definitions = defs, operands = ops) +VOPD = { + (0x00, "v_dual_fmac_f32"), + (0x01, "v_dual_fmaak_f32"), + (0x02, "v_dual_fmamk_f32"), + (0x03, "v_dual_mul_f32"), + (0x04, "v_dual_add_f32"), + (0x05, "v_dual_sub_f32"), + (0x06, "v_dual_subrev_f32"), + (0x07, "v_dual_mul_dx9_zero_f32"), + (0x08, "v_dual_mov_b32"), + (0x09, "v_dual_cndmask_b32"), + (0x0a, "v_dual_max_f32"), + (0x0b, "v_dual_min_f32"), + (0x0c, "v_dual_dot2acc_f32_f16"), + (0x0d, "v_dual_dot2acc_f32_bf16"), + (0x10, "v_dual_add_nc_u32"), + (0x11, "v_dual_lshlrev_b32"), + (0x12, "v_dual_and_b32"), +} +for gfx11, name in VOPD: + opcode(name, -1, -1, -1, gfx11, format = Format.VOPD, cls = InstrClass.Valu32) + + # DS instructions: 3 inputs (1 addr, 2 data), 1 output DS = { (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"), diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index a0ed4ebae89a..5a7ae9d94b93 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -443,6 +443,12 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins fprintf(output, " attr%d.%c", vintrp.attribute, "xyzw"[vintrp.component]); break; } + case Format::VOPD: { + const VOPD_instruction& vopd = instr->vopd(); + // TODO: beautify + fprintf(output, " %s", instr_info.name[(int)vopd.opy]); + break; + } case Format::DS: { const DS_instruction& ds = instr->ds(); if (ds.offset0) diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp index 007386cef7b6..c6a288fe5ef8 100644 --- a/src/amd/compiler/aco_scheduler_ilp.cpp +++ b/src/amd/compiler/aco_scheduler_ilp.cpp @@ -27,6 +27,16 @@ constexpr unsigned num_nodes = 16; using mask_t = uint16_t; static_assert(std::numeric_limits::digits >= num_nodes); +struct VOPDInfo { + VOPDInfo() : is_opy_only(0), is_dst_odd(0), src_banks(0), has_literal(0) {} + uint16_t is_opy_only : 1; + uint16_t is_dst_odd : 1; + uint16_t src_banks : 10; /* 0-3: src0, 4-7: src1, 8-9: src2 */ + uint16_t has_literal : 1; + aco_opcode op = aco_opcode::num_opcodes; + uint32_t literal = 0; +}; + struct InstrInfo { Instruction* instr; int32_t priority; @@ -46,12 +56,21 @@ struct RegisterInfo { struct SchedILPContext { Program* program; + bool is_vopd = false; InstrInfo nodes[num_nodes]; RegisterInfo regs[512]; mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should not be reordered. */ mask_t active_mask = 0; /* bitmask of valid instruction nodes. */ uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which should not be reordered. */ uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which should not be reordered. */ + + /* VOPD scheduler: */ + VOPDInfo vopd[num_nodes]; + VOPDInfo prev_vopd_info; + InstrInfo prev_info; + + mask_t vopd_odd_mask = 0; + mask_t vopd_even_mask = 0; }; /** @@ -98,6 +117,117 @@ can_reorder(const Instruction* const instr) return true; } +VOPDInfo +get_vopd_info(const Instruction* instr) +{ + if (instr->format != Format::VOP1 && instr->format != Format::VOP2) + return VOPDInfo(); + + VOPDInfo info; + switch (instr->opcode) { + case aco_opcode::v_fmac_f32: info.op = aco_opcode::v_dual_fmac_f32; break; + case aco_opcode::v_fmaak_f32: info.op = aco_opcode::v_dual_fmaak_f32; break; + case aco_opcode::v_fmamk_f32: info.op = aco_opcode::v_dual_fmamk_f32; break; + case aco_opcode::v_mul_f32: info.op = aco_opcode::v_dual_mul_f32; break; + case aco_opcode::v_add_f32: info.op = aco_opcode::v_dual_add_f32; break; + case aco_opcode::v_sub_f32: info.op = aco_opcode::v_dual_sub_f32; break; + case aco_opcode::v_subrev_f32: info.op = aco_opcode::v_dual_subrev_f32; break; + case aco_opcode::v_mul_legacy_f32: info.op = aco_opcode::v_dual_mul_dx9_zero_f32; break; + case aco_opcode::v_mov_b32: info.op = aco_opcode::v_dual_mov_b32; break; + case aco_opcode::v_cndmask_b32: info.op = aco_opcode::v_dual_cndmask_b32; break; + case aco_opcode::v_max_f32: info.op = aco_opcode::v_dual_max_f32; break; + case aco_opcode::v_min_f32: info.op = aco_opcode::v_dual_min_f32; break; + case aco_opcode::v_dot2c_f32_f16: info.op = aco_opcode::v_dual_dot2acc_f32_f16; break; + case aco_opcode::v_add_u32: + info.op = aco_opcode::v_dual_add_nc_u32; + info.is_opy_only = true; + break; + case aco_opcode::v_lshlrev_b32: + info.op = aco_opcode::v_dual_lshlrev_b32; + info.is_opy_only = true; + break; + case aco_opcode::v_and_b32: + info.op = aco_opcode::v_dual_and_b32; + info.is_opy_only = true; + break; + default: return VOPDInfo(); + } + + /* Each instruction may use at most one SGPR. */ + if (instr->opcode == aco_opcode::v_cndmask_b32 && instr->operands[0].isOfType(RegType::sgpr)) + return VOPDInfo(); + + info.is_dst_odd = instr->definitions[0].physReg().reg() & 0x1; + + static const unsigned bank_mask[3] = {0x3, 0x3, 0x1}; + bool has_sgpr = false; + for (unsigned i = 0; i < instr->operands.size(); i++) { + unsigned port = (instr->opcode == aco_opcode::v_fmamk_f32 && i == 1) ? 2 : i; + if (instr->operands[i].isOfType(RegType::vgpr)) + info.src_banks |= 1 << (port * 4 + (instr->operands[i].physReg().reg() & bank_mask[port])); + + /* Check all operands because of fmaak/fmamk. */ + if (instr->operands[i].isLiteral()) { + assert(!info.has_literal || info.literal == instr->operands[i].constantValue()); + info.has_literal = true; + info.literal = instr->operands[i].constantValue(); + } + + /* Check all operands because of cndmask. */ + has_sgpr |= !instr->operands[i].isConstant() && instr->operands[i].isOfType(RegType::sgpr); + } + + /* An instruction can't use both a literal and an SGPR. */ + if (has_sgpr && info.has_literal) + return VOPDInfo(); + + return info; +} + +bool +can_use_vopd(const SchedILPContext& ctx, unsigned idx) +{ + VOPDInfo cur_vopd = ctx.vopd[idx]; + Instruction* first = ctx.nodes[idx].instr; + Instruction* second = ctx.prev_info.instr; + + if (!second) + return false; + + if (ctx.prev_vopd_info.op == aco_opcode::num_opcodes || cur_vopd.op == aco_opcode::num_opcodes) + return false; + + if ((ctx.prev_vopd_info.src_banks & cur_vopd.src_banks) || + (ctx.prev_vopd_info.is_opy_only & cur_vopd.is_opy_only) || + (ctx.prev_vopd_info.is_dst_odd == cur_vopd.is_dst_odd)) { + return false; + } + + /* Both can use a literal, but it must be the same literal. */ + if (ctx.prev_vopd_info.has_literal && cur_vopd.has_literal && + ctx.prev_vopd_info.literal != cur_vopd.literal) + return false; + + assert(first->definitions.size() == 1); + assert(first->definitions[0].size() == 1); + assert(second->definitions.size() == 1); + assert(second->definitions[0].size() == 1); + + /* Check for WaW dependency. */ + if (first->definitions[0].physReg() == second->definitions[0].physReg()) + return false; + + /* Check for RaW dependency. */ + for (Operand op : second->operands) { + assert(op.size() == 1); + if (first->definitions[0].physReg() == op.physReg()) + return false; + } + + /* WaR dependencies are not a concern. */ + return true; +} + unsigned get_latency(const Instruction* const instr) { @@ -138,6 +268,16 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) bool reorder = can_reorder(instr); ctx.active_mask |= mask; + if (ctx.is_vopd) { + VOPDInfo vopd = get_vopd_info(entry.instr); + + ctx.vopd[idx] = vopd; + ctx.vopd_odd_mask &= ~mask; + ctx.vopd_odd_mask |= vopd.is_dst_odd ? mask : 0; + ctx.vopd_even_mask &= ~mask; + ctx.vopd_even_mask |= vopd.is_dst_odd || vopd.op == aco_opcode::num_opcodes ? 0 : mask; + } + for (const Operand& op : instr->operands) { assert(op.isFixed()); unsigned reg = op.physReg(); @@ -206,8 +346,10 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) reg_info.has_direct_dependency = 1; reg_info.direct_dependency = idx; - /* Add latency information for the next register read. */ - reg_info.latency = get_latency(instr); + if (!ctx.is_vopd) { + /* Add latency information for the next register read. */ + reg_info.latency = get_latency(instr); + } } } @@ -225,7 +367,7 @@ add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx) /* Just don't reorder these at all. */ if (!is_memory_instr(instr) || instr->definitions.empty() || - get_sync_info(instr).semantics & semantic_volatile) { + get_sync_info(instr).semantics & semantic_volatile || ctx.is_vopd) { /* Add all previous instructions as dependencies. */ entry.dependency_mask = ctx.active_mask; } @@ -343,7 +485,7 @@ collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, mask * Returns the index of the next instruction to be selected. */ unsigned -select_instruction(const SchedILPContext& ctx) +select_instruction_ilp(const SchedILPContext& ctx) { mask_t mask = ctx.active_mask; @@ -377,6 +519,145 @@ select_instruction(const SchedILPContext& ctx) return idx; } +bool +compare_nodes_vopd(const SchedILPContext& ctx, int num_vopd_odd_minus_even, bool* use_vopd, + unsigned current, unsigned candidate) +{ + if (can_use_vopd(ctx, candidate)) { + /* If we can form a VOPD instruction, always prefer to do so. */ + if (!*use_vopd) { + *use_vopd = true; + return true; + } + } else { + if (*use_vopd) + return false; + + /* Neither current nor candidate can form a VOPD instruction with the previously scheduled + * instruction. */ + VOPDInfo current_vopd = ctx.vopd[current]; + VOPDInfo candidate_vopd = ctx.vopd[candidate]; + + /* Delay scheduling VOPD-capable instructions in case an opportunity appears later. */ + bool current_vopd_capable = current_vopd.op != aco_opcode::num_opcodes; + bool candidate_vopd_capable = candidate_vopd.op != aco_opcode::num_opcodes; + if (current_vopd_capable != candidate_vopd_capable) + return !candidate_vopd_capable; + + /* If we have to select from VOPD-capable instructions, prefer maintaining a balance of + * odd/even instructions, in case selecting this instruction fails to make a pair. + */ + if (current_vopd_capable && num_vopd_odd_minus_even != 0) { + assert(candidate_vopd_capable); + bool prefer_vopd_dst_odd = num_vopd_odd_minus_even > 0; + if (current_vopd.is_dst_odd != candidate_vopd.is_dst_odd) + return prefer_vopd_dst_odd ? candidate_vopd.is_dst_odd : !candidate_vopd.is_dst_odd; + } + } + + return ctx.nodes[candidate].priority > ctx.nodes[current].priority; +} + +unsigned +select_instruction_vopd(const SchedILPContext& ctx, bool* use_vopd) +{ + *use_vopd = false; + + mask_t mask = ctx.active_mask; + if (ctx.next_non_reorderable != UINT8_MAX) + mask = ctx.nodes[ctx.next_non_reorderable].dependency_mask; + + if (mask == 0) + return ctx.next_non_reorderable; + + int num_vopd_odd_minus_even = + (int)util_bitcount(ctx.vopd_odd_mask & mask) - (int)util_bitcount(ctx.vopd_even_mask & mask); + + unsigned cur = -1u; + u_foreach_bit (i, mask) { + const InstrInfo& candidate = ctx.nodes[i]; + + /* Check if the candidate has pending dependencies. */ + if (candidate.dependency_mask) + continue; + + if (cur == -1u) { + cur = i; + *use_vopd = can_use_vopd(ctx, i); + } else if (compare_nodes_vopd(ctx, num_vopd_odd_minus_even, use_vopd, cur, i)) { + cur = i; + } + } + + assert(cur != -1u); + return cur; +} + +Instruction* +create_vopd_instruction(const SchedILPContext& ctx, unsigned idx) +{ + Instruction* x = ctx.prev_info.instr; + Instruction* y = ctx.nodes[idx].instr; + aco_opcode opx = ctx.prev_vopd_info.op; + aco_opcode opy = ctx.vopd[idx].op; + if (ctx.prev_vopd_info.is_opy_only) { + std::swap(x, y); + std::swap(opx, opy); + } + + VOPD_instruction* instr = create_instruction( + opx, Format::VOPD, x->operands.size() + y->operands.size(), 2); + instr->opy = opy; + instr->definitions[0] = x->definitions[0]; + instr->definitions[1] = y->definitions[0]; + std::copy(x->operands.begin(), x->operands.end(), instr->operands.begin()); + std::copy(y->operands.begin(), y->operands.end(), + std::next(instr->operands.begin(), x->operands.size())); + + return instr; +} + +template +void +do_schedule(SchedILPContext& ctx, It& insert_it, It& remove_it, It instructions_begin, + It instructions_end) +{ + for (unsigned i = 0; i < num_nodes; i++) { + if (remove_it == instructions_end) + break; + + add_entry(ctx, (remove_it++)->get(), i); + } + + ctx.prev_info.instr = NULL; + bool use_vopd = false; + + while (ctx.active_mask) { + unsigned next_idx = + ctx.is_vopd ? select_instruction_vopd(ctx, &use_vopd) : select_instruction_ilp(ctx); + Instruction* next_instr = ctx.nodes[next_idx].instr; + + if (use_vopd) { + std::prev(insert_it)->reset(create_vopd_instruction(ctx, next_idx)); + ctx.prev_info.instr = NULL; + } else { + (insert_it++)->reset(next_instr); + ctx.prev_info = ctx.nodes[next_idx]; + ctx.prev_vopd_info = ctx.vopd[next_idx]; + } + + remove_entry(ctx, next_instr, next_idx); + ctx.nodes[next_idx].instr = NULL; + + if (remove_it != instructions_end) { + add_entry(ctx, (remove_it++)->get(), next_idx); + } else if (ctx.last_non_reorderable != UINT8_MAX) { + ctx.nodes[ctx.last_non_reorderable].potential_clause = false; + ctx.last_non_reorderable = UINT8_MAX; + } + } +} + } // namespace void @@ -386,29 +667,26 @@ schedule_ilp(Program* program) for (Block& block : program->blocks) { auto it = block.instructions.begin(); - for (unsigned i = 0; i < num_nodes; i++) { - if (it == block.instructions.end()) - break; + auto insert_it = block.instructions.begin(); + do_schedule(ctx, insert_it, it, block.instructions.begin(), block.instructions.end()); + block.instructions.resize(insert_it - block.instructions.begin()); + } +} - add_entry(ctx, (it++)->get(), i); - } +void +schedule_vopd(Program* program) +{ + if (program->gfx_level < GFX11 || program->wave_size != 32) + return; - auto insert_it = block.instructions.begin(); - while (insert_it != block.instructions.end()) { - unsigned next_idx = select_instruction(ctx); - Instruction* next_instr = ctx.nodes[next_idx].instr; - remove_entry(ctx, next_instr, next_idx); - (insert_it++)->reset(next_instr); - ctx.nodes[next_idx].instr = NULL; + SchedILPContext ctx = {program}; + ctx.is_vopd = true; - if (it != block.instructions.end()) { - add_entry(ctx, (it++)->get(), next_idx); - } else if (ctx.last_non_reorderable != UINT8_MAX) { - ctx.nodes[ctx.last_non_reorderable].potential_clause = false; - ctx.last_non_reorderable = UINT8_MAX; - } - } - assert(it == block.instructions.end()); + for (Block& block : program->blocks) { + auto it = block.instructions.rbegin(); + auto insert_it = block.instructions.rbegin(); + do_schedule(ctx, insert_it, it, block.instructions.rbegin(), block.instructions.rend()); + block.instructions.erase(block.instructions.begin(), insert_it.base()); } } diff --git a/src/amd/compiler/aco_shader_info.h b/src/amd/compiler/aco_shader_info.h index e6ff2c8a5f7d..bfe8071dee2d 100644 --- a/src/amd/compiler/aco_shader_info.h +++ b/src/amd/compiler/aco_shader_info.h @@ -228,6 +228,7 @@ enum aco_statistic { aco_statistic_salu, aco_statistic_vmem, aco_statistic_smem, + aco_statistic_vopd, aco_num_statistics }; diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp index e454c2de93be..5eb202b8ba7e 100644 --- a/src/amd/compiler/aco_statistics.cpp +++ b/src/amd/compiler/aco_statistics.cpp @@ -540,6 +540,8 @@ collect_preasm_stats(Program* program) if (instr->isSALU() && !instr->isSOPP() && instr_info.classes[(int)instr->opcode] != instr_class::waitcnt) program->statistics[aco_statistic_salu]++; + if (instr->isVOPD()) + program->statistics[aco_statistic_vopd]++; if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) && !instr->operands.empty()) { diff --git a/src/amd/compiler/tests/test_assembler.cpp b/src/amd/compiler/tests/test_assembler.cpp index a7106e98686a..8c92e669a649 100644 --- a/src/amd/compiler/tests/test_assembler.cpp +++ b/src/amd/compiler/tests/test_assembler.cpp @@ -1055,3 +1055,62 @@ BEGIN_TEST(assembler.vop3_dpp) finish_assembler_test(); END_TEST + +BEGIN_TEST(assembler.vopd) + if (!setup_cs(NULL, GFX11)) + return; + + Definition dst_v0 = bld.def(v1); + dst_v0.setFixed(PhysReg(256)); + + Definition dst_v1 = bld.def(v1); + dst_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v0(bld.tmp(v1)); + op_v0.setFixed(PhysReg(256 + 0)); + + Operand op_v1(bld.tmp(v1)); + op_v1.setFixed(PhysReg(256 + 1)); + + Operand op_v2(bld.tmp(v1)); + op_v2.setFixed(PhysReg(256 + 2)); + + Operand op_v3(bld.tmp(v1)); + op_v3.setFixed(PhysReg(256 + 3)); + + Operand op_s0(bld.tmp(s1)); + op_s0.setFixed(PhysReg(0)); + + Operand op_vcc(bld.tmp(s1)); + op_vcc.setFixed(vcc); + + //>> BB0: + //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, aco_opcode::v_dual_mov_b32); + + //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0, + aco_opcode::v_dual_mov_b32); + + //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96), + aco_opcode::v_dual_mov_b32); + + //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102 + bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2, + aco_opcode::v_dual_mov_b32); + + //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103 + bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3, + aco_opcode::v_dual_mov_b32); + + //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501 + bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2, + aco_opcode::v_dual_and_b32); + + //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702 + bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3, + op_vcc, aco_opcode::v_dual_cndmask_b32); + + finish_assembler_test(); +END_TEST diff --git a/src/amd/vulkan/bvh/update.comp b/src/amd/vulkan/bvh/update.comp index 905f807ebe66..c3c740238f22 100644 --- a/src/amd/vulkan/bvh/update.comp +++ b/src/amd/vulkan/bvh/update.comp @@ -74,17 +74,9 @@ void main() { bool is_active; if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) { is_active = build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x); - } else if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_AABBS_KHR) { - VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); - is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x); } else { VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset); - /* arrayOfPointers */ - if (args.geom_data.stride == 8) { - src_ptr = DEREF(REF(VOID_REF)(src_ptr)); - } - - is_active = build_instance(bounds, src_ptr, dst_ptr, gl_GlobalInvocationID.x); + is_active = build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x); } if (!is_active) @@ -110,10 +102,15 @@ void main() { gl_StorageSemanticsBuffer, gl_SemanticsAcquireRelease | gl_SemanticsMakeAvailable | gl_SemanticsMakeVisible); - radv_bvh_box32_node node = DEREF(REF(radv_bvh_box32_node)OFFSET(src_bvh, offset)); + REF(radv_bvh_box32_node) src_node = REF(radv_bvh_box32_node)OFFSET(src_bvh, offset); + REF(radv_bvh_box32_node) dst_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, offset); + uint32_t children[4]; + for (uint32_t i = 0; i < 4; ++i) + children[i] = DEREF(src_node).children[i]; + uint32_t valid_child_count = 0; for (uint32_t i = 0; i < 4; ++valid_child_count, ++i) - if (node.children[i] == RADV_BVH_INVALID_NODE) + if (children[i] == RADV_BVH_INVALID_NODE) break; /* Check if all children have been processed. As this is an atomic the last path coming from @@ -127,33 +124,37 @@ void main() { if (ready_child_count != valid_child_count - 1) break; + for (uint32_t i = 0; i < 4; ++i) + DEREF(dst_node).children[i] = children[i]; + for (uint32_t i = 0; i < valid_child_count; ++i) { - uint32_t child_offset = id_to_offset(node.children[i]); + uint32_t child_offset = id_to_offset(children[i]); + radv_aabb child_bounds; if (child_offset == dst_offset) - node.coords[i] = bounds; + child_bounds = bounds; else if (child_offset >= internal_nodes_offset) { - radv_aabb child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY)); - radv_bvh_box32_node child_node = DEREF(REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset)); + child_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY)); + REF(radv_bvh_box32_node) child_node = REF(radv_bvh_box32_node)OFFSET(dst_bvh, child_offset); for (uint32_t j = 0; j < 4; ++j) { - if (child_node.children[j] == RADV_BVH_INVALID_NODE) + if (DEREF(child_node).children[j] == RADV_BVH_INVALID_NODE) break; - child_bounds.min = min(child_bounds.min, child_node.coords[j].min); - child_bounds.max = max(child_bounds.max, child_node.coords[j].max); + child_bounds.min = min(child_bounds.min, DEREF(child_node).coords[j].min); + child_bounds.max = max(child_bounds.max, DEREF(child_node).coords[j].max); } - node.coords[i] = child_bounds; } else { uint32_t child_index = (child_offset - first_leaf_offset) / leaf_node_size; - node.coords[i] = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index)); + child_bounds = DEREF(INDEX(radv_aabb, args.leaf_bounds, child_index)); } - } - DEREF(REF(radv_bvh_box32_node)OFFSET(dst_bvh, offset)) = node; + DEREF(dst_node).coords[i] = child_bounds; + } if (parent_id == RADV_BVH_ROOT_NODE) { radv_aabb root_bounds = radv_aabb(vec3(INFINITY), vec3(-INFINITY)); for (uint32_t i = 0; i < valid_child_count; ++i) { - root_bounds.min = min(root_bounds.min, node.coords[i].min); - root_bounds.max = max(root_bounds.max, node.coords[i].max); + radv_aabb bounds = DEREF(dst_node).coords[i]; + root_bounds.min = min(root_bounds.min, bounds.min); + root_bounds.max = max(root_bounds.max, bounds.max); } DEREF(args.dst).aabb = root_bounds; } diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c index 4921d2d7ea56..29a0ff599b34 100644 --- a/src/amd/vulkan/radv_image.c +++ b/src/amd/vulkan/radv_image.c @@ -1075,6 +1075,11 @@ radv_image_create_layout(struct radv_device *device, struct radv_image_create_in radv_video_get_profile_alignments(device->physical_device, profile_list, &width_align, &height_align); image_info.width = align(image_info.width, width_align); image_info.height = align(image_info.height, height_align); + + if (radv_has_uvd(device->physical_device) && image->vk.usage & VK_IMAGE_USAGE_VIDEO_DECODE_DPB_BIT_KHR) { + /* UVD and kernel demand a full DPB allocation. */ + image_info.array_size = MIN2(16, image_info.array_size); + } } unsigned plane_count = radv_get_internal_plane_count(device->physical_device, image->vk.format); diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c index 588e2750b8ee..b07e0c9e5e86 100644 --- a/src/amd/vulkan/radv_physical_device.c +++ b/src/amd/vulkan/radv_physical_device.c @@ -1345,7 +1345,8 @@ radv_get_physical_device_properties(struct radv_physical_device *pdevice) p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT | VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | VK_SUBGROUP_FEATURE_BALLOT_BIT | VK_SUBGROUP_FEATURE_CLUSTERED_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT | - VK_SUBGROUP_FEATURE_SHUFFLE_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT; + VK_SUBGROUP_FEATURE_SHUFFLE_BIT | VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | + VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR | VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR; p->subgroupQuadOperationsInAllStages = true; p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_ALL_CLIP_PLANES; @@ -1698,7 +1699,7 @@ radv_get_physical_device_properties(struct radv_physical_device *pdevice) /* VK_NV_device_generated_commands */ p->maxIndirectCommandsStreamCount = 1; p->maxIndirectCommandsStreamStride = UINT32_MAX; - p->maxIndirectCommandsTokenCount = UINT32_MAX; + p->maxIndirectCommandsTokenCount = 512; p->maxIndirectCommandsTokenOffset = UINT16_MAX; p->minIndirectCommandsBufferOffsetAlignment = 4; p->minSequencesCountBufferOffsetAlignment = 4; @@ -2050,13 +2051,13 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm if ((device->instance->debug_flags & RADV_DEBUG_INFO)) ac_print_gpu_info(&device->rad_info, stdout); + radv_init_physical_device_decoder(device); + radv_physical_device_init_queue_table(device); /* We don't check the error code, but later check if it is initialized. */ ac_init_perfcounters(&device->rad_info, false, false, &device->ac_perfcounters); - radv_init_physical_device_decoder(device); - /* The WSI is structured as a layer on top of the driver, so this has * to be the last part of initialization (at least until we get other * semi-layers). diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index 0a2430a53176..0120c7aa1d87 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -1647,7 +1647,8 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi queue->device->ws->cs_unchain(cmd_buffer->cs); if (!chainable || !queue->device->ws->cs_chain(chainable, cmd_buffer->cs, queue->state.uses_shadow_regs)) { /* don't submit empty command buffers to the kernel. */ - if (radv_queue_ring(queue) != AMD_IP_VCN_ENC || cmd_buffer->cs->cdw != 0) + if ((radv_queue_ring(queue) != AMD_IP_VCN_ENC && radv_queue_ring(queue) != AMD_IP_UVD) || + cmd_buffer->cs->cdw != 0) cs_array[num_submitted_cs++] = cmd_buffer->cs; } diff --git a/src/amd/vulkan/radv_video.c b/src/amd/vulkan/radv_video.c index 96851b9d771b..8637c502bad4 100644 --- a/src/amd/vulkan/radv_video.c +++ b/src/amd/vulkan/radv_video.c @@ -1724,10 +1724,12 @@ radv_uvd_cmd_reset(struct radv_cmd_buffer *cmd_buffer) if (vid->sessionctx.mem) send_cmd(cmd_buffer, RDECODE_CMD_SESSION_CONTEXT_BUFFER, vid->sessionctx.mem->bo, vid->sessionctx.offset); send_cmd(cmd_buffer, RDECODE_CMD_MSG_BUFFER, cmd_buffer->upload.upload_bo, out_offset); + /* pad out the IB to the 16 dword boundary - otherwise the fw seems to be unhappy */ - radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 8); - for (unsigned i = 0; i < 8; i++) - radeon_emit(cmd_buffer->cs, 0x81ff); + int padsize = vid->sessionctx.mem ? 4 : 6; + radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, padsize); + for (unsigned i = 0; i < padsize; i++) + radeon_emit(cmd_buffer->cs, PKT2_NOP_PAD); } VKAPI_ATTR void VKAPI_CALL diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 4fd9bb7053d8..233ede17ba8d 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -457,7 +457,17 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs) *cs->ib_size_ptr |= cs->base.cdw; } else { /* Pad the CS with NOP packets. */ - if (ip_type != AMDGPU_HW_IP_VCN_ENC) { + bool pad = true; + + /* Don't pad on VCN encode/unified as no NOPs */ + if (ip_type == AMDGPU_HW_IP_VCN_ENC) + pad = false; + + /* Don't add padding to 0 length UVD due to kernel */ + if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0) + pad = false; + + if (pad) { while (!cs->base.cdw || (cs->base.cdw & ib_pad_dw_mask)) radeon_emit_unchecked(&cs->base, nop_packet); } diff --git a/src/broadcom/ci/broadcom-rpi4-fails.txt b/src/broadcom/ci/broadcom-rpi4-fails.txt index 2332f8c2f324..6eda8515a8d7 100644 --- a/src/broadcom/ci/broadcom-rpi4-fails.txt +++ b/src/broadcom/ci/broadcom-rpi4-fails.txt @@ -508,3 +508,5 @@ KHR-GL31.texture_size_promotion.functional,Fail # uprev Piglit in Mesa spec@glsl-1.40@uniform_buffer@two-stages,Fail +# Couldn't reproduce locally +spec@oes_packed_depth_stencil@depth_stencil texture gles2,Fail diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c index 91b99b7df65a..d95d7fcb7a80 100644 --- a/src/freedreno/ir3/ir3_lower_subgroups.c +++ b/src/freedreno/ir3/ir3_lower_subgroups.c @@ -344,6 +344,9 @@ lower_instr(struct ir3 *ir, struct ir3_block **block, struct ir3_instruction *in struct ir3_block *store = ir3_block_create(ir); list_add(&store->node, &body->node); + body->reconvergence_point = true; + after_block->reconvergence_point = true; + link_blocks(before_block, body, 0); link_blocks(body, store, 0); diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c index 3d20eb6eb2fe..c4a4ef037260 100644 --- a/src/gallium/drivers/crocus/crocus_blorp.c +++ b/src/gallium/drivers/crocus/crocus_blorp.c @@ -261,20 +261,31 @@ blorp_get_l3_config(struct blorp_batch *blorp_batch) struct crocus_batch *batch = blorp_batch->driver_batch; return batch->screen->l3_config_3d; } -#else /* GFX_VER < 7 */ +#endif + +static void +blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch, + struct intel_urb_config *urb_cfg) +{ + /* Dummy. */ +} + static void blorp_emit_urb_config(struct blorp_batch *blorp_batch, - unsigned vs_entry_size, - UNUSED unsigned sf_entry_size) + struct intel_urb_config *urb_cfg) { +#if GFX_VER < 7 struct crocus_batch *batch = blorp_batch->driver_batch; #if GFX_VER <= 5 - batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size); + batch->screen->vtbl.calculate_urb_fence(batch, 0, + urb_cfg->size[MESA_SHADER_VERTEX], + urb_cfg->size[MESA_SHADER_FRAGMENT]); #else - genX(crocus_upload_urb)(batch, vs_entry_size, false, vs_entry_size); + genX(crocus_upload_urb)(batch, urb_cfg->size[MESA_SHADER_VERTEX], false, + urb_cfg->size[MESA_SHADER_VERTEX]); #endif -} #endif +} static void crocus_blorp_exec(struct blorp_batch *blorp_batch, diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c index 2781470fb9c4..8385a03b8348 100644 --- a/src/gallium/drivers/crocus/crocus_state.c +++ b/src/gallium/drivers/crocus/crocus_state.c @@ -6058,49 +6058,46 @@ crocus_upload_dirty_render_state(struct crocus_context *ice, const struct intel_device_info *devinfo = &batch->screen->devinfo; bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL; bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL; - unsigned entry_size[4]; + struct intel_urb_config urb_cfg; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { if (!ice->shaders.prog[i]) { - entry_size[i] = 1; + urb_cfg.size[i] = 1; } else { struct brw_vue_prog_data *vue_prog_data = (void *) ice->shaders.prog[i]->prog_data; - entry_size[i] = vue_prog_data->urb_entry_size; + urb_cfg.size[i] = vue_prog_data->urb_entry_size; } - assert(entry_size[i] != 0); + assert(urb_cfg.size[i] != 0); } /* If we're just switching between programs with the same URB requirements, * skip the rest of the logic. */ bool no_change = false; - if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] && + if (ice->urb.vsize == urb_cfg.size[MESA_SHADER_VERTEX] && ice->urb.gs_present == gs_present && - ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] && + ice->urb.gsize == urb_cfg.size[MESA_SHADER_GEOMETRY] && ice->urb.tess_present == tess_present && - ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] && - ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) { + ice->urb.hsize == urb_cfg.size[MESA_SHADER_TESS_CTRL] && + ice->urb.dsize == urb_cfg.size[MESA_SHADER_TESS_EVAL]) { no_change = true; } if (!no_change) { - ice->urb.vsize = entry_size[MESA_SHADER_VERTEX]; + ice->urb.vsize = urb_cfg.size[MESA_SHADER_VERTEX]; ice->urb.gs_present = gs_present; - ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY]; + ice->urb.gsize = urb_cfg.size[MESA_SHADER_GEOMETRY]; ice->urb.tess_present = tess_present; - ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL]; - ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL]; + ice->urb.hsize = urb_cfg.size[MESA_SHADER_TESS_CTRL]; + ice->urb.dsize = urb_cfg.size[MESA_SHADER_TESS_EVAL]; - unsigned entries[4]; - unsigned start[4]; bool constrained; intel_get_urb_config(devinfo, batch->screen->l3_config_3d, tess_present, gs_present, - entry_size, - entries, start, NULL, &constrained); + &urb_cfg, NULL, &constrained); #if GFX_VER == 7 if (devinfo->platform == INTEL_PLATFORM_IVB) @@ -6109,9 +6106,9 @@ crocus_upload_dirty_render_state(struct crocus_context *ice, for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = entry_size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; + urb.VSURBStartingAddress = urb_cfg.start[i]; + urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1; + urb.VSNumberofURBEntries = urb_cfg.entries[i]; } } } diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c index 1f716b35658c..2869d624ebd1 100644 --- a/src/gallium/drivers/iris/iris_blorp.c +++ b/src/gallium/drivers/iris/iris_blorp.c @@ -274,6 +274,13 @@ blorp_flush_range(UNUSED struct blorp_batch *blorp_batch, */ } +static void +blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch, + struct intel_urb_config *urb_cfg) +{ + genX(urb_workaround)(blorp_batch->driver_batch, urb_cfg); +} + static const struct intel_l3_config * blorp_get_l3_config(struct blorp_batch *blorp_batch) { @@ -410,8 +417,8 @@ iris_blorp_exec_render(struct blorp_batch *blorp_batch, ice->state.dirty |= ~skip_bits; ice->state.stage_dirty |= ~skip_stage_bits; - for (int i = 0; i < ARRAY_SIZE(ice->shaders.urb.size); i++) - ice->shaders.urb.size[i] = 0; + for (int i = 0; i < ARRAY_SIZE(ice->shaders.urb.cfg.size); i++) + ice->shaders.urb.cfg.size[i] = 0; if (params->src.enabled) iris_bo_bump_seqno(params->src.addr.buffer, batch->next_seqno, diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 74461c18eaca..17b9a9596474 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -714,12 +714,13 @@ struct iris_context { struct iris_compiled_shader *prog[MESA_SHADER_STAGES]; struct iris_compiled_shader *last_vue_shader; struct { - unsigned size[4]; - unsigned entries[4]; - unsigned start[4]; + struct intel_urb_config cfg; bool constrained; } urb; + /** Last urb emitted by the driver. */ + struct intel_urb_config last_urb; + /** Uploader for shader assembly from the driver thread */ struct u_upload_mgr *uploader_driver; /** Uploader for shader assembly from the threaded context */ @@ -1180,21 +1181,6 @@ iris_execute_indirect_draw_supported(const struct iris_context *ice, #ifdef genX # include "iris_genx_protos.h" #else -# define genX(x) gfx4_##x -# include "iris_genx_protos.h" -# undef genX -# define genX(x) gfx5_##x -# include "iris_genx_protos.h" -# undef genX -# define genX(x) gfx6_##x -# include "iris_genx_protos.h" -# undef genX -# define genX(x) gfx7_##x -# include "iris_genx_protos.h" -# undef genX -# define genX(x) gfx75_##x -# include "iris_genx_protos.h" -# undef genX # define genX(x) gfx8_##x # include "iris_genx_protos.h" # undef genX diff --git a/src/gallium/drivers/iris/iris_draw.c b/src/gallium/drivers/iris/iris_draw.c index bc897ba0f7d5..597a18c5c0e7 100644 --- a/src/gallium/drivers/iris/iris_draw.c +++ b/src/gallium/drivers/iris/iris_draw.c @@ -37,7 +37,6 @@ #include "util/u_transfer.h" #include "util/u_upload_mgr.h" #include "intel/compiler/brw_compiler.h" -#include "intel/compiler/brw_eu_defines.h" #include "compiler/shader_info.h" #include "iris_context.h" #include "iris_defines.h" diff --git a/src/gallium/drivers/iris/iris_genx_protos.h b/src/gallium/drivers/iris/iris_genx_protos.h index 44c5d427ce65..20e9815bda79 100644 --- a/src/gallium/drivers/iris/iris_genx_protos.h +++ b/src/gallium/drivers/iris/iris_genx_protos.h @@ -48,6 +48,8 @@ void genX(emit_3dprimitive_was)(struct iris_batch *batch, const struct pipe_draw_indirect_info *indirect, uint32_t primitive_topology, uint32_t vertex_count); +void genX(urb_workaround)(struct iris_batch *batch, + const struct intel_urb_config *urb_cfg); static inline void genX(maybe_emit_breakpoint)(struct iris_batch *batch, diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 600d95fa6b81..c48df4783151 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -436,7 +436,6 @@ static void iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, void *mem_ctx, nir_shader *nir, - struct brw_stage_prog_data *prog_data, unsigned kernel_input_size, enum brw_param_builtin **out_system_values, unsigned *out_num_system_values, @@ -1145,7 +1144,7 @@ check_urb_size(struct iris_context *ice, unsigned needed_size, gl_shader_stage stage) { - unsigned last_allocated_size = ice->shaders.urb.size[stage]; + unsigned last_allocated_size = ice->shaders.urb.cfg.size[stage]; /* If the last URB allocation wasn't large enough for our needs, * flag it as needing to be reconfigured. Otherwise, we can use @@ -1315,7 +1314,7 @@ iris_compile_vs(struct iris_screen *screen, prog_data->use_alt_mode = nir->info.use_legacy_math_rules; - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, + iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values, &num_system_values, &num_cbufs); struct iris_binding_table bt; @@ -1500,7 +1499,7 @@ iris_compile_tcs(struct iris_screen *screen, source_hash = *(uint32_t*)nir->info.source_sha1; } - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, + iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values, &num_system_values, &num_cbufs); iris_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0, num_system_values, num_cbufs); @@ -1657,7 +1656,7 @@ iris_compile_tes(struct iris_screen *screen, nir_shader_gather_info(nir, impl); } - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, + iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values, &num_system_values, &num_cbufs); struct iris_binding_table bt; @@ -1800,7 +1799,7 @@ iris_compile_gs(struct iris_screen *screen, nir_shader_gather_info(nir, impl); } - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, + iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values, &num_system_values, &num_cbufs); struct iris_binding_table bt; @@ -1930,7 +1929,7 @@ iris_compile_fs(struct iris_screen *screen, prog_data->use_alt_mode = nir->info.use_legacy_math_rules; - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, 0, &system_values, + iris_setup_uniforms(devinfo, mem_ctx, nir, 0, &system_values, &num_system_values, &num_cbufs); /* Lower output variables to load_output intrinsics before setting up @@ -2226,8 +2225,7 @@ iris_compile_cs(struct iris_screen *screen, NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics, devinfo, cs_prog_data); - iris_setup_uniforms(devinfo, mem_ctx, nir, prog_data, - ish->kernel_input_size, + iris_setup_uniforms(devinfo, mem_ctx, nir, ish->kernel_input_size, &system_values, &num_system_values, &num_cbufs); struct iris_binding_table bt; diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index 481f575b5115..ef02586774a6 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -38,10 +38,7 @@ #include "util/u_upload_mgr.h" #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" -#include "intel/common/intel_disasm.h" #include "intel/compiler/brw_compiler.h" -#include "intel/compiler/brw_eu.h" -#include "intel/compiler/brw_nir.h" #include "iris_context.h" #include "iris_resource.h" diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 6e7069058fb4..294acfc7f413 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -6819,31 +6819,31 @@ iris_upload_dirty_render_state(struct iris_context *ice, if (dirty & IRIS_DIRTY_URB) { for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { if (!ice->shaders.prog[i]) { - ice->shaders.urb.size[i] = 1; + ice->shaders.urb.cfg.size[i] = 1; } else { struct brw_vue_prog_data *vue_prog_data = (void *) ice->shaders.prog[i]->prog_data; - ice->shaders.urb.size[i] = vue_prog_data->urb_entry_size; + ice->shaders.urb.cfg.size[i] = vue_prog_data->urb_entry_size; } - assert(ice->shaders.urb.size[i] != 0); + assert(ice->shaders.urb.cfg.size[i] != 0); } intel_get_urb_config(screen->devinfo, screen->l3_config_3d, ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL, ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL, - ice->shaders.urb.size, - ice->shaders.urb.entries, - ice->shaders.urb.start, + &ice->shaders.urb.cfg, &ice->state.urb_deref_block_size, &ice->shaders.urb.constrained); + genX(urb_workaround)(batch, &ice->shaders.urb.cfg); + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = ice->shaders.urb.start[i]; - urb.VSURBEntryAllocationSize = ice->shaders.urb.size[i] - 1; - urb.VSNumberofURBEntries = ice->shaders.urb.entries[i]; + urb.VSURBStartingAddress = ice->shaders.urb.cfg.start[i]; + urb.VSURBEntryAllocationSize = ice->shaders.urb.cfg.size[i] - 1; + urb.VSNumberofURBEntries = ice->shaders.urb.cfg.entries[i]; } } } @@ -8137,6 +8137,35 @@ genX(emit_3dprimitive_was)(struct iris_batch *batch, #endif } +void +genX(urb_workaround)(struct iris_batch *batch, + const struct intel_urb_config *urb_cfg) +{ +#if INTEL_NEEDS_WA_16014912113 + if (intel_urb_setup_changed(urb_cfg, &batch->ice->shaders.last_urb, + MESA_SHADER_TESS_EVAL) && + batch->ice->shaders.last_urb.size[0] != 0) { + for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { + iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = + batch->ice->shaders.last_urb.start[i]; + urb.VSURBEntryAllocationSize = + batch->ice->shaders.last_urb.size[i] - 1; + urb.VSNumberofURBEntries = i == 0 ? 256 : 0; + } + } + iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) { + pc.HDCPipelineFlushEnable = true; + } + } +#endif + + /* Update current urb config. */ + memcpy(&batch->ice->shaders.last_urb, &batch->ice->shaders.urb.cfg, + sizeof(struct intel_urb_config)); +} + static void iris_upload_render_state(struct iris_context *ice, struct iris_batch *batch, diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index 6493b85dd78b..87104f19cb7d 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -405,7 +405,7 @@ panfrost_create_shader_state(struct pipe_context *pctx, if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.outputs_written & BITFIELD_BIT(FRAG_RESULT_COLOR)) { - NIR_PASS_V(nir, nir_lower_fragcolor, 8); + NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8); so->fragcolor_lowered = true; } diff --git a/src/gallium/drivers/virgl/meson.build b/src/gallium/drivers/virgl/meson.build index 0bb26c67c7c6..6f94c53f5feb 100644 --- a/src/gallium/drivers/virgl/meson.build +++ b/src/gallium/drivers/virgl/meson.build @@ -41,9 +41,15 @@ libvirgl = static_library( dependencies : [dep_libdrm, idep_mesautil, idep_xmlconfig, idep_nir], ) +virgl_deps = [libvirgl] +if not with_platform_windows + virgl_deps += libvirgldrm + virgl_deps += libvirglvtest +endif + driver_virgl = declare_dependency( compile_args : '-DGALLIUM_VIRGL', - link_with : [libvirgl, libvirgldrm, libvirglvtest], + link_with : virgl_deps, ) if with_tests diff --git a/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp b/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp index 34dfe8356443..057445a77186 100644 --- a/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp +++ b/src/gallium/drivers/virgl/tests/virgl_staging_mgr_test.cpp @@ -161,7 +161,7 @@ TEST_P(VirglStagingMgrWithAlignment, struct virgl_hw_res *out_resource[num_resources] = {0}; unsigned expected_offset = 0; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; for (unsigned i = 0; i < num_resources; ++i) { @@ -197,7 +197,7 @@ TEST_F(VirglStagingMgr, { struct virgl_hw_res *out_resource[2] = {0}; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; alloc_succeeded = @@ -229,7 +229,7 @@ TEST_F(VirglStagingMgr, { struct virgl_hw_res *out_resource[2] = {0}; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; alloc_succeeded = @@ -261,7 +261,7 @@ TEST_F(VirglStagingMgr, { struct virgl_hw_res *out_resource[2] = {0}; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; ASSERT_LT(staging_size, 5123); @@ -295,7 +295,7 @@ TEST_F(VirglStagingMgr, releases_resource_on_destruction) { struct virgl_hw_res *out_resource = NULL; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; alloc_succeeded = @@ -334,7 +334,7 @@ TEST_F(VirglStagingMgr, fails_gracefully_if_resource_create_fails) struct virgl_screen *vs = virgl_screen(ctx->screen); struct virgl_hw_res *out_resource = NULL; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; vs->vws->resource_create = failing_resource_create; @@ -359,7 +359,7 @@ TEST_F(VirglStagingMgr, fails_gracefully_if_map_fails) struct virgl_screen *vs = virgl_screen(ctx->screen); struct virgl_hw_res *out_resource = NULL; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; vs->vws->resource_map = failing_resource_map; @@ -377,7 +377,7 @@ TEST_F(VirglStagingMgr, uses_staging_buffer_resource) { struct virgl_hw_res *out_resource = NULL; unsigned out_offset; - void *map_ptr; + uint8_t *map_ptr; bool alloc_succeeded; alloc_succeeded = diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c index f47c469e858e..548335aac802 100644 --- a/src/gallium/drivers/virgl/virgl_context.c +++ b/src/gallium/drivers/virgl/virgl_context.c @@ -21,7 +21,11 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include +#ifndef _WIN32 #include +#endif + #include "pipe/p_shader_tokens.h" #include "compiler/nir/nir.h" @@ -592,7 +596,7 @@ static void virgl_hw_set_vertex_buffers(struct virgl_context *vctx) if (vctx->vertex_array_dirty) { const struct virgl_vertex_elements_state *ve = vctx->vertex_elements; - if (ve->num_bindings) { + if (ve && ve->num_bindings) { struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS]; for (int i = 0; i < ve->num_bindings; ++i) vertex_buffers[i] = vctx->vertex_buffer[ve->binding_map[i]]; @@ -1003,7 +1007,7 @@ static void virgl_draw_vbo(struct pipe_context *ctx, struct virgl_context *vctx = virgl_context(ctx); struct virgl_screen *rs = virgl_screen(ctx->screen); - struct virgl_indexbuf ib = {}; + struct virgl_indexbuf ib = { 0 }; struct pipe_draw_info info = *dinfo; if (!indirect && @@ -1478,7 +1482,7 @@ static void *virgl_create_compute_state(struct pipe_context *ctx, uint32_t handle; const struct tgsi_token *ntt_tokens = NULL; const struct tgsi_token *tokens; - struct pipe_stream_output_info so_info = {}; + struct pipe_stream_output_info so_info = { 0 }; int ret; if (state->ir_type == PIPE_SHADER_IR_NIR) { diff --git a/src/gallium/drivers/virgl/virgl_query.c b/src/gallium/drivers/virgl/virgl_query.c index 96a62a524af2..2aeafc07ec9e 100644 --- a/src/gallium/drivers/virgl/virgl_query.c +++ b/src/gallium/drivers/virgl/virgl_query.c @@ -30,7 +30,13 @@ #include "virgl_screen.h" struct virgl_query { - struct virgl_resource *buf; + enum pipe_query_type type; + + union { + struct virgl_resource *buf; + struct pipe_fence_handle *fence; // PIPE_QUERY_GPU_FINISHED + }; + uint32_t handle; uint32_t result_size; uint32_t pipeline_stats; @@ -123,6 +129,11 @@ static struct pipe_query *virgl_create_query(struct pipe_context *ctx, if (!query) return NULL; + query->type = query_type; + + if (query->type == PIPE_QUERY_GPU_FINISHED) + return (struct pipe_query *)query; + query->buf = (struct virgl_resource *) pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, sizeof(struct virgl_host_query_state)); @@ -159,9 +170,13 @@ static void virgl_destroy_query(struct pipe_context *ctx, struct virgl_context *vctx = virgl_context(ctx); struct virgl_query *query = virgl_query(q); - virgl_encode_delete_object(vctx, query->handle, VIRGL_OBJECT_QUERY); + if (query->type == PIPE_QUERY_GPU_FINISHED) { + ctx->screen->fence_reference(ctx->screen, &query->fence, NULL); + } else { + virgl_encode_delete_object(vctx, query->handle, VIRGL_OBJECT_QUERY); + pipe_resource_reference((struct pipe_resource **)&query->buf, NULL); + } - pipe_resource_reference((struct pipe_resource **)&query->buf, NULL); FREE(query); } @@ -184,6 +199,11 @@ static bool virgl_end_query(struct pipe_context *ctx, struct virgl_query *query = virgl_query(q); struct virgl_host_query_state *host_state; + if (query->type == PIPE_QUERY_GPU_FINISHED) { + ctx->flush(ctx, &query->fence, PIPE_FLUSH_DEFERRED); + return true; + } + host_state = vs->vws->resource_map(vs->vws, query->buf->hw_res); if (!host_state) return false; @@ -207,6 +227,13 @@ static bool virgl_get_query_result(struct pipe_context *ctx, { struct virgl_query *query = virgl_query(q); + if (query->type == PIPE_QUERY_GPU_FINISHED) { + struct pipe_screen *screen = ctx->screen; + + result->b = screen->fence_finish(screen, ctx, query->fence, wait ? OS_TIMEOUT_INFINITE : 0); + return result->b; + } + if (!query->ready) { struct virgl_screen *vs = virgl_screen(ctx->screen); struct virgl_context *vctx = virgl_context(ctx); diff --git a/src/gallium/drivers/virgl/virgl_resource.c b/src/gallium/drivers/virgl/virgl_resource.c index 494af9116b70..87d610ebbfc4 100644 --- a/src/gallium/drivers/virgl/virgl_resource.c +++ b/src/gallium/drivers/virgl/virgl_resource.c @@ -350,7 +350,7 @@ virgl_staging_map(struct virgl_context *vctx, unsigned align_offset; unsigned stride; uintptr_t layer_stride; - void *map_addr; + uint8_t *map_addr; bool alloc_succeeded; assert(vctx->supports_staging); @@ -529,7 +529,7 @@ virgl_resource_transfer_map(struct pipe_context *ctx, case VIRGL_TRANSFER_MAP_HW_RES: trans->hw_res_map = vws->resource_map(vws, vres->hw_res); if (trans->hw_res_map) - map_addr = trans->hw_res_map + trans->offset; + map_addr = (uint8_t *)trans->hw_res_map + trans->offset; else map_addr = NULL; break; @@ -717,22 +717,29 @@ static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *scre uint32_t storage_size; struct virgl_screen *vs = virgl_screen(screen); - if (templ->target == PIPE_BUFFER) + if (templ && templ->target == PIPE_BUFFER) return NULL; struct virgl_resource *res = CALLOC_STRUCT(virgl_resource); - res->b = *templ; + if (templ) + res->b = *templ; res->b.screen = &vs->base; pipe_reference_init(&res->b.reference, 1); plane = winsys_stride = plane_offset = modifier = 0; res->hw_res = vs->vws->resource_create_from_handle(vs->vws, whandle, + &res->b, &plane, &winsys_stride, &plane_offset, &modifier, &res->blob_mem); + if (!res->hw_res) { + FREE(res); + return NULL; + } + /* do not use winsys returns for guest storage info of classic resource */ if (!res->blob_mem) { winsys_stride = 0; @@ -742,10 +749,6 @@ static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *scre virgl_resource_layout(&res->b, &res->metadata, plane, winsys_stride, plane_offset, modifier); - if (!res->hw_res) { - FREE(res); - return NULL; - } /* * If the overall resource is larger than a single page in size, we can diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index d86ca5d1e8c3..6599f5f5c4cd 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -936,7 +936,7 @@ static void virgl_flush_frontbuffer(struct pipe_screen *screen, if (vws->flush_frontbuffer) { virgl_flush_eq(vctx, vctx, NULL); - vws->flush_frontbuffer(vws, vres->hw_res, level, layer, winsys_drawable_handle, + vws->flush_frontbuffer(vws, vctx->cbuf, vres->hw_res, level, layer, winsys_drawable_handle, sub_box); } } @@ -1054,6 +1054,10 @@ static struct disk_cache *virgl_get_disk_shader_cache (struct pipe_screen *pscre static void virgl_disk_cache_create(struct virgl_screen *screen) { + struct mesa_sha1 sha1_ctx; + _mesa_sha1_init(&sha1_ctx); + +#ifdef HAVE_DL_ITERATE_PHDR const struct build_id_note *note = build_id_find_nhdr_for_addr(virgl_disk_cache_create); assert(note); @@ -1064,9 +1068,8 @@ static void virgl_disk_cache_create(struct virgl_screen *screen) const uint8_t *id_sha1 = build_id_data(note); assert(id_sha1); - struct mesa_sha1 sha1_ctx; - _mesa_sha1_init(&sha1_ctx); _mesa_sha1_update(&sha1_ctx, id_sha1, build_id_len); +#endif /* When we switch the host the caps might change and then we might have to * apply different lowering. */ diff --git a/src/gallium/drivers/virgl/virgl_staging_mgr.c b/src/gallium/drivers/virgl/virgl_staging_mgr.c index 947f343bd7f0..c73c79ec2339 100644 --- a/src/gallium/drivers/virgl/virgl_staging_mgr.c +++ b/src/gallium/drivers/virgl/virgl_staging_mgr.c @@ -96,7 +96,7 @@ virgl_staging_alloc(struct virgl_staging_mgr *staging, unsigned alignment, unsigned *out_offset, struct virgl_hw_res **outbuf, - void **ptr) + uint8_t **ptr) { struct virgl_winsys *vws = staging->vws; unsigned offset = align(staging->offset, alignment); diff --git a/src/gallium/drivers/virgl/virgl_staging_mgr.h b/src/gallium/drivers/virgl/virgl_staging_mgr.h index 7abf18713bb8..18a1cd5e5eb9 100644 --- a/src/gallium/drivers/virgl/virgl_staging_mgr.h +++ b/src/gallium/drivers/virgl/virgl_staging_mgr.h @@ -83,7 +83,7 @@ virgl_staging_alloc(struct virgl_staging_mgr *staging, unsigned alignment, unsigned *out_offset, struct virgl_hw_res **outbuf, - void **ptr); + uint8_t **ptr); #ifdef __cplusplus } // extern "C" { diff --git a/src/gallium/drivers/virgl/virgl_texture.c b/src/gallium/drivers/virgl/virgl_texture.c index cde0b21e65aa..7412d86b35e4 100644 --- a/src/gallium/drivers/virgl/virgl_texture.c +++ b/src/gallium/drivers/virgl/virgl_texture.c @@ -190,7 +190,7 @@ static void *texture_transfer_map_resolve(struct pipe_context *ctx, goto fail; if (!util_format_translate_3d(resource->format, - ptr + vtex->metadata.level_offset[level], + (uint8_t *)ptr + vtex->metadata.level_offset[level], trans->base.stride, trans->base.layer_stride, box->x, box->y, box->z, @@ -212,7 +212,7 @@ static void *texture_transfer_map_resolve(struct pipe_context *ctx, if ((usage & PIPE_MAP_WRITE) == 0) pipe_resource_reference(&trans->resolve_transfer->resource, NULL); - return ptr + trans->offset; + return (uint8_t *)ptr + trans->offset; } fail: diff --git a/src/gallium/drivers/virgl/virgl_transfer_queue.c b/src/gallium/drivers/virgl/virgl_transfer_queue.c index 2353ad5042e5..da8255377a85 100644 --- a/src/gallium/drivers/virgl/virgl_transfer_queue.c +++ b/src/gallium/drivers/virgl/virgl_transfer_queue.c @@ -372,7 +372,7 @@ virgl_transfer_queue_extend_buffer(struct virgl_transfer_queue *queue, assert(queued->base.resource->target == PIPE_BUFFER); assert(queued->hw_res_map); - memcpy(queued->hw_res_map + offset, data, size); + memcpy((uint8_t *)queued->hw_res_map + offset, data, size); u_box_union_2d(&queued->base.box, &queued->base.box, &box); queued->offset = queued->base.box.x; diff --git a/src/gallium/drivers/virgl/virgl_video.c b/src/gallium/drivers/virgl/virgl_video.c index f0afa0491a8b..29af663c600f 100644 --- a/src/gallium/drivers/virgl/virgl_video.c +++ b/src/gallium/drivers/virgl/virgl_video.c @@ -66,12 +66,12 @@ */ #include -#include #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "util/u_video.h" #include "util/u_memory.h" +#include "util/macros.h" #include "virgl_screen.h" #include "virgl_resource.h" @@ -106,7 +106,7 @@ static int fill_base_picture_desc(const struct pipe_picture_desc *desc, ITEM_SET(vbase, desc, protected_playback); ITEM_SET(vbase, desc, key_size); memcpy(vbase->decrypt_key, desc->decrypt_key, - MIN(desc->key_size, sizeof(vbase->decrypt_key))); + MIN2(desc->key_size, sizeof(vbase->decrypt_key))); return 0; } @@ -1042,7 +1042,7 @@ static void virgl_video_decode_bitstream(struct pipe_video_codec *codec, if (!ptr) return; for (i = 0, vcdc->bs_size = 0; i < num_buffers; i++) { - memcpy(ptr + vcdc->bs_size, buffers[i], sizes[i]); + memcpy((uint8_t *)ptr + vcdc->bs_size, buffers[i], sizes[i]); vcdc->bs_size += sizes[i]; } pipe_buffer_unmap(&vctx->base, xfer); diff --git a/src/gallium/drivers/virgl/virgl_winsys.h b/src/gallium/drivers/virgl/virgl_winsys.h index e780f5eef9b4..3d83ac728f2d 100644 --- a/src/gallium/drivers/virgl/virgl_winsys.h +++ b/src/gallium/drivers/virgl/virgl_winsys.h @@ -27,6 +27,7 @@ #include "virtio-gpu/virgl_hw.h" struct pipe_box; +struct pipe_resource; struct pipe_fence_handle; struct winsys_handle; struct virgl_hw_res; @@ -86,6 +87,7 @@ struct virgl_winsys { struct virgl_hw_res *(*resource_create_from_handle)(struct virgl_winsys *vws, struct winsys_handle *whandle, + struct pipe_resource *templ, uint32_t *plane, uint32_t *stride, uint32_t *plane_offset, @@ -133,6 +135,7 @@ struct virgl_winsys { /* for sw paths */ void (*flush_frontbuffer)(struct virgl_winsys *vws, + struct virgl_cmd_buf *cbuf, struct virgl_hw_res *res, unsigned level, unsigned layer, void *winsys_drawable_handle, @@ -184,5 +187,5 @@ static inline void virgl_ws_fill_new_caps_defaults(struct virgl_drm_caps *caps) } extern enum virgl_formats pipe_to_virgl_format(enum pipe_format format); - +extern enum pipe_format virgl_to_pipe_format(enum virgl_formats format); #endif diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c index c79fb23a8343..da4d8d76b8ee 100644 --- a/src/gallium/drivers/zink/zink_compiler.c +++ b/src/gallium/drivers/zink/zink_compiler.c @@ -3564,26 +3564,39 @@ is_residency_code(nir_def *src) } } +static bool +lower_sparse_and_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data) +{ + if (instr->intrinsic != nir_intrinsic_sparse_residency_code_and) + return false; + + b->cursor = nir_before_instr(&instr->instr); + nir_def *src0; + if (is_residency_code(instr->src[0].ssa)) + src0 = nir_is_sparse_texels_resident(b, 1, instr->src[0].ssa); + else + src0 = instr->src[0].ssa; + nir_def *src1; + if (is_residency_code(instr->src[1].ssa)) + src1 = nir_is_sparse_texels_resident(b, 1, instr->src[1].ssa); + else + src1 = instr->src[1].ssa; + nir_def *def = nir_iand(b, src0, src1); + nir_def_rewrite_uses_after(&instr->def, def, &instr->instr); + nir_instr_remove(&instr->instr); + return true; +} + +static bool +lower_sparse_and(nir_shader *shader) +{ + return nir_shader_intrinsics_pass(shader, lower_sparse_and_instr, + nir_metadata_dominance, NULL); +} + static bool lower_sparse_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data) { - if (instr->intrinsic == nir_intrinsic_sparse_residency_code_and) { - b->cursor = nir_before_instr(&instr->instr); - nir_def *src0; - if (is_residency_code(instr->src[0].ssa)) - src0 = nir_is_sparse_texels_resident(b, 1, instr->src[0].ssa); - else - src0 = instr->src[0].ssa; - nir_def *src1; - if (is_residency_code(instr->src[1].ssa)) - src1 = nir_is_sparse_texels_resident(b, 1, instr->src[1].ssa); - else - src1 = instr->src[1].ssa; - nir_def *def = nir_iand(b, src0, src1); - nir_def_rewrite_uses_after(&instr->def, def, &instr->instr); - nir_instr_remove(&instr->instr); - return true; - } if (instr->intrinsic != nir_intrinsic_is_sparse_texels_resident) return false; @@ -4021,7 +4034,10 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad zs->can_inline = false; } else if (need_optimize) optimize_nir(nir, zs, true); - NIR_PASS_V(nir, lower_sparse); + bool has_sparse = false; + NIR_PASS(has_sparse, nir, lower_sparse); + if (has_sparse) + optimize_nir(nir, zs, false); struct zink_shader_object obj = compile_module(screen, zs, nir, can_shobj, pg); ralloc_free(nir); @@ -5472,6 +5488,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir) NIR_PASS_V(nir, lower_basevertex); NIR_PASS_V(nir, lower_baseinstance); + NIR_PASS_V(nir, lower_sparse_and); NIR_PASS_V(nir, split_bitfields); NIR_PASS_V(nir, nir_lower_frexp); /* TODO: Use the spirv instructions for this. */ diff --git a/src/gallium/frontends/d3d10umd/README.md b/src/gallium/frontends/d3d10umd/README.md index ebd408d4558e..36f1df28fbe7 100644 --- a/src/gallium/frontends/d3d10umd/README.md +++ b/src/gallium/frontends/d3d10umd/README.md @@ -3,4 +3,4 @@ This directory has a Gallium state tracker for D3D10 UMD DDI. It still uses TGSI, not not NIR. Currently it only supports SW rasterizers. See -src/gallium/targets/d3d10sw/README.md for further details. +src/gallium/targets/d3d10umd/README.md for further details. diff --git a/src/gallium/meson.build b/src/gallium/meson.build index e2b84e192768..5014bdbd700a 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -167,8 +167,10 @@ else endif if with_gallium_virgl subdir('winsys/virgl/common') - subdir('winsys/virgl/drm') - subdir('winsys/virgl/vtest') + if not with_platform_windows + subdir('winsys/virgl/drm') + subdir('winsys/virgl/vtest') + endif subdir('drivers/virgl') else driver_virgl = declare_dependency() @@ -239,7 +241,7 @@ if with_gallium_st_nine endif if with_gallium_st_d3d10umd subdir('frontends/d3d10umd') - subdir('targets/d3d10sw') + subdir('targets/d3d10umd') endif if with_platform_windows if with_opengl diff --git a/src/gallium/targets/d3d10sw/README.md b/src/gallium/targets/d3d10umd/README.md similarity index 79% rename from src/gallium/targets/d3d10sw/README.md rename to src/gallium/targets/d3d10umd/README.md index 7487bb2bc6a3..0675f9589e6b 100644 --- a/src/gallium/targets/d3d10sw/README.md +++ b/src/gallium/targets/d3d10umd/README.md @@ -1,15 +1,14 @@ -The resulting d3d10sw.dll implements D3D10's software rendering interface, like -WARP. +When compiled with `gallium-driver=llvmpipe` or `gallium-driver=softpipe` the resulting libgallium_d3d10.dll implements D3D10's software rendering interface, like WARP. It can be used directly from WLK 1.6 and WHCK 2.0 D3D10+ tests, via the -Src and -SWDLL options. For example: - wgf11blend.exe -Debug -DoNotCatchExceptions -DXGI:1.1 -FeatureLevel:10.0 -Src:SW -SWDLL:d3d10sw.dll -LogClean -LogVerbose + wgf11blend.exe -Debug -DoNotCatchExceptions -DXGI:1.1 -FeatureLevel:10.0 -Src:SW -SWDLL:libgallium_d3d10.dll -LogClean -LogVerbose However, as of WHCK version 2.1 this mechanism no longer works reliably. Either you use WHCK 2.0 binaries, or you must use the alternative method -cribed below (of copying d3d10sw.dll into the executable directory and rename +described below (of copying libgallium_d3d10.dll into the executable directory and rename it such that it matches the D3D10 UMD of the test machine). @@ -17,7 +16,7 @@ Examples can be easily modified to load it too: D3D10CreateDeviceAndSwapChain(NULL, D3D10_DRIVER_TYPE_SOFTWARE, - LoadLibraryA("d3d10sw"), /* Software */ + LoadLibraryA("libgallium_d3d10"), /* Software */ Flags, D3D10_SDK_VERSION, &SwapChainDesc, @@ -26,7 +25,7 @@ Examples can be easily modified to load it too: D3D11CreateDeviceAndSwapChain(NULL, /* pAdapter */ D3D_DRIVER_TYPE_SOFTWARE, - LoadLibraryA("d3d10sw"), /* Software */ + LoadLibraryA("libgallium_d3d10"), /* Software */ Flags, FeatureLevels, sizeof FeatureLevels / sizeof FeatureLevels[0], diff --git a/src/gallium/targets/d3d10sw/d3d10_sw.def.in b/src/gallium/targets/d3d10umd/d3d10.def.in similarity index 100% rename from src/gallium/targets/d3d10sw/d3d10_sw.def.in rename to src/gallium/targets/d3d10umd/d3d10.def.in diff --git a/src/gallium/targets/d3d10sw/d3d10_gdi.c b/src/gallium/targets/d3d10umd/d3d10_gdi.c similarity index 100% rename from src/gallium/targets/d3d10sw/d3d10_gdi.c rename to src/gallium/targets/d3d10umd/d3d10_gdi.c diff --git a/src/gallium/targets/d3d10sw/meson.build b/src/gallium/targets/d3d10umd/meson.build similarity index 82% rename from src/gallium/targets/d3d10sw/meson.build rename to src/gallium/targets/d3d10umd/meson.build index 5fe2f5fa39db..570225504343 100644 --- a/src/gallium/targets/d3d10sw/meson.build +++ b/src/gallium/targets/d3d10umd/meson.build @@ -19,17 +19,18 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -d3d10_sw_def = custom_target( - 'd3d10_sw.def', - input: 'd3d10_sw.def.in', - output : 'd3d10_sw.def', +libgallium_d3d10_def = custom_target( + 'd3d10.def', + input: 'd3d10.def.in', + output : 'd3d10.def', command : gen_vs_module_defs_normal_command, ) -libd3d10sw = shared_library( - 'd3d10sw', +gallium_d3d10_name = get_option('gallium-d3d10-dll-name') +libgallium_d3d10 = shared_library( + gallium_d3d10_name, ['d3d10_gdi.c'], - vs_module_defs : d3d10_sw_def, + vs_module_defs : libgallium_d3d10_def, include_directories : [ inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_d3d10umd, inc_gallium_winsys, inc_gallium_winsys_sw, inc_gallium_drivers, inc_winddk ], @@ -40,20 +41,20 @@ libd3d10sw = shared_library( dependencies : [ dep_ws2_32, idep_nir, driver_swrast ], - name_prefix : '', # otherwise mingw will create libd3d10sw.dll + name_prefix: '', install : true, ) if with_tests test( - 'd3d10sw', + 'd3d10', executable( - 'test_d3d10sw', + 'test_d3d10', files('tests/tri.cpp'), cpp_args : [cpp_msvc_compat_args], dependencies : [cpp.find_library('d3d11')], - link_with : [libd3d10sw], + link_with : [libgallium_d3d10], ), - suite : ['d3d10sw'], + suite : ['d3d10'], ) endif diff --git a/src/gallium/targets/d3d10sw/tests/tri.cpp b/src/gallium/targets/d3d10umd/tests/tri.cpp similarity index 99% rename from src/gallium/targets/d3d10sw/tests/tri.cpp rename to src/gallium/targets/d3d10umd/tests/tri.cpp index d7e95d4db725..da3008127717 100644 --- a/src/gallium/targets/d3d10sw/tests/tri.cpp +++ b/src/gallium/targets/d3d10umd/tests/tri.cpp @@ -96,7 +96,7 @@ main(int argc, char *argv[]) D3D_FEATURE_LEVEL_10_0 }; - HMODULE hSoftware = LoadLibraryA("d3d10sw.dll"); + HMODULE hSoftware = LoadLibraryA("libgallium_d3d10.dll"); if (!hSoftware) { return EXIT_FAILURE; } diff --git a/src/gallium/targets/d3d10sw/tests/tri_ps_4_0.h b/src/gallium/targets/d3d10umd/tests/tri_ps_4_0.h old mode 100755 new mode 100644 similarity index 100% rename from src/gallium/targets/d3d10sw/tests/tri_ps_4_0.h rename to src/gallium/targets/d3d10umd/tests/tri_ps_4_0.h diff --git a/src/gallium/targets/d3d10sw/tests/tri_vs_4_0.h b/src/gallium/targets/d3d10umd/tests/tri_vs_4_0.h old mode 100755 new mode 100644 similarity index 100% rename from src/gallium/targets/d3d10sw/tests/tri_vs_4_0.h rename to src/gallium/targets/d3d10umd/tests/tri_vs_4_0.h diff --git a/src/gallium/targets/wgl/meson.build b/src/gallium/targets/wgl/meson.build index eaca0aec86b5..d48863da4f94 100644 --- a/src/gallium/targets/wgl/meson.build +++ b/src/gallium/targets/wgl/meson.build @@ -28,7 +28,7 @@ wgl_def = custom_target( command : gen_vs_module_defs_normal_command, ) -gallium_wgl_name = get_option('gallium-windows-dll-name') +gallium_wgl_name = get_option('gallium-wgl-dll-name') libgallium_wgl = shared_library( gallium_wgl_name, ['wgl.c'], diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c index e3bbd34f00e3..03e33eb55088 100644 --- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c +++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c @@ -478,6 +478,7 @@ virgl_drm_winsys_resource_get_storage_size(struct virgl_winsys *qws, static struct virgl_hw_res * virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws, struct winsys_handle *whandle, + UNUSED struct pipe_resource *templ, uint32_t *plane, uint32_t *stride, uint32_t *plane_offset, diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c index 4dcc43f48067..fb9daaa57e6e 100644 --- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c +++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c @@ -21,6 +21,7 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include "util/macros.h" #include "util/u_surface.h" #include "util/u_memory.h" #include "util/format/u_format.h" @@ -630,6 +631,7 @@ static void virgl_fence_reference(struct virgl_winsys *vws, } static void virgl_vtest_flush_frontbuffer(struct virgl_winsys *vws, + UNUSED struct virgl_cmd_buf *cmdbuf, struct virgl_hw_res *res, unsigned level, unsigned layer, void *winsys_drawable_handle, diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h index cd395aa33b2e..5a1b13ae29f9 100644 --- a/src/intel/blorp/blorp_genX_exec.h +++ b/src/intel/blorp/blorp_genX_exec.h @@ -113,11 +113,15 @@ blorp_get_surface_base_address(struct blorp_batch *batch); #if GFX_VER >= 7 static const struct intel_l3_config * blorp_get_l3_config(struct blorp_batch *batch); -# else +#endif + +static void +blorp_pre_emit_urb_config(struct blorp_batch *batch, + struct intel_urb_config *urb_config); + static void blorp_emit_urb_config(struct blorp_batch *batch, - unsigned vs_entry_size, unsigned sf_entry_size); -#endif + struct intel_urb_config *urb_config); static void blorp_emit_pipeline(struct blorp_batch *batch, @@ -241,14 +245,19 @@ emit_urb_config(struct blorp_batch *batch, #if GFX_VER >= 7 assert(sf_entry_size == 0); - const unsigned entry_size[4] = { vs_entry_size, 1, 1, 1 }; - unsigned entries[4], start[4]; + struct intel_urb_config urb_cfg = { + .size = { vs_entry_size, 1, 1, 1 }, + }; + bool constrained; intel_get_urb_config(batch->blorp->compiler->devinfo, blorp_get_l3_config(batch), - false, false, entry_size, - entries, start, deref_block_size, &constrained); + false, false, &urb_cfg, + deref_block_size, &constrained); + + /* Tell drivers about the config. */ + blorp_pre_emit_urb_config(batch, &urb_cfg); #if GFX_VERx10 == 70 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: @@ -269,9 +278,9 @@ emit_urb_config(struct blorp_batch *batch, for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = entry_size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; + urb.VSURBStartingAddress = urb_cfg.start[i]; + urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1; + urb.VSNumberofURBEntries = urb_cfg.entries[i]; } } @@ -283,7 +292,10 @@ emit_urb_config(struct blorp_batch *batch, } #else /* GFX_VER < 7 */ - blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size); + struct intel_urb_config urb_cfg = { + .size = { vs_entry_size, 0, 0, 0, sf_entry_size, }, + }; + blorp_emit_urb_config(batch, &urb_cfg); #endif } diff --git a/src/intel/common/intel_l3_config.h b/src/intel/common/intel_l3_config.h index 8ed6c86acade..fa7a6b60de37 100644 --- a/src/intel/common/intel_l3_config.h +++ b/src/intel/common/intel_l3_config.h @@ -106,14 +106,33 @@ enum intel_urb_deref_block_size { INTEL_URB_DEREF_BLOCK_SIZE_MESH = 3, }; +struct intel_urb_config { + unsigned size[5]; + unsigned entries[5]; + unsigned start[5]; +}; + void intel_get_urb_config(const struct intel_device_info *devinfo, const struct intel_l3_config *l3_cfg, bool tess_present, bool gs_present, - const unsigned entry_size[4], - unsigned entries[4], unsigned start[4], + struct intel_urb_config *urb_cfg, enum intel_urb_deref_block_size *deref_block_size, bool *constrained); +/* Returns if URB changed for given shader stage. */ +static inline bool +intel_urb_setup_changed(const struct intel_urb_config *a, + const struct intel_urb_config *b, + gl_shader_stage stage) +{ + if (a->size[stage] != b->size[stage] || + a->entries[stage] != b->entries[stage] || + a->start[stage] != b->start[stage]) + return true; + + return false; +} + struct intel_mesh_urb_allocation { unsigned task_entries; unsigned task_entry_size_64b; diff --git a/src/intel/common/intel_urb_config.c b/src/intel/common/intel_urb_config.c index d19645c31e5f..48ec0aef6cf0 100644 --- a/src/intel/common/intel_urb_config.c +++ b/src/intel/common/intel_urb_config.c @@ -64,8 +64,7 @@ void intel_get_urb_config(const struct intel_device_info *devinfo, const struct intel_l3_config *l3_cfg, bool tess_present, bool gs_present, - const unsigned entry_size[4], - unsigned entries[4], unsigned start[4], + struct intel_urb_config *urb_cfg, enum intel_urb_deref_block_size *deref_block_size, bool *constrained) { @@ -110,7 +109,7 @@ intel_get_urb_config(const struct intel_device_info *devinfo, */ unsigned granularity[4]; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { - granularity[i] = (entry_size[i] < 9) ? 8 : 1; + granularity[i] = (urb_cfg->size[i] < 9) ? 8 : 1; } unsigned min_entries[4] = { @@ -148,7 +147,7 @@ intel_get_urb_config(const struct intel_device_info *devinfo, unsigned entry_size_bytes[4]; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { - entry_size_bytes[i] = 64 * entry_size[i]; + entry_size_bytes[i] = 64 * urb_cfg->size[i]; } /* Initially, assign each stage the minimum amount of URB space it needs, @@ -208,20 +207,21 @@ intel_get_urb_config(const struct intel_device_info *devinfo, * allocated to each stage. */ for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { - entries[i] = chunks[i] * chunk_size_bytes / entry_size_bytes[i]; + urb_cfg->entries[i] = chunks[i] * chunk_size_bytes / entry_size_bytes[i]; /* Since we rounded up when computing wants[], this may be slightly * more than the maximum allowed amount, so correct for that. */ - entries[i] = MIN2(entries[i], devinfo->urb.max_entries[i]); + urb_cfg->entries[i] = MIN2(urb_cfg->entries[i], + devinfo->urb.max_entries[i]); /* Ensure that we program a multiple of the granularity. */ - entries[i] = ROUND_DOWN_TO(entries[i], granularity[i]); + urb_cfg->entries[i] = ROUND_DOWN_TO(urb_cfg->entries[i], granularity[i]); /* Finally, sanity check to make sure we have at least the minimum * number of entries needed for each stage. */ - assert(entries[i] >= min_entries[i]); + assert(urb_cfg->entries[i] >= min_entries[i]); } /* Lay out the URB in pipeline order: push constants, VS, HS, DS, GS. */ @@ -245,12 +245,12 @@ intel_get_urb_config(const struct intel_device_info *devinfo, int next_urb = first_urb; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { - if (entries[i]) { - start[i] = next_urb; + if (urb_cfg->entries[i]) { + urb_cfg->start[i] = next_urb; next_urb += chunks[i]; } else { /* Put disabled stages at the beginning of the valid range */ - start[i] = first_urb; + urb_cfg->start[i] = first_urb; } } @@ -278,12 +278,12 @@ intel_get_urb_config(const struct intel_device_info *devinfo, if (gs_present) { *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_PER_POLY; } else if (tess_present) { - if (entries[MESA_SHADER_TESS_EVAL] < 324) + if (urb_cfg->entries[MESA_SHADER_TESS_EVAL] < 324) *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_PER_POLY; else *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_32; } else { - if (entries[MESA_SHADER_VERTEX] < 192) + if (urb_cfg->entries[MESA_SHADER_VERTEX] < 192) *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_PER_POLY; else *deref_block_size = INTEL_URB_DEREF_BLOCK_SIZE_32; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 5aad102fa07a..5a4a9ffc6556 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -80,6 +80,7 @@ static const driOptionDescription anv_dri_options[] = { DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0) DRI_CONF_ANV_DISABLE_FCV(false) DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false) + DRI_CONF_ANV_FORCE_FILTER_ADDR_ROUNDING(false) DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false) DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4) DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100) @@ -958,7 +959,9 @@ get_properties_1_1(const struct anv_physical_device *pdevice, VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT | VK_SUBGROUP_FEATURE_QUAD_BIT | VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | - VK_SUBGROUP_FEATURE_CLUSTERED_BIT; + VK_SUBGROUP_FEATURE_CLUSTERED_BIT | + VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR | + VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR; p->subgroupQuadOperationsInAllStages = true; p->pointClippingBehavior = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY; @@ -2467,6 +2470,8 @@ anv_init_dri_options(struct anv_instance *instance) driQueryOptionb(&instance->dri_options, "limit_trig_input_range"); instance->sample_mask_out_opengl_behaviour = driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour"); + instance->force_filter_addr_rounding = + driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding"); instance->lower_depth_range_rate = driQueryOptionf(&instance->dri_options, "lower_depth_range_rate"); instance->no_16bit = diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h index bd3dc52f9107..a6a535334315 100644 --- a/src/intel/vulkan/anv_genX.h +++ b/src/intel/vulkan/anv_genX.h @@ -37,6 +37,7 @@ #endif struct intel_sample_positions; +struct intel_urb_config; extern const uint32_t genX(vk_to_intel_cullmode)[]; @@ -88,6 +89,9 @@ void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer, unsigned width, unsigned height, unsigned scale); +void genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer, + const struct intel_urb_config *urb_cfg); + void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer); void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer); void genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline, @@ -172,7 +176,8 @@ void genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct intel_l3_config *l3_config, VkShaderStageFlags active_stages, - const unsigned entry_size[4], + const struct intel_urb_config *urb_cfg_in, + struct intel_urb_config *urb_cfg_out, enum intel_urb_deref_block_size *deref_block_size); void genX(emit_sample_pattern)(struct anv_batch *batch, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 98ebaf151c90..d06dc723882a 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1172,6 +1172,7 @@ struct anv_instance { uint8_t assume_full_subgroups; bool limit_trig_input_range; bool sample_mask_out_opengl_behaviour; + bool force_filter_addr_rounding; bool fp64_workaround_enabled; float lower_depth_range_rate; unsigned generated_indirect_threshold; @@ -3333,6 +3334,8 @@ struct anv_simple_shader { struct anv_shader_bin *kernel; /* L3 config used by the shader */ const struct intel_l3_config *l3_config; + /* Current URB config */ + const struct intel_urb_config *urb_cfg; /* Managed by the simpler shader helper*/ struct anv_state bt_state; @@ -3443,6 +3446,8 @@ struct anv_cmd_graphics_state { */ bool viewport_set; + struct intel_urb_config urb_cfg; + uint32_t n_occlusion_queries; struct anv_gfx_dynamic_state dyn_state; @@ -4275,6 +4280,9 @@ struct anv_graphics_pipeline { */ uint32_t batch_data[416]; + /* Urb setup utilized by this pipeline. */ + struct intel_urb_config urb_cfg; + /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */ struct { struct anv_gfx_state_ptr urb; diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index a4ec021224de..11a0e3c7d929 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -255,6 +255,18 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size) */ } +static void +blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch, + struct intel_urb_config *urb_cfg) +{ + struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch; + genX(urb_workaround)(cmd_buffer, urb_cfg); + + /* Update urb config. */ + memcpy(&cmd_buffer->state.gfx.urb_cfg, urb_cfg, + sizeof(struct intel_urb_config)); +} + static const struct intel_l3_config * blorp_get_l3_config(struct blorp_batch *batch) { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index f7b09ec6e86b..ad833b4ec1bd 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -395,22 +395,18 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer, 0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE); } -#if GFX_VER == 12 - /* Depth/Stencil writes by the render pipeline to D16 & S8 formats use a - * different pairing bit for the compression cache line. This means that - * there is potential for aliasing with the wrong cache if you use another - * format OR a piece of HW that does not use the same pairing. To avoid - * this, flush the tile cache as the compression data does not live in the - * color/depth cache. + /* Additional tile cache flush for MTL: + * + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420 + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530 */ - if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS && - final_needs_depth && !initial_depth_valid && - anv_image_format_is_d16_or_s8(image)) { + if (intel_device_info_is_mtl(cmd_buffer->device->info) && + image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS && + final_needs_depth && !initial_depth_valid) { anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT, - "D16 or S8 HIZ-CCS flush"); + "HIZ-CCS flush"); } -#endif } /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless @@ -467,17 +463,15 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer, } } - /* Depth/Stencil writes by the render pipeline to D16 & S8 formats use a - * different pairing bit for the compression cache line. This means that - * there is potential for aliasing with the wrong cache if you use another - * format OR a piece of HW that does not use the same pairing. To avoid - * this, flush the tile cache as the compression data does not live in the - * color/depth cache. + /* Additional tile cache flush for MTL: + * + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420 + * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530 */ - if (anv_image_format_is_d16_or_s8(image)) { + if (intel_device_info_is_mtl(cmd_buffer->device->info)) { anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_TILE_CACHE_FLUSH_BIT, - "D16 or S8 HIZ-CCS flush"); + "HIZ-CCS flush"); } #endif } @@ -2956,6 +2950,7 @@ genX(CmdExecuteCommands)( container->state.current_hash_scale = 0; container->state.gfx.push_constant_stages = 0; container->state.gfx.ds_write_state = false; + memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config)); memcpy(container->state.gfx.dyn_state.dirty, device->gfx_dirty_state, sizeof(container->state.gfx.dyn_state.dirty)); @@ -5513,6 +5508,30 @@ genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch, #endif } +void +genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer, + const struct intel_urb_config *urb_cfg) +{ +#if INTEL_NEEDS_WA_16014912113 + const struct intel_urb_config *current = + &cmd_buffer->state.gfx.urb_cfg; + if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) && + current->size[0] != 0) { + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = current->start[i]; + urb.VSURBEntryAllocationSize = current->size[i] - 1; + urb.VSNumberofURBEntries = i == 0 ? 256 : 0; + } + } + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.HDCPipelineFlushEnable = true; + } + } +#endif +} + struct anv_state genX(cmd_buffer_begin_companion_rcs_syncpoint)( struct anv_cmd_buffer *cmd_buffer) diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h index 4eb27d262d5a..89fceb8fac5b 100644 --- a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h +++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h @@ -151,6 +151,7 @@ genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_b .kernel = device->internal_kernels[ ANV_INTERNAL_KERNEL_GENERATED_DRAWS], .l3_config = device->internal_kernels_l3_config, + .urb_cfg = &cmd_buffer->state.gfx.urb_cfg, }; genX(emit_simple_shader_init)(state); @@ -478,6 +479,7 @@ genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd .kernel = device->internal_kernels[ ANV_INTERNAL_KERNEL_GENERATED_DRAWS], .l3_config = device->internal_kernels_l3_config, + .urb_cfg = &cmd_buffer->state.gfx.urb_cfg, }; genX(emit_simple_shader_init)(&simple_state); diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c index 3a66f10c26f8..db853db4b39b 100644 --- a/src/intel/vulkan/genX_gfx_state.c +++ b/src/intel/vulkan/genX_gfx_state.c @@ -1413,9 +1413,15 @@ cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer) &cmd_buffer->vk.dynamic_graphics_state; struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state; - if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) { + genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg); + anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb); + memcpy(&gfx->urb_cfg, &pipeline->urb_cfg, + sizeof(struct intel_urb_config)); + } + if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ms); diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c index 70b0851850f6..21699dc19544 100644 --- a/src/intel/vulkan/genX_gpu_memcpy.c +++ b/src/intel/vulkan/genX_gpu_memcpy.c @@ -53,6 +53,8 @@ gcd_pow2_u64(uint64_t a, uint64_t b) static void emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device, + const struct intel_urb_config *urb_cfg_in, + struct intel_urb_config *urb_cfg_out, const struct intel_l3_config *l3_config) { anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { @@ -102,9 +104,11 @@ emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device, * store the data that VF is going to pass to SOL. */ const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 }; + memcpy(urb_cfg_out->size, &entry_size, sizeof(entry_size)); genX(emit_urb_setup)(device, batch, l3_config, - VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL); + VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, urb_cfg_out, + NULL); #if GFX_VER >= 12 /* Disable Primitive Replication. */ @@ -258,7 +262,10 @@ genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, genX(emit_l3_config)(batch, device, cfg); genX(emit_pipeline_select)(batch, _3D, device); - emit_common_so_memcpy(batch, device, cfg); + struct intel_urb_config urb_cfg_in = { 0 }; + struct intel_urb_config urb_cfg = { 0 }; + + emit_common_so_memcpy(batch, device, &urb_cfg_in, &urb_cfg, cfg); } void @@ -325,7 +332,11 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, genX(flush_pipeline_select_3d)(cmd_buffer); + struct intel_urb_config urb_cfg; + emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, + &cmd_buffer->state.gfx.urb_cfg, + &urb_cfg, cmd_buffer->state.current_l3_config); emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size); @@ -334,6 +345,10 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, 1ull << 32); #endif + /* Update urb config after memcpy. */ + memcpy(&cmd_buffer->state.gfx.urb_cfg, &urb_cfg, + sizeof(struct intel_urb_config)); + /* Flag all the instructions emitted by the memcpy. */ struct anv_gfx_dynamic_state *hw_state = &cmd_buffer->state.gfx.dyn_state; diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c index b52023bfda43..a6bd444cf034 100644 --- a/src/intel/vulkan/genX_init_state.c +++ b/src/intel/vulkan/genX_init_state.c @@ -1154,8 +1154,12 @@ VkResult genX(CreateSampler)( const VkFilter mag_filter = plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter : pCreateInfo->magFilter; - const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST; - const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST; + const bool force_addr_rounding = + device->physical->instance->force_filter_addr_rounding; + const bool enable_min_filter_addr_rounding = + force_addr_rounding || min_filter != VK_FILTER_NEAREST; + const bool enable_mag_filter_addr_rounding = + force_addr_rounding || mag_filter != VK_FILTER_NEAREST; /* From Broadwell PRM, SAMPLER_STATE: * "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces." */ diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index a104b032271e..87f2ec4d763d 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -371,27 +371,42 @@ void genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, const struct intel_l3_config *l3_config, VkShaderStageFlags active_stages, - const unsigned entry_size[4], + const struct intel_urb_config *urb_cfg_in, + struct intel_urb_config *urb_cfg_out, enum intel_urb_deref_block_size *deref_block_size) { const struct intel_device_info *devinfo = device->info; - unsigned entries[4]; - unsigned start[4]; bool constrained; intel_get_urb_config(devinfo, l3_config, active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, - entry_size, entries, start, deref_block_size, + urb_cfg_out, deref_block_size, &constrained); +#if INTEL_NEEDS_WA_16014912113 + if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out, + MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) { + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { + anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { + urb._3DCommandSubOpcode += i; + urb.VSURBStartingAddress = urb_cfg_in->start[i]; + urb.VSURBEntryAllocationSize = urb_cfg_in->size[i] - 1; + urb.VSNumberofURBEntries = i == 0 ? 256 : 0; + } + } + genx_batch_emit_pipe_control(batch, device->info, _3D, + ANV_PIPE_HDC_PIPELINE_FLUSH_BIT); + } +#endif + for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = entry_size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; + urb.VSURBStartingAddress = urb_cfg_out->start[i]; + urb.VSURBEntryAllocationSize = urb_cfg_out->size[i] - 1; + urb.VSNumberofURBEntries = urb_cfg_out->entries[i]; } } #if GFX_VERx10 >= 125 @@ -458,21 +473,18 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline, return; } #endif - - unsigned entry_size[4]; for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { const struct brw_vue_prog_data *prog_data = !anv_pipeline_has_stage(pipeline, i) ? NULL : (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data; - entry_size[i] = prog_data ? prog_data->urb_entry_size : 1; + pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1; } struct anv_device *device = pipeline->base.base.device; const struct intel_device_info *devinfo = device->info; - unsigned entries[4]; - unsigned start[4]; + bool constrained; intel_get_urb_config(devinfo, pipeline->base.base.l3_config, @@ -480,17 +492,18 @@ emit_urb_setup(struct anv_graphics_pipeline *pipeline, VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, pipeline->base.base.active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, - entry_size, entries, start, deref_block_size, + &pipeline->urb_cfg, deref_block_size, &constrained); for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = entry_size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; + urb.VSURBStartingAddress = pipeline->urb_cfg.start[i]; + urb.VSURBEntryAllocationSize = pipeline->urb_cfg.size[i] - 1; + urb.VSNumberofURBEntries = pipeline->urb_cfg.entries[i]; } } + #if GFX_VERx10 >= 125 if (device->vk.enabled_extensions.EXT_mesh_shader) { anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero); diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index bc5ce323ad4e..57ccd76c3371 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -1744,6 +1744,7 @@ copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer, ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE : ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT], .l3_config = device->internal_kernels_l3_config, + .urb_cfg = &cmd_buffer->state.gfx.urb_cfg, }; genX(emit_simple_shader_init)(&state); diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c index 2776f5ef256a..98fe617cff9b 100644 --- a/src/intel/vulkan/genX_simple_shader.c +++ b/src/intel/vulkan/genX_simple_shader.c @@ -103,7 +103,9 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state) * allocate space for the VS. Even though one isn't run, we need VUEs to * store the data that VF is going to pass to SOL. */ - const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 }; + struct intel_urb_config urb_cfg_out = { + .size = { DIV_ROUND_UP(32, 64), 1, 1, 1 }, + }; genX(emit_l3_config)(batch, device, state->l3_config); @@ -112,7 +114,7 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state) enum intel_urb_deref_block_size deref_block_size; genX(emit_urb_setup)(device, batch, state->l3_config, VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT, - entry_size, &deref_block_size); + state->urb_cfg, &urb_cfg_out, &deref_block_size); anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) { ps_blend.HasWriteableRT = true; @@ -344,6 +346,10 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state) BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL); } + /* Update urb config after simple shader. */ + memcpy(&state->cmd_buffer->state.gfx.urb_cfg, &urb_cfg_out, + sizeof(struct intel_urb_config)); + state->cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0); state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER | ANV_CMD_DIRTY_XFB_ENABLE); diff --git a/src/intel/vulkan_hasvk/genX_blorp_exec.c b/src/intel/vulkan_hasvk/genX_blorp_exec.c index 34734d05c67d..0d10e5212445 100644 --- a/src/intel/vulkan_hasvk/genX_blorp_exec.c +++ b/src/intel/vulkan_hasvk/genX_blorp_exec.c @@ -250,6 +250,13 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size) */ } +static void +blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch, + struct intel_urb_config *urb_cfg) +{ + /* Dummy. */ +} + static const struct intel_l3_config * blorp_get_l3_config(struct blorp_batch *batch) { diff --git a/src/intel/vulkan_hasvk/genX_pipeline.c b/src/intel/vulkan_hasvk/genX_pipeline.c index 789e109f2bc6..8e50f660c4cd 100644 --- a/src/intel/vulkan_hasvk/genX_pipeline.c +++ b/src/intel/vulkan_hasvk/genX_pipeline.c @@ -276,16 +276,16 @@ genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, enum intel_urb_deref_block_size *deref_block_size) { const struct intel_device_info *devinfo = device->info; + struct intel_urb_config urb_cfg = { + .size = { entry_size[0], entry_size[1], entry_size[2], entry_size[3], }, + }; - unsigned entries[4]; - unsigned start[4]; bool constrained; intel_get_urb_config(devinfo, l3_config, active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, - entry_size, entries, start, deref_block_size, - &constrained); + &urb_cfg, deref_block_size, &constrained); #if GFX_VERx10 == 70 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: @@ -306,9 +306,9 @@ genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { urb._3DCommandSubOpcode += i; - urb.VSURBStartingAddress = start[i]; - urb.VSURBEntryAllocationSize = entry_size[i] - 1; - urb.VSNumberofURBEntries = entries[i]; + urb.VSURBStartingAddress = urb_cfg.start[i]; + urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1; + urb.VSNumberofURBEntries = urb_cfg.entries[i]; } } } diff --git a/src/meson.build b/src/meson.build index 1293538b8f66..fb516e122e6d 100644 --- a/src/meson.build +++ b/src/meson.build @@ -91,7 +91,7 @@ endif if with_gallium_etnaviv subdir('etnaviv') endif -if system_has_kms_drm +if system_has_kms_drm or with_gallium_virgl subdir('virtio') endif if with_gallium_freedreno or with_freedreno_vk or with_tools.contains('freedreno') diff --git a/src/panfrost/util/pan_lower_writeout.c b/src/panfrost/util/pan_lower_writeout.c index 56b33a495b0e..eb528ce3bf3c 100644 --- a/src/panfrost/util/pan_lower_writeout.c +++ b/src/panfrost/util/pan_lower_writeout.c @@ -106,6 +106,7 @@ pan_nir_lower_zs_store(nir_shader *nir) stores[1] = intr; writeout |= PAN_WRITEOUT_S; } else if (sem.dual_source_blend_index) { + assert(!stores[2]); /* there should be only 1 source for dual blending */ stores[2] = intr; writeout |= PAN_WRITEOUT_2; } diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index 440e0f2ec814..521ea9321ad3 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -1189,6 +1189,9 @@ TODO: document the other workarounds. + +