diff --git a/Cargo.toml b/Cargo.toml index 4ddf0c8..5aabd23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "luau0-src" -version = "0.8.3+luau614" +version = "0.8.4+luau616" authors = ["Aleksandr Orlenko "] edition = "2021" repository = "https://github.com/khvzak/luau-src-rs" diff --git a/luau/CodeGen/include/Luau/IrData.h b/luau/CodeGen/include/Luau/IrData.h index 1d9bbc7..79d06e5 100644 --- a/luau/CodeGen/include/Luau/IrData.h +++ b/luau/CodeGen/include/Luau/IrData.h @@ -398,6 +398,7 @@ enum class IrCmd : uint8_t // A, B: tag // C: block/vmexit/undef // In final x64 lowering, A can also be Rn + // When DebugLuauAbortingChecks flag is enabled, A can also be Rn // When undef is specified instead of a block, execution is aborted on check failure CHECK_TAG, diff --git a/luau/CodeGen/include/Luau/IrVisitUseDef.h b/luau/CodeGen/include/Luau/IrVisitUseDef.h index acff0d7..27244cb 100644 --- a/luau/CodeGen/include/Luau/IrVisitUseDef.h +++ b/luau/CodeGen/include/Luau/IrVisitUseDef.h @@ -4,6 +4,8 @@ #include "Luau/Common.h" #include "Luau/IrData.h" +LUAU_FASTFLAG(LuauCodegenRemoveDeadStores2) + namespace Luau { namespace CodeGen @@ -186,7 +188,15 @@ static void visitVmRegDefsUses(T& visitor, IrFunction& function, const IrInst& i visitor.def(inst.b); break; case IrCmd::FALLBACK_FORGPREP: - visitor.use(inst.b); + if (FFlag::LuauCodegenRemoveDeadStores2) + { + // This instruction doesn't always redefine Rn, Rn+1, Rn+2, so we have to mark it as implicit use + visitor.useRange(vmRegOp(inst.b), 3); + } + else + { + visitor.use(inst.b); + } visitor.defRange(vmRegOp(inst.b), 3); break; @@ -204,6 +214,11 @@ static void visitVmRegDefsUses(T& visitor, IrFunction& function, const IrInst& i visitor.use(inst.a); break; + // After optimizations with DebugLuauAbortingChecks enabled, CHECK_TAG Rn, tag, block instructions are generated + case IrCmd::CHECK_TAG: + visitor.maybeUse(inst.a); + break; + default: // All instructions which reference registers have to be handled explicitly CODEGEN_ASSERT(inst.a.kind != IrOpKind::VmReg); diff --git a/luau/CodeGen/include/Luau/OptimizeDeadStore.h b/luau/CodeGen/include/Luau/OptimizeDeadStore.h new file mode 100644 index 0000000..45395a5 --- /dev/null +++ b/luau/CodeGen/include/Luau/OptimizeDeadStore.h @@ -0,0 +1,16 @@ +// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details +#pragma once + +#include "Luau/IrData.h" + +namespace Luau +{ +namespace CodeGen +{ + +struct IrBuilder; + +void markDeadStoresInBlockChains(IrBuilder& build); + +} // namespace CodeGen +} // namespace Luau diff --git a/luau/CodeGen/src/AssemblyBuilderX64.cpp b/luau/CodeGen/src/AssemblyBuilderX64.cpp index be0f719..bed7e0e 100644 --- a/luau/CodeGen/src/AssemblyBuilderX64.cpp +++ b/luau/CodeGen/src/AssemblyBuilderX64.cpp @@ -6,8 +6,6 @@ #include #include -LUAU_FASTFLAGVARIABLE(LuauCache32BitAsmConsts, false) - namespace Luau { namespace CodeGen @@ -1041,33 +1039,24 @@ OperandX64 AssemblyBuilderX64::i64(int64_t value) OperandX64 AssemblyBuilderX64::f32(float value) { - if (FFlag::LuauCache32BitAsmConsts) - { - uint32_t as32BitKey; - static_assert(sizeof(as32BitKey) == sizeof(value), "Expecting float to be 32-bit"); - memcpy(&as32BitKey, &value, sizeof(value)); + uint32_t as32BitKey; + static_assert(sizeof(as32BitKey) == sizeof(value), "Expecting float to be 32-bit"); + memcpy(&as32BitKey, &value, sizeof(value)); - if (as32BitKey != ~0u) - { - if (int32_t* prev = constCache32.find(as32BitKey)) - return OperandX64(SizeX64::dword, noreg, 1, rip, *prev); - } + if (as32BitKey != ~0u) + { + if (int32_t* prev = constCache32.find(as32BitKey)) + return OperandX64(SizeX64::dword, noreg, 1, rip, *prev); + } - size_t pos = allocateData(4, 4); - writef32(&data[pos], value); - int32_t offset = int32_t(pos - data.size()); + size_t pos = allocateData(4, 4); + writef32(&data[pos], value); + int32_t offset = int32_t(pos - data.size()); - if (as32BitKey != ~0u) - constCache32[as32BitKey] = offset; + if (as32BitKey != ~0u) + constCache32[as32BitKey] = offset; - return OperandX64(SizeX64::dword, noreg, 1, rip, offset); - } - else - { - size_t pos = allocateData(4, 4); - writef32(&data[pos], value); - return OperandX64(SizeX64::dword, noreg, 1, rip, int32_t(pos - data.size())); - } + return OperandX64(SizeX64::dword, noreg, 1, rip, offset); } OperandX64 AssemblyBuilderX64::f64(double value) diff --git a/luau/CodeGen/src/CodeAllocator.cpp b/luau/CodeGen/src/CodeAllocator.cpp index ab623b4..ca667ae 100644 --- a/luau/CodeGen/src/CodeAllocator.cpp +++ b/luau/CodeGen/src/CodeAllocator.cpp @@ -27,6 +27,10 @@ const size_t kPageSize = sysconf(_SC_PAGESIZE); #endif #endif +#ifdef __APPLE__ +extern "C" void sys_icache_invalidate(void* start, size_t len); +#endif + static size_t alignToPageSize(size_t size) { return (size + kPageSize - 1) & ~(kPageSize - 1); @@ -98,7 +102,11 @@ static void makePagesExecutable(uint8_t* mem, size_t size) static void flushInstructionCache(uint8_t* mem, size_t size) { +#ifdef __APPLE__ + sys_icache_invalidate(mem, size); +#else __builtin___clear_cache((char*)mem, (char*)mem + size); +#endif } #endif diff --git a/luau/CodeGen/src/CodeGenLower.h b/luau/CodeGen/src/CodeGenLower.h index c011981..33f395f 100644 --- a/luau/CodeGen/src/CodeGenLower.h +++ b/luau/CodeGen/src/CodeGenLower.h @@ -8,6 +8,7 @@ #include "Luau/IrDump.h" #include "Luau/IrUtils.h" #include "Luau/OptimizeConstProp.h" +#include "Luau/OptimizeDeadStore.h" #include "Luau/OptimizeFinalX64.h" #include "EmitCommon.h" @@ -26,6 +27,7 @@ LUAU_FASTFLAG(DebugCodegenSkipNumbering) LUAU_FASTINT(CodegenHeuristicsInstructionLimit) LUAU_FASTINT(CodegenHeuristicsBlockLimit) LUAU_FASTINT(CodegenHeuristicsBlockInstructionLimit) +LUAU_FASTFLAG(LuauCodegenRemoveDeadStores2) namespace Luau { @@ -309,6 +311,9 @@ inline bool lowerFunction(IrBuilder& ir, AssemblyBuilder& build, ModuleHelpers& stats->blockLinearizationStats.constPropInstructionCount += constPropInstructionCount; } } + + if (FFlag::LuauCodegenRemoveDeadStores2) + markDeadStoresInBlockChains(ir); } std::vector sortedBlocks = getSortedBlockOrder(ir.function); diff --git a/luau/CodeGen/src/IrLoweringA64.cpp b/luau/CodeGen/src/IrLoweringA64.cpp index 6a5703d..2a29694 100644 --- a/luau/CodeGen/src/IrLoweringA64.cpp +++ b/luau/CodeGen/src/IrLoweringA64.cpp @@ -11,10 +11,9 @@ #include "lstate.h" #include "lgc.h" -LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false) LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false) -LUAU_FASTFLAG(LuauCodegenVectorTag) +LUAU_FASTFLAG(LuauCodegenVectorTag2) namespace Luau { @@ -680,7 +679,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b)); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) { RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -710,7 +709,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b)); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) { RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -740,7 +739,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b)); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) { RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -770,7 +769,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b)); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) { RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -800,7 +799,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { build.fneg(inst.regA64, regOp(inst.a)); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) { RegisterA64 tempw = regs.allocTemp(KindA64::w); build.mov(tempw, LUA_TVECTOR); @@ -1184,7 +1183,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.fcvt(temps, tempd); build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) { build.mov(tempw, LUA_TVECTOR); build.ins_4s(inst.regA64, tempw, 3); @@ -1629,11 +1628,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) RegisterA64 tempx = castReg(KindA64::x, temp); build.sub(tempx, tempx, regOp(inst.b)); // implicit uxtw build.cmp(tempx, uint16_t(accessSize)); - - if (DFFlag::LuauCodeGenFixBufferLenCheckA64) - build.b(ConditionA64::Less, target); // note: this is a signed 64-bit comparison so that out of bounds offset fails - else - build.b(ConditionA64::LessEqual, target); // note: this is a signed 64-bit comparison so that out of bounds offset fails + build.b(ConditionA64::Less, target); // note: this is a signed 64-bit comparison so that out of bounds offset fails } } else if (inst.b.kind == IrOpKind::Constant) diff --git a/luau/CodeGen/src/IrLoweringX64.cpp b/luau/CodeGen/src/IrLoweringX64.cpp index c5188dc..cfa9ba9 100644 --- a/luau/CodeGen/src/IrLoweringX64.cpp +++ b/luau/CodeGen/src/IrLoweringX64.cpp @@ -15,7 +15,9 @@ #include "lstate.h" #include "lgc.h" -LUAU_FASTFLAG(LuauCodegenVectorTag) +LUAU_FASTFLAG(LuauCodegenVectorTag2) +LUAU_FASTFLAGVARIABLE(LuauCodegenVectorOptAnd, false) +LUAU_FASTFLAGVARIABLE(LuauCodegenSmallerUnm, false) namespace Luau { @@ -541,18 +543,24 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a}); - RegisterX64 src = regOp(inst.a); - - if (inst.regX64 == src) + if (FFlag::LuauCodegenSmallerUnm) { - build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0)); + build.vxorpd(inst.regX64, regOp(inst.a), build.f64(-0.0)); } else { - build.vmovsd(inst.regX64, src, src); - build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0)); - } + RegisterX64 src = regOp(inst.a); + if (inst.regX64 == src) + { + build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0)); + } + else + { + build.vmovsd(inst.regX64, src, src); + build.vxorpd(inst.regX64, inst.regX64, build.f64(-0.0)); + } + } break; } case IrCmd::FLOOR_NUM: @@ -603,15 +611,28 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + if (FFlag::LuauCodegenVectorOptAnd) + { + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vaddps(inst.regX64, tmp1.reg, tmp2.reg); + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); - if (!FFlag::LuauCodegenVectorTag) + build.vaddps(inst.regX64, tmpa, tmpb); + } + else + { + ScopedRegX64 tmp1{regs, SizeX64::xmmword}; + ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + + // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out + build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); + build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); + build.vaddps(inst.regX64, tmp1.reg, tmp2.reg); + } + + if (!FFlag::LuauCodegenVectorTag2) build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; } @@ -619,14 +640,28 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + if (FFlag::LuauCodegenVectorOptAnd) + { + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; + + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vsubps(inst.regX64, tmp1.reg, tmp2.reg); - if (!FFlag::LuauCodegenVectorTag) + build.vsubps(inst.regX64, tmpa, tmpb); + } + else + { + ScopedRegX64 tmp1{regs, SizeX64::xmmword}; + ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + + // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out + build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); + build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); + build.vsubps(inst.regX64, tmp1.reg, tmp2.reg); + } + + if (!FFlag::LuauCodegenVectorTag2) build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; } @@ -634,14 +669,28 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + if (FFlag::LuauCodegenVectorOptAnd) + { + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; + + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vmulps(inst.regX64, tmp1.reg, tmp2.reg); - if (!FFlag::LuauCodegenVectorTag) + build.vmulps(inst.regX64, tmpa, tmpb); + } + else + { + ScopedRegX64 tmp1{regs, SizeX64::xmmword}; + ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + + // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out + build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); + build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); + build.vmulps(inst.regX64, tmp1.reg, tmp2.reg); + } + + if (!FFlag::LuauCodegenVectorTag2) build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; } @@ -649,14 +698,28 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + if (FFlag::LuauCodegenVectorOptAnd) + { + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; + + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); + + build.vdivps(inst.regX64, tmpa, tmpb); + } + else + { + ScopedRegX64 tmp1{regs, SizeX64::xmmword}; + ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + + // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out + build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); + build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); + build.vdivps(inst.regX64, tmp1.reg, tmp2.reg); + } - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vdivps(inst.regX64, tmp1.reg, tmp2.reg); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); break; } @@ -664,19 +727,26 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a}); - RegisterX64 src = regOp(inst.a); - - if (inst.regX64 == src) + if (FFlag::LuauCodegenSmallerUnm) { - build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); + build.vxorpd(inst.regX64, regOp(inst.a), build.f32x4(-0.0, -0.0, -0.0, -0.0)); } else { - build.vmovsd(inst.regX64, src, src); - build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); + RegisterX64 src = regOp(inst.a); + + if (inst.regX64 == src) + { + build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); + } + else + { + build.vmovsd(inst.regX64, src, src); + build.vxorpd(inst.regX64, inst.regX64, build.f32x4(-0.0, -0.0, -0.0, -0.0)); + } } - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); break; } @@ -982,7 +1052,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) static_assert(sizeof(asU32) == sizeof(value), "Expecting float to be 32-bit"); memcpy(&asU32, &value, sizeof(value)); - if (FFlag::LuauCodegenVectorTag) + if (FFlag::LuauCodegenVectorTag2) build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, 0)); else build.vmovaps(inst.regX64, build.u32x4(asU32, asU32, asU32, LUA_TVECTOR)); @@ -992,7 +1062,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) build.vcvtsd2ss(inst.regX64, inst.regX64, memRegDoubleOp(inst.a)); build.vpshufps(inst.regX64, inst.regX64, inst.regX64, 0b00'00'00'00); - if (!FFlag::LuauCodegenVectorTag) + if (!FFlag::LuauCodegenVectorTag2) build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); } break; @@ -2234,6 +2304,24 @@ OperandX64 IrLoweringX64::bufferAddrOp(IrOp bufferOp, IrOp indexOp) return noreg; } +RegisterX64 IrLoweringX64::vecOp(IrOp op, ScopedRegX64& tmp) +{ + if (FFlag::LuauCodegenVectorOptAnd && FFlag::LuauCodegenVectorTag2) + { + IrInst source = function.instOp(op); + CODEGEN_ASSERT(source.cmd != IrCmd::SUBSTITUTE); // we don't process substitutions + + // source that comes from memory or from tag instruction has .w = TVECTOR, which is denormal + // to avoid performance degradation on some CPUs we mask this component to produce zero + // otherwise we conservatively assume the vector is a result of a well formed math op so .w is a normal number or zero + if (source.cmd != IrCmd::LOAD_TVALUE && source.cmd != IrCmd::TAG_VECTOR) + return regOp(op); + } + tmp.alloc(SizeX64::xmmword); + build.vandps(tmp.reg, regOp(op), vectorAndMaskOp()); + return tmp.reg; +} + IrConst IrLoweringX64::constOp(IrOp op) const { return function.constOp(op); @@ -2279,6 +2367,8 @@ OperandX64 IrLoweringX64::vectorAndMaskOp() OperandX64 IrLoweringX64::vectorOrMaskOp() { + CODEGEN_ASSERT(!FFlag::LuauCodegenVectorTag2); + if (vectorOrMask.base == noreg) vectorOrMask = build.u32x4(0, 0, 0, LUA_TVECTOR); diff --git a/luau/CodeGen/src/IrLoweringX64.h b/luau/CodeGen/src/IrLoweringX64.h index f58a5d8..7ec4079 100644 --- a/luau/CodeGen/src/IrLoweringX64.h +++ b/luau/CodeGen/src/IrLoweringX64.h @@ -51,6 +51,7 @@ struct IrLoweringX64 OperandX64 memRegTagOp(IrOp op); RegisterX64 regOp(IrOp op); OperandX64 bufferAddrOp(IrOp bufferOp, IrOp indexOp); + RegisterX64 vecOp(IrOp op, ScopedRegX64& tmp); IrConst constOp(IrOp op) const; uint8_t tagOp(IrOp op) const; diff --git a/luau/CodeGen/src/IrTranslation.cpp b/luau/CodeGen/src/IrTranslation.cpp index 686d513..995225a 100644 --- a/luau/CodeGen/src/IrTranslation.cpp +++ b/luau/CodeGen/src/IrTranslation.cpp @@ -12,8 +12,7 @@ #include "lstate.h" #include "ltm.h" -LUAU_FASTFLAGVARIABLE(LuauCodegenLuData, false) -LUAU_FASTFLAGVARIABLE(LuauCodegenVector, false) +LUAU_FASTFLAGVARIABLE(LuauCodegenVectorTag2, false) LUAU_FASTFLAGVARIABLE(LuauCodegenVectorTag, false) namespace Luau @@ -354,100 +353,97 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, { BytecodeTypes bcTypes = build.function.getBytecodeTypesAt(pcpos); - if (FFlag::LuauCodegenVector) + // Special fast-paths for vectors, matching the cases we have in VM + if (bcTypes.a == LBC_TYPE_VECTOR && bcTypes.b == LBC_TYPE_VECTOR && (tm == TM_ADD || tm == TM_SUB || tm == TM_MUL || tm == TM_DIV)) { - // Special fast-paths for vectors, matching the cases we have in VM - if (bcTypes.a == LBC_TYPE_VECTOR && bcTypes.b == LBC_TYPE_VECTOR && (tm == TM_ADD || tm == TM_SUB || tm == TM_MUL || tm == TM_DIV)) + build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); + build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); + + IrOp vb = build.inst(IrCmd::LOAD_TVALUE, opb); + IrOp vc = build.inst(IrCmd::LOAD_TVALUE, opc); + IrOp result; + + switch (tm) { - build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); - build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); - - IrOp vb = build.inst(IrCmd::LOAD_TVALUE, opb); - IrOp vc = build.inst(IrCmd::LOAD_TVALUE, opc); - IrOp result; - - switch (tm) - { - case TM_ADD: - result = build.inst(IrCmd::ADD_VEC, vb, vc); - break; - case TM_SUB: - result = build.inst(IrCmd::SUB_VEC, vb, vc); - break; - case TM_MUL: - result = build.inst(IrCmd::MUL_VEC, vb, vc); - break; - case TM_DIV: - result = build.inst(IrCmd::DIV_VEC, vb, vc); - break; - default: - CODEGEN_ASSERT(!"Unknown TM op"); - } - - if (FFlag::LuauCodegenVectorTag) - result = build.inst(IrCmd::TAG_VECTOR, result); - - build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); - return; + case TM_ADD: + result = build.inst(IrCmd::ADD_VEC, vb, vc); + break; + case TM_SUB: + result = build.inst(IrCmd::SUB_VEC, vb, vc); + break; + case TM_MUL: + result = build.inst(IrCmd::MUL_VEC, vb, vc); + break; + case TM_DIV: + result = build.inst(IrCmd::DIV_VEC, vb, vc); + break; + default: + CODEGEN_ASSERT(!"Unknown TM op"); } - else if (bcTypes.a == LBC_TYPE_NUMBER && bcTypes.b == LBC_TYPE_VECTOR && (tm == TM_MUL || tm == TM_DIV)) + + if (FFlag::LuauCodegenVectorTag2) + result = build.inst(IrCmd::TAG_VECTOR, result); + + build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); + return; + } + else if (bcTypes.a == LBC_TYPE_NUMBER && bcTypes.b == LBC_TYPE_VECTOR && (tm == TM_MUL || tm == TM_DIV)) + { + if (rb != -1) + build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TNUMBER), build.vmExit(pcpos)); + + build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); + + IrOp vb = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opb)); + IrOp vc = build.inst(IrCmd::LOAD_TVALUE, opc); + IrOp result; + + switch (tm) { - if (rb != -1) - build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TNUMBER), build.vmExit(pcpos)); - - build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); - - IrOp vb = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opb)); - IrOp vc = build.inst(IrCmd::LOAD_TVALUE, opc); - IrOp result; - - switch (tm) - { - case TM_MUL: - result = build.inst(IrCmd::MUL_VEC, vb, vc); - break; - case TM_DIV: - result = build.inst(IrCmd::DIV_VEC, vb, vc); - break; - default: - CODEGEN_ASSERT(!"Unknown TM op"); - } - - if (FFlag::LuauCodegenVectorTag) - result = build.inst(IrCmd::TAG_VECTOR, result); - - build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); - return; + case TM_MUL: + result = build.inst(IrCmd::MUL_VEC, vb, vc); + break; + case TM_DIV: + result = build.inst(IrCmd::DIV_VEC, vb, vc); + break; + default: + CODEGEN_ASSERT(!"Unknown TM op"); } - else if (bcTypes.a == LBC_TYPE_VECTOR && bcTypes.b == LBC_TYPE_NUMBER && (tm == TM_MUL || tm == TM_DIV)) + + if (FFlag::LuauCodegenVectorTag2) + result = build.inst(IrCmd::TAG_VECTOR, result); + + build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); + return; + } + else if (bcTypes.a == LBC_TYPE_VECTOR && bcTypes.b == LBC_TYPE_NUMBER && (tm == TM_MUL || tm == TM_DIV)) + { + build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); + + if (rc != -1) + build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TNUMBER), build.vmExit(pcpos)); + + IrOp vb = build.inst(IrCmd::LOAD_TVALUE, opb); + IrOp vc = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opc)); + IrOp result; + + switch (tm) { - build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); - - if (rc != -1) - build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rc)), build.constTag(LUA_TNUMBER), build.vmExit(pcpos)); - - IrOp vb = build.inst(IrCmd::LOAD_TVALUE, opb); - IrOp vc = build.inst(IrCmd::NUM_TO_VEC, loadDoubleOrConstant(build, opc)); - IrOp result; - - switch (tm) - { - case TM_MUL: - result = build.inst(IrCmd::MUL_VEC, vb, vc); - break; - case TM_DIV: - result = build.inst(IrCmd::DIV_VEC, vb, vc); - break; - default: - CODEGEN_ASSERT(!"Unknown TM op"); - } - - if (FFlag::LuauCodegenVectorTag) - result = build.inst(IrCmd::TAG_VECTOR, result); - - build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); - return; + case TM_MUL: + result = build.inst(IrCmd::MUL_VEC, vb, vc); + break; + case TM_DIV: + result = build.inst(IrCmd::DIV_VEC, vb, vc); + break; + default: + CODEGEN_ASSERT(!"Unknown TM op"); } + + if (FFlag::LuauCodegenVectorTag2) + result = build.inst(IrCmd::TAG_VECTOR, result); + + build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), result); + return; } IrOp fallback; @@ -467,30 +463,10 @@ static void translateInstBinaryNumeric(IrBuilder& build, int ra, int rb, int rc, bcTypes.b == LBC_TYPE_NUMBER ? build.vmExit(pcpos) : getInitializedFallback(build, fallback)); } - IrOp vb, vc; + IrOp vb = loadDoubleOrConstant(build, opb); + IrOp vc; IrOp result; - if (FFlag::LuauCodegenVector) - { - vb = loadDoubleOrConstant(build, opb); - } - else - { - if (opb.kind == IrOpKind::VmConst) - { - CODEGEN_ASSERT(build.function.proto); - TValue protok = build.function.proto->k[vmConstOp(opb)]; - - CODEGEN_ASSERT(protok.tt == LUA_TNUMBER); - - vb = build.constDouble(protok.value.n); - } - else - { - vb = build.inst(IrCmd::LOAD_DOUBLE, opb); - } - } - if (opc.kind == IrOpKind::VmConst) { CODEGEN_ASSERT(build.function.proto); @@ -600,13 +576,13 @@ void translateInstMinus(IrBuilder& build, const Instruction* pc, int pcpos) int ra = LUAU_INSN_A(*pc); int rb = LUAU_INSN_B(*pc); - if (FFlag::LuauCodegenVector && bcTypes.a == LBC_TYPE_VECTOR) + if (bcTypes.a == LBC_TYPE_VECTOR) { build.inst(IrCmd::CHECK_TAG, build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)), build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); IrOp vb = build.inst(IrCmd::LOAD_TVALUE, build.vmReg(rb)); IrOp va = build.inst(IrCmd::UNM_VEC, vb); - if (FFlag::LuauCodegenVectorTag) + if (FFlag::LuauCodegenVectorTag2) va = build.inst(IrCmd::TAG_VECTOR, va); build.inst(IrCmd::STORE_TVALUE, build.vmReg(ra), va); return; @@ -940,10 +916,7 @@ void translateInstForGPrepNext(IrBuilder& build, const Instruction* pc, int pcpo // setpvalue(ra + 2, reinterpret_cast(uintptr_t(0)), LU_TAG_ITERATOR); build.inst(IrCmd::STORE_POINTER, build.vmReg(ra + 2), build.constInt(0)); - - if (FFlag::LuauCodegenLuData) - build.inst(IrCmd::STORE_EXTRA, build.vmReg(ra + 2), build.constInt(LU_TAG_ITERATOR)); - + build.inst(IrCmd::STORE_EXTRA, build.vmReg(ra + 2), build.constInt(LU_TAG_ITERATOR)); build.inst(IrCmd::STORE_TAG, build.vmReg(ra + 2), build.constTag(LUA_TLIGHTUSERDATA)); build.inst(IrCmd::JUMP, target); @@ -976,10 +949,7 @@ void translateInstForGPrepInext(IrBuilder& build, const Instruction* pc, int pcp // setpvalue(ra + 2, reinterpret_cast(uintptr_t(0)), LU_TAG_ITERATOR); build.inst(IrCmd::STORE_POINTER, build.vmReg(ra + 2), build.constInt(0)); - - if (FFlag::LuauCodegenLuData) - build.inst(IrCmd::STORE_EXTRA, build.vmReg(ra + 2), build.constInt(LU_TAG_ITERATOR)); - + build.inst(IrCmd::STORE_EXTRA, build.vmReg(ra + 2), build.constInt(LU_TAG_ITERATOR)); build.inst(IrCmd::STORE_TAG, build.vmReg(ra + 2), build.constTag(LUA_TLIGHTUSERDATA)); build.inst(IrCmd::JUMP, target); @@ -1225,7 +1195,7 @@ void translateInstGetTableKS(IrBuilder& build, const Instruction* pc, int pcpos) IrOp tb = build.inst(IrCmd::LOAD_TAG, build.vmReg(rb)); - if (FFlag::LuauCodegenVector && bcTypes.a == LBC_TYPE_VECTOR) + if (bcTypes.a == LBC_TYPE_VECTOR) { build.inst(IrCmd::CHECK_TAG, tb, build.constTag(LUA_TVECTOR), build.vmExit(pcpos)); diff --git a/luau/CodeGen/src/OptimizeConstProp.cpp b/luau/CodeGen/src/OptimizeConstProp.cpp index 4214d01..d765b80 100644 --- a/luau/CodeGen/src/OptimizeConstProp.cpp +++ b/luau/CodeGen/src/OptimizeConstProp.cpp @@ -17,9 +17,8 @@ LUAU_FASTINTVARIABLE(LuauCodeGenMinLinearBlockPath, 3) LUAU_FASTINTVARIABLE(LuauCodeGenReuseSlotLimit, 64) LUAU_FASTFLAGVARIABLE(DebugLuauAbortingChecks, false) -LUAU_FASTFLAG(LuauCodegenVector) -LUAU_FASTFLAG(LuauCodegenVectorTag) -LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenCheckGcEffectFix, false) +LUAU_FASTFLAG(LuauCodegenVectorTag2) +LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenCoverForgprepEffect, false) namespace Luau { @@ -712,11 +711,11 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& uint8_t tag = state.tryGetTag(inst.b); // We know the tag of some instructions that result in TValue - if (FFlag::LuauCodegenVector && tag == 0xff) + if (tag == 0xff) { if (IrInst* arg = function.asInstOp(inst.b)) { - if (FFlag::LuauCodegenVectorTag) + if (FFlag::LuauCodegenVectorTag2) { if (arg->cmd == IrCmd::TAG_VECTOR) tag = LUA_TVECTOR; @@ -1050,11 +1049,8 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& { state.checkedGc = true; - if (DFFlag::LuauCodeGenCheckGcEffectFix) - { - // GC assist might modify table data (hash part) - state.invalidateHeapTableData(); - } + // GC assist might modify table data (hash part) + state.invalidateHeapTableData(); } break; case IrCmd::BARRIER_OBJ: @@ -1264,20 +1260,21 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& case IrCmd::SUB_VEC: case IrCmd::MUL_VEC: case IrCmd::DIV_VEC: - if (FFlag::LuauCodegenVectorTag) + if (FFlag::LuauCodegenVectorTag2) { if (IrInst* a = function.asInstOp(inst.a); a && a->cmd == IrCmd::TAG_VECTOR) - inst.a = a->a; + replace(function, inst.a, a->a); + if (IrInst* b = function.asInstOp(inst.b); b && b->cmd == IrCmd::TAG_VECTOR) - inst.b = b->a; + replace(function, inst.b, b->a); } break; case IrCmd::UNM_VEC: - if (FFlag::LuauCodegenVectorTag) + if (FFlag::LuauCodegenVectorTag2) { if (IrInst* a = function.asInstOp(inst.a); a && a->cmd == IrCmd::TAG_VECTOR) - inst.a = a->a; + replace(function, inst.a, a->a); } break; @@ -1409,6 +1406,9 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction& state.invalidate(IrOp{inst.b.kind, vmRegOp(inst.b) + 0u}); state.invalidate(IrOp{inst.b.kind, vmRegOp(inst.b) + 1u}); state.invalidate(IrOp{inst.b.kind, vmRegOp(inst.b) + 2u}); + + if (DFFlag::LuauCodeGenCoverForgprepEffect) + state.invalidateUserCall(); break; } } diff --git a/luau/CodeGen/src/OptimizeDeadStore.cpp b/luau/CodeGen/src/OptimizeDeadStore.cpp new file mode 100644 index 0000000..3ea066e --- /dev/null +++ b/luau/CodeGen/src/OptimizeDeadStore.cpp @@ -0,0 +1,530 @@ +// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details +#include "Luau/OptimizeDeadStore.h" + +#include "Luau/IrBuilder.h" +#include "Luau/IrVisitUseDef.h" +#include "Luau/IrUtils.h" + +#include + +#include "lobject.h" + +LUAU_FASTFLAGVARIABLE(LuauCodegenRemoveDeadStores2, false) +LUAU_FASTFLAG(LuauCodegenVectorTag2) + +// TODO: optimization can be improved by knowing which registers are live in at each VM exit + +namespace Luau +{ +namespace CodeGen +{ + +// Luau value structure reminder: +// [ TValue ] +// [ Value ][ Extra ][ Tag ] +// Storing individual components will not kill any previous TValue stores +// Storing TValue will kill any full store or a component store ('extra' excluded because it's rare) + +struct StoreRegInfo +{ + // Indices of the last unused store instructions + uint32_t tagInstIdx = ~0u; + uint32_t valueInstIdx = ~0u; + uint32_t tvalueInstIdx = ~0u; + + // This register might contain a GC object + bool maybeGco = false; +}; + +struct RemoveDeadStoreState +{ + RemoveDeadStoreState(IrFunction& function) + : function(function) + { + maxReg = function.proto ? function.proto->maxstacksize : 255; + } + + void killTagStore(StoreRegInfo& regInfo) + { + if (regInfo.tagInstIdx != ~0u) + { + kill(function, function.instructions[regInfo.tagInstIdx]); + + regInfo.tagInstIdx = ~0u; + regInfo.maybeGco = false; + } + } + + void killValueStore(StoreRegInfo& regInfo) + { + if (regInfo.valueInstIdx != ~0u) + { + kill(function, function.instructions[regInfo.valueInstIdx]); + + regInfo.valueInstIdx = ~0u; + regInfo.maybeGco = false; + } + } + + void killTValueStore(StoreRegInfo& regInfo) + { + if (regInfo.tvalueInstIdx != ~0u) + { + kill(function, function.instructions[regInfo.tvalueInstIdx]); + + regInfo.tvalueInstIdx = ~0u; + regInfo.maybeGco = false; + } + } + + // When a register value is being defined, it kills previous stores + void defReg(uint8_t reg) + { + StoreRegInfo& regInfo = info[reg]; + + // Stores to captured registers are not removed since we don't track their uses outside of function + if (function.cfg.captured.regs.test(reg)) + return; + + killTagStore(regInfo); + killValueStore(regInfo); + killTValueStore(regInfo); + } + + // When a register value is being used, we forget about the last store location to not kill them + void useReg(uint8_t reg) + { + info[reg] = StoreRegInfo{}; + } + + // When checking control flow, such as exit to fallback blocks: + // For VM exits, we keep all stores because we don't have information on what registers are live at the start of the VM assist + // For regular blocks, we check which registers are expected to be live at entry (if we have CFG information available) + void checkLiveIns(IrOp op) + { + if (op.kind == IrOpKind::VmExit) + { + clear(); + } + else if (op.kind == IrOpKind::Block) + { + if (op.index < function.cfg.in.size()) + { + const RegisterSet& in = function.cfg.in[op.index]; + + for (int i = 0; i <= maxReg; i++) + { + if (in.regs.test(i) || (in.varargSeq && i >= in.varargStart)) + useReg(i); + } + } + else + { + clear(); + } + } + else if (op.kind == IrOpKind::Undef) + { + // Nothing to do for a debug abort + } + else + { + CODEGEN_ASSERT(!"unexpected jump target type"); + } + } + + // When checking block terminators, any registers that are not live out can be removed by saying that a new value is being 'defined' + void checkLiveOuts(const IrBlock& block) + { + uint32_t index = function.getBlockIndex(block); + + if (index < function.cfg.out.size()) + { + const RegisterSet& out = function.cfg.out[index]; + + for (int i = 0; i <= maxReg; i++) + { + bool isOut = out.regs.test(i) || (out.varargSeq && i >= out.varargStart); + + if (!isOut) + defReg(i); + } + } + } + + // Common instruction visitor handling + void defVarargs(uint8_t varargStart) + { + for (int i = varargStart; i <= maxReg; i++) + defReg(uint8_t(i)); + } + + void useVarargs(uint8_t varargStart) + { + for (int i = varargStart; i <= maxReg; i++) + useReg(uint8_t(i)); + } + + void def(IrOp op, int offset = 0) + { + defReg(vmRegOp(op) + offset); + } + + void use(IrOp op, int offset = 0) + { + useReg(vmRegOp(op) + offset); + } + + void maybeDef(IrOp op) + { + if (op.kind == IrOpKind::VmReg) + defReg(vmRegOp(op)); + } + + void maybeUse(IrOp op) + { + if (op.kind == IrOpKind::VmReg) + useReg(vmRegOp(op)); + } + + void defRange(int start, int count) + { + if (count == -1) + { + defVarargs(start); + } + else + { + for (int i = start; i < start + count; i++) + defReg(i); + } + } + + void useRange(int start, int count) + { + if (count == -1) + { + useVarargs(start); + } + else + { + for (int i = start; i < start + count; i++) + useReg(i); + } + } + + // Required for a full visitor interface + void capture(int reg) {} + + // Full clear of the tracked information + void clear() + { + for (int i = 0; i <= maxReg; i++) + info[i] = StoreRegInfo(); + + hasGcoToClear = false; + } + + // Partial clear of information about registers that might contain a GC object + // This is used by instructions that might perform a GC assist and GC needs all pointers to be pinned to stack + void flushGcoRegs() + { + for (int i = 0; i <= maxReg; i++) + { + if (info[i].maybeGco) + info[i] = StoreRegInfo(); + } + + hasGcoToClear = false; + } + + IrFunction& function; + + std::array info; + int maxReg = 255; + + // Some of the registers contain values which might be a GC object + bool hasGcoToClear = false; +}; + +static void markDeadStoresInInst(RemoveDeadStoreState& state, IrBuilder& build, IrFunction& function, IrBlock& block, IrInst& inst, uint32_t index) +{ + switch (inst.cmd) + { + case IrCmd::STORE_TAG: + if (inst.a.kind == IrOpKind::VmReg) + { + int reg = vmRegOp(inst.a); + + if (function.cfg.captured.regs.test(reg)) + return; + + StoreRegInfo& regInfo = state.info[reg]; + + state.killTagStore(regInfo); + + uint8_t tag = function.tagOp(inst.b); + + regInfo.tagInstIdx = index; + regInfo.maybeGco = isGCO(tag); + state.hasGcoToClear |= regInfo.maybeGco; + } + break; + case IrCmd::STORE_EXTRA: + // To simplify, extra field store is preserved along with all other stores made so far + if (inst.a.kind == IrOpKind::VmReg) + { + state.useReg(vmRegOp(inst.a)); + } + break; + case IrCmd::STORE_POINTER: + if (inst.a.kind == IrOpKind::VmReg) + { + int reg = vmRegOp(inst.a); + + if (function.cfg.captured.regs.test(reg)) + return; + + StoreRegInfo& regInfo = state.info[reg]; + + state.killValueStore(regInfo); + + regInfo.valueInstIdx = index; + regInfo.maybeGco = true; + state.hasGcoToClear = true; + } + break; + case IrCmd::STORE_DOUBLE: + case IrCmd::STORE_INT: + case IrCmd::STORE_VECTOR: + if (inst.a.kind == IrOpKind::VmReg) + { + int reg = vmRegOp(inst.a); + + if (function.cfg.captured.regs.test(reg)) + return; + + StoreRegInfo& regInfo = state.info[reg]; + + state.killValueStore(regInfo); + + regInfo.valueInstIdx = index; + } + break; + case IrCmd::STORE_TVALUE: + if (inst.a.kind == IrOpKind::VmReg) + { + int reg = vmRegOp(inst.a); + + if (function.cfg.captured.regs.test(reg)) + return; + + StoreRegInfo& regInfo = state.info[reg]; + + state.killTagStore(regInfo); + state.killValueStore(regInfo); + state.killTValueStore(regInfo); + + regInfo.tvalueInstIdx = index; + regInfo.maybeGco = true; + + // If the argument is a vector, it's not a GC object + // Note that for known boolean/number/GCO, we already optimize into STORE_SPLIT_TVALUE form + // TODO: this can be removed if TAG_VECTOR+STORE_TVALUE is replaced with STORE_SPLIT_TVALUE + if (IrInst* arg = function.asInstOp(inst.b)) + { + if (FFlag::LuauCodegenVectorTag2) + { + if (arg->cmd == IrCmd::TAG_VECTOR) + regInfo.maybeGco = false; + } + else + { + if (arg->cmd == IrCmd::ADD_VEC || arg->cmd == IrCmd::SUB_VEC || arg->cmd == IrCmd::MUL_VEC || arg->cmd == IrCmd::DIV_VEC || + arg->cmd == IrCmd::UNM_VEC) + regInfo.maybeGco = false; + } + } + + state.hasGcoToClear |= regInfo.maybeGco; + } + break; + case IrCmd::STORE_SPLIT_TVALUE: + if (inst.a.kind == IrOpKind::VmReg) + { + int reg = vmRegOp(inst.a); + + if (function.cfg.captured.regs.test(reg)) + return; + + StoreRegInfo& regInfo = state.info[reg]; + + state.killTagStore(regInfo); + state.killValueStore(regInfo); + state.killTValueStore(regInfo); + + regInfo.tvalueInstIdx = index; + regInfo.maybeGco = isGCO(function.tagOp(inst.b)); + state.hasGcoToClear |= regInfo.maybeGco; + } + break; + + // Guard checks can jump to a block which might be using some or all the values we stored + case IrCmd::CHECK_TAG: + // After optimizations with DebugLuauAbortingChecks enabled, CHECK_TAG might use a VM register + visitVmRegDefsUses(state, function, inst); + + state.checkLiveIns(inst.c); + break; + case IrCmd::TRY_NUM_TO_INDEX: + state.checkLiveIns(inst.b); + break; + case IrCmd::TRY_CALL_FASTGETTM: + state.checkLiveIns(inst.c); + break; + case IrCmd::CHECK_FASTCALL_RES: + state.checkLiveIns(inst.b); + break; + case IrCmd::CHECK_TRUTHY: + state.checkLiveIns(inst.c); + break; + case IrCmd::CHECK_READONLY: + state.checkLiveIns(inst.b); + break; + case IrCmd::CHECK_NO_METATABLE: + state.checkLiveIns(inst.b); + break; + case IrCmd::CHECK_SAFE_ENV: + state.checkLiveIns(inst.a); + break; + case IrCmd::CHECK_ARRAY_SIZE: + state.checkLiveIns(inst.c); + break; + case IrCmd::CHECK_SLOT_MATCH: + state.checkLiveIns(inst.c); + break; + case IrCmd::CHECK_NODE_NO_NEXT: + state.checkLiveIns(inst.b); + break; + case IrCmd::CHECK_NODE_VALUE: + state.checkLiveIns(inst.b); + break; + case IrCmd::CHECK_BUFFER_LEN: + state.checkLiveIns(inst.d); + break; + + case IrCmd::JUMP: + // Ideally, we would be able to remove stores to registers that are not live out from a block + // But during chain optimizations, we rely on data stored in the predecessor even when it's not an explicit live out + break; + case IrCmd::RETURN: + visitVmRegDefsUses(state, function, inst); + + // At the end of a function, we can kill stores to registers that are not live out + state.checkLiveOuts(block); + break; + case IrCmd::ADJUST_STACK_TO_REG: + // visitVmRegDefsUses considers adjustment as the fast call register definition point, but for dead store removal, we count the actual writes + break; + + // This group of instructions can trigger GC assist internally + // For GC to work correctly, all values containing a GCO have to be stored on stack - otherwise a live reference might be missed + case IrCmd::CMP_ANY: + case IrCmd::DO_ARITH: + case IrCmd::DO_LEN: + case IrCmd::GET_TABLE: + case IrCmd::SET_TABLE: + case IrCmd::GET_IMPORT: + case IrCmd::CONCAT: + case IrCmd::INTERRUPT: + case IrCmd::CHECK_GC: + case IrCmd::CALL: + case IrCmd::FORGLOOP_FALLBACK: + case IrCmd::FALLBACK_GETGLOBAL: + case IrCmd::FALLBACK_SETGLOBAL: + case IrCmd::FALLBACK_GETTABLEKS: + case IrCmd::FALLBACK_SETTABLEKS: + case IrCmd::FALLBACK_NAMECALL: + case IrCmd::FALLBACK_DUPCLOSURE: + case IrCmd::FALLBACK_FORGPREP: + if (state.hasGcoToClear) + state.flushGcoRegs(); + + visitVmRegDefsUses(state, function, inst); + break; + + default: + // Guards have to be covered explicitly + CODEGEN_ASSERT(!isNonTerminatingJump(inst.cmd)); + + visitVmRegDefsUses(state, function, inst); + break; + } +} + +static void markDeadStoresInBlock(IrBuilder& build, IrBlock& block, RemoveDeadStoreState& state) +{ + IrFunction& function = build.function; + + for (uint32_t index = block.start; index <= block.finish; index++) + { + CODEGEN_ASSERT(index < function.instructions.size()); + IrInst& inst = function.instructions[index]; + + markDeadStoresInInst(state, build, function, block, inst, index); + } +} + +static void markDeadStoresInBlockChain(IrBuilder& build, std::vector& visited, IrBlock* block) +{ + IrFunction& function = build.function; + + RemoveDeadStoreState state{function}; + + while (block) + { + uint32_t blockIdx = function.getBlockIndex(*block); + CODEGEN_ASSERT(!visited[blockIdx]); + visited[blockIdx] = true; + + markDeadStoresInBlock(build, *block, state); + + IrInst& termInst = function.instructions[block->finish]; + + IrBlock* nextBlock = nullptr; + + // Unconditional jump into a block with a single user (current block) allows us to continue optimization + // with the information we have gathered so far (unless we have already visited that block earlier) + if (termInst.cmd == IrCmd::JUMP && termInst.a.kind == IrOpKind::Block) + { + IrBlock& target = function.blockOp(termInst.a); + uint32_t targetIdx = function.getBlockIndex(target); + + if (target.useCount == 1 && !visited[targetIdx] && target.kind != IrBlockKind::Fallback) + nextBlock = ⌖ + } + + block = nextBlock; + } +} + +void markDeadStoresInBlockChains(IrBuilder& build) +{ + IrFunction& function = build.function; + + std::vector visited(function.blocks.size(), false); + + for (IrBlock& block : function.blocks) + { + if (block.kind == IrBlockKind::Fallback || block.kind == IrBlockKind::Dead) + continue; + + if (visited[function.getBlockIndex(block)]) + continue; + + markDeadStoresInBlockChain(build, visited, &block); + } +} + +} // namespace CodeGen +} // namespace Luau diff --git a/luau/CodeGen/src/OptimizeFinalX64.cpp b/luau/CodeGen/src/OptimizeFinalX64.cpp index 911750b..b2a5f7f 100644 --- a/luau/CodeGen/src/OptimizeFinalX64.cpp +++ b/luau/CodeGen/src/OptimizeFinalX64.cpp @@ -5,8 +5,6 @@ #include -LUAU_FASTFLAGVARIABLE(LuauCodegenMathMemArgs, false) - namespace Luau { namespace CodeGen @@ -116,7 +114,7 @@ static void optimizeMemoryOperandsX64(IrFunction& function, IrBlock& block) case IrCmd::SQRT_NUM: case IrCmd::ABS_NUM: { - if (FFlag::LuauCodegenMathMemArgs && inst.a.kind == IrOpKind::Inst) + if (inst.a.kind == IrOpKind::Inst) { IrInst& arg = function.instOp(inst.a); diff --git a/luau/VM/include/luaconf.h b/luau/VM/include/luaconf.h index 910e259..05d44f8 100644 --- a/luau/VM/include/luaconf.h +++ b/luau/VM/include/luaconf.h @@ -108,7 +108,7 @@ // upper bound for number of size classes used by page allocator #ifndef LUA_SIZECLASSES -#define LUA_SIZECLASSES 32 +#define LUA_SIZECLASSES 40 #endif // available number of separate memory categories diff --git a/luau/VM/src/lmem.cpp b/luau/VM/src/lmem.cpp index 8b264ef..d52d379 100644 --- a/luau/VM/src/lmem.cpp +++ b/luau/VM/src/lmem.cpp @@ -120,9 +120,19 @@ static_assert(offsetof(Udata, data) == ABISWITCH(16, 16, 12), "size mismatch for static_assert(sizeof(Table) == ABISWITCH(48, 32, 32), "size mismatch for table header"); static_assert(offsetof(Buffer, data) == ABISWITCH(8, 8, 8), "size mismatch for buffer header"); +LUAU_FASTFLAGVARIABLE(LuauExtendedSizeClasses, false) + const size_t kSizeClasses = LUA_SIZECLASSES; -const size_t kMaxSmallSize = 512; -const size_t kPageSize = 16 * 1024 - 24; // slightly under 16KB since that results in less fragmentation due to heap metadata +const size_t kMaxSmallSize_DEPRECATED = 512; // TODO: remove with FFlagLuauExtendedSizeClasses +const size_t kMaxSmallSize = 1024; +const size_t kLargePageThreshold = 512; // larger pages are used for objects larger than this size to fit more of them into a page + +// constant factor to reduce our page sizes by, to increase the chances that pages we allocate will +// allow external allocators to allocate them without wasting space due to rounding introduced by their heap meta data +const size_t kExternalAllocatorMetaDataReduction = 24; + +const size_t kSmallPageSize = 16 * 1024 - kExternalAllocatorMetaDataReduction; +const size_t kLargePageSize = 32 * 1024 - kExternalAllocatorMetaDataReduction; const size_t kBlockHeader = sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*); // suitable for aligning double & void* on all platforms const size_t kGCOLinkOffset = (sizeof(GCheader) + sizeof(void*) - 1) & ~(sizeof(void*) - 1); // GCO pages contain freelist links after the GC header @@ -143,6 +153,7 @@ struct SizeClassConfig // - we first allocate sizes classes in multiples of 8 // - after the first cutoff we allocate size classes in multiples of 16 // - after the second cutoff we allocate size classes in multiples of 32 + // - after the third cutoff we allocate size classes in multiples of 64 // this balances internal fragmentation vs external fragmentation for (int size = 8; size < 64; size += 8) sizeOfClass[classCount++] = size; @@ -150,7 +161,10 @@ struct SizeClassConfig for (int size = 64; size < 256; size += 16) sizeOfClass[classCount++] = size; - for (int size = 256; size <= 512; size += 32) + for (int size = 256; size < 512; size += 32) + sizeOfClass[classCount++] = size; + + for (int size = 512; size <= 1024; size += 64) sizeOfClass[classCount++] = size; LUAU_ASSERT(size_t(classCount) <= kSizeClasses); @@ -169,7 +183,8 @@ struct SizeClassConfig const SizeClassConfig kSizeClassConfig; // size class for a block of size sz; returns -1 for size=0 because empty allocations take no space -#define sizeclass(sz) (size_t((sz)-1) < kMaxSmallSize ? kSizeClassConfig.classForSize[sz] : -1) +#define sizeclass(sz) \ + (size_t((sz)-1) < (FFlag::LuauExtendedSizeClasses ? kMaxSmallSize : kMaxSmallSize_DEPRECATED) ? kSizeClassConfig.classForSize[sz] : -1) // metadata for a block is stored in the first pointer of the block #define metadata(block) (*(void**)(block)) @@ -245,18 +260,39 @@ static lua_Page* newpage(lua_State* L, lua_Page** gcopageset, int pageSize, int return page; } -static lua_Page* newclasspage(lua_State* L, lua_Page** freepageset, lua_Page** gcopageset, uint8_t sizeClass, bool storeMetadata) +// this is part of a cold path in newblock and newgcoblock +// it is marked as noinline to prevent it from being inlined into those functions +// if it is inlined, then the compiler may determine those functions are "too big" to be profitably inlined, which results in reduced performance +LUAU_NOINLINE static lua_Page* newclasspage(lua_State* L, lua_Page** freepageset, lua_Page** gcopageset, uint8_t sizeClass, bool storeMetadata) { - int blockSize = kSizeClassConfig.sizeOfClass[sizeClass] + (storeMetadata ? kBlockHeader : 0); - int blockCount = (kPageSize - offsetof(lua_Page, data)) / blockSize; + if (FFlag::LuauExtendedSizeClasses) + { + int sizeOfClass = kSizeClassConfig.sizeOfClass[sizeClass]; + int pageSize = sizeOfClass > int(kLargePageThreshold) ? kLargePageSize : kSmallPageSize; + int blockSize = sizeOfClass + (storeMetadata ? kBlockHeader : 0); + int blockCount = (pageSize - offsetof(lua_Page, data)) / blockSize; - lua_Page* page = newpage(L, gcopageset, kPageSize, blockSize, blockCount); + lua_Page* page = newpage(L, gcopageset, pageSize, blockSize, blockCount); - // prepend a page to page freelist (which is empty because we only ever allocate a new page when it is!) - LUAU_ASSERT(!freepageset[sizeClass]); - freepageset[sizeClass] = page; + // prepend a page to page freelist (which is empty because we only ever allocate a new page when it is!) + LUAU_ASSERT(!freepageset[sizeClass]); + freepageset[sizeClass] = page; - return page; + return page; + } + else + { + int blockSize = kSizeClassConfig.sizeOfClass[sizeClass] + (storeMetadata ? kBlockHeader : 0); + int blockCount = (kSmallPageSize - offsetof(lua_Page, data)) / blockSize; + + lua_Page* page = newpage(L, gcopageset, kSmallPageSize, blockSize, blockCount); + + // prepend a page to page freelist (which is empty because we only ever allocate a new page when it is!) + LUAU_ASSERT(!freepageset[sizeClass]); + freepageset[sizeClass] = page; + + return page; + } } static void freepage(lua_State* L, lua_Page** gcopageset, lua_Page* page) diff --git a/luau/VM/src/lnumprint.cpp b/luau/VM/src/lnumprint.cpp index c09b1be..763675e 100644 --- a/luau/VM/src/lnumprint.cpp +++ b/luau/VM/src/lnumprint.cpp @@ -11,8 +11,6 @@ #include #endif -LUAU_FASTFLAGVARIABLE(LuauSciNumberSkipTrailDot, false) - // This work is based on: // Raffaello Giulietti. The Schubfach way to render doubles. 2021 // https://drive.google.com/file/d/1IEeATSVnEE6TkrHlCYNY2GjaraBjOT4f/edit @@ -363,7 +361,7 @@ char* luai_num2str(char* buf, double n) char* exp = trimzero(buf + declen + 1); - if (FFlag::LuauSciNumberSkipTrailDot && exp[-1] == '.') + if (exp[-1] == '.') exp--; return printexp(exp, dot - 1); diff --git a/luau/VM/src/lobject.cpp b/luau/VM/src/lobject.cpp index 514a435..640bd96 100644 --- a/luau/VM/src/lobject.cpp +++ b/luau/VM/src/lobject.cpp @@ -48,7 +48,7 @@ int luaO_rawequalObj(const TValue* t1, const TValue* t2) case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); // boolean true must be 1 !! case LUA_TLIGHTUSERDATA: - return pvalue(t1) == pvalue(t2) && (!FFlag::LuauTaggedLuData || lightuserdatatag(t1) == lightuserdatatag(t2)); + return pvalue(t1) == pvalue(t2) && lightuserdatatag(t1) == lightuserdatatag(t2); default: LUAU_ASSERT(iscollectable(t1)); return gcvalue(t1) == gcvalue(t2); @@ -71,7 +71,7 @@ int luaO_rawequalKey(const TKey* t1, const TValue* t2) case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); // boolean true must be 1 !! case LUA_TLIGHTUSERDATA: - return pvalue(t1) == pvalue(t2) && (!FFlag::LuauTaggedLuData || lightuserdatatag(t1) == lightuserdatatag(t2)); + return pvalue(t1) == pvalue(t2) && lightuserdatatag(t1) == lightuserdatatag(t2); default: LUAU_ASSERT(iscollectable(t1)); return gcvalue(t1) == gcvalue(t2); diff --git a/luau/VM/src/lobject.h b/luau/VM/src/lobject.h index 44f2bcc..1f84e2d 100644 --- a/luau/VM/src/lobject.h +++ b/luau/VM/src/lobject.h @@ -5,8 +5,6 @@ #include "lua.h" #include "lcommon.h" -LUAU_FASTFLAG(LuauTaggedLuData) - /* ** Union of all collectible objects */ diff --git a/luau/VM/src/lstrlib.cpp b/luau/VM/src/lstrlib.cpp index 03d7cf3..85669e9 100644 --- a/luau/VM/src/lstrlib.cpp +++ b/luau/VM/src/lstrlib.cpp @@ -8,8 +8,6 @@ #include #include -LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauInterruptablePatternMatch, false) - // macro to `unsign' a character #define uchar(c) ((unsigned char)(c)) @@ -432,18 +430,15 @@ static const char* match(MatchState* ms, const char* s, const char* p) if (ms->matchdepth-- == 0) luaL_error(ms->L, "pattern too complex"); - if (DFFlag::LuauInterruptablePatternMatch) - { - lua_State* L = ms->L; - void (*interrupt)(lua_State*, int) = L->global->cb.interrupt; + lua_State* L = ms->L; + void (*interrupt)(lua_State*, int) = L->global->cb.interrupt; - if (LUAU_UNLIKELY(!!interrupt)) - { - // this interrupt is not yieldable - L->nCcalls++; - interrupt(L, -1); - L->nCcalls--; - } + if (LUAU_UNLIKELY(!!interrupt)) + { + // this interrupt is not yieldable + L->nCcalls++; + interrupt(L, -1); + L->nCcalls--; } init: // using goto's to optimize tail recursion diff --git a/luau/VM/src/ltm.cpp b/luau/VM/src/ltm.cpp index 2336902..09c3d82 100644 --- a/luau/VM/src/ltm.cpp +++ b/luau/VM/src/ltm.cpp @@ -129,7 +129,7 @@ const TString* luaT_objtypenamestr(lua_State* L, const TValue* o) if (ttisstring(type)) return tsvalue(type); } - else if (FFlag::LuauTaggedLuData && ttislightuserdata(o)) + else if (ttislightuserdata(o)) { int tag = lightuserdatatag(o); diff --git a/luau/VM/src/lvmexecute.cpp b/luau/VM/src/lvmexecute.cpp index 2ed4819..74e30c9 100644 --- a/luau/VM/src/lvmexecute.cpp +++ b/luau/VM/src/lvmexecute.cpp @@ -133,8 +133,6 @@ // Does VM support native execution via ExecutionCallbacks? We mostly assume it does but keep the define to make it easy to quantify the cost. #define VM_HAS_NATIVE 1 -LUAU_FASTFLAGVARIABLE(LuauTaggedLuData, false) - LUAU_NOINLINE void luau_callhook(lua_State* L, lua_Hook hook, void* userdata) { ptrdiff_t base = savestack(L, L->base); @@ -1110,9 +1108,7 @@ static void luau_execute(lua_State* L) VM_NEXT(); case LUA_TLIGHTUSERDATA: - pc += (pvalue(ra) == pvalue(rb) && (!FFlag::LuauTaggedLuData || lightuserdatatag(ra) == lightuserdatatag(rb))) - ? LUAU_INSN_D(insn) - : 1; + pc += (pvalue(ra) == pvalue(rb) && lightuserdatatag(ra) == lightuserdatatag(rb)) ? LUAU_INSN_D(insn) : 1; LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); VM_NEXT(); @@ -1227,9 +1223,7 @@ static void luau_execute(lua_State* L) VM_NEXT(); case LUA_TLIGHTUSERDATA: - pc += (pvalue(ra) != pvalue(rb) || (FFlag::LuauTaggedLuData && lightuserdatatag(ra) != lightuserdatatag(rb))) - ? LUAU_INSN_D(insn) - : 1; + pc += (pvalue(ra) != pvalue(rb) || lightuserdatatag(ra) != lightuserdatatag(rb)) ? LUAU_INSN_D(insn) : 1; LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); VM_NEXT(); diff --git a/luau/VM/src/lvmutils.cpp b/luau/VM/src/lvmutils.cpp index a2186c5..4db8bba 100644 --- a/luau/VM/src/lvmutils.cpp +++ b/luau/VM/src/lvmutils.cpp @@ -288,7 +288,7 @@ int luaV_equalval(lua_State* L, const TValue* t1, const TValue* t2) case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); // true must be 1 !! case LUA_TLIGHTUSERDATA: - return pvalue(t1) == pvalue(t2) && (!FFlag::LuauTaggedLuData || lightuserdatatag(t1) == lightuserdatatag(t2)); + return pvalue(t1) == pvalue(t2) && lightuserdatatag(t1) == lightuserdatatag(t2); case LUA_TUSERDATA: { tm = get_compTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);