diff --git a/internal/asm/arm64/assembler.go b/internal/asm/arm64/assembler.go index 5c1beba661..316d7cce27 100644 --- a/internal/asm/arm64/assembler.go +++ b/internal/asm/arm64/assembler.go @@ -92,7 +92,7 @@ type Assembler interface { // CompileVectorRegisterToVectorRegisterWithConst is the same as CompileVectorRegisterToVectorRegister but the // additional constant can be provided. - // For example, the const can be used to specify the shift amount for USHLLIMM instruction. + // For example, the const can be used to specify the shift amount for USHLL instruction. CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue) diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index d3b234d1ee..6515906766 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -718,12 +718,12 @@ const ( VFSUBD // SSHL is the SSHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en SSHL - // SSHLLIMM is the SSHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- - SSHLLIMM + // SSHLL is the SSHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + SSHLL // USHL is the USHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en USHL - // USHLLIMM is the USHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- - USHLLIMM + // USHLL is the USHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + USHLL // LD1R is the LD1R instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register-- LD1R // SMOV32 is the 32-bit variant of SMOV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/SMOV--vector- @@ -856,6 +856,34 @@ const ( VUMLAL // SHLL is the SHLL instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SHLL--SHLL2--Shift-Left-Long--by-element-size--?lang=en SHLL + // SADDLP is the SADDLP instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en + SADDLP + // UADDLP is the UADDLP instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en + UADDLP + // SSHLL2 is the SSHLL2(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + SSHLL2 + // USHLL2 is the USHLL2(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + USHLL2 + // SQRDMULH is the SQRDMULH(vector) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQRDMULH--vector---Signed-saturating-Rounding-Doubling-Multiply-returning-High-half-?lang=en + SQRDMULH + // SMULL is the SMULL(vector) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL + // SMULL2 is the SMULL2(vector) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL2 + // UMULL is the UMULL instruction. https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL + // UMULL2 is the UMULL2 instruction. https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL2 + // VFCVTZS is the FCVTZS(vector,integer) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en + // Note: prefixed by V to distinguish from the non-vector variant. + VFCVTZS + // VFCVTZU is the FCVTZU(vector,integer) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en + // Note: prefixed by V to distinguish from the non-vector variant. + VFCVTZU + // SQXTN is the SQXTN instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en + SQXTN + // UQXTN is the UQXTN instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en + UQXTN // instructionEnd is always placed at the bottom of this iota definition to be used in the test. instructionEnd @@ -1212,10 +1240,10 @@ func InstructionName(i asm.Instruction) string { return "SSHL" case USHL: return "USHL" - case SSHLLIMM: - return "SSHLLIMM" - case USHLLIMM: - return "USHLLIMM" + case SSHLL: + return "SSHLL" + case USHLL: + return "USHLL" case LD1R: return "LD1R" case SMOV32: @@ -1326,6 +1354,32 @@ func InstructionName(i asm.Instruction) string { return "VUMLAL" case SHLL: return "SHLL" + case SSHLL2: + return "SSHLL2" + case USHLL2: + return "USHLL2" + case SQRDMULH: + return "SQRDMULH" + case SADDLP: + return "SADDLP" + case UADDLP: + return "UADDLP" + case SMULL: + return "SMULL" + case SMULL2: + return "SMULL2" + case UMULL: + return "UMULL" + case UMULL2: + return "UMULL2" + case VFCVTZS: + return "VFCVTZS" + case VFCVTZU: + return "VFCVTZU" + case SQXTN: + return "SQXTN" + case UQXTN: + return "UQXTN" } panic(fmt.Errorf("unknown instruction %d", i)) } diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go index bf74ecf7f4..1c77f7d774 100644 --- a/internal/asm/arm64/impl.go +++ b/internal/asm/arm64/impl.go @@ -2773,18 +2773,18 @@ func (a *AssemblerImpl) EncodeStaticConstToVectorRegister(n *NodeImpl) (err erro // advancedSIMDTwoRegisterMisc holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { - U, opcode byte + u, opcode byte qAndSize map[VectorArrangement]qAndSize }{ // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en - NOT: {U: 0b1, opcode: 0b00101, + NOT: {u: 0b1, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement16B: {size: 0b00, q: 0b1}, VectorArrangement8B: {size: 0b00, q: 0b0}, }, }, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en - VFNEG: {U: 0b1, opcode: 0b01111, + VFNEG: {u: 0b1, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, @@ -2792,63 +2792,85 @@ var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { }, }, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FABS--vector---Floating-point-Absolute-value--vector--?lang=en - VFABS: {U: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ + VFABS: {u: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSQRT--vector---Floating-point-Square-Root--vector--?lang=en - VFSQRT: {U: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{ + VFSQRT: {u: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTM--vector---Floating-point-Round-to-Integral--toward-Minus-infinity--vector--?lang=en - VFRINTM: {U: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTM: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b01, q: 0b1}, VectorArrangement4S: {size: 0b00, q: 0b1}, VectorArrangement2S: {size: 0b00, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTN--vector---Floating-point-Round-to-Integral--to-nearest-with-ties-to-even--vector--?lang=en - VFRINTN: {U: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTN: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b01, q: 0b1}, VectorArrangement4S: {size: 0b00, q: 0b1}, VectorArrangement2S: {size: 0b00, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTP--vector---Floating-point-Round-to-Integral--toward-Plus-infinity--vector--?lang=en - VFRINTP: {U: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTP: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTZ--vector---Floating-point-Round-to-Integral--toward-Zero--vector--?lang=en - VFRINTZ: {U: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTZ: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CNT--Population-Count-per-byte-?lang=en - VCNT: {U: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ + VCNT: {u: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement8B: {size: 0b00, q: 0b0}, VectorArrangement16B: {size: 0b00, q: 0b1}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NEG--vector---Negate--vector--?lang=en - VNEG: {U: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize}, + VNEG: {u: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ABS--Absolute-value--vector--?lang=en - VABS: {U: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize}, + VABS: {u: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/REV64--Reverse-elements-in-64-bit-doublewords--vector--?lang=en - REV64: {U: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize}, + REV64: {u: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/XTN--XTN2--Extract-Narrow-?lang=en - XTN: {U: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ + XTN: {u: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {q: 0, size: 0b10}, VectorArrangement4S: {q: 0, size: 0b01}, VectorArrangement8H: {q: 0, size: 0b00}, }}, - SHLL: {U: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{ + SHLL: {u: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement8B: {q: 0b00, size: 0b00}, VectorArrangement4H: {q: 0b00, size: 0b01}, VectorArrangement2S: {q: 0b00, size: 0b10}, }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en + CMEQZERO: {u: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en + SADDLP: {u: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en + UADDLP: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en + VFCVTZS: {u: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement4S: {size: 0b10, q: 0b1}, + VectorArrangement2S: {size: 0b10, q: 0b0}, + VectorArrangement2D: {size: 0b11, q: 0b1}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en + VFCVTZU: {u: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement4S: {size: 0b10, q: 0b1}, + VectorArrangement2S: {size: 0b10, q: 0b0}, + VectorArrangement2D: {size: 0b11, q: 0b1}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en + SQXTN: {u: 0b0, opcode: 0b10100, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en + UQXTN: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize}, } // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in @@ -2857,11 +2879,36 @@ var advancedSIMDThreeDifferent = map[asm.Instruction]struct { u, opcode byte qAndSize map[VectorArrangement]qAndSize }{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMLAL--UMLAL2--vector---Unsigned-Multiply-Add-Long--vector--?lang=en VUMLAL: {u: 0b1, opcode: 0b1000, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2S: {q: 0b0, size: 0b10}, VectorArrangement4H: {q: 0b0, size: 0b01}, VectorArrangement8B: {q: 0b0, size: 0b00}, }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement8B: {q: 0b0, size: 0b00}, + VectorArrangement4H: {q: 0b0, size: 0b01}, + VectorArrangement2S: {q: 0b0, size: 0b10}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL2: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement16B: {q: 0b1, size: 0b00}, + VectorArrangement8H: {q: 0b1, size: 0b01}, + VectorArrangement4S: {q: 0b1, size: 0b10}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement8B: {q: 0b0, size: 0b00}, + VectorArrangement4H: {q: 0b0, size: 0b01}, + VectorArrangement2S: {q: 0b0, size: 0b10}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL2: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement16B: {q: 0b1, size: 0b00}, + VectorArrangement8H: {q: 0b1, size: 0b01}, + VectorArrangement4S: {q: 0b1, size: 0b10}, + }}, } // advancedSIMDThreeSame holds information to encode instructions as "Advanced SIMD three same" in @@ -3034,6 +3081,12 @@ var advancedSIMDThreeSame = map[asm.Instruction]struct { VectorArrangement8B: {q: 0b0, size: 0b10}, VectorArrangement16B: {q: 0b1, size: 0b10}, }}, + SQRDMULH: {u: 0b1, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement4H: {q: 0b0, size: 0b01}, + VectorArrangement8H: {q: 0b1, size: 0b01}, + VectorArrangement2S: {q: 0b0, size: 0b10}, + VectorArrangement4S: {q: 0b1, size: 0b10}, + }}, } // aAndSize is a pair of "Q" and "size" that appear in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en @@ -3249,59 +3302,62 @@ var advancedSIMDTableLookup = map[asm.Instruction]struct { TBL2: {op: 0, op2: 0, Len: 0b01, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}}, } -// advancedSIMDScalarTwoRegisterMisc holds information to encode instructions as "Advanced SIMD scalar two-register miscellaneous" in -// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en -var advancedSIMDScalarTwoRegisterMisc = map[asm.Instruction]struct { - U, opcode byte - qAndSize map[VectorArrangement]qAndSize -}{ - // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en - CMEQZERO: {U: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, -} - // advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en var advancedSIMDShiftByImmediate = map[asm.Instruction]struct { - U, opcode byte - immQResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) + U, opcode byte + q map[VectorArrangement]byte + immResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) }{ // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- - SSHLLIMM: {U: 0b0, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, + SSHLL: {U: 0b0, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- + SSHLL2: {U: 0b0, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- - USHLLIMM: {U: 0b1, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, + USHLL: {U: 0b1, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- + USHLL2: {U: 0b1, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en - SSHR: {U: 0b0, opcode: 0b00000, immQResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { - switch arr { - case VectorArrangement16B, VectorArrangement8B: - immh = 0b0001 - immb = 8 - byte(shiftAmount&0b111) - if arr == VectorArrangement16B { - q = 1 - } - case VectorArrangement8H, VectorArrangement4H: - v := 16 - byte(shiftAmount&0b1111) - immb = v & 0b111 - immh = 0b0010 | (v >> 3) - if arr == VectorArrangement8H { - q = 1 - } - case VectorArrangement4S, VectorArrangement2S: - v := 32 - byte(shiftAmount&0b11111) - immb = v & 0b111 - immh = 0b0100 | (v >> 3) - if arr == VectorArrangement4S { - q = 1 + SSHR: {U: 0b0, opcode: 0b00000, + q: map[VectorArrangement]byte{ + VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1, VectorArrangement2D: 0b1, + VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0, + }, + immResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) { + switch arr { + case VectorArrangement16B, VectorArrangement8B: + immh = 0b0001 + immb = 8 - byte(shiftAmount&0b111) + case VectorArrangement8H, VectorArrangement4H: + v := 16 - byte(shiftAmount&0b1111) + immb = v & 0b111 + immh = 0b0010 | (v >> 3) + case VectorArrangement4S, VectorArrangement2S: + v := 32 - byte(shiftAmount&0b11111) + immb = v & 0b111 + immh = 0b0100 | (v >> 3) + case VectorArrangement2D: + v := 64 - byte(shiftAmount&0b111111) + immb = v & 0b111 + immh = 0b1000 | (v >> 3) + default: + err = fmt.Errorf("unsupported arrangement %s", arr) } - case VectorArrangement2D: - v := 64 - byte(shiftAmount&0b111111) - immb = v & 0b111 - immh = 0b1000 | (v >> 3) - q = 1 - default: - err = fmt.Errorf("unsupported arrangement %s", arr) - } - return - }}, + return + }, + }, } // advancedSIMDPermute holds information to encode instructions as "Advanced SIMD permute" in @@ -3312,17 +3368,15 @@ var advancedSIMDPermute = map[asm.Instruction]struct { ZIP1: {opcode: 0b011}, } -func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { - q = 0b0 - +func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) { switch arr { - case VectorArrangement8B: + case VectorArrangement16B, VectorArrangement8B: immb = byte(shiftAmount) immh = 0b0001 - case VectorArrangement4H: + case VectorArrangement8H, VectorArrangement4H: immb = byte(shiftAmount) & 0b111 immh = 0b0010 | byte(shiftAmount>>3) - case VectorArrangement2S: + case VectorArrangement4S, VectorArrangement2S: immb = byte(shiftAmount) & 0b111 immh = 0b0100 | byte(shiftAmount>>3) default: @@ -3425,7 +3479,7 @@ func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err e (srcVectorRegBits << 5) | dstVectorRegBits, twoRegMisc.opcode<<4 | 0b1<<3 | srcVectorRegBits>>3, qs.size<<6 | 0b1<<5 | twoRegMisc.opcode>>4, - qs.q<<6 | twoRegMisc.U<<5 | 0b01110, + qs.q<<6 | twoRegMisc.u<<5 | 0b01110, }) return nil } @@ -3478,25 +3532,17 @@ func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err e return } - if scalaTwoMisc, ok := advancedSIMDScalarTwoRegisterMisc[n.Instruction]; ok { - qs, ok := scalaTwoMisc.qAndSize[n.VectorArrangement] - if !ok { - return fmt.Errorf("unsupported vector arrangement %s for %s", n.VectorArrangement, InstructionName(n.Instruction)) - } - a.Buf.Write([]byte{ - (dstVectorRegBits << 5) | dstVectorRegBits, - 0b100110<<2 | dstVectorRegBits>>3, - qs.size<<6 | 0b1<<5, - qs.q<<6 | scalaTwoMisc.U<<5 | 0b01001110, - }) - return - } - if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.Instruction]; ok { - immh, immb, q, err := shiftByImmediate.immQResolver(n.SrcConst, n.VectorArrangement) + immh, immb, err := shiftByImmediate.immResolver(n.SrcConst, n.VectorArrangement) if err != nil { return err } + + q, ok := shiftByImmediate.q[n.VectorArrangement] + if !ok { + return fmt.Errorf("unsupported vector arrangement %s for %s", n.VectorArrangement, InstructionName(n.Instruction)) + } + a.Buf.Write([]byte{ (srcVectorRegBits << 5) | dstVectorRegBits, shiftByImmediate.opcode<<3 | 0b1<<2 | srcVectorRegBits>>3, diff --git a/internal/asm/arm64/impl_test.go b/internal/asm/arm64/impl_test.go index a7ee786670..e1b6b16abe 100644 --- a/internal/asm/arm64/impl_test.go +++ b/internal/asm/arm64/impl_test.go @@ -1030,7 +1030,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "ushll v10.8h, v2.8b, #0", x1: RegV2, x2: RegV10, - inst: USHLLIMM, + inst: USHLL, exp: []byte{0x4a, 0xa4, 0x8, 0x2f}, arr: VectorArrangement8B, }, @@ -1038,7 +1038,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "ushll v10.8h, v2.8b, #7", x1: RegV2, x2: RegV10, - inst: USHLLIMM, + inst: USHLL, exp: []byte{0x4a, 0xa4, 0xf, 0x2f}, arr: VectorArrangement8B, c: 7, @@ -1182,7 +1182,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.8h, v2.8b, #7", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, exp: []byte{0x4a, 0xa4, 0xf, 0xf}, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0xf, 0xf}, arr: VectorArrangement8B, c: 7, }, @@ -1190,7 +1190,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.4s, v2.4h, #0", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x10, 0xf}, arr: VectorArrangement4H, }, @@ -1198,7 +1198,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.4s, v2.4h, #0xf", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x1f, 0xf}, arr: VectorArrangement4H, c: 15, @@ -1207,7 +1207,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.2d, v2.2s, #0", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x20, 0xf}, arr: VectorArrangement2S, }, @@ -1215,7 +1215,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.2d, v2.2s, #0x1f", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x3f, 0xf}, arr: VectorArrangement2S, c: 31, @@ -1396,10 +1396,10 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { { x1: RegRZR, x2: RegV30, - name: "cmeq v30.2d, v30.2d, #0", + name: "cmeq v30.2d, v0.2d, #0", inst: CMEQZERO, arr: VectorArrangement2D, - exp: []byte{0xde, 0x9b, 0xe0, 0x4e}, + exp: []byte{0x1e, 0x98, 0xe0, 0x4e}, }, { name: "tbl v1.8b, {v0.16b}, v1.8b", @@ -1754,6 +1754,170 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { exp: []byte{0x3e, 0x3b, 0xb0, 0x6e}, arr: VectorArrangement4S, }, + { + x1: RegV25, + x2: RegV30, + name: "saddlp v30.2d, v25.4s", + inst: SADDLP, + exp: []byte{0x3e, 0x2b, 0xa0, 0x4e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "saddlp v30.4s, v25.8h", + inst: SADDLP, + exp: []byte{0x3e, 0x2b, 0x60, 0x4e}, + arr: VectorArrangement8H, + }, + { + x1: RegV25, + x2: RegV30, + name: "uaddlp v30.2d, v25.4s", + inst: UADDLP, + exp: []byte{0x3e, 0x2b, 0xa0, 0x6e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "uaddlp v30.4s, v25.8h", + inst: UADDLP, + exp: []byte{0x3e, 0x2b, 0x60, 0x6e}, + arr: VectorArrangement8H, + }, + { + name: "sshll2 v10.8h, v2.16b, #7", + x1: RegV2, + x2: RegV10, + inst: SSHLL2, + exp: []byte{0x4a, 0xa4, 0xf, 0x4f}, + arr: VectorArrangement16B, + c: 7, + }, + { + name: "sshll2 v10.4s, v2.8h, #0", + x1: RegV2, + x2: RegV10, + inst: SSHLL2, + exp: []byte{0x4a, 0xa4, 0x10, 0x4f}, + arr: VectorArrangement8H, + }, + { + name: "sshll2 v10.2d, v2.4s, #0x15", + x1: RegV2, + x2: RegV10, + inst: SSHLL2, + exp: []byte{0x4a, 0xa4, 0x35, 0x4f}, + arr: VectorArrangement4S, + c: 21, + }, + { + name: "ushll2 v10.8h, v2.16b, #7", + x1: RegV2, + x2: RegV10, + inst: USHLL2, + exp: []byte{0x4a, 0xa4, 0xf, 0x6f}, + arr: VectorArrangement16B, + c: 7, + }, + { + name: "ushll2 v10.4s, v2.8h, #0", + x1: RegV2, + x2: RegV10, + inst: USHLL2, + exp: []byte{0x4a, 0xa4, 0x10, 0x6f}, + arr: VectorArrangement8H, + }, + { + name: "ushll2 v10.2d, v2.4s, #0x15", + x1: RegV2, + x2: RegV10, + inst: USHLL2, + exp: []byte{0x4a, 0xa4, 0x35, 0x6f}, + arr: VectorArrangement4S, + c: 21, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzs v30.4s, v25.4s", + inst: VFCVTZS, + exp: []byte{0x3e, 0xbb, 0xa1, 0x4e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzs v30.2s, v25.2s", + inst: VFCVTZS, + exp: []byte{0x3e, 0xbb, 0xa1, 0xe}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzs v30.2d, v25.2d", + inst: VFCVTZS, + exp: []byte{0x3e, 0xbb, 0xe1, 0x4e}, + arr: VectorArrangement2D, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzu v30.4s, v25.4s", + inst: VFCVTZU, + exp: []byte{0x3e, 0xbb, 0xa1, 0x6e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzu v30.2s, v25.2s", + inst: VFCVTZU, + exp: []byte{0x3e, 0xbb, 0xa1, 0x2e}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzu v30.2d, v25.2d", + inst: VFCVTZU, + exp: []byte{0x3e, 0xbb, 0xe1, 0x6e}, + arr: VectorArrangement2D, + }, + { + x1: RegV25, + x2: RegV30, + name: "sqxtn v30.2s, v25.2d", + inst: SQXTN, + exp: []byte{0x3e, 0x4b, 0xa1, 0xe}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "sqxtn v30.4h, v25.4s", + inst: SQXTN, + exp: []byte{0x3e, 0x4b, 0x61, 0xe}, + arr: VectorArrangement4H, + }, + { + x1: RegV25, + x2: RegV30, + name: "uqxtn v30.2s, v25.2d", + inst: UQXTN, + exp: []byte{0x3e, 0x4b, 0xa1, 0x2e}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "uqxtn v30.4h, v25.4s", + inst: UQXTN, + exp: []byte{0x3e, 0x4b, 0x61, 0x2e}, + arr: VectorArrangement4H, + }, } for _, tt := range tests { @@ -2567,6 +2731,163 @@ func TestAssemblerImpl_encodeTwoVectorRegistersToVectorRegister(t *testing.T) { }, exp: []byte{0x9e, 0x1c, 0xab, 0x2e}, }, + { + name: "sqrdmulh v30.8h, v4.8h, v11.8h", + n: &NodeImpl{ + Instruction: SQRDMULH, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8H, + }, + exp: []byte{0x9e, 0xb4, 0x6b, 0x6e}, + }, + { + name: "sqrdmulh v30.4s, v4.4s, v11.4s", + n: &NodeImpl{ + Instruction: SQRDMULH, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0x9e, 0xb4, 0xab, 0x6e}, + }, + { + name: "smull v30.8h, v4.8b, v11.8b", + n: &NodeImpl{ + Instruction: SMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0xe}, + }, + { + name: "smull v30.4s, v4.4h, v11.4h", + n: &NodeImpl{ + Instruction: SMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0xe}, + }, + { + name: "smull v30.2d, v4.2s, v11.2s", + n: &NodeImpl{ + Instruction: SMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement2S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0xe}, + }, + { + name: "smull2 v30.8h, v4.16b, v11.16b", + n: &NodeImpl{ + Instruction: SMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0x4e}, + }, + { + name: "smull2 v30.4s, v4.8h, v11.8h", + n: &NodeImpl{ + Instruction: SMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0x4e}, + }, + { + name: "smull2 v30.2d, v4.4s, v11.4s", + n: &NodeImpl{ + Instruction: SMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0x4e}, + }, + + ////////////////////// + + { + name: "umull v30.8h, v4.8b, v11.8b", + n: &NodeImpl{ + Instruction: UMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0x2e}, + }, + { + name: "umull v30.4s, v4.4h, v11.4h", + n: &NodeImpl{ + Instruction: UMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0x2e}, + }, + { + name: "umull v30.2d, v4.2s, v11.2s", + n: &NodeImpl{ + Instruction: UMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement2S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0x2e}, + }, + { + name: "umull2 v30.8h, v4.16b, v11.16b", + n: &NodeImpl{ + Instruction: UMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0x6e}, + }, + { + name: "umull2 v30.4s, v4.8h, v11.8h", + n: &NodeImpl{ + Instruction: UMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0x6e}, + }, + { + name: "umull2 v30.2d, v4.4s, v11.4s", + n: &NodeImpl{ + Instruction: UMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0x6e}, + }, } for _, tt := range tests { diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go index e4ad21307f..c4801c5f52 100644 --- a/internal/engine/compiler/compiler_vec_test.go +++ b/internal/engine/compiler/compiler_vec_test.go @@ -5094,11 +5094,6 @@ func TestCompiler_compileV128_Pmax_Pmin(t *testing.T) { } func TestCompiler_compileV128ExtMul(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string shape wazeroir.Shape @@ -5759,11 +5754,6 @@ func TestCompiler_compileV128ExtMul(t *testing.T) { } func TestCompiler_compileV128Extend(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string shape wazeroir.Shape @@ -6237,10 +6227,6 @@ func TestCompiler_compileV128Extend(t *testing.T) { } func TestCompiler_compileV128Q15mulrSatS(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string @@ -6508,10 +6494,6 @@ func TestCompiler_compileV128FloatDemote(t *testing.T) { } func TestCompiler_compileV128ExtAddPairwise(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string @@ -7179,10 +7161,6 @@ func TestCompiler_compileV128Dot(t *testing.T) { } func TestCompiler_compileV128ITruncSatFromF(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go index d609c129c2..e873c6fad0 100644 --- a/internal/engine/compiler/impl_vec_arm64.go +++ b/internal/engine/compiler/impl_vec_arm64.go @@ -151,7 +151,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType8x8u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -161,7 +161,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType16x4s: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -171,7 +171,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType16x4u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -181,7 +181,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType32x2s: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -191,7 +191,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType32x2u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -201,7 +201,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType8Splat: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 1) @@ -1264,22 +1264,80 @@ func (c *arm64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) err // compileV128Extend implements compiler.compileV128Extend for arm64. func (c *arm64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + var inst asm.Instruction + var arr arm64.VectorArrangement + if o.UseLow { + if o.Signed { + inst = arm64.SSHLL + } else { + inst = arm64.USHLL + } + + switch o.OriginShape { + case wazeroir.ShapeI8x16: + arr = arm64.VectorArrangement8B + case wazeroir.ShapeI16x8: + arr = arm64.VectorArrangement4H + case wazeroir.ShapeI32x4: + arr = arm64.VectorArrangement2S + } + } else { + if o.Signed { + inst = arm64.SSHLL2 + } else { + inst = arm64.USHLL2 + } + arr = defaultArrangementForShape(o.OriginShape) + } + + return c.compileV128UniOp(inst, arr) } // compileV128ExtMul implements compiler.compileV128ExtMul for arm64. func (c *arm64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + var inst asm.Instruction + var arr arm64.VectorArrangement + if o.UseLow { + if o.Signed { + inst = arm64.SMULL + } else { + inst = arm64.UMULL + } + + switch o.OriginShape { + case wazeroir.ShapeI8x16: + arr = arm64.VectorArrangement8B + case wazeroir.ShapeI16x8: + arr = arm64.VectorArrangement4H + case wazeroir.ShapeI32x4: + arr = arm64.VectorArrangement2S + } + } else { + if o.Signed { + inst = arm64.SMULL2 + } else { + inst = arm64.UMULL2 + } + arr = defaultArrangementForShape(o.OriginShape) + } + + return c.compileV128x2BinOp(inst, arr) } // compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for arm64. -func (c *arm64Compiler) compileV128Q15mulrSatS(o *wazeroir.OperationV128Q15mulrSatS) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128Q15mulrSatS(*wazeroir.OperationV128Q15mulrSatS) error { + return c.compileV128x2BinOp(arm64.SQRDMULH, arm64.VectorArrangement8H) } // compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for arm64. func (c *arm64Compiler) compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + var inst asm.Instruction + if o.Signed { + inst = arm64.SADDLP + } else { + inst = arm64.UADDLP + } + return c.compileV128UniOp(inst, defaultArrangementForShape(o.OriginShape)) } // compileV128FloatPromote implements compiler.compileV128FloatPromote for arm64. @@ -1308,6 +1366,35 @@ func (c *arm64Compiler) compileV128Narrow(o *wazeroir.OperationV128Narrow) error } // compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for arm64. -func (c *arm64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) (err error) { + v := c.locationStack.popV128() + if err = c.compileEnsureOnGeneralPurposeRegister(v); err != nil { + return err + } + + var cvt asm.Instruction + if o.Signed { + cvt = arm64.VFCVTZS + } else { + cvt = arm64.VFCVTZU + } + + c.assembler.CompileVectorRegisterToVectorRegister(cvt, v.register, v.register, + defaultArrangementForShape(o.OriginShape), arm64.VectorIndexNone, arm64.VectorIndexNone, + ) + + if o.OriginShape == wazeroir.ShapeF64x2 { + var narrow asm.Instruction + if o.Signed { + narrow = arm64.SQXTN + } else { + narrow = arm64.UQXTN + } + c.assembler.CompileVectorRegisterToVectorRegister(narrow, v.register, v.register, + arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone, + ) + } + + c.pushVectorRuntimeValueLocationOnRegister(v.register) + return } diff --git a/internal/integration_test/asm/arm64_debug/golang_asm.go b/internal/integration_test/asm/arm64_debug/golang_asm.go index 03fa844d2f..05c769dda7 100644 --- a/internal/integration_test/asm/arm64_debug/golang_asm.go +++ b/internal/integration_test/asm/arm64_debug/golang_asm.go @@ -361,7 +361,7 @@ func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegister(instruction a func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, c asm.ConstantValue) { switch instruction { - case asm_arm64.USHLLIMM: + case asm_arm64.USHLL: var dstArrangement asm_arm64.VectorArrangement if arrangement == asm_arm64.VectorArrangement8B { dstArrangement = asm_arm64.VectorArrangement8H @@ -667,5 +667,5 @@ var castAsGolangAsmInstruction = [...]obj.As{ asm_arm64.VMOV: arm64.AVMOV, asm_arm64.VADD: arm64.AVADD, asm_arm64.VSUB: arm64.AVSUB, - asm_arm64.USHLLIMM: arm64.AVUSHLL, + asm_arm64.USHLL: arm64.AVUSHLL, } diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go index 21eef59a90..8aa41a3af5 100644 --- a/internal/integration_test/spectest/v2/spec_test.go +++ b/internal/integration_test/spectest/v2/spec_test.go @@ -26,11 +26,8 @@ func TestCompiler(t *testing.T) { spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool { switch path.Base(jsonname) { - case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json", "simd_int_to_int_extend.json", - "simd_i64x2_extmul_i32x4.json", "simd_i32x4_extmul_i16x8.json", "simd_i16x8_extmul_i8x16.json", - "simd_i16x8_q15mulr_sat_s.json", "simd_i16x8_extadd_pairwise_i8x16.json", "simd_i32x4_extadd_pairwise_i16x8.json", - "simd_i32x4_dot_i16x8.json", "simd_i32x4_trunc_sat_f32x4.json", - "simd_splat.json", "simd_load.json", "simd_i32x4_trunc_sat_f64x2.json", + case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json", + "simd_i32x4_dot_i16x8.json", "simd_splat.json", "simd_load.json", "simd_conversions.json": // TODO: implement on arm64. return runtime.GOARCH == "amd64"