From 43493145e6438e02226c32042eb28d2fadd354c8 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Mon, 20 Jun 2022 16:29:48 +0900 Subject: [PATCH 1/6] Encoding Signed-off-by: Takeshi Yoneda --- internal/asm/arm64/assembler.go | 2 +- internal/asm/arm64/consts.go | 52 +++- internal/asm/arm64/impl.go | 162 ++++++----- internal/asm/arm64/impl_test.go | 259 +++++++++++++++++- internal/engine/compiler/impl_vec_arm64.go | 12 +- .../asm/arm64_debug/golang_asm.go | 4 +- 6 files changed, 399 insertions(+), 92 deletions(-) diff --git a/internal/asm/arm64/assembler.go b/internal/asm/arm64/assembler.go index 5c1beba661..316d7cce27 100644 --- a/internal/asm/arm64/assembler.go +++ b/internal/asm/arm64/assembler.go @@ -92,7 +92,7 @@ type Assembler interface { // CompileVectorRegisterToVectorRegisterWithConst is the same as CompileVectorRegisterToVectorRegister but the // additional constant can be provided. - // For example, the const can be used to specify the shift amount for USHLLIMM instruction. + // For example, the const can be used to specify the shift amount for USHLL instruction. CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue) diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index d3b234d1ee..40a403b441 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -718,12 +718,12 @@ const ( VFSUBD // SSHL is the SSHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en SSHL - // SSHLLIMM is the SSHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- - SSHLLIMM + // SSHLL is the SSHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + SSHLL // USHL is the USHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en USHL - // USHLLIMM is the USHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- - USHLLIMM + // USHLL is the USHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + USHLL // LD1R is the LD1R instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register-- LD1R // SMOV32 is the 32-bit variant of SMOV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/SMOV--vector- @@ -856,6 +856,24 @@ const ( VUMLAL // SHLL is the SHLL instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SHLL--SHLL2--Shift-Left-Long--by-element-size--?lang=en SHLL + // SADDLP is the SADDLP instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en + SADDLP + // UADDLP is the UADDLP instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en + UADDLP + // SSHLL2 is the SSHLL2(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + SSHLL2 + // USHLL2 is the USHLL2(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + USHLL2 + // SQRDMULH is the SQRDMULH(vector) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQRDMULH--vector---Signed-saturating-Rounding-Doubling-Multiply-returning-High-half-?lang=en + SQRDMULH + // SMULL is the SMULL(vector) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL + // SMULL2 is the SMULL2(vector) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL2 + // UMULL is the UMULL instruction. https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL + // UMULL2 is the UMULL2 instruction. https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL2 // instructionEnd is always placed at the bottom of this iota definition to be used in the test. instructionEnd @@ -1212,10 +1230,10 @@ func InstructionName(i asm.Instruction) string { return "SSHL" case USHL: return "USHL" - case SSHLLIMM: - return "SSHLLIMM" - case USHLLIMM: - return "USHLLIMM" + case SSHLL: + return "SSHLL" + case USHLL: + return "USHLL" case LD1R: return "LD1R" case SMOV32: @@ -1326,6 +1344,24 @@ func InstructionName(i asm.Instruction) string { return "VUMLAL" case SHLL: return "SHLL" + case SSHLL2: + return "SSHLL2" + case USHLL2: + return "USHLL2" + case SQRDMULH: + return "SQRDMULH" + case SADDLP: + return "SADDLP" + case UADDLP: + return "UADDLP" + case SMULL: + return "SMULL" + case SMULL2: + return "SMULL2" + case UMULL: + return "UMULL" + case UMULL2: + return "UMULL2" } panic(fmt.Errorf("unknown instruction %d", i)) } diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go index bf74ecf7f4..166a84c626 100644 --- a/internal/asm/arm64/impl.go +++ b/internal/asm/arm64/impl.go @@ -2849,6 +2849,12 @@ var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { VectorArrangement4H: {q: 0b00, size: 0b01}, VectorArrangement2S: {q: 0b00, size: 0b10}, }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en + CMEQZERO: {U: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en + SADDLP: {U: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en + UADDLP: {U: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, } // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in @@ -2857,11 +2863,36 @@ var advancedSIMDThreeDifferent = map[asm.Instruction]struct { u, opcode byte qAndSize map[VectorArrangement]qAndSize }{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMLAL--UMLAL2--vector---Unsigned-Multiply-Add-Long--vector--?lang=en VUMLAL: {u: 0b1, opcode: 0b1000, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2S: {q: 0b0, size: 0b10}, VectorArrangement4H: {q: 0b0, size: 0b01}, VectorArrangement8B: {q: 0b0, size: 0b00}, }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement8B: {q: 0b0, size: 0b00}, + VectorArrangement4H: {q: 0b0, size: 0b01}, + VectorArrangement2S: {q: 0b0, size: 0b10}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en + SMULL2: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement16B: {q: 0b1, size: 0b00}, + VectorArrangement8H: {q: 0b1, size: 0b01}, + VectorArrangement4S: {q: 0b1, size: 0b10}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement8B: {q: 0b0, size: 0b00}, + VectorArrangement4H: {q: 0b0, size: 0b01}, + VectorArrangement2S: {q: 0b0, size: 0b10}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + UMULL2: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement16B: {q: 0b1, size: 0b00}, + VectorArrangement8H: {q: 0b1, size: 0b01}, + VectorArrangement4S: {q: 0b1, size: 0b10}, + }}, } // advancedSIMDThreeSame holds information to encode instructions as "Advanced SIMD three same" in @@ -3034,6 +3065,12 @@ var advancedSIMDThreeSame = map[asm.Instruction]struct { VectorArrangement8B: {q: 0b0, size: 0b10}, VectorArrangement16B: {q: 0b1, size: 0b10}, }}, + SQRDMULH: {u: 0b1, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement4H: {q: 0b0, size: 0b01}, + VectorArrangement8H: {q: 0b1, size: 0b01}, + VectorArrangement2S: {q: 0b0, size: 0b10}, + VectorArrangement4S: {q: 0b1, size: 0b10}, + }}, } // aAndSize is a pair of "Q" and "size" that appear in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en @@ -3249,59 +3286,62 @@ var advancedSIMDTableLookup = map[asm.Instruction]struct { TBL2: {op: 0, op2: 0, Len: 0b01, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}}, } -// advancedSIMDScalarTwoRegisterMisc holds information to encode instructions as "Advanced SIMD scalar two-register miscellaneous" in -// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en -var advancedSIMDScalarTwoRegisterMisc = map[asm.Instruction]struct { - U, opcode byte - qAndSize map[VectorArrangement]qAndSize -}{ - // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en - CMEQZERO: {U: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, -} - // advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en var advancedSIMDShiftByImmediate = map[asm.Instruction]struct { - U, opcode byte - immQResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) + U, opcode byte + q map[VectorArrangement]byte + immResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) }{ // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- - SSHLLIMM: {U: 0b0, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, + SSHLL: {U: 0b0, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- + SSHLL2: {U: 0b0, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- - USHLLIMM: {U: 0b1, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, + USHLL: {U: 0b1, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- + USHLL2: {U: 0b1, opcode: 0b10100, + q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1}, + immResolver: immResolverForSIMDSiftLeftByImmediate, + }, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en - SSHR: {U: 0b0, opcode: 0b00000, immQResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { - switch arr { - case VectorArrangement16B, VectorArrangement8B: - immh = 0b0001 - immb = 8 - byte(shiftAmount&0b111) - if arr == VectorArrangement16B { - q = 1 - } - case VectorArrangement8H, VectorArrangement4H: - v := 16 - byte(shiftAmount&0b1111) - immb = v & 0b111 - immh = 0b0010 | (v >> 3) - if arr == VectorArrangement8H { - q = 1 - } - case VectorArrangement4S, VectorArrangement2S: - v := 32 - byte(shiftAmount&0b11111) - immb = v & 0b111 - immh = 0b0100 | (v >> 3) - if arr == VectorArrangement4S { - q = 1 + SSHR: {U: 0b0, opcode: 0b00000, + q: map[VectorArrangement]byte{ + VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1, VectorArrangement2D: 0b1, + VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0, + }, + immResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) { + switch arr { + case VectorArrangement16B, VectorArrangement8B: + immh = 0b0001 + immb = 8 - byte(shiftAmount&0b111) + case VectorArrangement8H, VectorArrangement4H: + v := 16 - byte(shiftAmount&0b1111) + immb = v & 0b111 + immh = 0b0010 | (v >> 3) + case VectorArrangement4S, VectorArrangement2S: + v := 32 - byte(shiftAmount&0b11111) + immb = v & 0b111 + immh = 0b0100 | (v >> 3) + case VectorArrangement2D: + v := 64 - byte(shiftAmount&0b111111) + immb = v & 0b111 + immh = 0b1000 | (v >> 3) + default: + err = fmt.Errorf("unsupported arrangement %s", arr) } - case VectorArrangement2D: - v := 64 - byte(shiftAmount&0b111111) - immb = v & 0b111 - immh = 0b1000 | (v >> 3) - q = 1 - default: - err = fmt.Errorf("unsupported arrangement %s", arr) - } - return - }}, + return + }, + }, } // advancedSIMDPermute holds information to encode instructions as "Advanced SIMD permute" in @@ -3312,17 +3352,15 @@ var advancedSIMDPermute = map[asm.Instruction]struct { ZIP1: {opcode: 0b011}, } -func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { - q = 0b0 - +func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) { switch arr { - case VectorArrangement8B: + case VectorArrangement16B, VectorArrangement8B: immb = byte(shiftAmount) immh = 0b0001 - case VectorArrangement4H: + case VectorArrangement8H, VectorArrangement4H: immb = byte(shiftAmount) & 0b111 immh = 0b0010 | byte(shiftAmount>>3) - case VectorArrangement2S: + case VectorArrangement4S, VectorArrangement2S: immb = byte(shiftAmount) & 0b111 immh = 0b0100 | byte(shiftAmount>>3) default: @@ -3478,25 +3516,17 @@ func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err e return } - if scalaTwoMisc, ok := advancedSIMDScalarTwoRegisterMisc[n.Instruction]; ok { - qs, ok := scalaTwoMisc.qAndSize[n.VectorArrangement] - if !ok { - return fmt.Errorf("unsupported vector arrangement %s for %s", n.VectorArrangement, InstructionName(n.Instruction)) - } - a.Buf.Write([]byte{ - (dstVectorRegBits << 5) | dstVectorRegBits, - 0b100110<<2 | dstVectorRegBits>>3, - qs.size<<6 | 0b1<<5, - qs.q<<6 | scalaTwoMisc.U<<5 | 0b01001110, - }) - return - } - if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.Instruction]; ok { - immh, immb, q, err := shiftByImmediate.immQResolver(n.SrcConst, n.VectorArrangement) + immh, immb, err := shiftByImmediate.immResolver(n.SrcConst, n.VectorArrangement) if err != nil { return err } + + q, ok := shiftByImmediate.q[n.VectorArrangement] + if !ok { + return fmt.Errorf("unsupported vector arrangement %s for %s", n.VectorArrangement, InstructionName(n.Instruction)) + } + a.Buf.Write([]byte{ (srcVectorRegBits << 5) | dstVectorRegBits, shiftByImmediate.opcode<<3 | 0b1<<2 | srcVectorRegBits>>3, diff --git a/internal/asm/arm64/impl_test.go b/internal/asm/arm64/impl_test.go index a7ee786670..4de81355a7 100644 --- a/internal/asm/arm64/impl_test.go +++ b/internal/asm/arm64/impl_test.go @@ -1030,7 +1030,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "ushll v10.8h, v2.8b, #0", x1: RegV2, x2: RegV10, - inst: USHLLIMM, + inst: USHLL, exp: []byte{0x4a, 0xa4, 0x8, 0x2f}, arr: VectorArrangement8B, }, @@ -1038,7 +1038,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "ushll v10.8h, v2.8b, #7", x1: RegV2, x2: RegV10, - inst: USHLLIMM, + inst: USHLL, exp: []byte{0x4a, 0xa4, 0xf, 0x2f}, arr: VectorArrangement8B, c: 7, @@ -1182,7 +1182,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.8h, v2.8b, #7", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, exp: []byte{0x4a, 0xa4, 0xf, 0xf}, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0xf, 0xf}, arr: VectorArrangement8B, c: 7, }, @@ -1190,7 +1190,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.4s, v2.4h, #0", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x10, 0xf}, arr: VectorArrangement4H, }, @@ -1198,7 +1198,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.4s, v2.4h, #0xf", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x1f, 0xf}, arr: VectorArrangement4H, c: 15, @@ -1207,7 +1207,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.2d, v2.2s, #0", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x20, 0xf}, arr: VectorArrangement2S, }, @@ -1215,7 +1215,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "sshll v10.2d, v2.2s, #0x1f", x1: RegV2, x2: RegV10, - inst: SSHLLIMM, + inst: SSHLL, exp: []byte{0x4a, 0xa4, 0x3f, 0xf}, arr: VectorArrangement2S, c: 31, @@ -1396,10 +1396,10 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { { x1: RegRZR, x2: RegV30, - name: "cmeq v30.2d, v30.2d, #0", + name: "cmeq v30.2d, v0.2d, #0", inst: CMEQZERO, arr: VectorArrangement2D, - exp: []byte{0xde, 0x9b, 0xe0, 0x4e}, + exp: []byte{0x1e, 0x98, 0xe0, 0x4e}, }, { name: "tbl v1.8b, {v0.16b}, v1.8b", @@ -1754,6 +1754,90 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { exp: []byte{0x3e, 0x3b, 0xb0, 0x6e}, arr: VectorArrangement4S, }, + { + x1: RegV25, + x2: RegV30, + name: "saddlp v30.2d, v25.4s", + inst: SADDLP, + exp: []byte{0x3e, 0x2b, 0xa0, 0x4e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "saddlp v30.4s, v25.8h", + inst: SADDLP, + exp: []byte{0x3e, 0x2b, 0x60, 0x4e}, + arr: VectorArrangement8H, + }, + { + x1: RegV25, + x2: RegV30, + name: "uaddlp v30.2d, v25.4s", + inst: UADDLP, + exp: []byte{0x3e, 0x2b, 0xa0, 0x6e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "uaddlp v30.4s, v25.8h", + inst: UADDLP, + exp: []byte{0x3e, 0x2b, 0x60, 0x6e}, + arr: VectorArrangement8H, + }, + { + name: "sshll2 v10.8h, v2.16b, #7", + x1: RegV2, + x2: RegV10, + inst: SSHLL2, + exp: []byte{0x4a, 0xa4, 0xf, 0x4f}, + arr: VectorArrangement16B, + c: 7, + }, + { + name: "sshll2 v10.4s, v2.8h, #0", + x1: RegV2, + x2: RegV10, + inst: SSHLL2, + exp: []byte{0x4a, 0xa4, 0x10, 0x4f}, + arr: VectorArrangement8H, + }, + { + name: "sshll2 v10.2d, v2.4s, #0x15", + x1: RegV2, + x2: RegV10, + inst: SSHLL2, + exp: []byte{0x4a, 0xa4, 0x35, 0x4f}, + arr: VectorArrangement4S, + c: 21, + }, + { + name: "ushll2 v10.8h, v2.16b, #7", + x1: RegV2, + x2: RegV10, + inst: USHLL2, + exp: []byte{0x4a, 0xa4, 0xf, 0x6f}, + arr: VectorArrangement16B, + c: 7, + }, + { + name: "ushll2 v10.4s, v2.8h, #0", + x1: RegV2, + x2: RegV10, + inst: USHLL2, + exp: []byte{0x4a, 0xa4, 0x10, 0x6f}, + arr: VectorArrangement8H, + }, + { + name: "ushll2 v10.2d, v2.4s, #0x15", + x1: RegV2, + x2: RegV10, + inst: USHLL2, + exp: []byte{0x4a, 0xa4, 0x35, 0x6f}, + arr: VectorArrangement4S, + c: 21, + }, } for _, tt := range tests { @@ -2567,6 +2651,163 @@ func TestAssemblerImpl_encodeTwoVectorRegistersToVectorRegister(t *testing.T) { }, exp: []byte{0x9e, 0x1c, 0xab, 0x2e}, }, + { + name: "sqrdmulh v30.8h, v4.8h, v11.8h", + n: &NodeImpl{ + Instruction: SQRDMULH, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8H, + }, + exp: []byte{0x9e, 0xb4, 0x6b, 0x6e}, + }, + { + name: "sqrdmulh v30.4s, v4.4s, v11.4s", + n: &NodeImpl{ + Instruction: SQRDMULH, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0x9e, 0xb4, 0xab, 0x6e}, + }, + { + name: "smull v30.8h, v4.8b, v11.8b", + n: &NodeImpl{ + Instruction: SMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0xe}, + }, + { + name: "smull v30.4s, v4.4h, v11.4h", + n: &NodeImpl{ + Instruction: SMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0xe}, + }, + { + name: "smull v30.2d, v4.2s, v11.2s", + n: &NodeImpl{ + Instruction: SMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement2S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0xe}, + }, + { + name: "smull2 v30.8h, v4.16b, v11.16b", + n: &NodeImpl{ + Instruction: SMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0x4e}, + }, + { + name: "smull2 v30.4s, v4.8h, v11.8h", + n: &NodeImpl{ + Instruction: SMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0x4e}, + }, + { + name: "smull2 v30.2d, v4.4s, v11.4s", + n: &NodeImpl{ + Instruction: SMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0x4e}, + }, + + ////////////////////// + + { + name: "umull v30.8h, v4.8b, v11.8b", + n: &NodeImpl{ + Instruction: UMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0x2e}, + }, + { + name: "umull v30.4s, v4.4h, v11.4h", + n: &NodeImpl{ + Instruction: UMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0x2e}, + }, + { + name: "umull v30.2d, v4.2s, v11.2s", + n: &NodeImpl{ + Instruction: UMULL, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement2S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0x2e}, + }, + { + name: "umull2 v30.8h, v4.16b, v11.16b", + n: &NodeImpl{ + Instruction: UMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0x9e, 0xc0, 0x2b, 0x6e}, + }, + { + name: "umull2 v30.4s, v4.8h, v11.8h", + n: &NodeImpl{ + Instruction: UMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement8H, + }, + exp: []byte{0x9e, 0xc0, 0x6b, 0x6e}, + }, + { + name: "umull2 v30.2d, v4.4s, v11.4s", + n: &NodeImpl{ + Instruction: UMULL2, + DstReg: RegV30, + SrcReg: RegV11, + SrcReg2: RegV4, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0x9e, 0xc0, 0xab, 0x6e}, + }, } for _, tt := range tests { diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go index d609c129c2..bc7263f2e9 100644 --- a/internal/engine/compiler/impl_vec_arm64.go +++ b/internal/engine/compiler/impl_vec_arm64.go @@ -151,7 +151,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType8x8u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -161,7 +161,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType16x4s: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -171,7 +171,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType16x4u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -181,7 +181,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType32x2s: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -191,7 +191,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType32x2u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -201,7 +201,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType8Splat: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 1) diff --git a/internal/integration_test/asm/arm64_debug/golang_asm.go b/internal/integration_test/asm/arm64_debug/golang_asm.go index 03fa844d2f..05c769dda7 100644 --- a/internal/integration_test/asm/arm64_debug/golang_asm.go +++ b/internal/integration_test/asm/arm64_debug/golang_asm.go @@ -361,7 +361,7 @@ func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegister(instruction a func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, c asm.ConstantValue) { switch instruction { - case asm_arm64.USHLLIMM: + case asm_arm64.USHLL: var dstArrangement asm_arm64.VectorArrangement if arrangement == asm_arm64.VectorArrangement8B { dstArrangement = asm_arm64.VectorArrangement8H @@ -667,5 +667,5 @@ var castAsGolangAsmInstruction = [...]obj.As{ asm_arm64.VMOV: arm64.AVMOV, asm_arm64.VADD: arm64.AVADD, asm_arm64.VSUB: arm64.AVSUB, - asm_arm64.USHLLIMM: arm64.AVUSHLL, + asm_arm64.USHLL: arm64.AVUSHLL, } From 9fe2674432d5418d7cbdb30a476328a3cf837a99 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Mon, 20 Jun 2022 16:41:33 +0900 Subject: [PATCH 2/6] more Signed-off-by: Takeshi Yoneda --- internal/engine/compiler/compiler_vec_test.go | 18 ----- internal/engine/compiler/impl_vec_arm64.go | 76 +++++++++++++++++-- .../integration_test/spectest/v2/spec_test.go | 4 +- 3 files changed, 72 insertions(+), 26 deletions(-) diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go index e4ad21307f..32dfb2b431 100644 --- a/internal/engine/compiler/compiler_vec_test.go +++ b/internal/engine/compiler/compiler_vec_test.go @@ -5094,11 +5094,6 @@ func TestCompiler_compileV128_Pmax_Pmin(t *testing.T) { } func TestCompiler_compileV128ExtMul(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string shape wazeroir.Shape @@ -5759,11 +5754,6 @@ func TestCompiler_compileV128ExtMul(t *testing.T) { } func TestCompiler_compileV128Extend(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string shape wazeroir.Shape @@ -6237,10 +6227,6 @@ func TestCompiler_compileV128Extend(t *testing.T) { } func TestCompiler_compileV128Q15mulrSatS(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string @@ -6508,10 +6494,6 @@ func TestCompiler_compileV128FloatDemote(t *testing.T) { } func TestCompiler_compileV128ExtAddPairwise(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go index bc7263f2e9..55cfbbee9d 100644 --- a/internal/engine/compiler/impl_vec_arm64.go +++ b/internal/engine/compiler/impl_vec_arm64.go @@ -1264,22 +1264,88 @@ func (c *arm64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) err // compileV128Extend implements compiler.compileV128Extend for arm64. func (c *arm64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + + var inst asm.Instruction + if o.UseLow { + if o.Signed { + inst = arm64.SSHLL + } else { + inst = arm64.USHLL + } + } else { + if o.Signed { + inst = arm64.SSHLL2 + } else { + inst = arm64.USHLL2 + } + } + + var arr arm64.VectorArrangement + if o.UseLow { + switch o.OriginShape { + case wazeroir.ShapeI8x16: + arr = arm64.VectorArrangement8B + case wazeroir.ShapeI16x8: + arr = arm64.VectorArrangement4H + case wazeroir.ShapeI32x4: + arr = arm64.VectorArrangement2S + } + } else { + arr = defaultArrangementForShape(o.OriginShape) + } + + return c.compileV128UniOp(inst, arr) } // compileV128ExtMul implements compiler.compileV128ExtMul for arm64. func (c *arm64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + + var inst asm.Instruction + if o.Signed { + if o.UseLow { + inst = arm64.SMULL + } else { + inst = arm64.SMULL2 + } + } else { + if o.UseLow { + inst = arm64.UMULL + } else { + inst = arm64.UMULL2 + } + } + + var arr arm64.VectorArrangement + if o.UseLow { + switch o.OriginShape { + case wazeroir.ShapeI8x16: + arr = arm64.VectorArrangement8B + case wazeroir.ShapeI16x8: + arr = arm64.VectorArrangement4H + case wazeroir.ShapeI32x4: + arr = arm64.VectorArrangement2S + } + } else { + arr = defaultArrangementForShape(o.OriginShape) + } + + return c.compileV128x2BinOp(inst, arr) } // compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for arm64. -func (c *arm64Compiler) compileV128Q15mulrSatS(o *wazeroir.OperationV128Q15mulrSatS) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128Q15mulrSatS(*wazeroir.OperationV128Q15mulrSatS) error { + return c.compileV128x2BinOp(arm64.SQRDMULH, arm64.VectorArrangement8H) } // compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for arm64. func (c *arm64Compiler) compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + var inst asm.Instruction + if o.Signed { + inst = arm64.SADDLP + } else { + inst = arm64.UADDLP + } + return c.compileV128UniOp(inst, defaultArrangementForShape(o.OriginShape)) } // compileV128FloatPromote implements compiler.compileV128FloatPromote for arm64. diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go index 21eef59a90..1213369352 100644 --- a/internal/integration_test/spectest/v2/spec_test.go +++ b/internal/integration_test/spectest/v2/spec_test.go @@ -26,9 +26,7 @@ func TestCompiler(t *testing.T) { spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool { switch path.Base(jsonname) { - case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json", "simd_int_to_int_extend.json", - "simd_i64x2_extmul_i32x4.json", "simd_i32x4_extmul_i16x8.json", "simd_i16x8_extmul_i8x16.json", - "simd_i16x8_q15mulr_sat_s.json", "simd_i16x8_extadd_pairwise_i8x16.json", "simd_i32x4_extadd_pairwise_i16x8.json", + case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json", "simd_i32x4_dot_i16x8.json", "simd_i32x4_trunc_sat_f32x4.json", "simd_splat.json", "simd_load.json", "simd_i32x4_trunc_sat_f64x2.json", "simd_conversions.json": From 3ab899b6bef901dc067450a6ffb6a60098028c38 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Mon, 20 Jun 2022 16:48:37 +0900 Subject: [PATCH 3/6] simple Signed-off-by: Takeshi Yoneda --- internal/asm/arm64/consts.go | 17 ++++++++++ internal/engine/compiler/impl_vec_arm64.go | 36 +++++++++------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index 40a403b441..e0bc490b3b 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -874,6 +874,14 @@ const ( UMULL // UMULL2 is the UMULL2 instruction. https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en UMULL2 + // FCVTZS is the FCVTZS instruction + FCVTZS + // FCVTZU is the FCVTZU instruction + FCVTZU + // SQXTN is the SQXTN instruction + SQXTN + // UQXTN is the UQXTN instruction + UQXTN // instructionEnd is always placed at the bottom of this iota definition to be used in the test. instructionEnd @@ -1362,6 +1370,15 @@ func InstructionName(i asm.Instruction) string { return "UMULL" case UMULL2: return "UMULL2" + + case FCVTZS: + return "FCVTZS" + case FCVTZU: + return "FCVTZU" + case SQXTN: + return "SQXTN" + case UQXTN: + return "UQXTN" } panic(fmt.Errorf("unknown instruction %d", i)) } diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go index 55cfbbee9d..e2feff7b6b 100644 --- a/internal/engine/compiler/impl_vec_arm64.go +++ b/internal/engine/compiler/impl_vec_arm64.go @@ -1264,24 +1264,15 @@ func (c *arm64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) err // compileV128Extend implements compiler.compileV128Extend for arm64. func (c *arm64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error { - var inst asm.Instruction + var arr arm64.VectorArrangement if o.UseLow { if o.Signed { inst = arm64.SSHLL } else { inst = arm64.USHLL } - } else { - if o.Signed { - inst = arm64.SSHLL2 - } else { - inst = arm64.USHLL2 - } - } - var arr arm64.VectorArrangement - if o.UseLow { switch o.OriginShape { case wazeroir.ShapeI8x16: arr = arm64.VectorArrangement8B @@ -1291,6 +1282,11 @@ func (c *arm64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error arr = arm64.VectorArrangement2S } } else { + if o.Signed { + inst = arm64.SSHLL2 + } else { + inst = arm64.USHLL2 + } arr = defaultArrangementForShape(o.OriginShape) } @@ -1299,24 +1295,15 @@ func (c *arm64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error // compileV128ExtMul implements compiler.compileV128ExtMul for arm64. func (c *arm64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error { - var inst asm.Instruction - if o.Signed { - if o.UseLow { + var arr arm64.VectorArrangement + if o.UseLow { + if o.Signed { inst = arm64.SMULL } else { - inst = arm64.SMULL2 - } - } else { - if o.UseLow { inst = arm64.UMULL - } else { - inst = arm64.UMULL2 } - } - var arr arm64.VectorArrangement - if o.UseLow { switch o.OriginShape { case wazeroir.ShapeI8x16: arr = arm64.VectorArrangement8B @@ -1326,6 +1313,11 @@ func (c *arm64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error arr = arm64.VectorArrangement2S } } else { + if o.Signed { + inst = arm64.SMULL2 + } else { + inst = arm64.UMULL2 + } arr = defaultArrangementForShape(o.OriginShape) } From f43a554aab5f8bcaf5bb0145f16de226d7f63947 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Mon, 20 Jun 2022 16:57:22 +0900 Subject: [PATCH 4/6] more Signed-off-by: Takeshi Yoneda --- internal/asm/arm64/consts.go | 19 ++++++------- internal/asm/arm64/impl.go | 12 +++++++++ internal/asm/arm64/impl_test.go | 48 +++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 9 deletions(-) diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index e0bc490b3b..3330d733ef 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -874,10 +874,12 @@ const ( UMULL // UMULL2 is the UMULL2 instruction. https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en UMULL2 - // FCVTZS is the FCVTZS instruction - FCVTZS - // FCVTZU is the FCVTZU instruction - FCVTZU + // VFCVTZS is the FCVTZS(vector,integer) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en + // Note: prefixed by V to distinguish from the non-vector variant. + VFCVTZS + // VFCVTZU is the FCVTZU(vector,integer) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en + // Note: prefixed by V to distinguish from the non-vector variant. + VFCVTZU // SQXTN is the SQXTN instruction SQXTN // UQXTN is the UQXTN instruction @@ -1370,11 +1372,10 @@ func InstructionName(i asm.Instruction) string { return "UMULL" case UMULL2: return "UMULL2" - - case FCVTZS: - return "FCVTZS" - case FCVTZU: - return "FCVTZU" + case VFCVTZS: + return "VFCVTZS" + case VFCVTZU: + return "VFCVTZU" case SQXTN: return "SQXTN" case UQXTN: diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go index 166a84c626..9bd6c15a6d 100644 --- a/internal/asm/arm64/impl.go +++ b/internal/asm/arm64/impl.go @@ -2855,6 +2855,18 @@ var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { SADDLP: {U: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en UADDLP: {U: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en + VFCVTZS: {U: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement4S: {size: 0b10, q: 0b1}, + VectorArrangement2S: {size: 0b10, q: 0b0}, + VectorArrangement2D: {size: 0b11, q: 0b1}, + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en + VFCVTZU: {U: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ + VectorArrangement4S: {size: 0b10, q: 0b1}, + VectorArrangement2S: {size: 0b10, q: 0b0}, + VectorArrangement2D: {size: 0b11, q: 0b1}, + }}, } // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in diff --git a/internal/asm/arm64/impl_test.go b/internal/asm/arm64/impl_test.go index 4de81355a7..961949839c 100644 --- a/internal/asm/arm64/impl_test.go +++ b/internal/asm/arm64/impl_test.go @@ -1838,6 +1838,54 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { arr: VectorArrangement4S, c: 21, }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzs v30.4s, v25.4s", + inst: VFCVTZS, + exp: []byte{0x3e, 0xbb, 0xa1, 0x4e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzs v30.2s, v25.2s", + inst: VFCVTZS, + exp: []byte{0x3e, 0xbb, 0xa1, 0xe}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzs v30.2d, v25.2d", + inst: VFCVTZS, + exp: []byte{0x3e, 0xbb, 0xe1, 0x4e}, + arr: VectorArrangement2D, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzu v30.4s, v25.4s", + inst: VFCVTZU, + exp: []byte{0x3e, 0xbb, 0xa1, 0x6e}, + arr: VectorArrangement4S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzu v30.2s, v25.2s", + inst: VFCVTZU, + exp: []byte{0x3e, 0xbb, 0xa1, 0x2e}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "fcvtzu v30.2d, v25.2d", + inst: VFCVTZU, + exp: []byte{0x3e, 0xbb, 0xe1, 0x6e}, + arr: VectorArrangement2D, + }, } for _, tt := range tests { From c23b0d94960d142d2c025c4d5e3a8079867cfeca Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Mon, 20 Jun 2022 17:04:56 +0900 Subject: [PATCH 5/6] more Signed-off-by: Takeshi Yoneda --- internal/asm/arm64/consts.go | 4 +-- internal/asm/arm64/impl.go | 46 ++++++++++++++++++--------------- internal/asm/arm64/impl_test.go | 32 +++++++++++++++++++++++ 3 files changed, 59 insertions(+), 23 deletions(-) diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index 3330d733ef..6515906766 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -880,9 +880,9 @@ const ( // VFCVTZU is the FCVTZU(vector,integer) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en // Note: prefixed by V to distinguish from the non-vector variant. VFCVTZU - // SQXTN is the SQXTN instruction + // SQXTN is the SQXTN instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en SQXTN - // UQXTN is the UQXTN instruction + // UQXTN is the UQXTN instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en UQXTN // instructionEnd is always placed at the bottom of this iota definition to be used in the test. diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go index 9bd6c15a6d..1c77f7d774 100644 --- a/internal/asm/arm64/impl.go +++ b/internal/asm/arm64/impl.go @@ -2773,18 +2773,18 @@ func (a *AssemblerImpl) EncodeStaticConstToVectorRegister(n *NodeImpl) (err erro // advancedSIMDTwoRegisterMisc holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { - U, opcode byte + u, opcode byte qAndSize map[VectorArrangement]qAndSize }{ // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en - NOT: {U: 0b1, opcode: 0b00101, + NOT: {u: 0b1, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement16B: {size: 0b00, q: 0b1}, VectorArrangement8B: {size: 0b00, q: 0b0}, }, }, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en - VFNEG: {U: 0b1, opcode: 0b01111, + VFNEG: {u: 0b1, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, @@ -2792,81 +2792,85 @@ var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { }, }, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FABS--vector---Floating-point-Absolute-value--vector--?lang=en - VFABS: {U: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ + VFABS: {u: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSQRT--vector---Floating-point-Square-Root--vector--?lang=en - VFSQRT: {U: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{ + VFSQRT: {u: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTM--vector---Floating-point-Round-to-Integral--toward-Minus-infinity--vector--?lang=en - VFRINTM: {U: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTM: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b01, q: 0b1}, VectorArrangement4S: {size: 0b00, q: 0b1}, VectorArrangement2S: {size: 0b00, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTN--vector---Floating-point-Round-to-Integral--to-nearest-with-ties-to-even--vector--?lang=en - VFRINTN: {U: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTN: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b01, q: 0b1}, VectorArrangement4S: {size: 0b00, q: 0b1}, VectorArrangement2S: {size: 0b00, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTP--vector---Floating-point-Round-to-Integral--toward-Plus-infinity--vector--?lang=en - VFRINTP: {U: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTP: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTZ--vector---Floating-point-Round-to-Integral--toward-Zero--vector--?lang=en - VFRINTZ: {U: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ + VFRINTZ: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {size: 0b11, q: 0b1}, VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CNT--Population-Count-per-byte-?lang=en - VCNT: {U: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ + VCNT: {u: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement8B: {size: 0b00, q: 0b0}, VectorArrangement16B: {size: 0b00, q: 0b1}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NEG--vector---Negate--vector--?lang=en - VNEG: {U: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize}, + VNEG: {u: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ABS--Absolute-value--vector--?lang=en - VABS: {U: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize}, + VABS: {u: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/REV64--Reverse-elements-in-64-bit-doublewords--vector--?lang=en - REV64: {U: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize}, + REV64: {u: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/XTN--XTN2--Extract-Narrow-?lang=en - XTN: {U: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ + XTN: {u: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement2D: {q: 0, size: 0b10}, VectorArrangement4S: {q: 0, size: 0b01}, VectorArrangement8H: {q: 0, size: 0b00}, }}, - SHLL: {U: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{ + SHLL: {u: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement8B: {q: 0b00, size: 0b00}, VectorArrangement4H: {q: 0b00, size: 0b01}, VectorArrangement2S: {q: 0b00, size: 0b10}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en - CMEQZERO: {U: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, + CMEQZERO: {u: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en - SADDLP: {U: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize}, + SADDLP: {u: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en - UADDLP: {U: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, + UADDLP: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en - VFCVTZS: {U: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ + VFCVTZS: {u: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, VectorArrangement2D: {size: 0b11, q: 0b1}, }}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en - VFCVTZU: {U: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ + VFCVTZU: {u: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ VectorArrangement4S: {size: 0b10, q: 0b1}, VectorArrangement2S: {size: 0b10, q: 0b0}, VectorArrangement2D: {size: 0b11, q: 0b1}, }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en + SQXTN: {u: 0b0, opcode: 0b10100, qAndSize: defaultQAndSize}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en + UQXTN: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize}, } // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in @@ -3475,7 +3479,7 @@ func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err e (srcVectorRegBits << 5) | dstVectorRegBits, twoRegMisc.opcode<<4 | 0b1<<3 | srcVectorRegBits>>3, qs.size<<6 | 0b1<<5 | twoRegMisc.opcode>>4, - qs.q<<6 | twoRegMisc.U<<5 | 0b01110, + qs.q<<6 | twoRegMisc.u<<5 | 0b01110, }) return nil } diff --git a/internal/asm/arm64/impl_test.go b/internal/asm/arm64/impl_test.go index 961949839c..e1b6b16abe 100644 --- a/internal/asm/arm64/impl_test.go +++ b/internal/asm/arm64/impl_test.go @@ -1886,6 +1886,38 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { exp: []byte{0x3e, 0xbb, 0xe1, 0x6e}, arr: VectorArrangement2D, }, + { + x1: RegV25, + x2: RegV30, + name: "sqxtn v30.2s, v25.2d", + inst: SQXTN, + exp: []byte{0x3e, 0x4b, 0xa1, 0xe}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "sqxtn v30.4h, v25.4s", + inst: SQXTN, + exp: []byte{0x3e, 0x4b, 0x61, 0xe}, + arr: VectorArrangement4H, + }, + { + x1: RegV25, + x2: RegV30, + name: "uqxtn v30.2s, v25.2d", + inst: UQXTN, + exp: []byte{0x3e, 0x4b, 0xa1, 0x2e}, + arr: VectorArrangement2S, + }, + { + x1: RegV25, + x2: RegV30, + name: "uqxtn v30.4h, v25.4s", + inst: UQXTN, + exp: []byte{0x3e, 0x4b, 0x61, 0x2e}, + arr: VectorArrangement4H, + }, } for _, tt := range tests { From e4ddc68727379d64bafc2760286ebec406194f7b Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Mon, 20 Jun 2022 17:12:30 +0900 Subject: [PATCH 6/6] more Signed-off-by: Takeshi Yoneda --- internal/engine/compiler/compiler_vec_test.go | 4 --- internal/engine/compiler/impl_vec_arm64.go | 33 +++++++++++++++++-- .../integration_test/spectest/v2/spec_test.go | 3 +- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go index 32dfb2b431..c4801c5f52 100644 --- a/internal/engine/compiler/compiler_vec_test.go +++ b/internal/engine/compiler/compiler_vec_test.go @@ -7161,10 +7161,6 @@ func TestCompiler_compileV128Dot(t *testing.T) { } func TestCompiler_compileV128ITruncSatFromF(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go index e2feff7b6b..e873c6fad0 100644 --- a/internal/engine/compiler/impl_vec_arm64.go +++ b/internal/engine/compiler/impl_vec_arm64.go @@ -1366,6 +1366,35 @@ func (c *arm64Compiler) compileV128Narrow(o *wazeroir.OperationV128Narrow) error } // compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for arm64. -func (c *arm64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) (err error) { + v := c.locationStack.popV128() + if err = c.compileEnsureOnGeneralPurposeRegister(v); err != nil { + return err + } + + var cvt asm.Instruction + if o.Signed { + cvt = arm64.VFCVTZS + } else { + cvt = arm64.VFCVTZU + } + + c.assembler.CompileVectorRegisterToVectorRegister(cvt, v.register, v.register, + defaultArrangementForShape(o.OriginShape), arm64.VectorIndexNone, arm64.VectorIndexNone, + ) + + if o.OriginShape == wazeroir.ShapeF64x2 { + var narrow asm.Instruction + if o.Signed { + narrow = arm64.SQXTN + } else { + narrow = arm64.UQXTN + } + c.assembler.CompileVectorRegisterToVectorRegister(narrow, v.register, v.register, + arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone, + ) + } + + c.pushVectorRuntimeValueLocationOnRegister(v.register) + return } diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go index 1213369352..8aa41a3af5 100644 --- a/internal/integration_test/spectest/v2/spec_test.go +++ b/internal/integration_test/spectest/v2/spec_test.go @@ -27,8 +27,7 @@ func TestCompiler(t *testing.T) { spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool { switch path.Base(jsonname) { case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json", - "simd_i32x4_dot_i16x8.json", "simd_i32x4_trunc_sat_f32x4.json", - "simd_splat.json", "simd_load.json", "simd_i32x4_trunc_sat_f64x2.json", + "simd_i32x4_dot_i16x8.json", "simd_splat.json", "simd_load.json", "simd_conversions.json": // TODO: implement on arm64. return runtime.GOARCH == "amd64"