From 4ab2382d13f0743187a3a459adc1b2eebd7f477c Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Fri, 17 Jun 2022 11:23:17 +0900 Subject: [PATCH 1/3] more Signed-off-by: Takeshi Yoneda --- Makefile | 4 +- internal/asm/arm64/assembler.go | 13 +- internal/asm/arm64/consts.go | 249 ++-- internal/asm/arm64/impl.go | 1277 +++++++++++------ internal/asm/arm64/impl_test.go | 771 +++++++++- internal/engine/compiler/compiler_vec_test.go | 33 +- internal/engine/compiler/impl_arm64.go | 92 +- internal/engine/compiler/impl_vec_arm64.go | 355 ++++- .../engine/compiler/impl_vec_arm64_test.go | 4 +- .../asm/arm64_debug/debug_assembler.go | 14 + .../asm/arm64_debug/golang_asm.go | 90 +- .../asm/arm64_debug/impl_test.go | 375 +---- .../integration_test/spectest/v2/spec_test.go | 3 +- 13 files changed, 2161 insertions(+), 1119 deletions(-) diff --git a/Makefile b/Makefile index 5341bf6f77..6ef9b752f7 100644 --- a/Makefile +++ b/Makefile @@ -117,10 +117,10 @@ spectest: @$(MAKE) spectest.v2 spectest.v1: - go test $$(go list ./... | grep $(spectest_v1_dir)) -v -timeout 120s + @go test $$(go list ./... | grep $(spectest_v1_dir)) -timeout 120s spectest.v2: - go test $$(go list ./... | grep $(spectest_v2_dir)) -v -timeout 120s + @go test $$(go list ./... | grep $(spectest_v2_dir)) -timeout 120s golangci_lint_path := $(shell go env GOPATH)/bin/golangci-lint diff --git a/internal/asm/arm64/assembler.go b/internal/asm/arm64/assembler.go index de48834c3d..f6dd8d7bd4 100644 --- a/internal/asm/arm64/assembler.go +++ b/internal/asm/arm64/assembler.go @@ -107,7 +107,7 @@ type Assembler interface { // CompileVectorRegisterToVectorRegisterWithConst is the same as CompileVectorRegisterToVectorRegister but the // additional constant can be provided. - // For example, the const can be used to specify the shift amount for USHLL instruction. + // For example, the const can be used to specify the shift amount for USHLLIMM instruction. CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue) @@ -115,4 +115,15 @@ type Assembler interface { // the memory and the destination is the dstReg. CompileLoadStaticConstToVectorRegister(instruction asm.Instruction, c asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement) + + // CompileTwoVectorRegistersToVectorRegister adds an instruction where source are two vectors and destination is one + // vector. The vector's arrangement can be specified `arrangement`. + CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register, + arrangement VectorArrangement) + + // CompileTwoVectorRegistersToVectorRegisterWithConst is the same as CompileTwoVectorRegistersToVectorRegister except + // that this also accept additional constant. + // For example EXIT instruction needs the extraction target immediate as const. + CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register, + arrangement VectorArrangement, c asm.ConstantValue) } diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index 85e5c1be35..9fcfda20df 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -12,7 +12,7 @@ import ( // See https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/condition-codes-1-condition-flags-and-codes const ( // CondEQ is the eq (equal) condition code - CondEQ asm.ConditionalRegisterState = asm.ConditionalRegisterStateUnset + 1 + iota + CondEQ = asm.ConditionalRegisterStateUnset + 1 + iota // CondNE is the ne (not equal) condition code CondNE // CondHS is the hs (unsigned higher or same) condition code @@ -443,7 +443,7 @@ func RegisterName(r asm.Register) string { // Arm64-specific instructions. // // Note: This only defines arm64 instructions used by wazero's compiler. -// Note: Naming conventions intentionally match the Go assembler: https://go.dev/doc/asm +// Note: Naming conventions partially match the Go assembler: https://go.dev/doc/asm const ( // NOP is the NOP instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NOP NOP asm.Instruction = iota @@ -459,6 +459,10 @@ const ( ADR // AND is the AND instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/AND--shifted-register- AND + // ANDIMM32 is the AND(immediate) instruction in 32-bit mode https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en + ANDIMM32 + // ANDIMM64 is the AND(immediate) instruction in 64-bit mode https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en + ANDIMM64 // ANDW is the AND instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/AND--register- ANDW // ASR is the ASR instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ASR--register- @@ -467,32 +471,32 @@ const ( ASRW // B is the B instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B B - // BEQ is the B.cond instruction with CondEQ. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BEQ - // BGE is the B.cond instruction with CondGE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BGE - // BGT is the B.cond instruction with CondGT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BGT - // BHI is the B.cond instruction with CondHI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BHI - // BHS is the B.cond instruction with CondHS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BHS - // BLE is the B.cond instruction with CondLE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BLE - // BLO is the B.cond instruction with CondLO. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BLO - // BLS is the B.cond instruction with CondLS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BLS - // BLT is the B.cond instruction with CondLT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BLT - // BMI is the B.cond instruction with CondMI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BMI - // BPL is the B.cond instruction with CondPL. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BPL - // BNE is the B.cond instruction with CondNE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BNE - // BVS is the B.cond instruction with CondVS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond - BVS + // BCONDEQ is the B.cond instruction with CondEQ. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDEQ + // BCONDGE is the B.cond instruction with CondGE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDGE + // BCONDGT is the B.cond instruction with CondGT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDGT + // BCONDHI is the B.cond instruction with CondHI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDHI + // BCONDHS is the B.cond instruction with CondHS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDHS + // BCONDLE is the B.cond instruction with CondLE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDLE + // BCONDLO is the B.cond instruction with CondLO. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDLO + // BCONDLS is the B.cond instruction with CondLS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDLS + // BCONDLT is the B.cond instruction with CondLT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDLT + // BCONDMI is the B.cond instruction with CondMI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDMI + // BCONDPL is the B.cond instruction with CondPL. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDPL + // BCONDNE is the B.cond instruction with CondNE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDNE + // BCONDVS is the B.cond instruction with CondVS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + BCONDVS // CLZ is the CLZ instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/CLZ CLZ // CLZW is the CLZ instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/CLZ @@ -619,11 +623,11 @@ const ( MSUBW // MUL is the MUL instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MUL MUL - // MULW is the MUL instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MUL + // MULW is the MUL instruction, in 32-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MUL MULW // NEG is the NEG instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NEG NEG - // NEGW is the NEG instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NEG + // NEGW is the NEG instruction, in 32-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NEG NEGW // ORR is the ORR instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ORR--shifted-register- ORR @@ -677,20 +681,21 @@ const ( UDIV // UDIVW is the UDIV instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/UDIV UDIVW - // VBIT is the BIT instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/BIT--vector- VBIT // VCNT is the CNT instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/CNT--vector- VCNT // VMOV has different semantics depending on the types of operands: - // * MOV(vector) if the operands are vectors and indexes are not specified. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/MOV--vector- - // * MOV(vector, element) if the operands are vectors and indexes are specified. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/MOV--vector--element- - // * INS(vector, element) if the src is a general purpose and the dst is a vector. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/INS--vector---general- - // * UMOV(vector) if the dst is a general purpose and the src is a vector. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/UMOV--vector- // * LDR(SIMD&FP) if the src is memory and dst is a vector: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--immediate--SIMD-FP---Load-SIMD-FP-Register--immediate-offset-- - // * LDR (literal, SIMD&FP) if the src is static const and dst is a vector: https://developer.arm.com/documentation/dui0801/h/A64-Floating-point-Instructions/LDR--literal--SIMD-and-FP- + // * LDR(literal, SIMD&FP) if the src is static const and dst is a vector: https://developer.arm.com/documentation/dui0801/h/A64-Floating-point-Instructions/LDR--literal--SIMD-and-FP- // * STR(SIMD&FP) if the dst is memory and src is a vector: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/STR--immediate--SIMD-FP---Store-SIMD-FP-register--immediate-offset-- VMOV + // UMOV is the UMOV instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en + UMOV + // INSGEN is the INS(general) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en + INSGEN + // INSELEM is the INS(element) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en + INSELEM // VUADDLV is the UADDLV(vector) instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/UADDLV--vector- VUADDLV // VADD is the ADD(vector) instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/ADD--vector- @@ -705,28 +710,62 @@ const ( VFSUBS // VFSUBD is the FSUB(vector) instruction, for double precision. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/FSUB--vector- VFSUBD - // SSHLL is the SSHLL(vector) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- - SSHLL - // USHLL is the USHLL(vector) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- - USHLL + // SSHL is the SSHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en + SSHL + // SSHLLIMM is the SSHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + SSHLLIMM + // USHL is the USHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en + USHL + // USHLLIMM is the USHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector- + USHLLIMM // LD1R is the LD1R instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register-- LD1R - // SMOV is the SMOV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/SMOV--vector- - SMOV - // DUP is the DUP(element) instruction. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar- - DUP + // SMOV32 is the 32-bit variant of SMOV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/SMOV--vector- + SMOV32 + // DUPGEN is the DUP(general) instruction. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector- + DUPGEN + // DUPELEM is the DUP(element) instruction. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar- + DUPELEM // UMAXP is the UMAXP(vector) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/UMAXP--vector- UMAXP // UMINV is the UMINV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/UMINV--vector- UMINV // CMEQ is the CMEQ(vector, register) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/CMEQ--vector--register- CMEQ - // ADDP is the ADDP(vector) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector- + // CMEQZERO is the CMEP(zero) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en + CMEQZERO + // ADDP is the ADDP(scalar) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en ADDP + // VADDP is the ADDP(vector) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector- + // Note: prefixed by V to distinguish from the non-vector variant of ADDP(scalar). + VADDP // TBL1 is the TBL instruction whose source is one vector. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/TBL--Table-vector-Lookup- TBL1 // TBL2 is the TBL instruction whose source is two vectors. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/TBL--Table-vector-Lookup- TBL2 + // NOT is the NOT(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en + NOT + // VAND is the AND(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector-- + // Note: prefixed by V to distinguish from the non-vector variant of AND. + VAND + // VORR is the ORR(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register-- + // Note: prefixed by V to distinguish from the non-vector variant of ORR. + VORR + // BSL https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/BSL--Bitwise-Select- + BSL + // BIC is the BIC(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register-- + BIC + // VFNEG is the FNEG(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector-- + // Note: prefixed by V to distinguish from the non-vector variant of FNEG. + VFNEG + // ADDV is the ADDV instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/ADDV--Add-across-Vector- + ADDV + // ZIP1 is the ZIP1 instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ZIP1--Zip-vectors--primary--?lang=en + ZIP1 + // SSHR is the SSHR(immediate,vector) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en + SSHR + // EXT is the EXT instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en + EXT // instructionEnd is always placed at the bottom of this iota definition to be used in the test. instructionEnd @@ -742,17 +781,17 @@ const ( VectorArrangement8B // VectorArrangement16B is an arrangement of 16 bytes (128-bit vector) VectorArrangement16B - // VectorArrangement4H is an arrangement of 4 halfwords (64-bit vector) + // VectorArrangement4H is an arrangement of 4 half precisions (64-bit vector) VectorArrangement4H - // VectorArrangement8H is an arrangement of 8 halfwords (128-bit vector) + // VectorArrangement8H is an arrangement of 8 half precisions (128-bit vector) VectorArrangement8H - // VectorArrangement2S is an arrangement of 2 words (64-bit vector) + // VectorArrangement2S is an arrangement of 2 single precisions (64-bit vector) VectorArrangement2S - // VectorArrangement4S is an arrangement of 4 words (128-bit vector) + // VectorArrangement4S is an arrangement of 4 single precisions (128-bit vector) VectorArrangement4S - // VectorArrangement1D is an arrangement of 1 doubleword (64-bit vector) + // VectorArrangement1D is an arrangement of 1 double precision (64-bit vector) VectorArrangement1D - // VectorArrangement2D is an arrangement of 2 doublewords (128-bit vector) + // VectorArrangement2D is an arrangement of 2 double precisions (128-bit vector) VectorArrangement2D // Assign each vector size specifier to a vector arrangement ID. @@ -831,6 +870,10 @@ func InstructionName(i asm.Instruction) string { return "ADR" case AND: return "AND" + case ANDIMM32: + return "ANDIMM32" + case ANDIMM64: + return "ANDIMM64" case ANDW: return "ANDW" case ASR: @@ -839,32 +882,32 @@ func InstructionName(i asm.Instruction) string { return "ASRW" case B: return "B" - case BEQ: - return "BEQ" - case BGE: - return "BGE" - case BGT: - return "BGT" - case BHI: - return "BHI" - case BHS: - return "BHS" - case BLE: - return "BLE" - case BLO: - return "BLO" - case BLS: - return "BLS" - case BLT: - return "BLT" - case BMI: - return "BMI" - case BPL: - return "BPL" - case BNE: - return "BNE" - case BVS: - return "BVS" + case BCONDEQ: + return "BCONDEQ" + case BCONDGE: + return "BCONDGE" + case BCONDGT: + return "BCONDGT" + case BCONDHI: + return "BCONDHI" + case BCONDHS: + return "BCONDHS" + case BCONDLE: + return "BCONDLE" + case BCONDLO: + return "BCONDLO" + case BCONDLS: + return "BCONDLS" + case BCONDLT: + return "BCONDLT" + case BCONDMI: + return "BCONDMI" + case BCONDPL: + return "BCONDPL" + case BCONDNE: + return "BCONDNE" + case BCONDVS: + return "BCONDVS" case CLZ: return "CLZ" case CLZW: @@ -1057,6 +1100,12 @@ func InstructionName(i asm.Instruction) string { return "VUADDLV" case VMOV: return "VMOV" + case INSELEM: + return "INSELEM" + case UMOV: + return "UMOV" + case INSGEN: + return "INSGEN" case VADD: return "VADD" case VFADDS: @@ -1069,16 +1118,22 @@ func InstructionName(i asm.Instruction) string { return "VFSUBS" case VFSUBD: return "VFSUBD" - case SSHLL: - return "SSHLL" - case USHLL: - return "USHLL" + case SSHL: + return "SSHL" + case USHL: + return "USHL" + case SSHLLIMM: + return "SSHLLIMM" + case USHLLIMM: + return "USHLLIMM" case LD1R: return "LD1R" - case SMOV: - return "SMOV" - case DUP: - return "DUP" + case SMOV32: + return "SMOV32" + case DUPGEN: + return "DUPGEN" + case DUPELEM: + return "DUPELEM" case UMAXP: return "UMAXP" case UMINV: @@ -1087,10 +1142,34 @@ func InstructionName(i asm.Instruction) string { return "CMEQ" case ADDP: return "ADDP" + case VADDP: + return "VADDP" case TBL1: return "TBL1" case TBL2: return "TBL2" + case NOT: + return "NOT" + case VAND: + return "VAND" + case VORR: + return "VORR" + case BSL: + return "BSL" + case BIC: + return "BIC" + case VFNEG: + return "VFNEG" + case ADDV: + return "ADDV" + case CMEQZERO: + return "CMEQZERO" + case ZIP1: + return "ZIP1" + case SSHR: + return "SSHR" + case EXT: + return "EXT" } panic(fmt.Errorf("unknown instruction %d", i)) } diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go index 197fcb50db..927e4f5ac5 100644 --- a/internal/asm/arm64/impl.go +++ b/internal/asm/arm64/impl.go @@ -142,6 +142,7 @@ const ( OperandTypeSIMDByte OperandTypeTwoSIMDBytes OperandTypeVectorRegister + OperandTypeTwoVectorRegisters OperandTypeStaticConst ) @@ -172,6 +173,8 @@ func (o OperandType) String() (ret string) { ret = "vector-register" case OperandTypeStaticConst: ret = "static-const" + case OperandTypeTwoVectorRegisters: + ret = "two-vector-registers" } return } @@ -180,28 +183,29 @@ func (o OperandType) String() (ret string) { type OperandTypes struct{ src, dst OperandType } var ( - OperandTypesNoneToNone = OperandTypes{OperandTypeNone, OperandTypeNone} - OperandTypesNoneToRegister = OperandTypes{OperandTypeNone, OperandTypeRegister} - OperandTypesNoneToMemory = OperandTypes{OperandTypeNone, OperandTypeMemory} - OperandTypesNoneToBranch = OperandTypes{OperandTypeNone, OperandTypeBranch} - OperandTypesRegisterToRegister = OperandTypes{OperandTypeRegister, OperandTypeRegister} - OperandTypesLeftShiftedRegisterToRegister = OperandTypes{OperandTypeLeftShiftedRegister, OperandTypeRegister} - OperandTypesTwoRegistersToRegister = OperandTypes{OperandTypeTwoRegisters, OperandTypeRegister} - OperandTypesThreeRegistersToRegister = OperandTypes{OperandTypeThreeRegisters, OperandTypeRegister} - OperandTypesTwoRegistersToNone = OperandTypes{OperandTypeTwoRegisters, OperandTypeNone} - OperandTypesRegisterAndConstToNone = OperandTypes{OperandTypeRegisterAndConst, OperandTypeNone} - OperandTypesRegisterToMemory = OperandTypes{OperandTypeRegister, OperandTypeMemory} - OperandTypesMemoryToRegister = OperandTypes{OperandTypeMemory, OperandTypeRegister} - OperandTypesConstToRegister = OperandTypes{OperandTypeConst, OperandTypeRegister} - OperandTypesSIMDByteToSIMDByte = OperandTypes{OperandTypeSIMDByte, OperandTypeSIMDByte} - OperandTypesSIMDByteToRegister = OperandTypes{OperandTypeSIMDByte, OperandTypeRegister} - OperandTypesTwoSIMDBytesToSIMDByteRegister = OperandTypes{OperandTypeTwoSIMDBytes, OperandTypeSIMDByte} - OperandTypesRegisterToVectorRegister = OperandTypes{OperandTypeRegister, OperandTypeVectorRegister} - OperandTypesVectorRegisterToRegister = OperandTypes{OperandTypeVectorRegister, OperandTypeRegister} - OperandTypesMemoryToVectorRegister = OperandTypes{OperandTypeMemory, OperandTypeVectorRegister} - OperandTypesVectorRegisterToMemory = OperandTypes{OperandTypeVectorRegister, OperandTypeMemory} - OperandTypesVectorRegisterToVectorRegister = OperandTypes{OperandTypeVectorRegister, OperandTypeVectorRegister} - OperandTypesStaticConstToVectorRegister = OperandTypes{OperandTypeStaticConst, OperandTypeVectorRegister} + OperandTypesNoneToNone = OperandTypes{OperandTypeNone, OperandTypeNone} + OperandTypesNoneToRegister = OperandTypes{OperandTypeNone, OperandTypeRegister} + OperandTypesNoneToMemory = OperandTypes{OperandTypeNone, OperandTypeMemory} + OperandTypesNoneToBranch = OperandTypes{OperandTypeNone, OperandTypeBranch} + OperandTypesRegisterToRegister = OperandTypes{OperandTypeRegister, OperandTypeRegister} + OperandTypesLeftShiftedRegisterToRegister = OperandTypes{OperandTypeLeftShiftedRegister, OperandTypeRegister} + OperandTypesTwoRegistersToRegister = OperandTypes{OperandTypeTwoRegisters, OperandTypeRegister} + OperandTypesThreeRegistersToRegister = OperandTypes{OperandTypeThreeRegisters, OperandTypeRegister} + OperandTypesTwoRegistersToNone = OperandTypes{OperandTypeTwoRegisters, OperandTypeNone} + OperandTypesRegisterAndConstToNone = OperandTypes{OperandTypeRegisterAndConst, OperandTypeNone} + OperandTypesRegisterToMemory = OperandTypes{OperandTypeRegister, OperandTypeMemory} + OperandTypesMemoryToRegister = OperandTypes{OperandTypeMemory, OperandTypeRegister} + OperandTypesConstToRegister = OperandTypes{OperandTypeConst, OperandTypeRegister} + OperandTypesSIMDByteToSIMDByte = OperandTypes{OperandTypeSIMDByte, OperandTypeSIMDByte} + OperandTypesSIMDByteToRegister = OperandTypes{OperandTypeSIMDByte, OperandTypeRegister} + OperandTypesTwoSIMDBytesToSIMDByteRegister = OperandTypes{OperandTypeTwoSIMDBytes, OperandTypeSIMDByte} + OperandTypesRegisterToVectorRegister = OperandTypes{OperandTypeRegister, OperandTypeVectorRegister} + OperandTypesVectorRegisterToRegister = OperandTypes{OperandTypeVectorRegister, OperandTypeRegister} + OperandTypesMemoryToVectorRegister = OperandTypes{OperandTypeMemory, OperandTypeVectorRegister} + OperandTypesVectorRegisterToMemory = OperandTypes{OperandTypeVectorRegister, OperandTypeMemory} + OperandTypesVectorRegisterToVectorRegister = OperandTypes{OperandTypeVectorRegister, OperandTypeVectorRegister} + OperandTypesTwoVectorRegistersToVectorRegister = OperandTypes{OperandTypeTwoVectorRegisters, OperandTypeVectorRegister} + OperandTypesStaticConstToVectorRegister = OperandTypes{OperandTypeStaticConst, OperandTypeVectorRegister} ) // String implements fmt.Stringer @@ -434,6 +438,8 @@ func (a *AssemblerImpl) EncodeNode(n *NodeImpl) (err error) { err = a.EncodeVectorRegisterToVectorRegister(n) case OperandTypesStaticConstToVectorRegister: err = a.EncodeStaticConstToVectorRegister(n) + case OperandTypesTwoVectorRegistersToVectorRegister: + err = a.encodeTwoVectorRegistersToVectorRegister(n) default: err = fmt.Errorf("encoder undefined for [%s] operand type", n.Types) } @@ -706,8 +712,7 @@ func (a *AssemblerImpl) CompileVectorRegisterToVectorRegisterWithConst(instructi n.VectorArrangement = arrangement } -// CompileLoadStaticConstToVectorRegister adds an instruction where the source operand is StaticConstant located in the memory -// and the destination is the dstReg. +// CompileLoadStaticConstToVectorRegister implements Assembler.CompileLoadStaticConstToVectorRegister func (a *AssemblerImpl) CompileLoadStaticConstToVectorRegister(instruction asm.Instruction, c asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement) { n := a.newNode(instruction, OperandTypesStaticConstToVectorRegister) @@ -716,6 +721,27 @@ func (a *AssemblerImpl) CompileLoadStaticConstToVectorRegister(instruction asm.I n.VectorArrangement = arrangement } +// CompileTwoVectorRegistersToVectorRegister implements Assembler.CompileTwoVectorRegistersToVectorRegister. +func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register, + arrangement VectorArrangement) { + n := a.newNode(instruction, OperandTypesTwoVectorRegistersToVectorRegister) + n.SrcReg = srcReg + n.SrcReg2 = srcReg2 + n.DstReg = dstReg + n.VectorArrangement = arrangement +} + +// CompileTwoVectorRegistersToVectorRegisterWithConst implements Assembler.CompileTwoVectorRegistersToVectorRegisterWithConst. +func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction, + srcReg, srcReg2, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue) { + n := a.newNode(instruction, OperandTypesTwoVectorRegistersToVectorRegister) + n.SrcReg = srcReg + n.SrcReg2 = srcReg2 + n.SrcConst = c + n.DstReg = dstReg + n.VectorArrangement = arrangement +} + func errorEncodingUnsupported(n *NodeImpl) error { return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.Instruction), n.Types) } @@ -761,7 +787,7 @@ func (a *AssemblerImpl) EncodeJumpToRegister(n *NodeImpl) (err error) { // TODO: unexport after golang-asm complete removal. func (a *AssemblerImpl) EncodeRelativeBranch(n *NodeImpl) (err error) { switch n.Instruction { - case B, BEQ, BGE, BGT, BHI, BHS, BLE, BLO, BLS, BLT, BMI, BNE, BVS, BPL: + case B, BCONDEQ, BCONDGE, BCONDGT, BCONDHI, BCONDHS, BCONDLE, BCONDLO, BCONDLS, BCONDLT, BCONDMI, BCONDNE, BCONDVS, BCONDPL: default: return errorEncodingUnsupported(n) } @@ -781,31 +807,31 @@ func (a *AssemblerImpl) EncodeRelativeBranch(n *NodeImpl) (err error) { switch n.Instruction { case B: condBits = condBitsUnconditional - case BEQ: + case BCONDEQ: condBits = 0b0000 - case BGE: + case BCONDGE: condBits = 0b1010 - case BGT: + case BCONDGT: condBits = 0b1100 - case BHI: + case BCONDHI: condBits = 0b1000 - case BHS: + case BCONDHS: condBits = 0b0010 - case BLE: + case BCONDLE: condBits = 0b1101 - case BLO: + case BCONDLO: condBits = 0b0011 - case BLS: + case BCONDLS: condBits = 0b1001 - case BLT: + case BCONDLT: condBits = 0b1011 - case BMI: + case BCONDMI: condBits = 0b0100 - case BPL: + case BCONDPL: condBits = 0b0101 - case BNE: + case BCONDNE: condBits = 0b0001 - case BVS: + case BCONDVS: condBits = 0b0110 } @@ -2010,6 +2036,74 @@ func (a *AssemblerImpl) addOrSub64BitRegisters(sfops byte, src1RegBits byte, src }) } +// See "Logical (immediate)" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate +var logicalImmediate = map[asm.Instruction]struct { + sf, opc byte + resolver func(imm asm.ConstantValue) (imms, immr, N byte, err error) +}{ + ANDIMM32: {sf: 0b0, opc: 0b00, resolver: func(imm asm.ConstantValue) (imms, immr, N byte, err error) { + if !isBitMaskImmediate(uint64(imm)) { + err = fmt.Errorf("const %d must be valid bitmask immediate for %s", imm, InstructionName(ANDIMM64)) + return + } + immr, imms, N = bitmaskImmediate(uint64(imm), false) + return + }}, + ANDIMM64: {sf: 0b1, opc: 0b00, resolver: func(imm asm.ConstantValue) (imms, immr, N byte, err error) { + if !isBitMaskImmediate(uint64(imm)) { + err = fmt.Errorf("const %d must be valid bitmask immediate for %s", imm, InstructionName(ANDIMM64)) + return + } + immr, imms, N = bitmaskImmediate(uint64(imm), true) + return + }}, +} + +func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) { + var size uint32 + switch { + case c != c>>32|c<<32: + size = 64 + case c != c>>16|c<<48: + size = 32 + c = uint64(int32(c)) + case c != c>>8|c<<56: + size = 16 + c = uint64(int16(c)) + case c != c>>4|c<<60: + size = 8 + c = uint64(int8(c)) + case c != c>>2|c<<62: + size = 4 + c = uint64(int64(c<<60) >> 60) + default: + size = 2 + c = uint64(int64(c<<62) >> 62) + } + + neg := false + if int64(c) < 0 { + c = ^c + neg = true + } + + onesSize, nonZeroPos := getOnesSequenceSize(c) + if neg { + nonZeroPos = onesSize + nonZeroPos + onesSize = size - onesSize + } + + var mode byte = 32 + if is64bit { + N, mode = 0b1, 64 + } + + immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1)) + imms = byte((onesSize - 1) | 63&^(size<<1-1)) + return +} + // Exported for inter-op testing with golang-asm. // TODO: unexport after golang-asm complete removal. func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) { @@ -2021,6 +2115,24 @@ func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) { return err } + if log, ok := logicalImmediate[n.Instruction]; ok { + // See "Logical (immediate)" in + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate + imms, immr, N, err := log.resolver(c) + if err != nil { + return err + } + + a.Buf.Write([]byte{ + (dstRegBits << 5) | dstRegBits, + imms<<2 | dstRegBits>>3, + N<<6 | immr, + log.sf<<7 | log.opc<<5 | 0b10010, + }) + return nil + } + + // TODO: refactor and generalize the following like ^ logicalImmediate, etc. switch inst := n.Instruction; inst { case ADD, ADDS, SUB, SUBS: var sfops byte @@ -2595,141 +2707,6 @@ func checkArrangementIndexPair(arr VectorArrangement, index VectorIndex) (err er return } -func (a *AssemblerImpl) EncodeVectorRegisterToRegister(n *NodeImpl) (err error) { - if err = checkArrangementIndexPair(n.VectorArrangement, n.SrcVectorIndex); err != nil { - return - } - - srcVecRegBits, err := vectorRegisterBits(n.SrcReg) - if err != nil { - return err - } - - dstRegBits, err := intRegisterBits(n.DstReg) - if err != nil { - return err - } - - switch n.Instruction { - case VMOV, SMOV: - var imm4 byte // imm4 as in "Advanced SIMD copy" https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en - isSMOV := n.Instruction == SMOV - if isSMOV { - // SMOV: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register- - imm4 = 0b0101 - } else { - // VMOV is translated as "UMOV": https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register- - imm4 = 0b0111 - } - - var imm5 byte - var q byte - switch n.VectorArrangement { - case VectorArrangementB: - imm5 |= 0b1 - imm5 |= byte(n.SrcVectorIndex) << 1 - case VectorArrangementH: - imm5 |= 0b10 - imm5 |= byte(n.SrcVectorIndex) << 2 - case VectorArrangementS: - if isSMOV { - return fmt.Errorf("invalid arrangement for SMOV: %s", n.VectorArrangement.String()) - } - imm5 |= 0b100 - imm5 |= byte(n.SrcVectorIndex) << 3 - case VectorArrangementD: - if isSMOV { - return fmt.Errorf("invalid arrangement for SMOV: %s", n.VectorArrangement.String()) - } - - imm5 |= 0b1000 - imm5 |= byte(n.SrcVectorIndex) << 4 - q = 0b1 - default: - return fmt.Errorf("unsupported arrangement for VMOV: %s", n.VectorArrangement) - } - a.Buf.Write([]byte{ - (srcVecRegBits << 5) | dstRegBits, - imm4<<3 | 0b100 | srcVecRegBits>>3, - imm5, - q<<6 | 0b00001110, - }) - default: - return errorEncodingUnsupported(n) - } - return -} - -func (a *AssemblerImpl) EncodeRegisterToVectorRegister(n *NodeImpl) (err error) { - srcRegBits, err := intRegisterBits(n.SrcReg) - if err != nil { - return err - } - - dstVectorRegBits, err := vectorRegisterBits(n.DstReg) - if err != nil { - return err - } - - switch n.Instruction { - case DUP: - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector- - var imm5 byte - switch n.VectorArrangement { - case VectorArrangementB: - imm5 = 0b1 - case VectorArrangementH: - imm5 = 0b10 - case VectorArrangementS: - imm5 = 0b100 - case VectorArrangementD: - imm5 = 0b1000 - default: - return fmt.Errorf("unsupported arrangement for DUP: %s", n.VectorArrangement) - } - a.Buf.Write([]byte{ - (srcRegBits << 5) | dstVectorRegBits, - 0b11<<2 | srcRegBits>>3, - imm5, - 0b01_001110, - }) - case VMOV: - if err = checkArrangementIndexPair(n.VectorArrangement, n.DstVectorIndex); err != nil { - return - } - - // VMOV is translated as "INS(Vector, Element)" - // Description: https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/INS--vector---general- - // Encoding: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en - var imm5 byte - switch n.VectorArrangement { - case VectorArrangementB: - imm5 |= 0b1 - imm5 |= byte(n.DstVectorIndex) << 1 - case VectorArrangementH: - imm5 |= 0b10 - imm5 |= byte(n.DstVectorIndex) << 2 - case VectorArrangementS: - imm5 |= 0b100 - imm5 |= byte(n.DstVectorIndex) << 3 - case VectorArrangementD: - imm5 |= 0b1000 - imm5 |= byte(n.DstVectorIndex) << 4 - default: - return fmt.Errorf("unsupported arrangement for VMOV: %s", n.VectorArrangement) - } - a.Buf.Write([]byte{ - (srcRegBits << 5) | dstVectorRegBits, - 0b000111_00 | srcRegBits>>3, - imm5, - 0b01001110, - }) - default: - return errorEncodingUnsupported(n) - } - return -} - func (a *AssemblerImpl) EncodeMemoryToVectorRegister(n *NodeImpl) (err error) { srcBaseRegBits, err := intRegisterBits(n.SrcReg) if err != nil { @@ -2919,285 +2896,763 @@ func (a *AssemblerImpl) EncodeStaticConstToVectorRegister(n *NodeImpl) (err erro return } -func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err error) { - var srcVectorRegBits byte - if n.SrcReg != RegRZR { - srcVectorRegBits, err = vectorRegisterBits(n.SrcReg) - if err != nil { - return err +// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { + U, Opcode byte + // TODO: extract common implementation of qAndSizeResolver. + qAndSizeResolver func(arrangement VectorArrangement) (Q, Size byte, err error) +}{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en + NOT: {U: 0b1, Opcode: 0b00101, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b00 + switch arrangement { + case VectorArrangement16B: + Q = 0b1 + case VectorArrangement8B: + Q = 0b0 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(NOT)) } - } - - dstVectorRegBits, err := vectorRegisterBits(n.DstReg) - if err != nil { - return err - } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en + VFNEG: {U: 0b1, Opcode: 0b01111, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b00 + switch arrangement { + case VectorArrangement4S: + size, Q = 0b10, 0b1 + case VectorArrangement2S: + size, Q = 0b10, 0b0 + case VectorArrangement2D: + size, Q = 0b11, 0b1 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFNEG)) + } + return + }}, +} - switch n.Instruction { - case DUP: - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar- - if n.SrcVectorIndex == VectorIndexNone { - return fmt.Errorf("source vector index must be given for %s", InstructionName(DUP)) +// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD three same" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDThreeSame = map[asm.Instruction]struct { + U, Opcode byte + qAndSizeResolver func(arrangement VectorArrangement) (Q, Size byte, err error) +}{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--?lang=en + VAND: {U: 0b0, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b00 + switch arrangement { + case VectorArrangement16B: + Q = 0b1 + case VectorArrangement8B: + Q = 0b0 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(AND)) } - var imm5 byte - switch n.VectorArrangement { - case VectorArrangementB: - imm5 |= 0b1 - imm5 |= byte(n.SrcVectorIndex) << 1 - case VectorArrangementH: - imm5 |= 0b10 - imm5 |= byte(n.SrcVectorIndex) << 2 - case VectorArrangementS: - imm5 |= 0b100 - imm5 |= byte(n.SrcVectorIndex) << 3 - case VectorArrangementD: - imm5 |= 0b1000 - imm5 |= byte(n.SrcVectorIndex) << 4 + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BSL--Bitwise-Select-?lang=en + BSL: {U: 0b1, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b01 + switch arrangement { + case VectorArrangement16B: + Q = 0b1 + case VectorArrangement8B: + Q = 0b0 default: - return fmt.Errorf("unsupported arrangement for VMOV: %d", n.VectorArrangement) + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(BSL)) } - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - 0b1<<2 | srcVectorRegBits>>3, - imm5, - 0b0100_1110, - }) - case VMOV: - if n.SrcVectorIndex != VectorIndexNone && n.DstVectorIndex != VectorIndexNone { - // This case VMOV is translated as MOV(vector, element) - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element-- - var imm5, imm4 byte - switch n.VectorArrangement { - case VectorArrangementB: - imm5 |= 0b1 - imm5 |= byte(n.DstVectorIndex) << 1 - imm4 = byte(n.SrcVectorIndex) - case VectorArrangementH: - imm5 |= 0b10 - imm5 |= byte(n.DstVectorIndex) << 2 - imm4 = byte(n.SrcVectorIndex) << 1 - case VectorArrangementS: - imm5 |= 0b100 - imm5 |= byte(n.DstVectorIndex) << 3 - imm4 = byte(n.SrcVectorIndex) << 2 - case VectorArrangementD: - imm5 |= 0b1000 - imm5 |= byte(n.DstVectorIndex) << 4 - imm4 = byte(n.SrcVectorIndex) << 3 - default: - return fmt.Errorf("unsupported arrangement for VMOV: %d", n.VectorArrangement) - } - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - imm4<<3 | 1<<2 | srcVectorRegBits>>3, - imm5, - 0b01101110, - }) - } else { - // This case VMOV is translated as MOV(vector) - if n.VectorArrangement != VectorArrangement16B { - return fmt.Errorf("unsupported arrangement for VMOV: %s", n.VectorArrangement) - } - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register-- - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - 0b000111<<2 | srcVectorRegBits>>3, - 0b101<<5 | srcVectorRegBits, - 0b0100_1110, - }) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EOR--vector---Bitwise-Exclusive-OR--vector--?lang=en + EOR: {U: 0b1, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b00 + switch arrangement { + case VectorArrangement16B: + Q = 0b1 + case VectorArrangement8B: + Q = 0b0 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(BSL)) } - case VADD, VSUB: - if n.VectorArrangement == VectorArrangementNone || (n.VectorArrangement >= VectorArrangementB && n.VectorArrangement <= VectorArrangementD) || - (n.VectorArrangement == VectorArrangement1D) { - return fmt.Errorf("unsupported arrangement for VADD: %s", n.VectorArrangement) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--?lang=en + VORR: {U: 0b0, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b10 + switch arrangement { + case VectorArrangement16B: + Q = 0b1 + case VectorArrangement8B: + Q = 0b0 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VORR)) } - - var u byte - switch n.Instruction { - case VADD: - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/ADD--vector---Add--vector-- - u = 0b0 - case VSUB: - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector-- - u = 0b1 + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--?lang=en + BIC: {U: 0b0, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size = 0b01 + switch arrangement { + case VectorArrangement16B: + Q = 0b1 + case VectorArrangement8B: + Q = 0b0 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(BIC)) } - - size, q := arrangementSizeQ(n.VectorArrangement) - a.Buf.Write([]byte{ - (dstVectorRegBits << 5) | dstVectorRegBits, - 0b100001<<2 | dstVectorRegBits>>3, - size<<6 | 0b1<<5 | srcVectorRegBits, - q<<6 | u<<5 | 0b1110, - }) - case VFADDS, VFADDD, VFSUBS, VFSUBD: - var sz, b byte - switch n.Instruction { - case VFADDS: - case VFADDD: - sz = 0b1 - case VFSUBS: - b = 0b1 - case VFSUBD: - b = 0b1 - sz = 0b1 + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en + VFADDS: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + switch arrangement { + case VectorArrangement2S: + size, Q = 0b00, 0 + case VectorArrangement4S: + size, Q = 0b00, 1 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFADDS)) } - - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector-- - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - 0b110101<<2 | srcVectorRegBits>>3, - b<<7 | sz<<6 | 0b1<<5 | dstVectorRegBits, - 0b1<<6 | 0b1110, - }) - - case SSHLL, USHLL: - // SSHLL: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- - // USHLL: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- - var u byte - switch n.Instruction { - case SSHLL: - u = 0b0 - case USHLL: - u = 0b1 + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en + VFADDD: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + switch arrangement { + case VectorArrangement2D: + size, Q = 0b01, 1 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFADDD)) } - - var immb, immh byte - switch n.VectorArrangement { - case VectorArrangement8B: - if n.SrcConst < 0 || n.SrcConst > 7 { - return fmt.Errorf("shift amount on %s must be between 0 and 7 for %s but was %d", - InstructionName(n.Instruction), n.VectorArrangement, n.SrcConst) - } - immb = byte(n.SrcConst) - immh = 0b0001 - case VectorArrangement4H: - if n.SrcConst < 0 || n.SrcConst > 15 { - return fmt.Errorf("shift amount on %s must be between 0 and 15 for %s but was %d", - InstructionName(n.Instruction), n.VectorArrangement, n.SrcConst) - } - immb = byte(n.SrcConst) & 0b111 - immh = 0b0010 | byte(n.SrcConst>>3) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en + VFSUBS: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + switch arrangement { case VectorArrangement2S: - if n.SrcConst < 0 || n.SrcConst > 31 { - return fmt.Errorf("shift amount on %s must be between 0 and 31 for %s but was %d", - InstructionName(n.Instruction), n.VectorArrangement, n.SrcConst) - } - immb = byte(n.SrcConst) & 0b111 - immh = 0b0100 | byte(n.SrcConst>>3) + size, Q = 0b10, 0 + case VectorArrangement4S: + size, Q = 0b10, 1 default: - return fmt.Errorf("unsupported arrangement for %s: %s", - InstructionName(n.Instruction), n.VectorArrangement) + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFSUBS)) } - - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - 0b101001<<2 | srcVectorRegBits>>3, - immh<<3 | immb, - u<<5 | 0b1111, - }) - case ADDP: - var opcode byte - var size, q byte - var rm, op byte - switch n.VectorArrangement { + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en + VFSUBD: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + switch arrangement { + case VectorArrangement2D: + size, Q = 0b11, 1 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFSUBD)) + } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-?lang=en + UMAXP: {U: 0b1, Opcode: 0b10100, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en + CMEQ: {U: 0b1, Opcode: 0b10001, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, + // https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector- + VADDP: {U: 0b0, Opcode: 0b10111, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector---Add--vector--?lang=en + VADD: {U: 0, Opcode: 0b10000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--?lang=en + VSUB: {U: 1, Opcode: 0b10000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en + SSHL: {U: 0, Opcode: 0b01000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, + USHL: {U: 0b1, Opcode: 0b01000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) { + size, Q = arrangementSizeQ(arrangement) + return + }}, +} + +// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD across lanes" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDAcrossLanes = map[asm.Instruction]struct { + U, Opcode byte + // TODO: extract common implementation of qAndSizeResolver. + qAndSizeResolver func(arrangement VectorArrangement) (Q, Size byte, err error) +}{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-?lang=en + ADDV: {U: 0b0, Opcode: 0b11011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, Size byte, err error) { + switch arrangement { + case VectorArrangement16B: + Size, Q = 0b00, 0b1 + case VectorArrangement8B: + Size, Q = 0b00, 0b0 + case VectorArrangement8H: + Size, Q = 0b01, 0b1 + case VectorArrangement4H: + Size, Q = 0b01, 0b0 + case VectorArrangement4S: + Size, Q = 0b10, 0b1 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(ADDV)) + } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMINV--Unsigned-Minimum-across-Vector-?lang=en + UMINV: {U: 0b1, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, Size byte, err error) { + switch arrangement { + case VectorArrangement16B: + Size, Q = 0b00, 0b1 + case VectorArrangement8B: + Size, Q = 0b00, 0b0 + case VectorArrangement8H: + Size, Q = 0b01, 0b1 + case VectorArrangement4H: + Size, Q = 0b01, 0b0 + case VectorArrangement4S: + Size, Q = 0b10, 0b1 + default: + err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(UMINV)) + } + return + }}, +} + +// advancedSIMDScalarPairwise holds information to encode instructions as "Advanced SIMD scalar pairwise" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDScalarPairwise = map[asm.Instruction]struct { + U, Opcode byte + sizeResolver func(arrangement VectorArrangement) (Size byte) +}{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en + ADDP: {U: 0b0, Opcode: 0b11011, sizeResolver: func(arrangement VectorArrangement) (size byte) { + size = 0b11 + return + }}, +} + +// advancedSIMDCopy holds information to encode instructions as "Advanced SIMD copy" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDCopy = map[asm.Instruction]struct { + op byte + // TODO: extract common implementation of resolver. + resolver func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) +}{ + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-?lang=en + DUPELEM: {op: 0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { + imm4 = 0b0000 + q = 0b1 + + switch arr { + case VectorArrangementB: + imm5 |= 0b1 + imm5 |= byte(srcIndex) << 1 + case VectorArrangementH: + imm5 |= 0b10 + imm5 |= byte(srcIndex) << 2 + case VectorArrangementS: + imm5 |= 0b100 + imm5 |= byte(srcIndex) << 3 case VectorArrangementD: - opcode = 0b10111_0 - size, q = 0b11, 0b1 - // ADDP (scalar) https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en - rm = 0b10001 - op = 0b1 + imm5 |= 0b1000 + imm5 |= byte(srcIndex) << 4 + default: + err = fmt.Errorf("unsupported arrangement for DUPELEM: %d", arr) + } + + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en + DUPGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { + imm4 = 0b0001 + switch arr { + case VectorArrangement8B: + imm5 = 0b1 + case VectorArrangement16B: + imm5 = 0b1 + q = 0b1 + case VectorArrangement4H: + imm5 = 0b10 + case VectorArrangement8H: + imm5 = 0b10 + q = 0b1 + case VectorArrangement2S: + imm5 = 0b100 + case VectorArrangement4S: + imm5 = 0b100 + q = 0b1 + case VectorArrangement2D: + imm5 = 0b1000 + q = 0b1 default: - // ADDP (vector) https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--vector---Add-Pairwise--vector--?lang=en - opcode = 0b10111_1 - size, q = arrangementSizeQ(n.VectorArrangement) - rm = dstVectorRegBits - op = 0b0 + err = fmt.Errorf("unsupported arrangement for DUPGEN: %s", arr) + } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en + INSGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { + imm4, q = 0b0011, 0b1 + switch arr { + case VectorArrangementB: + imm5 |= 0b1 + imm5 |= byte(dstIndex) << 1 + case VectorArrangementH: + imm5 |= 0b10 + imm5 |= byte(dstIndex) << 2 + case VectorArrangementS: + imm5 |= 0b100 + imm5 |= byte(dstIndex) << 3 + case VectorArrangementD: + imm5 |= 0b1000 + imm5 |= byte(dstIndex) << 4 + default: + err = fmt.Errorf("unsupported arrangement for INSGEN: %s", arr) + } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en + UMOV: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { + imm4 = 0b0111 + switch arr { + case VectorArrangementB: + imm5 |= 0b1 + imm5 |= byte(srcIndex) << 1 + case VectorArrangementH: + imm5 |= 0b10 + imm5 |= byte(srcIndex) << 2 + case VectorArrangementS: + imm5 |= 0b100 + imm5 |= byte(srcIndex) << 3 + case VectorArrangementD: + imm5 |= 0b1000 + imm5 |= byte(srcIndex) << 4 + q = 0b1 + default: + err = fmt.Errorf("unsupported arrangement for UMOV: %s", arr) + } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-?lang=en + SMOV32: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { + imm4 = 0b0101 + switch arr { + case VectorArrangementB: + imm5 |= 0b1 + imm5 |= byte(srcIndex) << 1 + case VectorArrangementH: + imm5 |= 0b10 + imm5 |= byte(srcIndex) << 2 + default: + err = fmt.Errorf("unsupported arrangement for SMOV32: %s", arr) + } + return + }}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en + INSELEM: {op: 0b1, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { + q = 0b1 + switch arr { + case VectorArrangementB: + imm5 |= 0b1 + imm5 |= byte(dstIndex) << 1 + imm4 = byte(srcIndex) + case VectorArrangementH: + imm5 |= 0b10 + imm5 |= byte(dstIndex) << 2 + imm4 = byte(srcIndex) << 1 + case VectorArrangementS: + imm5 |= 0b100 + imm5 |= byte(dstIndex) << 3 + imm4 = byte(srcIndex) << 2 + case VectorArrangementD: + imm5 |= 0b1000 + imm5 |= byte(dstIndex) << 4 + imm4 = byte(srcIndex) << 3 + default: + err = fmt.Errorf("unsupported arrangement for INSELEM: %d", arr) + } + return + }}, +} + +// advancedSIMDTableLookup holds information to encode instructions as "Advanced SIMD table lookup" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDTableLookup = map[asm.Instruction]struct { + op, op2, Len byte + qResolver func(arr VectorArrangement) (q byte) +}{ + TBL1: {op: 0, op2: 0, Len: 0b00, qResolver: func(arr VectorArrangement) (q byte) { + switch arr { + case VectorArrangement16B: + q = 0b1 + case VectorArrangement8B: + q = 0b0 + } + return + }}, + TBL2: {op: 0, op2: 0, Len: 0b01, qResolver: func(arr VectorArrangement) (q byte) { + switch arr { + case VectorArrangement16B: + q = 0b1 + case VectorArrangement8B: + q = 0b0 + } + return + }}, +} + +// advancedSIMDScalarTwoRegisterMisc holds information to encode instructions as "Advanced SIMD scalar two-register miscellaneous" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDScalarTwoRegisterMisc = map[asm.Instruction]struct { + U, opcode byte + qAndSizeResolver func(arr VectorArrangement) (q, size byte) +}{ + CMEQZERO: {U: 0b0, opcode: 0b01001, qAndSizeResolver: func(arr VectorArrangement) (q, size byte) { + size, q = arrangementSizeQ(arr) + return + }}, +} + +// advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDShiftByImmediate = map[asm.Instruction]struct { + U, opcode byte + immQResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) +}{ + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- + SSHLLIMM: {U: 0b0, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, + // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- + USHLLIMM: {U: 0b1, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en + SSHR: {U: 0b0, opcode: 0b00000, immQResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { + // TODO: + switch arr { + case VectorArrangement16B, VectorArrangement8B: + immh = 0b0001 + immb = 8 - byte(shiftAmount&0b111) + if arr == VectorArrangement16B { + q = 1 + } + case VectorArrangement8H, VectorArrangement4H: + v := 16 - byte(shiftAmount&0b1111) + immb = v & 0b111 + immh = 0b0010 | (v >> 3) + if arr == VectorArrangement8H { + q = 1 + } + case VectorArrangement4S, VectorArrangement2S: + v := 32 - byte(shiftAmount&0b11111) + immb = v & 0b111 + immh = 0b0100 | (v >> 3) + if arr == VectorArrangement4S { + q = 1 + } + case VectorArrangement2D: + v := 64 - byte(shiftAmount&0b111111) + immb = v & 0b111 + immh = 0b1000 | (v >> 3) + q = 1 + default: + err = fmt.Errorf("unsupported arrangement %s", arr) + } + return + }}, +} + +// advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD permute" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +var advancedSIMDPermute = map[asm.Instruction]struct { + opcode byte +}{ + ZIP1: {opcode: 0b011}, +} + +func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { + q = 0b0 + + switch arr { + case VectorArrangement8B: + immb = byte(shiftAmount) + immh = 0b0001 + case VectorArrangement4H: + immb = byte(shiftAmount) & 0b111 + immh = 0b0010 | byte(shiftAmount>>3) + case VectorArrangement2S: + immb = byte(shiftAmount) & 0b111 + immh = 0b0100 | byte(shiftAmount>>3) + default: + err = fmt.Errorf("unsupported arrangement %s", arr) + } + return +} + +// encodeAdvancedSIMDCopy encodes instruction as "Advanced SIMD copy" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func (a *AssemblerImpl) encodeAdvancedSIMDCopy(srcRegBits, dstRegBits, op, imm5, imm4, q byte) { + a.Buf.Write([]byte{ + (srcRegBits << 5) | dstRegBits, + imm4<<3 | 0b1<<2 | srcRegBits>>3, + imm5, + q<<6 | op<<5 | 0b1110, + }) +} + +// encodeAdvancedSIMDThreeSame encodes instruction as "Advanced SIMD three same" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func (a *AssemblerImpl) encodeAdvancedSIMDThreeSame(src1, src2, dst, opcode, size, q, u byte) { + a.Buf.Write([]byte{ + (src2 << 5) | dst, + opcode<<3 | 1<<2 | src2>>3, + size<<6 | 0b1<<5 | src1, + q<<6 | u<<5 | 0b1110, + }) +} + +// encodeAdvancedSIMDPermute encodes instruction as "Advanced SIMD permute" in +// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func (a *AssemblerImpl) encodeAdvancedSIMDPermute(src1, src2, dst, opcode, size, q byte) { + a.Buf.Write([]byte{ + (src2 << 5) | dst, + opcode<<4 | 0b1<<3 | src2>>3, + size<<6 | src1, + q<<6 | 0b1110, + }) +} + +func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err error) { + var srcVectorRegBits byte + if n.SrcReg != RegRZR { + srcVectorRegBits, err = vectorRegisterBits(n.SrcReg) + if err != nil { + return err + } + } + + dstVectorRegBits, err := vectorRegisterBits(n.DstReg) + if err != nil { + return err + } + + if simdCopy, ok := advancedSIMDCopy[n.Instruction]; ok { + imm5, imm4, q, err := simdCopy.resolver(n.SrcVectorIndex, n.DstVectorIndex, n.VectorArrangement) + if err != nil { + return err } + a.encodeAdvancedSIMDCopy(srcVectorRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q) + return nil + } + + if scalarPairwise, ok := advancedSIMDScalarPairwise[n.Instruction]; ok { + // See "Advanced SIMD scalar pairwise" in + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + size := scalarPairwise.sizeResolver(n.VectorArrangement) a.Buf.Write([]byte{ (srcVectorRegBits << 5) | dstVectorRegBits, - opcode<<2 | srcVectorRegBits>>3, - size<<6 | 0b1<<5 | rm, - q<<6 | op<<4 | 0b01110, + scalarPairwise.Opcode<<4 | 1<<3 | srcVectorRegBits>>3, + size<<6 | 0b11<<4 | scalarPairwise.Opcode>>4, + 0b1<<6 | scalarPairwise.U<<5 | 0b11110, }) - case UMAXP: - // "Advanced SIMD three same" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en - var opcode, u byte - switch n.Instruction { - case UMAXP: - // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise- - opcode, u = 0b10100, 0b1 + return + } + + if twoRegMisc, ok := advancedSIMDTwoRegisterMisc[n.Instruction]; ok { + // See "Advanced SIMD two-register miscellaneous" in + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + q, size, err := twoRegMisc.qAndSizeResolver(n.VectorArrangement) + if err != nil { + return err } - var size, q byte = arrangementSizeQ(n.VectorArrangement) a.Buf.Write([]byte{ (srcVectorRegBits << 5) | dstVectorRegBits, - opcode<<3 | 0b1<<2 | srcVectorRegBits>>3, - size<<6 | 0b1<<5 | dstVectorRegBits, - q<<6 | u<<5 | 0b01110, + twoRegMisc.Opcode<<4 | 0b1<<3 | srcVectorRegBits>>3, + size<<6 | 0b1<<5 | twoRegMisc.Opcode>>4, + q<<6 | twoRegMisc.U<<5 | 0b01110, }) - case UMINV: - // "Advanced SIMD across lanes" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en - var opcode, u byte = 0b11010, 0b1 - var size, q byte = arrangementSizeQ(n.VectorArrangement) + return nil + } + + if threeSame, ok := advancedSIMDThreeSame[n.Instruction]; ok { + q, size, err := threeSame.qAndSizeResolver(n.VectorArrangement) + if err != nil { + return err + } + a.encodeAdvancedSIMDThreeSame(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeSame.Opcode, size, q, threeSame.U) + return nil + } + if acrossLanes, ok := advancedSIMDAcrossLanes[n.Instruction]; ok { + // See "Advanced SIMD across lanes" in + // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en + q, size, err := acrossLanes.qAndSizeResolver(n.VectorArrangement) + if err != nil { + return err + } a.Buf.Write([]byte{ (srcVectorRegBits << 5) | dstVectorRegBits, - opcode<<4 | 0b1<<3 | srcVectorRegBits>>3, - size<<6 | 0b11000<<1 | opcode>>4, - q<<6 | u<<5 | 0b01110, + acrossLanes.Opcode<<4 | 0b1<<3 | srcVectorRegBits>>3, + size<<6 | 0b11000<<1 | acrossLanes.Opcode>>4, + q<<6 | acrossLanes.U<<5 | 0b01110, }) - case CMEQ: - const size byte = 0b11 - if n.SrcReg == RegRZR { - // CMEQ (zero, vector) - // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en - a.Buf.Write([]byte{ - (dstVectorRegBits << 5) | dstVectorRegBits, - 0b100110<<2 | dstVectorRegBits>>3, - size<<6 | 0b1<<5, - 0b01001110, - }) - } else { - // CMEQ (register, vector) - // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - 0b100011<<2 | srcVectorRegBits>>3, - size<<6 | 0b1<<5 | dstVectorRegBits, - 0b01101110, - }) + return nil + } + + if lookup, ok := advancedSIMDTableLookup[n.Instruction]; ok { + q := lookup.qResolver(n.VectorArrangement) + + a.Buf.Write([]byte{ + (srcVectorRegBits << 5) | dstVectorRegBits, + lookup.Len<<5 | lookup.op<<4 | srcVectorRegBits>>3, + lookup.op2<<6 | dstVectorRegBits, + q<<6 | 0b1110, + }) + return + } + + if scalaTwoMisc, ok := advancedSIMDScalarTwoRegisterMisc[n.Instruction]; ok { + q, size := scalaTwoMisc.qAndSizeResolver(n.VectorArrangement) + a.Buf.Write([]byte{ + (dstVectorRegBits << 5) | dstVectorRegBits, + 0b100110<<2 | dstVectorRegBits>>3, + size<<6 | 0b1<<5, + q<<6 | scalaTwoMisc.U<<5 | 0b01001110, + }) + return + } + + if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.Instruction]; ok { + immh, immb, q, err := shiftByImmediate.immQResolver(n.SrcConst, n.VectorArrangement) + if err != nil { + return err } + a.Buf.Write([]byte{ + (srcVectorRegBits << 5) | dstVectorRegBits, + shiftByImmediate.opcode<<3 | 0b1<<2 | srcVectorRegBits>>3, + immh<<3 | immb, + q<<6 | shiftByImmediate.U<<5 | 0b1111, + }) + return nil + } - case TBL1, TBL2: - // Interpret dstVectorRegBits as the index register (`Rm` in the doc) - // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/TBL--Table-vector-Lookup-?lang=en + if permute, ok := advancedSIMDPermute[n.Instruction]; ok { + size, q := arrangementSizeQ(n.VectorArrangement) + a.encodeAdvancedSIMDPermute(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, permute.opcode, size, q) + return + } + return errorEncodingUnsupported(n) +} - var l byte // `len` in the doc. - switch n.Instruction { - case TBL1: - l = 0b00 - case TBL2: - l = 0b01 +func (a *AssemblerImpl) encodeTwoVectorRegistersToVectorRegister(n *NodeImpl) (err error) { + var srcRegBits, srcRegBits2, dstRegBits byte + srcRegBits, err = vectorRegisterBits(n.SrcReg) + if err != nil { + return err + } + + srcRegBits2, err = vectorRegisterBits(n.SrcReg2) + if err != nil { + return err + } + + dstRegBits, err = vectorRegisterBits(n.DstReg) + if err != nil { + return err + } + + if threeSame, ok := advancedSIMDThreeSame[n.Instruction]; ok { + q, size, err := threeSame.qAndSizeResolver(n.VectorArrangement) + if err != nil { + return err } + a.encodeAdvancedSIMDThreeSame(srcRegBits, srcRegBits2, dstRegBits, threeSame.Opcode, size, q, threeSame.U) + return nil + } + + if permute, ok := advancedSIMDPermute[n.Instruction]; ok { + size, q := arrangementSizeQ(n.VectorArrangement) + a.encodeAdvancedSIMDPermute(srcRegBits, srcRegBits2, dstRegBits, permute.opcode, size, q) + return + } - var q byte + if n.Instruction == EXT { + // EXT is the only instruction in "Advanced SIMD extract", so inline the encoding here. + // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en + var q, imm4 byte switch n.VectorArrangement { case VectorArrangement16B: + imm4 = 0b1111 & byte(n.SrcConst) q = 0b1 case VectorArrangement8B: - q = 0b0 + imm4 = 0b111 & byte(n.SrcConst) + default: + return fmt.Errorf("invalid arrangement %s for EXT", n.VectorArrangement) } - a.Buf.Write([]byte{ - (srcVectorRegBits << 5) | dstVectorRegBits, - l<<5 | srcVectorRegBits>>3, - dstVectorRegBits, - q<<6 | 0b1110, + (srcRegBits2 << 5) | dstRegBits, + imm4<<3 | srcRegBits2>>3, + srcRegBits, + q<<6 | 0b101110, }) - default: - return errorEncodingUnsupported(n) + return + } + return +} + +func (a *AssemblerImpl) EncodeVectorRegisterToRegister(n *NodeImpl) (err error) { + if err = checkArrangementIndexPair(n.VectorArrangement, n.SrcVectorIndex); err != nil { + return + } + + srcVecRegBits, err := vectorRegisterBits(n.SrcReg) + if err != nil { + return err + } + + dstRegBits, err := intRegisterBits(n.DstReg) + if err != nil { + return err } - return nil + if simdCopy, ok := advancedSIMDCopy[n.Instruction]; ok { + imm5, imm4, q, err := simdCopy.resolver(n.SrcVectorIndex, n.DstVectorIndex, n.VectorArrangement) + if err != nil { + return err + } + a.encodeAdvancedSIMDCopy(srcVecRegBits, dstRegBits, simdCopy.op, imm5, imm4, q) + return nil + } + return errorEncodingUnsupported(n) +} + +func (a *AssemblerImpl) EncodeRegisterToVectorRegister(n *NodeImpl) (err error) { + srcRegBits, err := intRegisterBits(n.SrcReg) + if err != nil { + return err + } + + dstVectorRegBits, err := vectorRegisterBits(n.DstReg) + if err != nil { + return err + } + + if simdCopy, ok := advancedSIMDCopy[n.Instruction]; ok { + imm5, imm4, q, err := simdCopy.resolver(n.SrcVectorIndex, n.DstVectorIndex, n.VectorArrangement) + if err != nil { + return err + } + a.encodeAdvancedSIMDCopy(srcRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q) + return nil + } + return errorEncodingUnsupported(n) } var zeroRegisterBits byte = 0b11111 diff --git a/internal/asm/arm64/impl_test.go b/internal/asm/arm64/impl_test.go index 26c49abea1..3d36195e3c 100644 --- a/internal/asm/arm64/impl_test.go +++ b/internal/asm/arm64/impl_test.go @@ -37,16 +37,16 @@ func TestNodeImpl_String(t *testing.T) { exp: "NOP", }, { - in: &NodeImpl{Instruction: BEQ, Types: OperandTypesNoneToRegister, DstReg: RegR1}, - exp: "BEQ R1", + in: &NodeImpl{Instruction: BCONDEQ, Types: OperandTypesNoneToRegister, DstReg: RegR1}, + exp: "BCONDEQ R1", }, { - in: &NodeImpl{Instruction: BNE, Types: OperandTypesNoneToMemory, DstReg: RegR1, DstConst: 0x1234}, - exp: "BNE [R1 + 0x1234]", + in: &NodeImpl{Instruction: BCONDNE, Types: OperandTypesNoneToMemory, DstReg: RegR1, DstConst: 0x1234}, + exp: "BCONDNE [R1 + 0x1234]", }, { - in: &NodeImpl{Instruction: BNE, Types: OperandTypesNoneToBranch, JumpTarget: &NodeImpl{Instruction: NOP}}, - exp: "BNE {NOP}", + in: &NodeImpl{Instruction: BCONDNE, Types: OperandTypesNoneToBranch, JumpTarget: &NodeImpl{Instruction: NOP}}, + exp: "BCONDNE {NOP}", }, { in: &NodeImpl{Instruction: ADD, Types: OperandTypesRegisterToRegister, SrcReg: RegV0, DstReg: RegV10}, @@ -229,9 +229,9 @@ func TestAssemblerImpl_CompileJump(t *testing.T) { func TestAssemblerImpl_CompileJumpToRegister(t *testing.T) { a := NewAssemblerImpl(RegR10) - a.CompileJumpToRegister(BNE, RegR27) + a.CompileJumpToRegister(BCONDNE, RegR27) actualNode := a.Current - require.Equal(t, BNE, actualNode.Instruction) + require.Equal(t, BCONDNE, actualNode.Instruction) require.Equal(t, RegR27, actualNode.DstReg) require.Equal(t, OperandTypeNone, actualNode.Types.src) require.Equal(t, OperandTypeRegister, actualNode.Types.dst) @@ -239,9 +239,9 @@ func TestAssemblerImpl_CompileJumpToRegister(t *testing.T) { func TestAssemblerImpl_CompileJumpToMemory(t *testing.T) { a := NewAssemblerImpl(RegR10) - a.CompileJumpToMemory(BNE, RegR27) + a.CompileJumpToMemory(BCONDNE, RegR27) actualNode := a.Current - require.Equal(t, BNE, actualNode.Instruction) + require.Equal(t, BCONDNE, actualNode.Instruction) require.Equal(t, RegR27, actualNode.DstReg) require.Equal(t, OperandTypeNone, actualNode.Types.src) require.Equal(t, OperandTypeMemory, actualNode.Types.dst) @@ -453,6 +453,19 @@ func Test_CompileVectorRegisterToVectorRegister(t *testing.T) { require.Equal(t, VectorIndex(2), actualNode.DstVectorIndex) } +func Test_CompileTwoVectorRegistersToVectorRegister(t *testing.T) { + a := NewAssemblerImpl(RegR10) + a.CompileTwoVectorRegistersToVectorRegister(VMOV, RegV3, RegV15, RegV10, VectorArrangement1D) + actualNode := a.Current + require.Equal(t, VMOV, actualNode.Instruction) + require.Equal(t, RegV3, actualNode.SrcReg) + require.Equal(t, RegV15, actualNode.SrcReg2) + require.Equal(t, RegV10, actualNode.DstReg) + require.Equal(t, OperandTypeTwoVectorRegisters, actualNode.Types.src) + require.Equal(t, OperandTypeVectorRegister, actualNode.Types.dst) + require.Equal(t, VectorArrangement1D, actualNode.VectorArrangement) +} + func Test_checkRegisterToRegisterType(t *testing.T) { tests := []struct { src, dst asm.Register @@ -876,70 +889,257 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { srcIndex, dstIndex VectorIndex exp []byte }{ - // These are not supported in golang-asm, so test it here instead of integration tests. { + inst: ZIP1, + name: "zip1 v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x39, 0x2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + inst: ADDV, + name: "addv b10, v2.16b", + x1: RegV2, + x2: RegV10, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0xb8, 0x31, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + inst: VORR, + name: "orr v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x1d, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + inst: VORR, + name: "orr v10.8b, v10.8b, v2.8b", + x1: RegV2, + x2: RegV10, + arr: VectorArrangement8B, + exp: []byte{0x4a, 0x1d, 0xa2, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "fadd v10.2d, v10.2d, v2.2d", x1: RegV2, x2: RegV10, inst: VFADDD, - exp: []byte{ - 0x4a, 0xd4, 0x6a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - }, + arr: VectorArrangement2D, + exp: []byte{0x4a, 0xd5, 0x62, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { + name: "fadd v10.4s, v10.4s, v2.4s", x1: RegV2, x2: RegV10, inst: VFADDS, - exp: []byte{ - 0x4a, 0xd4, 0x2a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - }, + arr: VectorArrangement4S, + exp: []byte{0x4a, 0xd5, 0x22, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { + name: "fsub v10.2d, v10.2d, v2.2d", x1: RegV2, x2: RegV10, inst: VFSUBD, - exp: []byte{ - 0x4a, 0xd4, 0xea, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - }, + arr: VectorArrangement2D, + exp: []byte{0x4a, 0xd5, 0xe2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { + name: "fsub v10.4s, v10.4s, v2.4s", x1: RegV2, x2: RegV10, inst: VFSUBS, - exp: []byte{ - 0x4a, 0xd4, 0xaa, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, - }, + arr: VectorArrangement4S, + exp: []byte{0x4a, 0xd5, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "ushll v10.8h, v2.8b, #0", + x1: RegV2, + x2: RegV10, + inst: USHLLIMM, + exp: []byte{0x4a, 0xa4, 0x8, 0x2f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8B, + }, + { + name: "ushll v10.8h, v2.8b, #7", + x1: RegV2, + x2: RegV10, + inst: USHLLIMM, + exp: []byte{0x4a, 0xa4, 0xf, 0x2f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8B, + c: 7, + }, + { + name: "10.8h, v2.8b, #0", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x8, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement16B, + c: 8, + }, + { + name: "sshr v10.16b, v2.16b, #3", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0xd, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement16B, + c: 3, + }, + { + name: "sshr v10.16b, v2.16b, #1", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0xf, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement16B, + c: 1, + }, + { + name: "sshr v10.8b, v2.8b, #3", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8B, + c: 3, + }, + { + name: "sshr v10.8h, v2.8h, #0x10", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x10, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8H, + c: 16, + }, + { + name: "sshr v10.8h, v2.8h, #0xf", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x11, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8H, + c: 15, + }, + { + name: "sshr v10.8h, v2.8h, #3", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x1d, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8H, + c: 3, + }, + { + name: "sshr v10.4h, v2.4h, #0xf", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x11, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement4H, + c: 15, + }, + { + name: "sshr v10.2s, v2.2s, #0x20", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x20, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2S, + c: 32, + }, + { + name: "sshr v10.2s, v2.2s, #0x1f", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x21, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2S, + c: 31, + }, + { + name: "sshr v10.2s, v2.2s, #7", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x39, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2S, + c: 7, }, { + name: "sshr v10.4s, v2.4s, #7", x1: RegV2, x2: RegV10, - inst: SSHLL, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x39, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement4S, + c: 7, + }, + { + name: "sshr v10.2d, v2.2d, #0x3f", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x41, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2D, + c: 63, + }, + { + name: "sshr v10.2d, v2.2d, #0x21", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x5f, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2D, + c: 33, + }, + { + name: "sshr v10.2d, v2.2d, #1", + x1: RegV2, + x2: RegV10, + inst: SSHR, + exp: []byte{0x4a, 0x4, 0x7f, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2D, + c: 1, + }, + { + name: "sshll v10.8h, v2.8b, #0", + x1: RegV2, + x2: RegV10, + inst: SSHLLIMM, exp: []byte{ 0x4a, 0xa4, 0x8, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, arr: VectorArrangement8B, }, { + name: "sshll v10.8h, v2.8b, #7", x1: RegV2, x2: RegV10, - inst: SSHLL, exp: []byte{ + inst: SSHLLIMM, exp: []byte{ 0x4a, 0xa4, 0xf, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, arr: VectorArrangement8B, c: 7, }, { + name: "sshll v10.4s, v2.4h, #0", x1: RegV2, x2: RegV10, - inst: SSHLL, + inst: SSHLLIMM, exp: []byte{ 0x4a, 0xa4, 0x10, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, arr: VectorArrangement4H, }, { + name: "sshll v10.4s, v2.4h, #0xf", x1: RegV2, x2: RegV10, - inst: SSHLL, + inst: SSHLLIMM, exp: []byte{ 0x4a, 0xa4, 0x1f, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, @@ -947,18 +1147,20 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { c: 15, }, { + name: "sshll v10.2d, v2.2s, #0", x1: RegV2, x2: RegV10, - inst: SSHLL, + inst: SSHLLIMM, exp: []byte{ 0x4a, 0xa4, 0x20, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, arr: VectorArrangement2S, }, { + name: "sshll v10.2d, v2.2s, #0x1f", x1: RegV2, x2: RegV10, - inst: SSHLL, + inst: SSHLLIMM, exp: []byte{ 0x4a, 0xa4, 0x3f, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }, @@ -969,7 +1171,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "ins v10.s[2], v2.s[1]", - inst: VMOV, + inst: INSELEM, exp: []byte{0x4a, 0x24, 0x14, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementS, srcIndex: 1, @@ -979,7 +1181,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "ins v10.s[0], v2.s[3]", - inst: VMOV, + inst: INSELEM, exp: []byte{0x4a, 0x64, 0x4, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementS, srcIndex: 3, @@ -989,7 +1191,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "ins v10.b[0], v2.b[0xf]", - inst: VMOV, + inst: INSELEM, exp: []byte{0x4a, 0x7c, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementB, srcIndex: 15, @@ -999,7 +1201,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "ins v10.d[1], v2.d[0]", - inst: VMOV, + inst: INSELEM, exp: []byte{0x4a, 0x4, 0x18, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementD, srcIndex: 0, @@ -1009,7 +1211,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "dup v10.2d, v2.d[0]", - inst: DUP, + inst: DUPELEM, exp: []byte{0x4a, 0x4, 0x8, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementD, srcIndex: 0, @@ -1018,7 +1220,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "dup v10.2d, v2.d[1]", - inst: DUP, + inst: DUPELEM, exp: []byte{0x4a, 0x4, 0x18, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementD, srcIndex: 1, @@ -1027,7 +1229,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "dup v10.4s, v2.s[3]", - inst: DUP, + inst: DUPELEM, exp: []byte{0x4a, 0x4, 0x1c, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementS, srcIndex: 3, @@ -1036,7 +1238,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "dup v10.8h, v2.h[7]", - inst: DUP, + inst: DUPELEM, exp: []byte{0x4a, 0x4, 0x1e, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementH, srcIndex: 7, @@ -1045,7 +1247,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { x1: RegV2, x2: RegV10, name: "dup v10.16b, v2.b[0xf]", - inst: DUP, + inst: DUPELEM, exp: []byte{0x4a, 0x4, 0x1f, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangementB, srcIndex: 15, @@ -1053,25 +1255,25 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { { x1: RegV2, x2: RegV10, - name: "umaxp v10.16b, v2.16b, v10.16b", + name: "umaxp v10.16b, v10.16b, v2.16b", inst: UMAXP, - exp: []byte{0x4a, 0xa4, 0x2a, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + exp: []byte{0x4a, 0xa5, 0x22, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangement16B, }, { x1: RegV2, x2: RegV10, - name: "umaxp v10.8h, v2.8h, v10.8h", + name: "umaxp v10.8h, v10.8h, v2.8h", inst: UMAXP, - exp: []byte{0x4a, 0xa4, 0x6a, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + exp: []byte{0x4a, 0xa5, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangement8H, }, { x1: RegV2, x2: RegV10, - name: "umaxp v10.4s, v2.8h, v10.4s", + name: "umaxp v10.4s, v10.4s, v2.4s", inst: UMAXP, - exp: []byte{0x4a, 0xa4, 0xaa, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + exp: []byte{0x4a, 0xa5, 0xa2, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangement4S, }, { @@ -1080,30 +1282,29 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { name: "addp d11, v11.2d", inst: ADDP, exp: []byte{0x6b, 0xb9, 0xf1, 0x5e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, - arr: VectorArrangementD, }, { x1: RegV2, x2: RegV10, - name: "addp v10.16b, v2.16b, v10.16b", - inst: ADDP, - exp: []byte{0x4a, 0xbc, 0x2a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + name: "addp v10.16b, v10.16b, v2.16b", + inst: VADDP, + exp: []byte{0x4a, 0xbd, 0x22, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangement16B, }, { x1: RegV2, x2: RegV10, - name: "addp v10.8h, v2.8h, v10.8h", - inst: ADDP, - exp: []byte{0x4a, 0xbc, 0x6a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + name: "addp v10.8h, v10.8h, v2.8h", + inst: VADDP, + exp: []byte{0x4a, 0xbd, 0x62, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangement8H, }, { x1: RegV2, x2: RegV10, - name: "addp v10.4s, v2.8h, v10.4s", - inst: ADDP, - exp: []byte{0x4a, 0xbc, 0xaa, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + name: "addp v10.4s, v10.4s, v2.4s", + inst: VADDP, + exp: []byte{0x4a, 0xbd, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, arr: VectorArrangement4S, }, { @@ -1133,15 +1334,17 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { { x1: RegV2, x2: RegV10, - name: "cmeq v10.2d, v2.2d, v10.2d", + name: "cmeq v10.2d, v10.2d, v2.2d", + arr: VectorArrangement2D, inst: CMEQ, - exp: []byte{0x4a, 0x8c, 0xea, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + exp: []byte{0x4a, 0x8d, 0xe2, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { x1: RegRZR, x2: RegV30, name: "cmeq v30.2d, v30.2d, #0", - inst: CMEQ, + inst: CMEQZERO, + arr: VectorArrangement2D, exp: []byte{0xde, 0x9b, 0xe0, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { @@ -1176,6 +1379,135 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { arr: VectorArrangement16B, exp: []byte{0xe1, 0x23, 0x1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, + { + x1: RegV2, + x2: RegV10, + name: "add v10.4s, v10.4s, v2.4s", + inst: VADD, + exp: []byte{0x4a, 0x85, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement4S, + }, + { + x1: RegV2, + x2: RegV10, + name: "add v10.2d, v10.2d, v2.2d", + inst: VADD, + exp: []byte{0x4a, 0x85, 0xe2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2D, + }, + { + x1: RegV2, + x2: RegV10, + name: "sub v10.8h, v10.8h, v2.8h", + inst: VSUB, + exp: []byte{0x4a, 0x85, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8H, + }, + { + x1: RegV29, + x2: RegV30, + name: "sub v30.16b, v30.16b, v29.16b", + inst: VSUB, + exp: []byte{0xde, 0x87, 0x3d, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement16B, + }, + { + name: "bic v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + inst: BIC, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x1d, 0x62, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "eor v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + inst: EOR, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x1d, 0x22, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "bsl v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + inst: BSL, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x1d, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "bsl v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + inst: BSL, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x1d, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and v10.16b, v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + inst: VAND, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x1d, 0x22, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + // mvn is an alias of NOT: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MVN--Bitwise-NOT--vector---an-alias-of-NOT-?lang=en + name: "mvn v10.16b, v2.16b", + x1: RegV2, + x2: RegV10, + inst: NOT, + arr: VectorArrangement16B, + exp: []byte{0x4a, 0x58, 0x20, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "fneg v10.2d, v2.2d", + x1: RegV2, + x2: RegV10, + inst: VFNEG, + arr: VectorArrangement2D, + exp: []byte{0x4a, 0xf8, 0xe0, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "fneg v10.4s, v2.4s", + x1: RegV2, + x2: RegV10, + inst: VFNEG, + arr: VectorArrangement4S, + exp: []byte{0x4a, 0xf8, 0xa0, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + x1: RegV2, + x2: RegV10, + name: "sshl v10.2d, v10.2d, v2.2d", + inst: SSHL, + exp: []byte{0x4a, 0x45, 0xe2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement2D, + }, + { + x1: RegV25, + x2: RegV30, + name: "sshl v30.4s, v30.4s, v25.4s", + inst: SSHL, + exp: []byte{0xde, 0x47, 0xb9, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement4S, + }, + { + x1: RegV2, + x2: RegV10, + name: "ushl v10.8h, v10.8h, v2.8h", + inst: USHL, + exp: []byte{0x4a, 0x45, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement8H, + }, + { + x1: RegV25, + x2: RegV30, + name: "ushl v30.16b, v30.16b, v25.16b", + inst: USHL, + exp: []byte{0xde, 0x47, 0x39, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + arr: VectorArrangement16B, + }, } for _, tt := range tests { @@ -1207,10 +1539,43 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) { exp []byte }{ // These are not supported in golang-asm, so test it here instead of integration tests. + { + name: "umov w10, v0.b[0xf]", + n: &NodeImpl{ + Instruction: UMOV, + SrcReg: RegV0, + DstReg: RegR10, + VectorArrangement: VectorArrangementB, + SrcVectorIndex: 15, + }, + exp: []byte{0xa, 0x3c, 0x1f, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "mov w10, v0.s[3]", + n: &NodeImpl{ + Instruction: UMOV, + SrcReg: RegV0, + DstReg: RegR10, + VectorArrangement: VectorArrangementS, + SrcVectorIndex: 3, + }, + exp: []byte{0xa, 0x3c, 0x1c, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "mov x5, v30.d[1]", + n: &NodeImpl{ + Instruction: UMOV, + SrcReg: RegV30, + DstReg: RegR5, + VectorArrangement: VectorArrangementD, + SrcVectorIndex: 1, + }, + exp: []byte{0xc5, 0x3f, 0x18, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, { name: "smov w10, v0.b[0xf]", n: &NodeImpl{ - Instruction: SMOV, + Instruction: SMOV32, SrcReg: RegV0, DstReg: RegR10, VectorArrangement: VectorArrangementB, @@ -1221,7 +1586,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) { { name: "smov w10, v0.b[0]", n: &NodeImpl{ - Instruction: SMOV, + Instruction: SMOV32, SrcReg: RegV0, DstReg: RegR10, VectorArrangement: VectorArrangementB, @@ -1232,7 +1597,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) { { name: "smov w1, v30.h[7]", n: &NodeImpl{ - Instruction: SMOV, + Instruction: SMOV32, SrcReg: RegV30, DstReg: RegR1, VectorArrangement: VectorArrangementH, @@ -1243,7 +1608,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) { { name: "smov w1, v30.h[0]", n: &NodeImpl{ - Instruction: SMOV, + Instruction: SMOV32, SrcReg: RegV30, DstReg: RegR1, VectorArrangement: VectorArrangementH, @@ -1267,6 +1632,253 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) { } } +func TestAssemblerImpl_encodeTwoVectorRegistersToVectorRegister(t *testing.T) { + tests := []struct { + name string + n *NodeImpl + exp []byte + }{ + { + name: "orr v30.16b, v10.16b, v1.16b", + n: &NodeImpl{ + Instruction: VORR, + DstReg: RegV30, + SrcReg: RegV1, + SrcReg2: RegV10, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0x5e, 0x1d, 0xa1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "orr v30.8b, v10.8b, v1.8b", + n: &NodeImpl{ + Instruction: VORR, + DstReg: RegV30, + SrcReg: RegV1, + SrcReg2: RegV10, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0x5e, 0x1d, 0xa1, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "bsl v0.8b, v15.8b, v1.8b", + n: &NodeImpl{ + Instruction: BSL, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0xe0, 0x1d, 0x61, 0x2e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "zip1 v0.4s, v15.4s, v1.4s", + n: &NodeImpl{ + Instruction: ZIP1, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + VectorArrangement: VectorArrangement4S, + }, + exp: []byte{0xe0, 0x39, 0x81, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "zip1 v0.2d, v15.2d, v1.2d", + n: &NodeImpl{ + Instruction: ZIP1, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + VectorArrangement: VectorArrangement2D, + }, + exp: []byte{0xe0, 0x39, 0xc1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "ext v0.16b, v15.16b, v1.16b, #0xf", + n: &NodeImpl{ + Instruction: EXT, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + SrcConst: 0xf, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xe0, 0x79, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "ext v0.16b, v15.16b, v1.16b, #8", + n: &NodeImpl{ + Instruction: EXT, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + SrcConst: 8, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xe0, 0x41, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "ext v0.16b, v15.16b, v1.16b, #0", + n: &NodeImpl{ + Instruction: EXT, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + SrcConst: 0, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xe0, 0x1, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "ext v0.8b, v15.8b, v1.8b, #7", + n: &NodeImpl{ + Instruction: EXT, + DstReg: RegV0, + SrcReg: RegV1, + SrcReg2: RegV15, + SrcConst: 7, + VectorArrangement: VectorArrangement8B, + }, + exp: []byte{0xe0, 0x39, 0x1, 0x2e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + } + + for _, tt := range tests { + tc := tt + t.Run(tc.name, func(t *testing.T) { + a := NewAssemblerImpl(asm.NilRegister) + err := a.encodeTwoVectorRegistersToVectorRegister(tc.n) + require.NoError(t, err) + actual, err := a.Assemble() + require.NoError(t, err) + + require.Equal(t, tc.exp, actual, hex.EncodeToString(actual)) + }) + } +} + +func TestAssemblerImpl_EncodeConstToRegister(t *testing.T) { + tests := []struct { + name string + n *NodeImpl + exp []byte + }{ + { + name: "and w30, w30, #1", + n: &NodeImpl{ + Instruction: ANDIMM32, + DstReg: RegR30, + SrcConst: 1, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0x3, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and w30, w30, #7", + n: &NodeImpl{ + Instruction: ANDIMM32, + DstReg: RegR30, + SrcConst: 0x7, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0xb, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and w30, w30, #0xf", + n: &NodeImpl{ + Instruction: ANDIMM32, + DstReg: RegR30, + SrcConst: 0xf, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0xf, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and w30, w30, #0x1f", + n: &NodeImpl{ + Instruction: ANDIMM32, + DstReg: RegR30, + SrcConst: 0x1f, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0x13, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and w30, w30, #0x3f", + n: &NodeImpl{ + Instruction: ANDIMM32, + DstReg: RegR30, + SrcConst: 0x3f, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0x17, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and x30, x30, #1", + n: &NodeImpl{ + Instruction: ANDIMM64, + DstReg: RegR30, + SrcConst: 1, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0x3, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and x30, x30, #7", + n: &NodeImpl{ + Instruction: ANDIMM64, + DstReg: RegR30, + SrcConst: 0x7, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0xb, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and x30, x30, #0xf", + n: &NodeImpl{ + Instruction: ANDIMM64, + DstReg: RegR30, + SrcConst: 0xf, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0xf, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and x30, x30, #0x1f", + n: &NodeImpl{ + Instruction: ANDIMM64, + DstReg: RegR30, + SrcConst: 0x1f, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0x13, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "and x30, x30, #0x3f", + n: &NodeImpl{ + Instruction: ANDIMM64, + DstReg: RegR30, + SrcConst: 0x3f, + VectorArrangement: VectorArrangement16B, + }, + exp: []byte{0xde, 0x17, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + } + + for _, tt := range tests { + tc := tt + t.Run(tc.name, func(t *testing.T) { + a := NewAssemblerImpl(asm.NilRegister) + err := a.EncodeConstToRegister(tc.n) + require.NoError(t, err) + actual, err := a.Assemble() + require.NoError(t, err) + + require.Equal(t, tc.exp, actual, hex.EncodeToString(actual)) + }) + } +} + func TestAssemblerImpl_EncodeRegisterToVectorRegister(t *testing.T) { tests := []struct { name string @@ -1275,42 +1887,63 @@ func TestAssemblerImpl_EncodeRegisterToVectorRegister(t *testing.T) { }{ // These are not supported in golang-asm, so test it here instead of integration tests. { - name: "dup v10.2d, x10", + name: "ins v10.d[0], x10", n: &NodeImpl{ - Instruction: DUP, + Instruction: INSGEN, + DstReg: RegV10, SrcReg: RegR10, + VectorArrangement: VectorArrangementD, + }, + exp: []byte{0x4a, 0x1d, 0x8, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "ins v10.d[1], x10", + n: &NodeImpl{ + Instruction: INSGEN, DstReg: RegV10, + SrcReg: RegR10, VectorArrangement: VectorArrangementD, + DstVectorIndex: 1, + }, + exp: []byte{0x4a, 0x1d, 0x18, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + }, + { + name: "dup v10.2d, x10", + n: &NodeImpl{ + Instruction: DUPGEN, + SrcReg: RegR10, + DstReg: RegV10, + VectorArrangement: VectorArrangement2D, }, exp: []byte{0x4a, 0xd, 0x8, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { name: "dup v1.4s, w30", n: &NodeImpl{ - Instruction: DUP, + Instruction: DUPGEN, SrcReg: RegR30, DstReg: RegV1, - VectorArrangement: VectorArrangementS, + VectorArrangement: VectorArrangement4S, }, exp: []byte{0xc1, 0xf, 0x4, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { name: "dup v30.8h, w1", n: &NodeImpl{ - Instruction: DUP, + Instruction: DUPGEN, SrcReg: RegR1, DstReg: RegV30, - VectorArrangement: VectorArrangementH, + VectorArrangement: VectorArrangement8H, }, exp: []byte{0x3e, 0xc, 0x2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, { name: "dup v30.16b, w1", n: &NodeImpl{ - Instruction: DUP, + Instruction: DUPGEN, SrcReg: RegR1, DstReg: RegV30, - VectorArrangement: VectorArrangementB, + VectorArrangement: VectorArrangement16B, }, exp: []byte{0x3e, 0xc, 0x1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, }, diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go index 3bd8a28108..6df5be5f7b 100644 --- a/internal/engine/compiler/compiler_vec_test.go +++ b/internal/engine/compiler/compiler_vec_test.go @@ -110,10 +110,6 @@ func TestCompiler_compileV128Add(t *testing.T) { } func TestCompiler_compileV128Sub(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string @@ -1902,10 +1898,6 @@ func TestCompiler_compileV128Shuffle(t *testing.T) { } func TestCompiler_compileV128Bitmask(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } u16x8 := func(u1, u2, u3, u4, u5, u6, u7, u8 uint16) (ret [16]byte) { binary.LittleEndian.PutUint16(ret[0:], u1) @@ -2028,6 +2020,7 @@ func TestCompiler_compileV128Bitmask(t *testing.T) { require.NoError(t, err) // Generate and run the code under test. + code, _, _, err := compiler.compile() require.NoError(t, err) env.exec(code) @@ -2039,11 +2032,6 @@ func TestCompiler_compileV128Bitmask(t *testing.T) { } func TestCompiler_compileV128_Not(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - env := newCompilerEnvironment() compiler := env.requireNewCompiler(t, newCompiler, &wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}}) @@ -2079,10 +2067,6 @@ func TestCompiler_compileV128_Not(t *testing.T) { } func TestCompiler_compileV128_And_Or_Xor_AndNot(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } tests := []struct { name string @@ -2315,11 +2299,6 @@ func TestCompiler_compileV128_And_Or_Xor_AndNot(t *testing.T) { } func TestCompiler_compileV128Bitselect(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string selector, x1, x2, exp [16]byte @@ -2414,11 +2393,6 @@ func TestCompiler_compileV128Bitselect(t *testing.T) { } func TestCompiler_compileV128Shl(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string shape wazeroir.Shape @@ -2704,11 +2678,6 @@ func TestCompiler_compileV128Shl(t *testing.T) { } func TestCompiler_compileV128Shr(t *testing.T) { - if runtime.GOARCH != "amd64" { - // TODO: implement on amd64. - t.Skip() - } - tests := []struct { name string signed bool diff --git a/internal/engine/compiler/impl_arm64.go b/internal/engine/compiler/impl_arm64.go index 91fd60cd36..614fd826a6 100644 --- a/internal/engine/compiler/impl_arm64.go +++ b/internal/engine/compiler/impl_arm64.go @@ -276,7 +276,7 @@ func (c *arm64Compiler) compileMaybeGrowValueStack() error { c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmpX, tmpY) // If ceil > valueStackLen - stack base pointer, we need to grow the stack by calling builtin Go function. - brIfValueStackOK := c.assembler.CompileJump(arm64.BLS) + brIfValueStackOK := c.assembler.CompileJump(arm64.BCONDLS) if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexGrowValueStack); err != nil { return err } @@ -319,7 +319,7 @@ func (c *arm64Compiler) compileReturnFunction() error { c.assembler.CompileTwoRegistersToNone(arm64.CMP, callFramePointerReg, arm64.RegRZR) // If the values are identical, we return back to the Go code with returned status. - brIfNotEqual := c.assembler.CompileJump(arm64.BNE) + brIfNotEqual := c.assembler.CompileJump(arm64.BCONDNE) c.compileExitFromNativeCode(nativeCallStatusCodeReturned) // Otherwise, we have to jump to the caller's return address. @@ -663,31 +663,31 @@ func (c *arm64Compiler) compileBrIf(o *wazeroir.OperationBrIf) error { // Here we represent the conditional codes by using arm64.COND_** registers, and that means the // conditional jump can be performed if we use arm64.B**. // For example, if we have arm64.CondEQ on cond, that means we performed compileEq right before - // this compileBrIf and BrIf can be achieved by arm64.BEQ. + // this compileBrIf and BrIf can be achieved by arm64.BCONDEQ. var brInst asm.Instruction switch cond.conditionalRegister { case arm64.CondEQ: - brInst = arm64.BEQ + brInst = arm64.BCONDEQ case arm64.CondNE: - brInst = arm64.BNE + brInst = arm64.BCONDNE case arm64.CondHS: - brInst = arm64.BHS + brInst = arm64.BCONDHS case arm64.CondLO: - brInst = arm64.BLO + brInst = arm64.BCONDLO case arm64.CondMI: - brInst = arm64.BMI + brInst = arm64.BCONDMI case arm64.CondHI: - brInst = arm64.BHI + brInst = arm64.BCONDHI case arm64.CondLS: - brInst = arm64.BLS + brInst = arm64.BCONDLS case arm64.CondGE: - brInst = arm64.BGE + brInst = arm64.BCONDGE case arm64.CondLT: - brInst = arm64.BLT + brInst = arm64.BCONDLT case arm64.CondGT: - brInst = arm64.BGT + brInst = arm64.BCONDGT case arm64.CondLE: - brInst = arm64.BLE + brInst = arm64.BCONDLE default: // BUG: This means that we use the cond.conditionalRegister somewhere in this file, // but not covered in switch ^. That shouldn't happen. @@ -704,7 +704,7 @@ func (c *arm64Compiler) compileBrIf(o *wazeroir.OperationBrIf) error { // so we use CMPW (32-bit compare) here. c.assembler.CompileTwoRegistersToNone(arm64.CMPW, cond.register, arm64.RegRZR) - conditionalBR = c.assembler.CompileJump(arm64.BNE) + conditionalBR = c.assembler.CompileJump(arm64.BCONDNE) c.markRegisterUnused(cond.register) } @@ -815,7 +815,7 @@ func (c *arm64Compiler) compileBrTable(o *wazeroir.OperationBrTable) error { // Compare the length with offset. c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmpReg, index.register) // If the value exceeds the length, we will branch into the default target (corresponding to len(o.Targets) index). - brDefaultIndex := c.assembler.CompileJump(arm64.BLO) + brDefaultIndex := c.assembler.CompileJump(arm64.BCONDLO) c.assembler.CompileRegisterToRegister(arm64.MOVWU, tmpReg, index.register) c.assembler.SetJumpTargetOnNext(brDefaultIndex) @@ -946,7 +946,7 @@ func (c *arm64Compiler) compileCallImpl(index wasm.Index, targetFunctionAddressR ) // Compare tmp(len(ce.callFrameStack)) with callFrameStackPointerRegister(ce.callFrameStackPointer). c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, callFrameStackPointerRegister) - brIfCallFrameStackOK := c.assembler.CompileJump(arm64.BNE) + brIfCallFrameStackOK := c.assembler.CompileJump(arm64.BCONDNE) // If these values equal, we need to grow the callFrame stack. // For call_indirect, we need to push the value back to the register. @@ -1192,7 +1192,7 @@ func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) e c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp2, offset.register) // If it exceeds len(table), we exit the execution. - brIfOffsetOK := c.assembler.CompileJump(arm64.BLO) + brIfOffsetOK := c.assembler.CompileJump(arm64.BCONDLO) c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess) // Otherwise, we proceed to do function type check. @@ -1220,7 +1220,7 @@ func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) e // Check if the value of table[offset] equals zero, meaning that the target element is uninitialized. c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, offset.register) - brIfInitialized := c.assembler.CompileJump(arm64.BNE) + brIfInitialized := c.assembler.CompileJump(arm64.BCONDNE) c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess) c.assembler.SetJumpTargetOnNext(brIfInitialized) @@ -1244,7 +1244,7 @@ func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) e // Compare these two values, and if they equal, we are ready to make function call. c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmp, tmp2) - brIfTypeMatched := c.assembler.CompileJump(arm64.BEQ) + brIfTypeMatched := c.assembler.CompileJump(arm64.BCONDEQ) c.compileExitFromNativeCode(nativeCallStatusCodeTypeMismatchOnIndirectCall) c.assembler.SetJumpTargetOnNext(brIfTypeMatched) @@ -1352,7 +1352,7 @@ func (c *arm64Compiler) compileSelect() error { // At this point, x1 is non-zero register, and x2 is either general purpose or zero register. c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, cv.register) - brIfNotZero := c.assembler.CompileJump(arm64.BNE) + brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE) // If cv == 0, we move the value of x2 to the x1.register. @@ -1403,8 +1403,8 @@ func (c *arm64Compiler) compilePick(o *wazeroir.OperationPick) error { case runtimeValueTypeF64: c.assembler.CompileRegisterToRegister(arm64.FMOVD, pickTarget.register, pickedRegister) case runtimeValueTypeV128Lo: - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, - pickTarget.register, pickedRegister, arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone) + c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR, + pickTarget.register, pickTarget.register, pickedRegister, arm64.VectorArrangement16B) case runtimeValueTypeV128Hi: panic("BUG") // since pick target must point to the lower 64-bits of vectors. } @@ -1744,7 +1744,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisor) // If it is zero, we exit with nativeCallStatusIntegerDivisionByZero. - brIfDivisorNonZero := c.assembler.CompileJump(arm64.BNE) + brIfDivisorNonZero := c.assembler.CompileJump(arm64.BCONDNE) c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero) // Otherwise, we proceed. @@ -1760,7 +1760,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, divisor) // If they not equal, we skip the following check. - brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BNE) + brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BCONDNE) // Otherwise, we further check if the dividend equals math.MinInt32 or MinInt64. c.assembler.CompileMemoryToRegister( @@ -1771,7 +1771,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, dividend) // If they not equal, we are safe to execute the division. - brIfDividendNotMinInt := c.assembler.CompileJump(arm64.BNE) + brIfDividendNotMinInt := c.assembler.CompileJump(arm64.BCONDNE) // Otherwise, we raise overflow error. c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow) @@ -1823,7 +1823,7 @@ func (c *arm64Compiler) compileRem(o *wazeroir.OperationRem) error { c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisorReg) // If it is zero, we exit with nativeCallStatusIntegerDivisionByZero. - brIfDivisorNonZero := c.assembler.CompileJump(arm64.BNE) + brIfDivisorNonZero := c.assembler.CompileJump(arm64.BCONDNE) c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero) // Otherwise, we proceed. @@ -2256,7 +2256,7 @@ func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) err // See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register c.assembler.CompileRegisterAndConstToNone(arm64.CMP, arm64ReservedRegisterForTemporary, 1) - brOK := c.assembler.CompileJump(arm64.BNE) + brOK := c.assembler.CompileJump(arm64.BCONDNE) // If so, exit the execution with errors depending on whether or not the source value is NaN. var floatcmp asm.Instruction @@ -2268,7 +2268,7 @@ func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) err c.assembler.CompileTwoRegistersToNone(floatcmp, source.register, source.register) // VS flag is set if at least one of values for FCMP is NaN. // https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code - brIfSourceNaN := c.assembler.CompileJump(arm64.BVS) + brIfSourceNaN := c.assembler.CompileJump(arm64.BCONDVS) // If the source value is not NaN, the operation was overflow. c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow) @@ -2847,7 +2847,7 @@ func (c *arm64Compiler) compileMemoryAccessOffsetSetup(offsetArg uint32, targetS // Check if offsetRegister(= base+offsetArg+targetSizeInBytes) > len(memory.Buffer). c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, offsetRegister) - boundsOK := c.assembler.CompileJump(arm64.BLS) + boundsOK := c.assembler.CompileJump(arm64.BCONDLS) // If offsetRegister(= base+offsetArg+targetSizeInBytes) exceeds the memory length, // we exit the function with nativeCallStatusCodeMemoryOutOfBounds. @@ -3135,7 +3135,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) arm64ReservedRegisterForTemporary) c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register) - sourceBoundsOK := c.assembler.CompileJump(arm64.BLS) + sourceBoundsOK := c.assembler.CompileJump(arm64.BCONDLS) // If not, raise out of bounds memory access error. c.compileExitFromNativeCode(outOfBoundsErrorStatus) @@ -3165,7 +3165,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) } c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register) - destinationBoundsOK := c.assembler.CompileJump(arm64.BLS) + destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS) // If not, raise out of bounds memory access error. c.compileExitFromNativeCode(outOfBoundsErrorStatus) @@ -3176,7 +3176,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) if !isZeroRegister(copySize.register) { // If the size equals zero, we can skip the entire instructions beflow. c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register) - skipCopyJump := c.assembler.CompileJump(arm64.BEQ) + skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ) var movInst asm.Instruction var movSize int64 @@ -3231,7 +3231,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) // Decrement the size counter and if the value is still negative, continue the loop. c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register) - c.assembler.CompileJump(arm64.BMI).AssignJumpTarget(beginCopyLoop) + c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop) c.assembler.SetJumpTargetOnNext(skipCopyJump) } @@ -3351,7 +3351,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd // Check memory len >= sourceOffset. c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register) - sourceBoundsOK := c.assembler.CompileJump(arm64.BLS) + sourceBoundsOK := c.assembler.CompileJump(arm64.BCONDLS) // If not, raise out of bounds memory access error. c.compileExitFromNativeCode(outOfBoundsErrorStatus) @@ -3377,7 +3377,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd } c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register) - destinationBoundsOK := c.assembler.CompileJump(arm64.BLS) + destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS) // If not, raise out of bounds memory access error. c.compileExitFromNativeCode(outOfBoundsErrorStatus) @@ -3398,11 +3398,11 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd // If the size equals zero, we can skip the entire instructions beflow. if !isZeroRegister(copySize.register) { c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register) - skipCopyJump := c.assembler.CompileJump(arm64.BEQ) + skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ) // If source offet < destination offset: for (i = size-1; i >= 0; i--) dst[i] = src[i]; c.assembler.CompileTwoRegistersToNone(arm64.CMP, sourceOffset.register, destinationOffset.register) - destLowerThanSourceJump := c.assembler.CompileJump(arm64.BLS) + destLowerThanSourceJump := c.assembler.CompileJump(arm64.BCONDLS) var endJump asm.Node { // sourceOffset -= size. @@ -3464,7 +3464,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd ) // If the value on the copySize.register is not equal zero, continue the loop. - c.assembler.CompileJump(arm64.BNE).AssignJumpTarget(beginCopyLoop) + c.assembler.CompileJump(arm64.BCONDNE).AssignJumpTarget(beginCopyLoop) // Otherwise, exit the loop. endJump = c.assembler.CompileJump(arm64.B) @@ -3529,7 +3529,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd // size += 1 c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register) - c.assembler.CompileJump(arm64.BMI).AssignJumpTarget(beginCopyLoop) + c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop) } c.assembler.SetJumpTargetOnNext(skipCopyJump, endJump) } @@ -3602,7 +3602,7 @@ func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error { // Check len >= destinationOffset. c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register) - destinationBoundsOK := c.assembler.CompileJump(arm64.BLS) + destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS) // If not, raise the runtime error. if isTable { @@ -3616,7 +3616,7 @@ func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error { // If the size equals zero, we can skip the entire instructions beflow. c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, fillSize.register) - skipCopyJump := c.assembler.CompileJump(arm64.BEQ) + skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ) // destinationOffset -= size. c.assembler.CompileRegisterToRegister(arm64.SUB, fillSize.register, destinationOffset.register) @@ -3664,7 +3664,7 @@ func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error { ) // If the value on the copySizeRgister.register is not equal zero, continue the loop. - continueJump := c.assembler.CompileJump(arm64.BNE) + continueJump := c.assembler.CompileJump(arm64.BCONDNE) continueJump.AssignJumpTarget(beginCopyLoop) // Mark all of the operand registers. @@ -3774,7 +3774,7 @@ func (c *arm64Compiler) compileTableGet(o *wazeroir.OperationTableGet) error { c.assembler.CompileTwoRegistersToNone(arm64.CMP, ref, offset.register) // If it exceeds len(table), we exit the execution. - brIfBoundsOK := c.assembler.CompileJump(arm64.BLO) + brIfBoundsOK := c.assembler.CompileJump(arm64.BCONDLO) c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess) c.assembler.SetJumpTargetOnNext(brIfBoundsOK) @@ -3835,7 +3835,7 @@ func (c *arm64Compiler) compileTableSet(o *wazeroir.OperationTableSet) error { c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, offset.register) // If it exceeds len(table), we exit the execution. - brIfBoundsOK := c.assembler.CompileJump(arm64.BLO) + brIfBoundsOK := c.assembler.CompileJump(arm64.BCONDLO) c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess) c.assembler.SetJumpTargetOnNext(brIfBoundsOK) @@ -4156,7 +4156,7 @@ func (c *arm64Compiler) compileModuleContextInitialization() error { // If the module instance address stays the same, we could skip the entire code below. c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64CallingConventionModuleInstanceAddressRegister, tmpX) - brIfModuleUnchanged := c.assembler.CompileJump(arm64.BEQ) + brIfModuleUnchanged := c.assembler.CompileJump(arm64.BCONDEQ) // Otherwise, update the moduleEngine.moduleContext.ModuleInstanceAddress. c.assembler.CompileRegisterToMemory(arm64.MOVD, diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go index c406972ceb..44def48679 100644 --- a/internal/engine/compiler/impl_vec_arm64.go +++ b/internal/engine/compiler/impl_vec_arm64.go @@ -34,7 +34,7 @@ func (c *arm64Compiler) compileV128Const(o *wazeroir.OperationV128Const) error { c.assembler.CompileConstToRegister(arm64.MOVD, int64(o.Hi), arm64ReservedRegisterForTemporary) } // "ins Vn.D[1], intReg" - c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, intReg, result, arm64.VectorArrangementD, 1) + c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, intReg, result, arm64.VectorArrangementD, 1) c.pushVectorRuntimeValueLocationOnRegister(result) return nil @@ -151,7 +151,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType8x8u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -161,7 +161,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType16x4s: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -171,7 +171,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType16x4u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -181,7 +181,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType32x2s: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -191,7 +191,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result, arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType32x2u: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8) @@ -201,7 +201,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV, arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD, ) - c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result, arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone) case wazeroir.V128LoadType8Splat: offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 1) @@ -284,7 +284,7 @@ func (c *arm64Compiler) compileV128LoadLane(o *wazeroir.OperationV128LoadLane) ( } c.assembler.CompileMemoryWithRegisterOffsetToRegister(loadInst, arm64ReservedRegisterForMemory, source, source) - c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, source, targetVector.register, arr, arm64.VectorIndex(o.LaneIndex)) + c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, source, targetVector.register, arr, arm64.VectorIndex(o.LaneIndex)) c.pushVectorRuntimeValueLocationOnRegister(targetVector.register) c.locationStack.markRegisterUnused(source) @@ -341,7 +341,7 @@ func (c *arm64Compiler) compileV128StoreLane(o *wazeroir.OperationV128StoreLane) return err } - c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v.register, arm64ReservedRegisterForTemporary, arr, + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v.register, arm64ReservedRegisterForTemporary, arr, arm64.VectorIndex(o.LaneIndex)) c.assembler.CompileRegisterToMemoryWithRegisterOffset(storeInst, @@ -366,9 +366,9 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL } var inst asm.Instruction if o.Signed { - inst = arm64.SMOV + inst = arm64.SMOV32 } else { - inst = arm64.VMOV + inst = arm64.UMOV } c.assembler.CompileVectorRegisterToRegister(inst, v.register, result, arm64.VectorArrangementB, arm64.VectorIndex(o.LaneIndex)) @@ -382,9 +382,9 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL } var inst asm.Instruction if o.Signed { - inst = arm64.SMOV + inst = arm64.SMOV32 } else { - inst = arm64.VMOV + inst = arm64.UMOV } c.assembler.CompileVectorRegisterToRegister(inst, v.register, result, arm64.VectorArrangementH, arm64.VectorIndex(o.LaneIndex)) @@ -396,7 +396,7 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL if err != nil { return err } - c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v.register, result, + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v.register, result, arm64.VectorArrangementS, arm64.VectorIndex(o.LaneIndex)) c.locationStack.markRegisterUnused(v.register) @@ -406,17 +406,17 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL if err != nil { return err } - c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v.register, result, + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v.register, result, arm64.VectorArrangementD, arm64.VectorIndex(o.LaneIndex)) c.locationStack.markRegisterUnused(v.register) c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64) case wazeroir.ShapeF32x4: - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, v.register, v.register, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, v.register, v.register, arm64.VectorArrangementS, arm64.VectorIndex(o.LaneIndex), 0) c.pushRuntimeValueLocationOnRegister(v.register, runtimeValueTypeF32) case wazeroir.ShapeF64x2: - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, v.register, v.register, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, v.register, v.register, arm64.VectorArrangementD, arm64.VectorIndex(o.LaneIndex), 0) c.pushRuntimeValueLocationOnRegister(v.register, runtimeValueTypeF64) } @@ -437,22 +437,22 @@ func (c *arm64Compiler) compileV128ReplaceLane(o *wazeroir.OperationV128ReplaceL switch o.Shape { case wazeroir.ShapeI8x16: - c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register, + c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register, arm64.VectorArrangementB, arm64.VectorIndex(o.LaneIndex)) case wazeroir.ShapeI16x8: - c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register, + c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register, arm64.VectorArrangementH, arm64.VectorIndex(o.LaneIndex)) case wazeroir.ShapeI32x4: - c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register, + c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register, arm64.VectorArrangementS, arm64.VectorIndex(o.LaneIndex)) case wazeroir.ShapeI64x2: - c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register, + c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register, arm64.VectorArrangementD, arm64.VectorIndex(o.LaneIndex)) case wazeroir.ShapeF32x4: - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, origin.register, vector.register, arm64.VectorArrangementS, 0, arm64.VectorIndex(o.LaneIndex)) case wazeroir.ShapeF64x2: - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, origin.register, vector.register, arm64.VectorArrangementD, 0, arm64.VectorIndex(o.LaneIndex)) } @@ -475,36 +475,36 @@ func (c *arm64Compiler) compileV128Splat(o *wazeroir.OperationV128Splat) (err er if err != nil { return } - c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result, - arm64.VectorArrangementB, arm64.VectorIndexNone) + c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result, + arm64.VectorArrangement16B, arm64.VectorIndexNone) case wazeroir.ShapeI16x8: result, err = c.allocateRegister(registerTypeVector) if err != nil { return } - c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result, - arm64.VectorArrangementH, arm64.VectorIndexNone) + c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result, + arm64.VectorArrangement8H, arm64.VectorIndexNone) case wazeroir.ShapeI32x4: result, err = c.allocateRegister(registerTypeVector) if err != nil { return } - c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result, - arm64.VectorArrangementS, arm64.VectorIndexNone) + c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result, + arm64.VectorArrangement4S, arm64.VectorIndexNone) case wazeroir.ShapeI64x2: result, err = c.allocateRegister(registerTypeVector) if err != nil { return } - c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result, - arm64.VectorArrangementD, arm64.VectorIndexNone) + c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result, + arm64.VectorArrangement2D, arm64.VectorIndexNone) case wazeroir.ShapeF32x4: result = origin.register - c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUP, origin.register, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUPELEM, origin.register, result, arm64.VectorArrangementS, 0, arm64.VectorIndexNone) case wazeroir.ShapeF64x2: result = origin.register - c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUP, origin.register, result, + c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUPELEM, origin.register, result, arm64.VectorArrangementD, 0, arm64.VectorIndexNone) } @@ -536,8 +536,8 @@ func (c *arm64Compiler) compileV128Shuffle(o *wazeroir.OperationV128Shuffle) (er c.onValueReleaseRegisterToStack(wReg) if w.onRegister() { - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, w.register, wReg, - arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone) + c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR, + w.register, w.register, wReg, arm64.VectorArrangement16B) // We no longer use the old register. c.markRegisterUnused(w.register) } else { // on stack @@ -553,8 +553,8 @@ func (c *arm64Compiler) compileV128Shuffle(o *wazeroir.OperationV128Shuffle) (er c.onValueReleaseRegisterToStack(vReg) if v.onRegister() { - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, v.register, vReg, - arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone) + c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR, + v.register, v.register, vReg, arm64.VectorArrangement16B) // We no longer use the old register. c.markRegisterUnused(v.register) } else { // on stack @@ -607,7 +607,7 @@ func (c *arm64Compiler) compileV128AnyTrue(*wazeroir.OperationV128AnyTrue) (err v := vector.register c.assembler.CompileVectorRegisterToVectorRegister(arm64.UMAXP, v, v, arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone) - c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v, arm64ReservedRegisterForTemporary, + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, arm64ReservedRegisterForTemporary, arm64.VectorArrangementD, 0) c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, arm64ReservedRegisterForTemporary) c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondNE) @@ -625,10 +625,10 @@ func (c *arm64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) (er v := vector.register if o.Shape == wazeroir.ShapeI64x2 { - c.assembler.CompileVectorRegisterToVectorRegister(arm64.CMEQ, arm64.RegRZR, v, - arm64.VectorArrangementNone, arm64.VectorIndexNone, arm64.VectorIndexNone) + c.assembler.CompileVectorRegisterToVectorRegister(arm64.CMEQZERO, arm64.RegRZR, v, + arm64.VectorArrangement2D, arm64.VectorIndexNone, arm64.VectorIndexNone) c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDP, v, v, - arm64.VectorArrangementD, arm64.VectorIndexNone, arm64.VectorIndexNone) + arm64.VectorArrangementNone, arm64.VectorIndexNone, arm64.VectorIndexNone) c.assembler.CompileTwoRegistersToNone(arm64.FCMPD, v, v) c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondEQ) } else { @@ -644,7 +644,7 @@ func (c *arm64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) (er c.assembler.CompileVectorRegisterToVectorRegister(arm64.UMINV, v, v, arr, arm64.VectorIndexNone, arm64.VectorIndexNone) - c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v, arm64ReservedRegisterForTemporary, + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, arm64ReservedRegisterForTemporary, arm64.VectorArrangementD, 0) c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, arm64ReservedRegisterForTemporary) c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondNE) @@ -653,49 +653,288 @@ func (c *arm64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) (er return } +var ( + i8x16BitmaskConst = [16]byte{ + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + } + i16x8BitmaskConst = [16]byte{ + 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, + 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x00, + } + i32x4BitmaskConst = [16]byte{ + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + } +) + // compileV128BitMask implements compiler.compileV128BitMask for arm64. -func (c *arm64Compiler) compileV128BitMask(o *wazeroir.OperationV128BitMask) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128BitMask(o *wazeroir.OperationV128BitMask) (err error) { + vector := c.locationStack.popV128() + if err = c.compileEnsureOnGeneralPurposeRegister(vector); err != nil { + return + } + + v := vector.register + + result, err := c.allocateRegister(registerTypeGeneralPurpose) + if err != nil { + return err + } + + switch o.Shape { + case wazeroir.ShapeI8x16: + vecTmp, err := c.allocateRegister(registerTypeVector) + if err != nil { + return err + } + // Right arithmetic shift on the original vector and store the result into vecTmp. So we have: + // v[i] = 0xff if vi<0, 0 otherwise. + c.assembler.CompileVectorRegisterToVectorRegisterWithConst(arm64.SSHR, v, v, arm64.VectorArrangement16B, 7) + + // Load the bit mask into vecTmp. + c.assembler.CompileLoadStaticConstToVectorRegister(arm64.VMOV, i8x16BitmaskConst[:], vecTmp, arm64.VectorArrangementQ) + + // Lane-wise logical AND with i8x16BitmaskConst, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise. + // + // Below, we use the following notation: + // wi := (1 << i) if vi<0, 0 otherwise. + c.assembler.CompileVectorRegisterToVectorRegister(arm64.VAND, vecTmp, v, arm64.VectorArrangement16B, + arm64.VectorIndexNone, arm64.VectorIndexNone) + + // Swap the lower and higher 8 byte elements, and write it into vecTmp, meaning that we have + // vecTmp[i] = w(i+8) if i < 8, w(i-8) otherwise. + // + c.assembler.CompileTwoVectorRegistersToVectorRegisterWithConst(arm64.EXT, v, v, vecTmp, arm64.VectorArrangement16B, 0x8) + + // v = [w0, w8, ..., w7, w15] + c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.ZIP1, vecTmp, v, v, arm64.VectorArrangement16B) + + // v.h[0] = w0 + ... + w15 + c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDV, v, v, + arm64.VectorArrangement8H, arm64.VectorIndexNone, arm64.VectorIndexNone) + + // Extract the v.h[0] as the result. + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, arm64.VectorArrangementH, 0) + case wazeroir.ShapeI16x8: + vecTmp, err := c.allocateRegister(registerTypeVector) + if err != nil { + return err + } + // Right arithmetic shift on the original vector and store the result into vecTmp. So we have: + // v[i] = 0xffff if vi<0, 0 otherwise. + c.assembler.CompileVectorRegisterToVectorRegisterWithConst(arm64.SSHR, v, v, arm64.VectorArrangement8H, 15) + + // Load the bit mask into vecTmp. + c.assembler.CompileLoadStaticConstToVectorRegister(arm64.VMOV, i16x8BitmaskConst[:], vecTmp, arm64.VectorArrangementQ) + + // Lane-wise logical AND with i16x8BitmaskConst, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise for i=0..3 + // = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7 + c.assembler.CompileVectorRegisterToVectorRegister(arm64.VAND, vecTmp, v, arm64.VectorArrangement16B, + arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDV, v, v, + arm64.VectorArrangement8H, arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, arm64.VectorArrangementH, 0) + case wazeroir.ShapeI32x4: + vecTmp, err := c.allocateRegister(registerTypeVector) + if err != nil { + return err + } + + // Right arithmetic shift on the original vector and store the result into vecTmp. So we have: + // v[i] = 0xffffffff if vi<0, 0 otherwise. + c.assembler.CompileVectorRegisterToVectorRegisterWithConst(arm64.SSHR, v, v, arm64.VectorArrangement4S, 32) + + // Load the bit mask into vecTmp. + c.assembler.CompileLoadStaticConstToVectorRegister(arm64.VMOV, i32x4BitmaskConst[:], vecTmp, arm64.VectorArrangementQ) + + // Lane-wise logical AND with i16x8BitmaskConst, meaning that we have + // v[i] = (1 << i) if vi<0, 0 otherwise for i in [0, 1] + // = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3] + c.assembler.CompileVectorRegisterToVectorRegister(arm64.VAND, vecTmp, v, arm64.VectorArrangement16B, + arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDV, v, v, + arm64.VectorArrangement4S, arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, arm64.VectorArrangementS, 0) + case wazeroir.ShapeI64x2: + // Move the lower 64-bit int into result, + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, + arm64.VectorArrangementD, 0) + // Move the higher 64-bit int into arm64ReservedRegisterForTemporary. + c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, arm64ReservedRegisterForTemporary, + arm64.VectorArrangementD, 1) + + // Move the sign bit into the least significant bit. + c.assembler.CompileConstToRegister(arm64.LSR, 63, result) + c.assembler.CompileConstToRegister(arm64.LSR, 63, arm64ReservedRegisterForTemporary) + + // result = (arm64ReservedRegisterForTemporary<<1) | result + c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD, + arm64ReservedRegisterForTemporary, 1, result, result) + } + + c.markRegisterUnused(v) + c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32) + return } // compileV128And implements compiler.compileV128And for arm64. -func (c *arm64Compiler) compileV128And(o *wazeroir.OperationV128And) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128And(*wazeroir.OperationV128And) error { + return c.compileV128x2BinOp(arm64.VAND, arm64.VectorArrangement16B) } // compileV128Not implements compiler.compileV128Not for arm64. -func (c *arm64Compiler) compileV128Not(o *wazeroir.OperationV128Not) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128Not(*wazeroir.OperationV128Not) error { + return c.compileV128UniOp(arm64.NOT, arm64.VectorArrangement16B) } // compileV128Or implements compiler.compileV128Or for arm64. -func (c *arm64Compiler) compileV128Or(o *wazeroir.OperationV128Or) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128Or(*wazeroir.OperationV128Or) error { + return c.compileV128x2BinOp(arm64.VORR, arm64.VectorArrangement16B) } // compileV128Xor implements compiler.compileV128Xor for arm64. -func (c *arm64Compiler) compileV128Xor(o *wazeroir.OperationV128Xor) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128Xor(*wazeroir.OperationV128Xor) error { + return c.compileV128x2BinOp(arm64.EOR, arm64.VectorArrangement16B) } // compileV128Bitselect implements compiler.compileV128Bitselect for arm64. -func (c *arm64Compiler) compileV128Bitselect(o *wazeroir.OperationV128Bitselect) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128Bitselect(*wazeroir.OperationV128Bitselect) error { + selector := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(selector); err != nil { + return err + } + + x2 := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil { + return err + } + + x1 := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil { + return err + } + + c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.BSL, + x2.register, x1.register, selector.register, arm64.VectorArrangement16B) + + c.markRegisterUnused(x1.register, x2.register) + c.pushVectorRuntimeValueLocationOnRegister(selector.register) + return nil } // compileV128AndNot implements compiler.compileV128AndNot for arm64. -func (c *arm64Compiler) compileV128AndNot(o *wazeroir.OperationV128AndNot) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) +func (c *arm64Compiler) compileV128AndNot(*wazeroir.OperationV128AndNot) error { + return c.compileV128x2BinOp(arm64.BIC, arm64.VectorArrangement16B) +} + +func (c *arm64Compiler) compileV128UniOp(inst asm.Instruction, arr arm64.VectorArrangement) error { + v := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil { + return err + } + + c.assembler.CompileVectorRegisterToVectorRegister(inst, v.register, v.register, arr, arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.pushVectorRuntimeValueLocationOnRegister(v.register) + return nil +} + +func (c *arm64Compiler) compileV128x2BinOp(inst asm.Instruction, arr arm64.VectorArrangement) error { + x2 := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil { + return err + } + + x1 := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil { + return err + } + + c.assembler.CompileVectorRegisterToVectorRegister(inst, x2.register, x1.register, arr, arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.markRegisterUnused(x2.register) + c.pushVectorRuntimeValueLocationOnRegister(x1.register) + return nil } // compileV128Shr implements compiler.compileV128Shr for arm64. func (c *arm64Compiler) compileV128Shr(o *wazeroir.OperationV128Shr) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + var inst asm.Instruction + if o.Signed { + inst = arm64.SSHL + } else { + inst = arm64.USHL + } + return c.compileV128ShiftImpl(o.Shape, inst, true) } // compileV128Shl implements compiler.compileV128Shl for arm64. func (c *arm64Compiler) compileV128Shl(o *wazeroir.OperationV128Shl) error { - return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind()) + return c.compileV128ShiftImpl(o.Shape, arm64.SSHL, false) +} + +func (c *arm64Compiler) compileV128ShiftImpl(shape wazeroir.Shape, ins asm.Instruction, rightShift bool) error { + s := c.locationStack.pop() + if s.register == arm64.RegRZR { + // If the shift amount is zero register, nothing to do here. + return nil + } + + var modulo asm.ConstantValue + var arr arm64.VectorArrangement + switch shape { + case wazeroir.ShapeI8x16: + modulo = 0x7 // modulo 8. + arr = arm64.VectorArrangement16B + case wazeroir.ShapeI16x8: + modulo = 0xf // modulo 16. + arr = arm64.VectorArrangement8H + case wazeroir.ShapeI32x4: + modulo = 0x1f // modulo 32. + arr = arm64.VectorArrangement4S + case wazeroir.ShapeI64x2: + modulo = 0x3f // modulo 64. + arr = arm64.VectorArrangement2D + } + + if err := c.compileEnsureOnGeneralPurposeRegister(s); err != nil { + return err + } + + v := c.locationStack.popV128() + if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil { + return err + } + + tmp, err := c.allocateRegister(registerTypeVector) + if err != nil { + return err + } + + c.assembler.CompileConstToRegister(arm64.ANDIMM32, modulo, s.register) + + if rightShift { + // Negate the amount to make this as right shift. + c.assembler.CompileRegisterToRegister(arm64.NEG, s.register, s.register) + } + + // Copy the shift amount into a vector register as SSHL requires it to be there. + c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, s.register, tmp, + arr, arm64.VectorIndexNone) + + c.assembler.CompileVectorRegisterToVectorRegister(ins, tmp, v.register, arr, + arm64.VectorIndexNone, arm64.VectorIndexNone) + + c.markRegisterUnused(s.register) + c.pushVectorRuntimeValueLocationOnRegister(v.register) + return nil } // compileV128Cmp implements compiler.compileV128Cmp for arm64. diff --git a/internal/engine/compiler/impl_vec_arm64_test.go b/internal/engine/compiler/impl_vec_arm64_test.go index 5ce04d5a3e..03d25e37d4 100644 --- a/internal/engine/compiler/impl_vec_arm64_test.go +++ b/internal/engine/compiler/impl_vec_arm64_test.go @@ -67,8 +67,8 @@ func TestArm64Compiler_V128Shuffle_ConstTable_MiddleOfFunction(t *testing.T) { func TestArm64Compiler_V128Shuffle_combinations(t *testing.T) { movValueRegisterToRegister := func(t *testing.T, c *arm64Compiler, src *runtimeValueLocation, dst asm.Register) { - c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, src.register, dst, - arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone) + c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR, src.register, src.register, dst, + arm64.VectorArrangement16B) c.locationStack.markRegisterUnused(src.register) src.setRegister(dst) // We have to set the lower 64-bits' location as well. diff --git a/internal/integration_test/asm/arm64_debug/debug_assembler.go b/internal/integration_test/asm/arm64_debug/debug_assembler.go index 72e411f0d1..24c772e97b 100644 --- a/internal/integration_test/asm/arm64_debug/debug_assembler.go +++ b/internal/integration_test/asm/arm64_debug/debug_assembler.go @@ -309,3 +309,17 @@ func (ta *testAssembler) CompileLoadStaticConstToVectorRegister(instruction asm. ta.goasm.CompileLoadStaticConstToVectorRegister(instruction, c, dstReg, arrangement) ta.a.CompileLoadStaticConstToVectorRegister(instruction, c, dstReg, arrangement) } + +// CompileTwoVectorRegistersToVectorRegister implements the same method as documented on arm64.Assembler. +func (ta *testAssembler) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, + srcReg, srcReg2, dstReg asm.Register, arrangement arm64.VectorArrangement) { + ta.goasm.CompileTwoVectorRegistersToVectorRegister(instruction, srcReg, srcReg2, dstReg, arrangement) + ta.a.CompileTwoVectorRegistersToVectorRegister(instruction, srcReg, srcReg2, dstReg, arrangement) +} + +// CompileTwoVectorRegistersToVectorRegisterWithConst implements the same method as documented on arm64.Assembler. +func (ta *testAssembler) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction, + srcReg, srcReg2, dstReg asm.Register, arrangement arm64.VectorArrangement, c asm.ConstantValue) { + ta.goasm.CompileTwoVectorRegistersToVectorRegisterWithConst(instruction, srcReg, srcReg2, dstReg, arrangement, c) + ta.a.CompileTwoVectorRegistersToVectorRegisterWithConst(instruction, srcReg, srcReg2, dstReg, arrangement, c) +} diff --git a/internal/integration_test/asm/arm64_debug/golang_asm.go b/internal/integration_test/asm/arm64_debug/golang_asm.go index 34d44669de..50e63b2def 100644 --- a/internal/integration_test/asm/arm64_debug/golang_asm.go +++ b/internal/integration_test/asm/arm64_debug/golang_asm.go @@ -18,13 +18,13 @@ func newAssembler(temporaryRegister asm.Register) (*assemblerGoAsmImpl, error) { return &assemblerGoAsmImpl{GolangAsmBaseAssembler: g, temporaryRegister: temporaryRegister}, err } -// assemblerGoAsmImpl implements asm_arm64.Assembler for golang-asm library. +// assemblerGoAsmImpl implements arm64.Assembler for golang-asm library. type assemblerGoAsmImpl struct { *golang_asm.GolangAsmBaseAssembler temporaryRegister asm.Register } -// CompileConstToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileConstToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileConstToRegister(instruction asm.Instruction, constValue asm.ConstantValue, destinationReg asm.Register) asm.Node { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -45,7 +45,7 @@ func (a *assemblerGoAsmImpl) CompileConstToRegister(instruction asm.Instruction, return golang_asm.NewGolangAsmNode(inst) } -// CompileMemoryToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileMemoryToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileMemoryToRegister(instruction asm.Instruction, sourceBaseReg asm.Register, sourceOffsetConst asm.ConstantValue, destinationReg asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -57,7 +57,7 @@ func (a *assemblerGoAsmImpl) CompileMemoryToRegister(instruction asm.Instruction a.AddInstruction(inst) } -// CompileMemoryWithRegisterOffsetToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileMemoryWithRegisterOffsetToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileMemoryWithRegisterOffsetToRegister(instruction asm.Instruction, sourceBaseReg, sourceOffsetReg, destinationReg asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -70,7 +70,7 @@ func (a *assemblerGoAsmImpl) CompileMemoryWithRegisterOffsetToRegister(instructi a.AddInstruction(inst) } -// CompileRegisterToMemory implements the same method as documented on asm_arm64.Assembler. +// CompileRegisterToMemory implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileRegisterToMemory(instruction asm.Instruction, sourceReg asm.Register, destinationBaseReg asm.Register, destinationOffsetConst asm.ConstantValue) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -95,7 +95,7 @@ func (a *assemblerGoAsmImpl) CompileRegisterToMemoryWithRegisterOffset(instructi a.AddInstruction(inst) } -// CompileRegisterToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileRegisterToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -106,7 +106,7 @@ func (a *assemblerGoAsmImpl) CompileRegisterToRegister(instruction asm.Instructi a.AddInstruction(inst) } -// CompileTwoRegistersToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileTwoRegistersToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileTwoRegistersToRegister(instruction asm.Instruction, src1, src2, destination asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -118,7 +118,7 @@ func (a *assemblerGoAsmImpl) CompileTwoRegistersToRegister(instruction asm.Instr a.AddInstruction(inst) } -// CompileThreeRegistersToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileThreeRegistersToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileThreeRegistersToRegister(instruction asm.Instruction, src1, src2, src3, dst asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -131,7 +131,7 @@ func (a *assemblerGoAsmImpl) CompileThreeRegistersToRegister(instruction asm.Ins a.AddInstruction(inst) } -// CompileTwoRegistersToNone implements the same method as documented on asm_arm64.Assembler. +// CompileTwoRegistersToNone implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileTwoRegistersToNone(instruction asm.Instruction, src1, src2 asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -144,7 +144,7 @@ func (a *assemblerGoAsmImpl) CompileTwoRegistersToNone(instruction asm.Instructi a.AddInstruction(inst) } -// CompileRegisterAndConstToNone implements the same method as documented on asm_arm64.Assembler. +// CompileRegisterAndConstToNone implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileRegisterAndConstToNone(instruction asm.Instruction, src asm.Register, srcConst asm.ConstantValue) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -192,7 +192,7 @@ func (a *assemblerGoAsmImpl) CompileStandAlone(instruction asm.Instruction) asm. return golang_asm.NewGolangAsmNode(prog) } -// CompileLeftShiftedRegisterToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileLeftShiftedRegisterToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileLeftShiftedRegisterToRegister(instruction asm.Instruction, shiftedSourceReg asm.Register, shiftNum asm.ConstantValue, srcReg, destinationReg asm.Register) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -205,7 +205,7 @@ func (a *assemblerGoAsmImpl) CompileLeftShiftedRegisterToRegister(instruction as a.AddInstruction(inst) } -// CompileReadInstructionAddress implements the same method as documented on asm_arm64.Assembler. +// CompileReadInstructionAddress implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileReadInstructionAddress(destinationReg asm.Register, beforeAcquisitionTargetInstruction asm.Instruction) { // Emit ADR instruction to read the specified instruction's absolute address. // Note: we cannot emit the "ADR REG, $(target's offset from here)" due to the @@ -262,7 +262,7 @@ func (a *assemblerGoAsmImpl) CompileReadInstructionAddress(destinationReg asm.Re }) } -// CompileConditionalRegisterSet implements the same method as documented on asm_arm64.Assembler. +// CompileConditionalRegisterSet implements the same method as documented on arm64.Assembler. // // We use CSET instruction to set 1 on the register if the condition satisfies: // https://developer.arm.com/documentation/100076/0100/a64-instruction-set-reference/a64-general-instructions/cset @@ -282,7 +282,7 @@ func simdRegisterForScalarFloatRegister(freg int16) int16 { return freg + (arm64.REG_F31 - arm64.REG_F0) + 1 } -// CompileTwoSIMDBytesToSIMDByteRegister implements the same method as documented on asm_arm64.Assembler. +// CompileTwoSIMDBytesToSIMDByteRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileTwoSIMDBytesToSIMDByteRegister(instruction asm.Instruction, srcReg1, srcReg2, dstReg asm.Register) { src1FloatReg, src2FloatReg, dstFloatReg := castAsGolangAsmRegister[srcReg1], castAsGolangAsmRegister[srcReg2], castAsGolangAsmRegister[dstReg] src1VReg, src2VReg, dstVReg := simdRegisterForScalarFloatRegister(src1FloatReg), simdRegisterForScalarFloatRegister(src2FloatReg), simdRegisterForScalarFloatRegister(dstFloatReg) @@ -300,7 +300,7 @@ func (a *assemblerGoAsmImpl) CompileTwoSIMDBytesToSIMDByteRegister(instruction a } -// CompileSIMDByteToSIMDByte implements the same method as documented on asm_arm64.Assembler. +// CompileSIMDByteToSIMDByte implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileSIMDByteToSIMDByte(instruction asm.Instruction, srcReg, dstReg asm.Register) { srcFloatReg, dstFloatReg := castAsGolangAsmRegister[srcReg], castAsGolangAsmRegister[dstReg] srcVReg, dstVReg := simdRegisterForScalarFloatRegister(srcFloatReg), simdRegisterForScalarFloatRegister(dstFloatReg) @@ -316,7 +316,7 @@ func (a *assemblerGoAsmImpl) CompileSIMDByteToSIMDByte(instruction asm.Instructi a.AddInstruction(inst) } -// CompileSIMDByteToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileSIMDByteToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileSIMDByteToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register) { srcFloatReg, dstFlaotReg := castAsGolangAsmRegister[srcReg], castAsGolangAsmRegister[dstReg] srcVReg, dstVReg := simdRegisterForScalarFloatRegister(srcFloatReg), simdRegisterForScalarFloatRegister(dstFlaotReg) @@ -332,30 +332,42 @@ func (a *assemblerGoAsmImpl) CompileSIMDByteToRegister(instruction asm.Instructi a.AddInstruction(inst) } -// CompileMemoryToVectorRegister implements the same method as documented on asm_arm64.Assembler. +// CompileMemoryToVectorRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileMemoryToVectorRegister( _ asm.Instruction, _ asm.Register, _ asm.ConstantValue, _ asm.Register, _ asm_arm64.VectorArrangement, ) { panic("CompileMemoryToVectorRegister is unsupported with golang-asm") } -// CompileVectorRegisterToMemory implements the same method as documented on asm_arm64.Assembler. +// CompileVectorRegisterToMemory implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileVectorRegisterToMemory(_ asm.Instruction, _, _ asm.Register, _ asm.ConstantValue, _ asm_arm64.VectorArrangement) { panic("CompileVectorRegisterToMemory is unsupported with golang-asm") } -// CompileMemoryWithRegisterOffsetToVectorRegister implements the same method as documented on asm_arm64.Assembler. +// CompileMemoryWithRegisterOffsetToVectorRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileMemoryWithRegisterOffsetToVectorRegister(_ asm.Instruction, _, _ asm.Register, _ asm.Register, _ asm_arm64.VectorArrangement) { panic("CompileMemoryWithRegisterOffsetToVectorRegister is unsupported with golang-asm") } -// CompileVectorRegisterToMemoryWithRegisterOffset implements the same method as documented on asm_arm64.Assembler. +// CompileVectorRegisterToMemoryWithRegisterOffset implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileVectorRegisterToMemoryWithRegisterOffset(_ asm.Instruction, _, _, _ asm.Register, _ asm_arm64.VectorArrangement) { panic("CompileVectorRegisterToMemoryWithRegisterOffset is unsupported with golang-asm") } -// CompileRegisterToVectorRegister implements the same method as documented on asm_arm64.Assembler. +// CompileTwoVectorRegistersToVectorRegister implements the same method as documented on arm64.Assembler. +func (a *assemblerGoAsmImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, + srcReg, srcReg2, dstReg asm.Register, arrangement asm_arm64.VectorArrangement) { + panic("CompileTwoVectorRegistersToVectorRegister is unsupported with golang-asm") +} + +// CompileTwoVectorRegistersToVectorRegisterWithConst implements the same method as documented on arm64.Assembler. +func (a *assemblerGoAsmImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction, + srcReg, srcReg2, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, value asm.ConstantValue) { + panic("CompileTwoVectorRegistersToVectorRegisterWithConst is unsupported with golang-asm") +} + +// CompileRegisterToVectorRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileRegisterToVectorRegister(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, index asm_arm64.VectorIndex) { inst := a.NewProg() @@ -369,7 +381,7 @@ func (a *assemblerGoAsmImpl) CompileRegisterToVectorRegister(instruction asm.Ins a.AddInstruction(inst) } -// CompileVectorRegisterToVectorRegister implements the same method as documented on asm_arm64.Assembler. +// CompileVectorRegisterToVectorRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegister(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, srcIndex, dstIndex asm_arm64.VectorIndex) { inst := a.NewProg() inst.As = castAsGolangAsmInstruction[instruction] @@ -395,11 +407,11 @@ func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegister(instruction a } } -// CompileVectorRegisterToVectorRegisterWithConst implements the same method as documented on asm_arm64.Assembler. +// CompileVectorRegisterToVectorRegisterWithConst implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, c asm.ConstantValue) { switch instruction { - case asm_arm64.USHLL: + case asm_arm64.USHLLIMM: var dstArrangement asm_arm64.VectorArrangement if arrangement == asm_arm64.VectorArrangement8B { dstArrangement = asm_arm64.VectorArrangement8H @@ -421,7 +433,7 @@ func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegisterWithConst(inst } } -// CompileVectorRegisterToRegister implements the same method as documented on asm_arm64.Assembler. +// CompileVectorRegisterToRegister implements the same method as documented on arm64.Assembler. func (a *assemblerGoAsmImpl) CompileVectorRegisterToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, index asm_arm64.VectorIndex) { inst := a.NewProg() @@ -594,19 +606,19 @@ var castAsGolangAsmInstruction = [...]obj.As{ asm_arm64.ASR: arm64.AASR, asm_arm64.ASRW: arm64.AASRW, asm_arm64.B: arm64.AB, - asm_arm64.BEQ: arm64.ABEQ, - asm_arm64.BGE: arm64.ABGE, - asm_arm64.BGT: arm64.ABGT, - asm_arm64.BHI: arm64.ABHI, - asm_arm64.BHS: arm64.ABHS, - asm_arm64.BLE: arm64.ABLE, - asm_arm64.BLO: arm64.ABLO, - asm_arm64.BLS: arm64.ABLS, - asm_arm64.BLT: arm64.ABLT, - asm_arm64.BMI: arm64.ABMI, - asm_arm64.BPL: arm64.ABPL, - asm_arm64.BNE: arm64.ABNE, - asm_arm64.BVS: arm64.ABVS, + asm_arm64.BCONDEQ: arm64.ABEQ, + asm_arm64.BCONDGE: arm64.ABGE, + asm_arm64.BCONDGT: arm64.ABGT, + asm_arm64.BCONDHI: arm64.ABHI, + asm_arm64.BCONDHS: arm64.ABHS, + asm_arm64.BCONDLE: arm64.ABLE, + asm_arm64.BCONDLO: arm64.ABLO, + asm_arm64.BCONDLS: arm64.ABLS, + asm_arm64.BCONDLT: arm64.ABLT, + asm_arm64.BCONDMI: arm64.ABMI, + asm_arm64.BCONDPL: arm64.ABPL, + asm_arm64.BCONDNE: arm64.ABNE, + asm_arm64.BCONDVS: arm64.ABVS, asm_arm64.CLZ: arm64.ACLZ, asm_arm64.CLZW: arm64.ACLZW, asm_arm64.CMP: arm64.ACMP, @@ -705,5 +717,5 @@ var castAsGolangAsmInstruction = [...]obj.As{ asm_arm64.VMOV: arm64.AVMOV, asm_arm64.VADD: arm64.AVADD, asm_arm64.VSUB: arm64.AVSUB, - asm_arm64.USHLL: arm64.AVUSHLL, + asm_arm64.USHLLIMM: arm64.AVUSHLL, } diff --git a/internal/integration_test/asm/arm64_debug/impl_test.go b/internal/integration_test/asm/arm64_debug/impl_test.go index 4a254efeb0..bae42276b0 100644 --- a/internal/integration_test/asm/arm64_debug/impl_test.go +++ b/internal/integration_test/asm/arm64_debug/impl_test.go @@ -1,7 +1,6 @@ package arm64debug import ( - "encoding/hex" "fmt" "math" "testing" @@ -1138,9 +1137,9 @@ func TestAssemblerImpl_EncodeRelativeJump(t *testing.T) { }) for _, inst := range []asm.Instruction{ - arm64.B, arm64.BEQ, arm64.BGE, arm64.BGT, arm64.BHI, arm64.BHS, - arm64.BLE, arm64.BLO, arm64.BLS, arm64.BLT, arm64.BMI, arm64.BNE, arm64.BVS, - arm64.BPL, + arm64.B, arm64.BCONDEQ, arm64.BCONDGE, arm64.BCONDGT, arm64.BCONDHI, arm64.BCONDHS, + arm64.BCONDLE, arm64.BCONDLO, arm64.BCONDLS, arm64.BCONDLT, arm64.BCONDMI, arm64.BCONDNE, arm64.BCONDVS, + arm64.BCONDPL, } { inst := inst t.Run(arm64.InstructionName(inst), func(t *testing.T) { @@ -1276,374 +1275,6 @@ func TestAssemblerImpl_EncodeTwoSIMDBytesToSIMDByteRegister(t *testing.T) { } } -func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) { - t.Run("error", func(t *testing.T) { - tests := []struct { - n *arm64.NodeImpl - expErr string - }{ - { - n: &arm64.NodeImpl{Instruction: arm64.B, - SrcReg: arm64.RegV21, - DstReg: arm64.RegV21, - Types: arm64.OperandTypesVectorRegisterToVectorRegister, - SrcVectorIndex: arm64.VectorIndexNone, - DstVectorIndex: arm64.VectorIndexNone, - }, - expErr: "B is unsupported for from:vector-register,to:vector-register type", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VMOV, - SrcReg: arm64.RegV21, - DstReg: arm64.RegV21, - Types: arm64.OperandTypesVectorRegisterToVectorRegister, - SrcVectorIndex: arm64.VectorIndexNone, - DstVectorIndex: arm64.VectorIndexNone, - }, - expErr: "unsupported arrangement for VMOV: none", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VADD, - SrcReg: arm64.RegV21, - DstReg: arm64.RegV21, - Types: arm64.OperandTypesVectorRegisterToVectorRegister, - SrcVectorIndex: arm64.VectorIndexNone, - DstVectorIndex: arm64.VectorIndexNone, - }, - expErr: "unsupported arrangement for VADD: none", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VADD, - SrcReg: arm64.RegV21, - DstReg: arm64.RegV21, - Types: arm64.OperandTypesVectorRegisterToVectorRegister, - VectorArrangement: arm64.VectorArrangement1D, - SrcVectorIndex: arm64.VectorIndexNone, - DstVectorIndex: arm64.VectorIndexNone, - }, - expErr: "unsupported arrangement for VADD: 1D", - }, - } - - for _, tt := range tests { - tc := tt - t.Run(tc.expErr, func(t *testing.T) { - a := arm64.NewAssemblerImpl(asm.NilRegister) - err := a.EncodeVectorRegisterToVectorRegister(tc.n) - require.EqualError(t, err, tc.expErr) - }) - } - }) - - vectorRegs := []asm.Register{arm64.RegV10, arm64.RegV2, arm64.RegV30} - tests := []struct { - name string - inst asm.Instruction - arr arm64.VectorArrangement - needConst bool - c asm.ConstantValue - srcIndex, dstIndex arm64.VectorIndex - }{ - {inst: arm64.VMOV, arr: arm64.VectorArrangement16B, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.VADD, arr: arm64.VectorArrangement2D, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.VADD, arr: arm64.VectorArrangement4S, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.VADD, arr: arm64.VectorArrangement8H, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.VADD, arr: arm64.VectorArrangement16B, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - { - name: "VSUB 2d", - inst: arm64.VSUB, arr: arm64.VectorArrangement2D, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone, - }, - { - name: "VSUB 4s", - inst: arm64.VSUB, arr: arm64.VectorArrangement4S, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone, - }, - { - name: "VSUB 8h", - inst: arm64.VSUB, arr: arm64.VectorArrangement8H, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone, - }, - { - name: "VSUB 16b", - inst: arm64.VSUB, arr: arm64.VectorArrangement16B, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone, - }, - {inst: arm64.USHLL, arr: arm64.VectorArrangement8B, needConst: true, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.USHLL, arr: arm64.VectorArrangement4H, needConst: true, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.USHLL, arr: arm64.VectorArrangement2S, needConst: true, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.USHLL, arr: arm64.VectorArrangement8B, needConst: true, c: 7, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.USHLL, arr: arm64.VectorArrangement4H, needConst: true, c: 15, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - {inst: arm64.USHLL, arr: arm64.VectorArrangement2S, needConst: true, c: 31, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone}, - } - - for _, tt := range tests { - tc := tt - t.Run(tc.name, func(t *testing.T) { - for _, src := range vectorRegs { - for _, dst := range vectorRegs { - src, dst := src, dst - t.Run(fmt.Sprintf("src=%s.%s,dst=%s.%s", - arm64.RegisterName(src), tc.arr, arm64.RegisterName(dst), tc.arr), func(t *testing.T) { - goasm := newGoasmAssembler(t, asm.NilRegister) - a := arm64.NewAssemblerImpl(asm.NilRegister) - - for _, assembler := range []arm64.Assembler{goasm, a} { - if tc.needConst { - assembler.CompileVectorRegisterToVectorRegisterWithConst(tc.inst, src, dst, tc.arr, tc.c) - } else { - assembler.CompileVectorRegisterToVectorRegister(tc.inst, src, dst, tc.arr, tc.srcIndex, tc.dstIndex) - } - } - - expected, err := goasm.Assemble() - require.NoError(t, err) - - actual, err := a.Assemble() - require.NoError(t, err) - require.Equal(t, expected, actual, hex.EncodeToString(expected)) - }) - } - } - }) - } -} - -func TestAssemblerImpl_EncodeRegisterToVectorRegister(t *testing.T) { - t.Run("error", func(t *testing.T) { - tests := []struct { - n *arm64.NodeImpl - exp string - }{ - { - n: &arm64.NodeImpl{ - Instruction: arm64.B, Types: arm64.OperandTypesRegisterToVectorRegister, - SrcReg: arm64.RegR0, - DstReg: arm64.RegV3, - }, - exp: "B is unsupported for from:register,to:vector-register type", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VMOV, - SrcReg: arm64.RegR0, - DstReg: arm64.RegV3, - Types: arm64.OperandTypesRegisterToVectorRegister, - DstVectorIndex: 100, VectorArrangement: arm64.VectorArrangement1D, - }, - exp: "invalid arrangement and index pair: 1D[100]", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VMOV, - Types: arm64.OperandTypesRegisterToVectorRegister, - SrcReg: arm64.RegR0, - DstReg: arm64.RegV3, - DstVectorIndex: 0, VectorArrangement: arm64.VectorArrangement1D, - }, - exp: "unsupported arrangement for VMOV: 1D", - }, - } - - for _, tt := range tests { - tc := tt - t.Run(tc.exp, func(t *testing.T) { - a := arm64.NewAssemblerImpl(asm.NilRegister) - err := a.EncodeRegisterToVectorRegister(tc.n) - require.EqualError(t, err, tc.exp) - }) - } - }) - - regs := []asm.Register{arm64.RegR0, arm64.RegR10, arm64.RegR30} - vectorRegs := []asm.Register{arm64.RegV0, arm64.RegV10, arm64.RegV30} - - tests := []struct { - inst asm.Instruction - arrangement arm64.VectorArrangement - index arm64.VectorIndex - }{ - { - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementD, - index: 0, - }, - { - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementD, - index: 1, - }, - { - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementB, - index: 0, - }, - { - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementB, - index: 5, - }, - { - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementH, - index: 1, - }, - { - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementH, - index: 4, - }, - } - - for _, tt := range tests { - tc := tt - t.Run(arm64.InstructionName(tc.inst), func(t *testing.T) { - for _, r := range regs { - for _, vr := range vectorRegs { - r, vr := r, vr - t.Run(fmt.Sprintf("src=%s,dst=%s.%s[%d]", - arm64.RegisterName(r), arm64.RegisterName(vr), tc.arrangement, tc.index), func(t *testing.T) { - goasm := newGoasmAssembler(t, asm.NilRegister) - a := arm64.NewAssemblerImpl(asm.NilRegister) - - for _, assembler := range []arm64.Assembler{goasm, a} { - assembler.CompileRegisterToVectorRegister(tc.inst, r, vr, tc.arrangement, tc.index) - } - - expected, err := goasm.Assemble() - require.NoError(t, err) - - actual, err := a.Assemble() - require.NoError(t, err) - require.Equal(t, expected, actual) - }) - } - } - }) - } -} - -func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) { - t.Run("error", func(t *testing.T) { - tests := []struct { - n *arm64.NodeImpl - expErr string - }{ - { - n: &arm64.NodeImpl{Instruction: arm64.B, Types: arm64.OperandTypesVectorRegisterToRegister, - SrcReg: arm64.RegV0, - DstReg: arm64.RegR3, - }, - expErr: "B is unsupported for from:vector-register,to:register type", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VMOV, - Types: arm64.OperandTypesVectorRegisterToRegister, - SrcReg: arm64.RegV0, - DstReg: arm64.RegR3, - SrcVectorIndex: 100, VectorArrangement: arm64.VectorArrangement1D, - }, - expErr: "invalid arrangement and index pair: 1D[100]", - }, - { - n: &arm64.NodeImpl{Instruction: arm64.VMOV, - Types: arm64.OperandTypesVectorRegisterToRegister, - SrcReg: arm64.RegV0, - DstReg: arm64.RegR3, - SrcVectorIndex: 0, VectorArrangement: arm64.VectorArrangement1D, - }, - expErr: "unsupported arrangement for VMOV: 1D", - }, - } - - for _, tt := range tests { - tc := tt - a := arm64.NewAssemblerImpl(asm.NilRegister) - err := a.EncodeVectorRegisterToRegister(tc.n) - require.EqualError(t, err, tc.expErr) - } - }) - - regs := []asm.Register{arm64.RegR0, arm64.RegR10, arm64.RegR30} - vectorRegs := []asm.Register{arm64.RegV0, arm64.RegV10, arm64.RegV30} - - tests := []struct { - name string - inst asm.Instruction - arrangement arm64.VectorArrangement - index arm64.VectorIndex - }{ - { - name: "VMOV D[0]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementD, - index: 0, - }, - { - name: "VMOV D[1]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementD, - index: 1, - }, - { - name: "VMOV B[0]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementB, - index: 0, - }, - { - name: "VMOV B[15]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementB, - index: 15, - }, - { - name: "VMOV H[1]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementH, - index: 1, - }, - { - name: "VMOV H[4]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementH, - index: 7, - }, - { - name: "VMOV S[2]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementS, - index: 2, - }, - { - name: "VMOV S[3]", - inst: arm64.VMOV, - arrangement: arm64.VectorArrangementS, - index: 3, - }, - } - - for _, tt := range tests { - tc := tt - t.Run(tc.name, func(t *testing.T) { - for _, r := range regs { - for _, vr := range vectorRegs { - r, vr := r, vr - t.Run(fmt.Sprintf("dst=%s,src=%s.%s[%d]", - arm64.RegisterName(r), arm64.RegisterName(vr), tc.arrangement, tc.index), func(t *testing.T) { - goasm := newGoasmAssembler(t, asm.NilRegister) - a := arm64.NewAssemblerImpl(asm.NilRegister) - - for _, assembler := range []arm64.Assembler{goasm, a} { - assembler.CompileVectorRegisterToRegister(tc.inst, vr, r, tc.arrangement, tc.index) - } - - expected, err := goasm.Assemble() - require.NoError(t, err) - actual, err := a.Assemble() - require.NoError(t, err) - require.Equal(t, expected, actual) - }) - } - } - }) - } -} - func conditionalRegisterToState(r asm.Register) asm.ConditionalRegisterState { switch r { case arm64.RegCondEQ: diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go index 76838f372c..f928f4b015 100644 --- a/internal/integration_test/spectest/v2/spec_test.go +++ b/internal/integration_test/spectest/v2/spec_test.go @@ -26,8 +26,7 @@ func TestCompiler(t *testing.T) { spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool { switch path.Base(jsonname) { - case "simd_bitwise.json", "simd_boolean.json", "simd_bit_shift.json", - "simd_i8x16_cmp.json", "simd_i16x8_cmp.json", "simd_i32x4_cmp.json", "simd_i64x2_cmp.json", + case "simd_i8x16_cmp.json", "simd_i16x8_cmp.json", "simd_i32x4_cmp.json", "simd_i64x2_cmp.json", "simd_f32x4_cmp.json", "simd_f64x2_cmp.json", "simd_f32x4_arith.json", "simd_f64x2_arith.json", "simd_i16x8_arith.json", "simd_i64x2_arith.json", "simd_i32x4_arith.json", "simd_i8x16_arith.json", "simd_i16x8_sat_arith.json", "simd_i8x16_sat_arith.json", From 489c92c014d8db08e07302ba492162cc0677529d Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Fri, 17 Jun 2022 11:26:02 +0900 Subject: [PATCH 2/3] not TODO Signed-off-by: Takeshi Yoneda --- internal/asm/arm64/impl.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go index 927e4f5ac5..dea74b4603 100644 --- a/internal/asm/arm64/impl.go +++ b/internal/asm/arm64/impl.go @@ -3335,7 +3335,6 @@ var advancedSIMDShiftByImmediate = map[asm.Instruction]struct { USHLLIMM: {U: 0b1, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate}, // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en SSHR: {U: 0b0, opcode: 0b00000, immQResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) { - // TODO: switch arr { case VectorArrangement16B, VectorArrangement8B: immh = 0b0001 From bba0260a03f23c6eeef0847f14c2978f8e90b4d8 Mon Sep 17 00:00:00 2001 From: Takeshi Yoneda Date: Fri, 17 Jun 2022 12:27:06 +0900 Subject: [PATCH 3/3] more Signed-off-by: Takeshi Yoneda --- internal/asm/arm64/consts.go | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go index 9fcfda20df..4538baef4c 100644 --- a/internal/asm/arm64/consts.go +++ b/internal/asm/arm64/consts.go @@ -471,32 +471,38 @@ const ( ASRW // B is the B instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B B - // BCONDEQ is the B.cond instruction with CondEQ. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + + // Below are B.cond instructions. + // * https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // * https://developer.arm.com/documentation/dui0802/a/A32-and-T32-Instructions/Condition-codes + + // BCONDEQ is the B.cond instruction with CondEQ. BCONDEQ - // BCONDGE is the B.cond instruction with CondGE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDGE is the B.cond instruction with CondGE. BCONDGE - // BCONDGT is the B.cond instruction with CondGT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDGT is the B.cond instruction with CondGT. BCONDGT - // BCONDHI is the B.cond instruction with CondHI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDHI is the B.cond instruction with CondHI. BCONDHI - // BCONDHS is the B.cond instruction with CondHS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDHS is the B.cond instruction with CondHS. BCONDHS - // BCONDLE is the B.cond instruction with CondLE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDLE is the B.cond instruction with CondLE. BCONDLE - // BCONDLO is the B.cond instruction with CondLO. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDLO is the B.cond instruction with CondLO. BCONDLO - // BCONDLS is the B.cond instruction with CondLS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDLS is the B.cond instruction with CondLS. BCONDLS - // BCONDLT is the B.cond instruction with CondLT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDLT is the B.cond instruction with CondLT. BCONDLT - // BCONDMI is the B.cond instruction with CondMI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDMI is the B.cond instruction with CondMI. BCONDMI - // BCONDPL is the B.cond instruction with CondPL. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDPL is the B.cond instruction with CondPL. BCONDPL - // BCONDNE is the B.cond instruction with CondNE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDNE is the B.cond instruction with CondNE. BCONDNE - // BCONDVS is the B.cond instruction with CondVS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond + // BCONDVS is the B.cond instruction with CondVS. BCONDVS + // CLZ is the CLZ instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/CLZ CLZ // CLZW is the CLZ instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/CLZ