From 4ab2382d13f0743187a3a459adc1b2eebd7f477c Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <takeshi@tetrate.io>
Date: Fri, 17 Jun 2022 11:23:17 +0900
Subject: [PATCH] more

Signed-off-by: Takeshi Yoneda <takeshi@tetrate.io>
---
 Makefile                                      |    4 +-
 internal/asm/arm64/assembler.go               |   13 +-
 internal/asm/arm64/consts.go                  |  249 ++--
 internal/asm/arm64/impl.go                    | 1277 +++++++++++------
 internal/asm/arm64/impl_test.go               |  771 +++++++++-
 internal/engine/compiler/compiler_vec_test.go |   33 +-
 internal/engine/compiler/impl_arm64.go        |   92 +-
 internal/engine/compiler/impl_vec_arm64.go    |  355 ++++-
 .../engine/compiler/impl_vec_arm64_test.go    |    4 +-
 .../asm/arm64_debug/debug_assembler.go        |   14 +
 .../asm/arm64_debug/golang_asm.go             |   90 +-
 .../asm/arm64_debug/impl_test.go              |  375 +----
 .../integration_test/spectest/v2/spec_test.go |    3 +-
 13 files changed, 2161 insertions(+), 1119 deletions(-)

diff --git a/Makefile b/Makefile
index 5341bf6f77..6ef9b752f7 100644
--- a/Makefile
+++ b/Makefile
@@ -117,10 +117,10 @@ spectest:
 	@$(MAKE) spectest.v2
 
 spectest.v1:
-	go test $$(go list ./... | grep $(spectest_v1_dir)) -v -timeout 120s
+	@go test $$(go list ./... | grep $(spectest_v1_dir)) -timeout 120s
 
 spectest.v2:
-	go test $$(go list ./... | grep $(spectest_v2_dir)) -v -timeout 120s
+	@go test $$(go list ./... | grep $(spectest_v2_dir)) -timeout 120s
 
 golangci_lint_path := $(shell go env GOPATH)/bin/golangci-lint
 
diff --git a/internal/asm/arm64/assembler.go b/internal/asm/arm64/assembler.go
index de48834c3d..f6dd8d7bd4 100644
--- a/internal/asm/arm64/assembler.go
+++ b/internal/asm/arm64/assembler.go
@@ -107,7 +107,7 @@ type Assembler interface {
 
 	// CompileVectorRegisterToVectorRegisterWithConst is the same as CompileVectorRegisterToVectorRegister but the
 	// additional constant can be provided.
-	// For example, the const can be used to specify the shift amount for USHLL instruction.
+	// For example, the const can be used to specify the shift amount for USHLLIMM instruction.
 	CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg, dstReg asm.Register,
 		arrangement VectorArrangement, c asm.ConstantValue)
 
@@ -115,4 +115,15 @@ type Assembler interface {
 	// the memory and the destination is the dstReg.
 	CompileLoadStaticConstToVectorRegister(instruction asm.Instruction, c asm.StaticConst, dstReg asm.Register,
 		arrangement VectorArrangement)
+
+	// CompileTwoVectorRegistersToVectorRegister adds an instruction where source are two vectors and destination is one
+	// vector. The vector's arrangement can be specified `arrangement`.
+	CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register,
+		arrangement VectorArrangement)
+
+	// CompileTwoVectorRegistersToVectorRegisterWithConst is the same as CompileTwoVectorRegistersToVectorRegister except
+	// that this also accept additional constant.
+	// For example EXIT instruction needs the extraction target immediate as const.
+	CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register,
+		arrangement VectorArrangement, c asm.ConstantValue)
 }
diff --git a/internal/asm/arm64/consts.go b/internal/asm/arm64/consts.go
index 85e5c1be35..9fcfda20df 100644
--- a/internal/asm/arm64/consts.go
+++ b/internal/asm/arm64/consts.go
@@ -12,7 +12,7 @@ import (
 // See https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/condition-codes-1-condition-flags-and-codes
 const (
 	// CondEQ is the eq (equal) condition code
-	CondEQ asm.ConditionalRegisterState = asm.ConditionalRegisterStateUnset + 1 + iota
+	CondEQ = asm.ConditionalRegisterStateUnset + 1 + iota
 	// CondNE is the ne (not equal) condition code
 	CondNE
 	// CondHS is the hs (unsigned higher or same) condition code
@@ -443,7 +443,7 @@ func RegisterName(r asm.Register) string {
 // Arm64-specific instructions.
 //
 // Note: This only defines arm64 instructions used by wazero's compiler.
-// Note: Naming conventions intentionally match the Go assembler: https://go.dev/doc/asm
+// Note: Naming conventions partially match the Go assembler: https://go.dev/doc/asm
 const (
 	// NOP is the NOP instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NOP
 	NOP asm.Instruction = iota
@@ -459,6 +459,10 @@ const (
 	ADR
 	// AND is the AND instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/AND--shifted-register-
 	AND
+	// ANDIMM32 is the AND(immediate) instruction in 32-bit mode https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en
+	ANDIMM32
+	// ANDIMM64 is the AND(immediate) instruction in 64-bit mode https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/AND--immediate---Bitwise-AND--immediate--?lang=en
+	ANDIMM64
 	// ANDW is the AND instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/AND--register-
 	ANDW
 	// ASR is the ASR instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ASR--register-
@@ -467,32 +471,32 @@ const (
 	ASRW
 	// B is the B instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B
 	B
-	// BEQ is the B.cond instruction with CondEQ. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BEQ
-	// BGE is the B.cond instruction with CondGE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BGE
-	// BGT is the B.cond instruction with CondGT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BGT
-	// BHI is the B.cond instruction with CondHI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BHI
-	// BHS is the B.cond instruction with CondHS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BHS
-	// BLE is the B.cond instruction with CondLE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BLE
-	// BLO is the B.cond instruction with CondLO. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BLO
-	// BLS is the B.cond instruction with CondLS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BLS
-	// BLT is the B.cond instruction with CondLT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BLT
-	// BMI is the B.cond instruction with CondMI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BMI
-	// BPL is the B.cond instruction with CondPL. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BPL
-	// BNE is the B.cond instruction with CondNE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BNE
-	// BVS is the B.cond instruction with CondVS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
-	BVS
+	// BCONDEQ is the B.cond instruction with CondEQ. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDEQ
+	// BCONDGE is the B.cond instruction with CondGE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDGE
+	// BCONDGT is the B.cond instruction with CondGT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDGT
+	// BCONDHI is the B.cond instruction with CondHI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDHI
+	// BCONDHS is the B.cond instruction with CondHS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDHS
+	// BCONDLE is the B.cond instruction with CondLE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDLE
+	// BCONDLO is the B.cond instruction with CondLO. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDLO
+	// BCONDLS is the B.cond instruction with CondLS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDLS
+	// BCONDLT is the B.cond instruction with CondLT. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDLT
+	// BCONDMI is the B.cond instruction with CondMI. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDMI
+	// BCONDPL is the B.cond instruction with CondPL. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDPL
+	// BCONDNE is the B.cond instruction with CondNE. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDNE
+	// BCONDVS is the B.cond instruction with CondVS. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/B-cond
+	BCONDVS
 	// CLZ is the CLZ instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/CLZ
 	CLZ
 	// CLZW is the CLZ instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/CLZ
@@ -619,11 +623,11 @@ const (
 	MSUBW
 	// MUL is the MUL instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MUL
 	MUL
-	// MULW is the MUL instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MUL
+	// MULW is the MUL instruction, in 32-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MUL
 	MULW
 	// NEG is the NEG instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NEG
 	NEG
-	// NEGW is the NEG instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NEG
+	// NEGW is the NEG instruction, in 32-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/NEG
 	NEGW
 	// ORR is the ORR instruction. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ORR--shifted-register-
 	ORR
@@ -677,20 +681,21 @@ const (
 	UDIV
 	// UDIVW is the UDIV instruction, in 64-bit mode. https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/UDIV
 	UDIVW
-
 	// VBIT is the BIT instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/BIT--vector-
 	VBIT
 	// VCNT is the CNT instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/CNT--vector-
 	VCNT
 	// VMOV has different semantics depending on the types of operands:
-	//	* MOV(vector) if the operands are vectors and indexes are not specified. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/MOV--vector-
-	//	* MOV(vector, element) if the operands are vectors and indexes are specified. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/MOV--vector--element-
-	//	* INS(vector, element) if the src is a general purpose and the dst is a vector. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/INS--vector---general-
-	//	* UMOV(vector) if the dst is a general purpose and the src is a vector. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/UMOV--vector-
 	//	* LDR(SIMD&FP) if the src is memory and dst is a vector: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--immediate--SIMD-FP---Load-SIMD-FP-Register--immediate-offset--
-	//	* LDR (literal, SIMD&FP) if the src is static const and dst is a vector: https://developer.arm.com/documentation/dui0801/h/A64-Floating-point-Instructions/LDR--literal--SIMD-and-FP-
+	//	* LDR(literal, SIMD&FP) if the src is static const and dst is a vector: https://developer.arm.com/documentation/dui0801/h/A64-Floating-point-Instructions/LDR--literal--SIMD-and-FP-
 	//	* STR(SIMD&FP) if the dst is memory and src is a vector: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/STR--immediate--SIMD-FP---Store-SIMD-FP-register--immediate-offset--
 	VMOV
+	// UMOV is the UMOV instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
+	UMOV
+	// INSGEN is the INS(general) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en
+	INSGEN
+	// INSELEM is the INS(element) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
+	INSELEM
 	// VUADDLV is the UADDLV(vector) instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/UADDLV--vector-
 	VUADDLV
 	// VADD is the ADD(vector) instruction. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/ADD--vector-
@@ -705,28 +710,62 @@ const (
 	VFSUBS
 	// VFSUBD is the FSUB(vector) instruction, for double precision. https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/FSUB--vector-
 	VFSUBD
-	// SSHLL is the SSHLL(vector) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector-
-	SSHLL
-	// USHLL is the USHLL(vector) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector-
-	USHLL
+	// SSHL is the SSHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
+	SSHL
+	// SSHLLIMM is the SSHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector-
+	SSHLLIMM
+	// USHL is the USHL(vector,register) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
+	USHL
+	// USHLLIMM is the USHLL(vector,immediate) instruction. https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/SSHLL--SSHLL2--vector-
+	USHLLIMM
 	// LD1R is the LD1R instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--
 	LD1R
-	// SMOV is the SMOV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/SMOV--vector-
-	SMOV
-	// DUP is the DUP(element) instruction. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-
-	DUP
+	// SMOV32 is the 32-bit variant of SMOV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/SMOV--vector-
+	SMOV32
+	// DUPGEN is the DUP(general) instruction. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-
+	DUPGEN
+	// DUPELEM is the DUP(element) instruction. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-
+	DUPELEM
 	// UMAXP is the UMAXP(vector) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/UMAXP--vector-
 	UMAXP
 	// UMINV is the UMINV(vector) instruction. https://developer.arm.com/documentation/100069/0610/A64-SIMD-Vector-Instructions/UMINV--vector-
 	UMINV
 	// CMEQ is the CMEQ(vector, register) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/CMEQ--vector--register-
 	CMEQ
-	// ADDP is the ADDP(vector) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector-
+	// CMEQZERO is the CMEP(zero) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en
+	CMEQZERO
+	// ADDP is the ADDP(scalar) instruction. https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en
 	ADDP
+	// VADDP is the ADDP(vector) instruction. https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector-
+	// Note: prefixed by V to distinguish from the non-vector variant of ADDP(scalar).
+	VADDP
 	// TBL1 is the TBL instruction whose source is one vector. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/TBL--Table-vector-Lookup-
 	TBL1
 	// TBL2 is the TBL instruction whose source is two vectors. https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/TBL--Table-vector-Lookup-
 	TBL2
+	// NOT is the NOT(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en
+	NOT
+	// VAND is the AND(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--
+	// Note: prefixed by V to distinguish from the non-vector variant of AND.
+	VAND
+	// VORR is the ORR(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--
+	// Note: prefixed by V to distinguish from the non-vector variant of ORR.
+	VORR
+	// BSL https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/BSL--Bitwise-Select-
+	BSL
+	// BIC is the BIC(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--
+	BIC
+	// VFNEG is the FNEG(vector) instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--
+	// Note: prefixed by V to distinguish from the non-vector variant of FNEG.
+	VFNEG
+	// ADDV is the ADDV instruction https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-
+	ADDV
+	// ZIP1 is the ZIP1 instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ZIP1--Zip-vectors--primary--?lang=en
+	ZIP1
+	// SSHR is the SSHR(immediate,vector) instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en
+	SSHR
+	// EXT is the EXT instruction https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
+	EXT
 
 	// instructionEnd is always placed at the bottom of this iota definition to be used in the test.
 	instructionEnd
@@ -742,17 +781,17 @@ const (
 	VectorArrangement8B
 	// VectorArrangement16B is an arrangement of 16 bytes (128-bit vector)
 	VectorArrangement16B
-	// VectorArrangement4H is an arrangement of 4 halfwords (64-bit vector)
+	// VectorArrangement4H is an arrangement of 4 half precisions (64-bit vector)
 	VectorArrangement4H
-	// VectorArrangement8H is an arrangement of 8 halfwords (128-bit vector)
+	// VectorArrangement8H is an arrangement of 8 half precisions (128-bit vector)
 	VectorArrangement8H
-	// VectorArrangement2S is an arrangement of 2 words (64-bit vector)
+	// VectorArrangement2S is an arrangement of 2 single precisions (64-bit vector)
 	VectorArrangement2S
-	// VectorArrangement4S is an arrangement of 4 words (128-bit vector)
+	// VectorArrangement4S is an arrangement of 4 single precisions (128-bit vector)
 	VectorArrangement4S
-	// VectorArrangement1D is an arrangement of 1 doubleword (64-bit vector)
+	// VectorArrangement1D is an arrangement of 1 double precision (64-bit vector)
 	VectorArrangement1D
-	// VectorArrangement2D is an arrangement of 2 doublewords (128-bit vector)
+	// VectorArrangement2D is an arrangement of 2 double precisions (128-bit vector)
 	VectorArrangement2D
 
 	// Assign each vector size specifier to a vector arrangement ID.
@@ -831,6 +870,10 @@ func InstructionName(i asm.Instruction) string {
 		return "ADR"
 	case AND:
 		return "AND"
+	case ANDIMM32:
+		return "ANDIMM32"
+	case ANDIMM64:
+		return "ANDIMM64"
 	case ANDW:
 		return "ANDW"
 	case ASR:
@@ -839,32 +882,32 @@ func InstructionName(i asm.Instruction) string {
 		return "ASRW"
 	case B:
 		return "B"
-	case BEQ:
-		return "BEQ"
-	case BGE:
-		return "BGE"
-	case BGT:
-		return "BGT"
-	case BHI:
-		return "BHI"
-	case BHS:
-		return "BHS"
-	case BLE:
-		return "BLE"
-	case BLO:
-		return "BLO"
-	case BLS:
-		return "BLS"
-	case BLT:
-		return "BLT"
-	case BMI:
-		return "BMI"
-	case BPL:
-		return "BPL"
-	case BNE:
-		return "BNE"
-	case BVS:
-		return "BVS"
+	case BCONDEQ:
+		return "BCONDEQ"
+	case BCONDGE:
+		return "BCONDGE"
+	case BCONDGT:
+		return "BCONDGT"
+	case BCONDHI:
+		return "BCONDHI"
+	case BCONDHS:
+		return "BCONDHS"
+	case BCONDLE:
+		return "BCONDLE"
+	case BCONDLO:
+		return "BCONDLO"
+	case BCONDLS:
+		return "BCONDLS"
+	case BCONDLT:
+		return "BCONDLT"
+	case BCONDMI:
+		return "BCONDMI"
+	case BCONDPL:
+		return "BCONDPL"
+	case BCONDNE:
+		return "BCONDNE"
+	case BCONDVS:
+		return "BCONDVS"
 	case CLZ:
 		return "CLZ"
 	case CLZW:
@@ -1057,6 +1100,12 @@ func InstructionName(i asm.Instruction) string {
 		return "VUADDLV"
 	case VMOV:
 		return "VMOV"
+	case INSELEM:
+		return "INSELEM"
+	case UMOV:
+		return "UMOV"
+	case INSGEN:
+		return "INSGEN"
 	case VADD:
 		return "VADD"
 	case VFADDS:
@@ -1069,16 +1118,22 @@ func InstructionName(i asm.Instruction) string {
 		return "VFSUBS"
 	case VFSUBD:
 		return "VFSUBD"
-	case SSHLL:
-		return "SSHLL"
-	case USHLL:
-		return "USHLL"
+	case SSHL:
+		return "SSHL"
+	case USHL:
+		return "USHL"
+	case SSHLLIMM:
+		return "SSHLLIMM"
+	case USHLLIMM:
+		return "USHLLIMM"
 	case LD1R:
 		return "LD1R"
-	case SMOV:
-		return "SMOV"
-	case DUP:
-		return "DUP"
+	case SMOV32:
+		return "SMOV32"
+	case DUPGEN:
+		return "DUPGEN"
+	case DUPELEM:
+		return "DUPELEM"
 	case UMAXP:
 		return "UMAXP"
 	case UMINV:
@@ -1087,10 +1142,34 @@ func InstructionName(i asm.Instruction) string {
 		return "CMEQ"
 	case ADDP:
 		return "ADDP"
+	case VADDP:
+		return "VADDP"
 	case TBL1:
 		return "TBL1"
 	case TBL2:
 		return "TBL2"
+	case NOT:
+		return "NOT"
+	case VAND:
+		return "VAND"
+	case VORR:
+		return "VORR"
+	case BSL:
+		return "BSL"
+	case BIC:
+		return "BIC"
+	case VFNEG:
+		return "VFNEG"
+	case ADDV:
+		return "ADDV"
+	case CMEQZERO:
+		return "CMEQZERO"
+	case ZIP1:
+		return "ZIP1"
+	case SSHR:
+		return "SSHR"
+	case EXT:
+		return "EXT"
 	}
 	panic(fmt.Errorf("unknown instruction %d", i))
 }
diff --git a/internal/asm/arm64/impl.go b/internal/asm/arm64/impl.go
index 197fcb50db..927e4f5ac5 100644
--- a/internal/asm/arm64/impl.go
+++ b/internal/asm/arm64/impl.go
@@ -142,6 +142,7 @@ const (
 	OperandTypeSIMDByte
 	OperandTypeTwoSIMDBytes
 	OperandTypeVectorRegister
+	OperandTypeTwoVectorRegisters
 	OperandTypeStaticConst
 )
 
@@ -172,6 +173,8 @@ func (o OperandType) String() (ret string) {
 		ret = "vector-register"
 	case OperandTypeStaticConst:
 		ret = "static-const"
+	case OperandTypeTwoVectorRegisters:
+		ret = "two-vector-registers"
 	}
 	return
 }
@@ -180,28 +183,29 @@ func (o OperandType) String() (ret string) {
 type OperandTypes struct{ src, dst OperandType }
 
 var (
-	OperandTypesNoneToNone                     = OperandTypes{OperandTypeNone, OperandTypeNone}
-	OperandTypesNoneToRegister                 = OperandTypes{OperandTypeNone, OperandTypeRegister}
-	OperandTypesNoneToMemory                   = OperandTypes{OperandTypeNone, OperandTypeMemory}
-	OperandTypesNoneToBranch                   = OperandTypes{OperandTypeNone, OperandTypeBranch}
-	OperandTypesRegisterToRegister             = OperandTypes{OperandTypeRegister, OperandTypeRegister}
-	OperandTypesLeftShiftedRegisterToRegister  = OperandTypes{OperandTypeLeftShiftedRegister, OperandTypeRegister}
-	OperandTypesTwoRegistersToRegister         = OperandTypes{OperandTypeTwoRegisters, OperandTypeRegister}
-	OperandTypesThreeRegistersToRegister       = OperandTypes{OperandTypeThreeRegisters, OperandTypeRegister}
-	OperandTypesTwoRegistersToNone             = OperandTypes{OperandTypeTwoRegisters, OperandTypeNone}
-	OperandTypesRegisterAndConstToNone         = OperandTypes{OperandTypeRegisterAndConst, OperandTypeNone}
-	OperandTypesRegisterToMemory               = OperandTypes{OperandTypeRegister, OperandTypeMemory}
-	OperandTypesMemoryToRegister               = OperandTypes{OperandTypeMemory, OperandTypeRegister}
-	OperandTypesConstToRegister                = OperandTypes{OperandTypeConst, OperandTypeRegister}
-	OperandTypesSIMDByteToSIMDByte             = OperandTypes{OperandTypeSIMDByte, OperandTypeSIMDByte}
-	OperandTypesSIMDByteToRegister             = OperandTypes{OperandTypeSIMDByte, OperandTypeRegister}
-	OperandTypesTwoSIMDBytesToSIMDByteRegister = OperandTypes{OperandTypeTwoSIMDBytes, OperandTypeSIMDByte}
-	OperandTypesRegisterToVectorRegister       = OperandTypes{OperandTypeRegister, OperandTypeVectorRegister}
-	OperandTypesVectorRegisterToRegister       = OperandTypes{OperandTypeVectorRegister, OperandTypeRegister}
-	OperandTypesMemoryToVectorRegister         = OperandTypes{OperandTypeMemory, OperandTypeVectorRegister}
-	OperandTypesVectorRegisterToMemory         = OperandTypes{OperandTypeVectorRegister, OperandTypeMemory}
-	OperandTypesVectorRegisterToVectorRegister = OperandTypes{OperandTypeVectorRegister, OperandTypeVectorRegister}
-	OperandTypesStaticConstToVectorRegister    = OperandTypes{OperandTypeStaticConst, OperandTypeVectorRegister}
+	OperandTypesNoneToNone                         = OperandTypes{OperandTypeNone, OperandTypeNone}
+	OperandTypesNoneToRegister                     = OperandTypes{OperandTypeNone, OperandTypeRegister}
+	OperandTypesNoneToMemory                       = OperandTypes{OperandTypeNone, OperandTypeMemory}
+	OperandTypesNoneToBranch                       = OperandTypes{OperandTypeNone, OperandTypeBranch}
+	OperandTypesRegisterToRegister                 = OperandTypes{OperandTypeRegister, OperandTypeRegister}
+	OperandTypesLeftShiftedRegisterToRegister      = OperandTypes{OperandTypeLeftShiftedRegister, OperandTypeRegister}
+	OperandTypesTwoRegistersToRegister             = OperandTypes{OperandTypeTwoRegisters, OperandTypeRegister}
+	OperandTypesThreeRegistersToRegister           = OperandTypes{OperandTypeThreeRegisters, OperandTypeRegister}
+	OperandTypesTwoRegistersToNone                 = OperandTypes{OperandTypeTwoRegisters, OperandTypeNone}
+	OperandTypesRegisterAndConstToNone             = OperandTypes{OperandTypeRegisterAndConst, OperandTypeNone}
+	OperandTypesRegisterToMemory                   = OperandTypes{OperandTypeRegister, OperandTypeMemory}
+	OperandTypesMemoryToRegister                   = OperandTypes{OperandTypeMemory, OperandTypeRegister}
+	OperandTypesConstToRegister                    = OperandTypes{OperandTypeConst, OperandTypeRegister}
+	OperandTypesSIMDByteToSIMDByte                 = OperandTypes{OperandTypeSIMDByte, OperandTypeSIMDByte}
+	OperandTypesSIMDByteToRegister                 = OperandTypes{OperandTypeSIMDByte, OperandTypeRegister}
+	OperandTypesTwoSIMDBytesToSIMDByteRegister     = OperandTypes{OperandTypeTwoSIMDBytes, OperandTypeSIMDByte}
+	OperandTypesRegisterToVectorRegister           = OperandTypes{OperandTypeRegister, OperandTypeVectorRegister}
+	OperandTypesVectorRegisterToRegister           = OperandTypes{OperandTypeVectorRegister, OperandTypeRegister}
+	OperandTypesMemoryToVectorRegister             = OperandTypes{OperandTypeMemory, OperandTypeVectorRegister}
+	OperandTypesVectorRegisterToMemory             = OperandTypes{OperandTypeVectorRegister, OperandTypeMemory}
+	OperandTypesVectorRegisterToVectorRegister     = OperandTypes{OperandTypeVectorRegister, OperandTypeVectorRegister}
+	OperandTypesTwoVectorRegistersToVectorRegister = OperandTypes{OperandTypeTwoVectorRegisters, OperandTypeVectorRegister}
+	OperandTypesStaticConstToVectorRegister        = OperandTypes{OperandTypeStaticConst, OperandTypeVectorRegister}
 )
 
 // String implements fmt.Stringer
@@ -434,6 +438,8 @@ func (a *AssemblerImpl) EncodeNode(n *NodeImpl) (err error) {
 		err = a.EncodeVectorRegisterToVectorRegister(n)
 	case OperandTypesStaticConstToVectorRegister:
 		err = a.EncodeStaticConstToVectorRegister(n)
+	case OperandTypesTwoVectorRegistersToVectorRegister:
+		err = a.encodeTwoVectorRegistersToVectorRegister(n)
 	default:
 		err = fmt.Errorf("encoder undefined for [%s] operand type", n.Types)
 	}
@@ -706,8 +712,7 @@ func (a *AssemblerImpl) CompileVectorRegisterToVectorRegisterWithConst(instructi
 	n.VectorArrangement = arrangement
 }
 
-// CompileLoadStaticConstToVectorRegister adds an instruction where the source operand is StaticConstant located in the memory
-// and the destination is the dstReg.
+// CompileLoadStaticConstToVectorRegister implements Assembler.CompileLoadStaticConstToVectorRegister
 func (a *AssemblerImpl) CompileLoadStaticConstToVectorRegister(instruction asm.Instruction,
 	c asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement) {
 	n := a.newNode(instruction, OperandTypesStaticConstToVectorRegister)
@@ -716,6 +721,27 @@ func (a *AssemblerImpl) CompileLoadStaticConstToVectorRegister(instruction asm.I
 	n.VectorArrangement = arrangement
 }
 
+// CompileTwoVectorRegistersToVectorRegister implements Assembler.CompileTwoVectorRegistersToVectorRegister.
+func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register,
+	arrangement VectorArrangement) {
+	n := a.newNode(instruction, OperandTypesTwoVectorRegistersToVectorRegister)
+	n.SrcReg = srcReg
+	n.SrcReg2 = srcReg2
+	n.DstReg = dstReg
+	n.VectorArrangement = arrangement
+}
+
+// CompileTwoVectorRegistersToVectorRegisterWithConst implements Assembler.CompileTwoVectorRegistersToVectorRegisterWithConst.
+func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction,
+	srcReg, srcReg2, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue) {
+	n := a.newNode(instruction, OperandTypesTwoVectorRegistersToVectorRegister)
+	n.SrcReg = srcReg
+	n.SrcReg2 = srcReg2
+	n.SrcConst = c
+	n.DstReg = dstReg
+	n.VectorArrangement = arrangement
+}
+
 func errorEncodingUnsupported(n *NodeImpl) error {
 	return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.Instruction), n.Types)
 }
@@ -761,7 +787,7 @@ func (a *AssemblerImpl) EncodeJumpToRegister(n *NodeImpl) (err error) {
 // TODO: unexport after golang-asm complete removal.
 func (a *AssemblerImpl) EncodeRelativeBranch(n *NodeImpl) (err error) {
 	switch n.Instruction {
-	case B, BEQ, BGE, BGT, BHI, BHS, BLE, BLO, BLS, BLT, BMI, BNE, BVS, BPL:
+	case B, BCONDEQ, BCONDGE, BCONDGT, BCONDHI, BCONDHS, BCONDLE, BCONDLO, BCONDLS, BCONDLT, BCONDMI, BCONDNE, BCONDVS, BCONDPL:
 	default:
 		return errorEncodingUnsupported(n)
 	}
@@ -781,31 +807,31 @@ func (a *AssemblerImpl) EncodeRelativeBranch(n *NodeImpl) (err error) {
 		switch n.Instruction {
 		case B:
 			condBits = condBitsUnconditional
-		case BEQ:
+		case BCONDEQ:
 			condBits = 0b0000
-		case BGE:
+		case BCONDGE:
 			condBits = 0b1010
-		case BGT:
+		case BCONDGT:
 			condBits = 0b1100
-		case BHI:
+		case BCONDHI:
 			condBits = 0b1000
-		case BHS:
+		case BCONDHS:
 			condBits = 0b0010
-		case BLE:
+		case BCONDLE:
 			condBits = 0b1101
-		case BLO:
+		case BCONDLO:
 			condBits = 0b0011
-		case BLS:
+		case BCONDLS:
 			condBits = 0b1001
-		case BLT:
+		case BCONDLT:
 			condBits = 0b1011
-		case BMI:
+		case BCONDMI:
 			condBits = 0b0100
-		case BPL:
+		case BCONDPL:
 			condBits = 0b0101
-		case BNE:
+		case BCONDNE:
 			condBits = 0b0001
-		case BVS:
+		case BCONDVS:
 			condBits = 0b0110
 		}
 
@@ -2010,6 +2036,74 @@ func (a *AssemblerImpl) addOrSub64BitRegisters(sfops byte, src1RegBits byte, src
 	})
 }
 
+// See "Logical (immediate)" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate
+var logicalImmediate = map[asm.Instruction]struct {
+	sf, opc  byte
+	resolver func(imm asm.ConstantValue) (imms, immr, N byte, err error)
+}{
+	ANDIMM32: {sf: 0b0, opc: 0b00, resolver: func(imm asm.ConstantValue) (imms, immr, N byte, err error) {
+		if !isBitMaskImmediate(uint64(imm)) {
+			err = fmt.Errorf("const %d must be valid bitmask immediate for %s", imm, InstructionName(ANDIMM64))
+			return
+		}
+		immr, imms, N = bitmaskImmediate(uint64(imm), false)
+		return
+	}},
+	ANDIMM64: {sf: 0b1, opc: 0b00, resolver: func(imm asm.ConstantValue) (imms, immr, N byte, err error) {
+		if !isBitMaskImmediate(uint64(imm)) {
+			err = fmt.Errorf("const %d must be valid bitmask immediate for %s", imm, InstructionName(ANDIMM64))
+			return
+		}
+		immr, imms, N = bitmaskImmediate(uint64(imm), true)
+		return
+	}},
+}
+
+func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
+	var size uint32
+	switch {
+	case c != c>>32|c<<32:
+		size = 64
+	case c != c>>16|c<<48:
+		size = 32
+		c = uint64(int32(c))
+	case c != c>>8|c<<56:
+		size = 16
+		c = uint64(int16(c))
+	case c != c>>4|c<<60:
+		size = 8
+		c = uint64(int8(c))
+	case c != c>>2|c<<62:
+		size = 4
+		c = uint64(int64(c<<60) >> 60)
+	default:
+		size = 2
+		c = uint64(int64(c<<62) >> 62)
+	}
+
+	neg := false
+	if int64(c) < 0 {
+		c = ^c
+		neg = true
+	}
+
+	onesSize, nonZeroPos := getOnesSequenceSize(c)
+	if neg {
+		nonZeroPos = onesSize + nonZeroPos
+		onesSize = size - onesSize
+	}
+
+	var mode byte = 32
+	if is64bit {
+		N, mode = 0b1, 64
+	}
+
+	immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
+	imms = byte((onesSize - 1) | 63&^(size<<1-1))
+	return
+}
+
 // Exported for inter-op testing with golang-asm.
 // TODO: unexport after golang-asm complete removal.
 func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) {
@@ -2021,6 +2115,24 @@ func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) {
 		return err
 	}
 
+	if log, ok := logicalImmediate[n.Instruction]; ok {
+		// See "Logical (immediate)" in
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate
+		imms, immr, N, err := log.resolver(c)
+		if err != nil {
+			return err
+		}
+
+		a.Buf.Write([]byte{
+			(dstRegBits << 5) | dstRegBits,
+			imms<<2 | dstRegBits>>3,
+			N<<6 | immr,
+			log.sf<<7 | log.opc<<5 | 0b10010,
+		})
+		return nil
+	}
+
+	// TODO: refactor and generalize the following like ^ logicalImmediate, etc.
 	switch inst := n.Instruction; inst {
 	case ADD, ADDS, SUB, SUBS:
 		var sfops byte
@@ -2595,141 +2707,6 @@ func checkArrangementIndexPair(arr VectorArrangement, index VectorIndex) (err er
 	return
 }
 
-func (a *AssemblerImpl) EncodeVectorRegisterToRegister(n *NodeImpl) (err error) {
-	if err = checkArrangementIndexPair(n.VectorArrangement, n.SrcVectorIndex); err != nil {
-		return
-	}
-
-	srcVecRegBits, err := vectorRegisterBits(n.SrcReg)
-	if err != nil {
-		return err
-	}
-
-	dstRegBits, err := intRegisterBits(n.DstReg)
-	if err != nil {
-		return err
-	}
-
-	switch n.Instruction {
-	case VMOV, SMOV:
-		var imm4 byte // imm4 as in "Advanced SIMD copy" https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-		isSMOV := n.Instruction == SMOV
-		if isSMOV {
-			// SMOV: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-
-			imm4 = 0b0101
-		} else {
-			// VMOV is translated as "UMOV": https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-
-			imm4 = 0b0111
-		}
-
-		var imm5 byte
-		var q byte
-		switch n.VectorArrangement {
-		case VectorArrangementB:
-			imm5 |= 0b1
-			imm5 |= byte(n.SrcVectorIndex) << 1
-		case VectorArrangementH:
-			imm5 |= 0b10
-			imm5 |= byte(n.SrcVectorIndex) << 2
-		case VectorArrangementS:
-			if isSMOV {
-				return fmt.Errorf("invalid arrangement for SMOV: %s", n.VectorArrangement.String())
-			}
-			imm5 |= 0b100
-			imm5 |= byte(n.SrcVectorIndex) << 3
-		case VectorArrangementD:
-			if isSMOV {
-				return fmt.Errorf("invalid arrangement for SMOV: %s", n.VectorArrangement.String())
-			}
-
-			imm5 |= 0b1000
-			imm5 |= byte(n.SrcVectorIndex) << 4
-			q = 0b1
-		default:
-			return fmt.Errorf("unsupported arrangement for VMOV: %s", n.VectorArrangement)
-		}
-		a.Buf.Write([]byte{
-			(srcVecRegBits << 5) | dstRegBits,
-			imm4<<3 | 0b100 | srcVecRegBits>>3,
-			imm5,
-			q<<6 | 0b00001110,
-		})
-	default:
-		return errorEncodingUnsupported(n)
-	}
-	return
-}
-
-func (a *AssemblerImpl) EncodeRegisterToVectorRegister(n *NodeImpl) (err error) {
-	srcRegBits, err := intRegisterBits(n.SrcReg)
-	if err != nil {
-		return err
-	}
-
-	dstVectorRegBits, err := vectorRegisterBits(n.DstReg)
-	if err != nil {
-		return err
-	}
-
-	switch n.Instruction {
-	case DUP:
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-
-		var imm5 byte
-		switch n.VectorArrangement {
-		case VectorArrangementB:
-			imm5 = 0b1
-		case VectorArrangementH:
-			imm5 = 0b10
-		case VectorArrangementS:
-			imm5 = 0b100
-		case VectorArrangementD:
-			imm5 = 0b1000
-		default:
-			return fmt.Errorf("unsupported arrangement for DUP: %s", n.VectorArrangement)
-		}
-		a.Buf.Write([]byte{
-			(srcRegBits << 5) | dstVectorRegBits,
-			0b11<<2 | srcRegBits>>3,
-			imm5,
-			0b01_001110,
-		})
-	case VMOV:
-		if err = checkArrangementIndexPair(n.VectorArrangement, n.DstVectorIndex); err != nil {
-			return
-		}
-
-		// VMOV is translated as "INS(Vector, Element)"
-		// Description: https://developer.arm.com/documentation/dui0802/a/A64-Advanced-SIMD-Vector-Instructions/INS--vector---general-
-		// Encoding: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en
-		var imm5 byte
-		switch n.VectorArrangement {
-		case VectorArrangementB:
-			imm5 |= 0b1
-			imm5 |= byte(n.DstVectorIndex) << 1
-		case VectorArrangementH:
-			imm5 |= 0b10
-			imm5 |= byte(n.DstVectorIndex) << 2
-		case VectorArrangementS:
-			imm5 |= 0b100
-			imm5 |= byte(n.DstVectorIndex) << 3
-		case VectorArrangementD:
-			imm5 |= 0b1000
-			imm5 |= byte(n.DstVectorIndex) << 4
-		default:
-			return fmt.Errorf("unsupported arrangement for VMOV: %s", n.VectorArrangement)
-		}
-		a.Buf.Write([]byte{
-			(srcRegBits << 5) | dstVectorRegBits,
-			0b000111_00 | srcRegBits>>3,
-			imm5,
-			0b01001110,
-		})
-	default:
-		return errorEncodingUnsupported(n)
-	}
-	return
-}
-
 func (a *AssemblerImpl) EncodeMemoryToVectorRegister(n *NodeImpl) (err error) {
 	srcBaseRegBits, err := intRegisterBits(n.SrcReg)
 	if err != nil {
@@ -2919,285 +2896,763 @@ func (a *AssemblerImpl) EncodeStaticConstToVectorRegister(n *NodeImpl) (err erro
 	return
 }
 
-func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err error) {
-	var srcVectorRegBits byte
-	if n.SrcReg != RegRZR {
-		srcVectorRegBits, err = vectorRegisterBits(n.SrcReg)
-		if err != nil {
-			return err
+// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct {
+	U, Opcode byte
+	// TODO: extract common implementation of qAndSizeResolver.
+	qAndSizeResolver func(arrangement VectorArrangement) (Q, Size byte, err error)
+}{
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en
+	NOT: {U: 0b1, Opcode: 0b00101, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b00
+		switch arrangement {
+		case VectorArrangement16B:
+			Q = 0b1
+		case VectorArrangement8B:
+			Q = 0b0
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(NOT))
 		}
-	}
-
-	dstVectorRegBits, err := vectorRegisterBits(n.DstReg)
-	if err != nil {
-		return err
-	}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en
+	VFNEG: {U: 0b1, Opcode: 0b01111, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b00
+		switch arrangement {
+		case VectorArrangement4S:
+			size, Q = 0b10, 0b1
+		case VectorArrangement2S:
+			size, Q = 0b10, 0b0
+		case VectorArrangement2D:
+			size, Q = 0b11, 0b1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFNEG))
+		}
+		return
+	}},
+}
 
-	switch n.Instruction {
-	case DUP:
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-
-		if n.SrcVectorIndex == VectorIndexNone {
-			return fmt.Errorf("source vector index must be given for %s", InstructionName(DUP))
+// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD three same" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDThreeSame = map[asm.Instruction]struct {
+	U, Opcode        byte
+	qAndSizeResolver func(arrangement VectorArrangement) (Q, Size byte, err error)
+}{
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--?lang=en
+	VAND: {U: 0b0, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b00
+		switch arrangement {
+		case VectorArrangement16B:
+			Q = 0b1
+		case VectorArrangement8B:
+			Q = 0b0
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(AND))
 		}
-		var imm5 byte
-		switch n.VectorArrangement {
-		case VectorArrangementB:
-			imm5 |= 0b1
-			imm5 |= byte(n.SrcVectorIndex) << 1
-		case VectorArrangementH:
-			imm5 |= 0b10
-			imm5 |= byte(n.SrcVectorIndex) << 2
-		case VectorArrangementS:
-			imm5 |= 0b100
-			imm5 |= byte(n.SrcVectorIndex) << 3
-		case VectorArrangementD:
-			imm5 |= 0b1000
-			imm5 |= byte(n.SrcVectorIndex) << 4
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BSL--Bitwise-Select-?lang=en
+	BSL: {U: 0b1, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b01
+		switch arrangement {
+		case VectorArrangement16B:
+			Q = 0b1
+		case VectorArrangement8B:
+			Q = 0b0
 		default:
-			return fmt.Errorf("unsupported arrangement for VMOV: %d", n.VectorArrangement)
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(BSL))
 		}
-		a.Buf.Write([]byte{
-			(srcVectorRegBits << 5) | dstVectorRegBits,
-			0b1<<2 | srcVectorRegBits>>3,
-			imm5,
-			0b0100_1110,
-		})
-	case VMOV:
-		if n.SrcVectorIndex != VectorIndexNone && n.DstVectorIndex != VectorIndexNone {
-			// This case VMOV is translated as MOV(vector, element)
-			// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--element---Move-vector-element-to-another-vector-element--an-alias-of-INS--element--
-			var imm5, imm4 byte
-			switch n.VectorArrangement {
-			case VectorArrangementB:
-				imm5 |= 0b1
-				imm5 |= byte(n.DstVectorIndex) << 1
-				imm4 = byte(n.SrcVectorIndex)
-			case VectorArrangementH:
-				imm5 |= 0b10
-				imm5 |= byte(n.DstVectorIndex) << 2
-				imm4 = byte(n.SrcVectorIndex) << 1
-			case VectorArrangementS:
-				imm5 |= 0b100
-				imm5 |= byte(n.DstVectorIndex) << 3
-				imm4 = byte(n.SrcVectorIndex) << 2
-			case VectorArrangementD:
-				imm5 |= 0b1000
-				imm5 |= byte(n.DstVectorIndex) << 4
-				imm4 = byte(n.SrcVectorIndex) << 3
-			default:
-				return fmt.Errorf("unsupported arrangement for VMOV: %d", n.VectorArrangement)
-			}
-			a.Buf.Write([]byte{
-				(srcVectorRegBits << 5) | dstVectorRegBits,
-				imm4<<3 | 1<<2 | srcVectorRegBits>>3,
-				imm5,
-				0b01101110,
-			})
-		} else {
-			// This case VMOV is translated as MOV(vector)
-			if n.VectorArrangement != VectorArrangement16B {
-				return fmt.Errorf("unsupported arrangement for VMOV: %s", n.VectorArrangement)
-			}
-			// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--vector---Move-vector--an-alias-of-ORR--vector--register--
-			a.Buf.Write([]byte{
-				(srcVectorRegBits << 5) | dstVectorRegBits,
-				0b000111<<2 | srcVectorRegBits>>3,
-				0b101<<5 | srcVectorRegBits,
-				0b0100_1110,
-			})
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EOR--vector---Bitwise-Exclusive-OR--vector--?lang=en
+	EOR: {U: 0b1, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b00
+		switch arrangement {
+		case VectorArrangement16B:
+			Q = 0b1
+		case VectorArrangement8B:
+			Q = 0b0
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(BSL))
 		}
-	case VADD, VSUB:
-		if n.VectorArrangement == VectorArrangementNone || (n.VectorArrangement >= VectorArrangementB && n.VectorArrangement <= VectorArrangementD) ||
-			(n.VectorArrangement == VectorArrangement1D) {
-			return fmt.Errorf("unsupported arrangement for VADD: %s", n.VectorArrangement)
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--?lang=en
+	VORR: {U: 0b0, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b10
+		switch arrangement {
+		case VectorArrangement16B:
+			Q = 0b1
+		case VectorArrangement8B:
+			Q = 0b0
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VORR))
 		}
-
-		var u byte
-		switch n.Instruction {
-		case VADD:
-			// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/ADD--vector---Add--vector--
-			u = 0b0
-		case VSUB:
-			// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--
-			u = 0b1
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--?lang=en
+	BIC: {U: 0b0, Opcode: 0b00011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size = 0b01
+		switch arrangement {
+		case VectorArrangement16B:
+			Q = 0b1
+		case VectorArrangement8B:
+			Q = 0b0
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(BIC))
 		}
-
-		size, q := arrangementSizeQ(n.VectorArrangement)
-		a.Buf.Write([]byte{
-			(dstVectorRegBits << 5) | dstVectorRegBits,
-			0b100001<<2 | dstVectorRegBits>>3,
-			size<<6 | 0b1<<5 | srcVectorRegBits,
-			q<<6 | u<<5 | 0b1110,
-		})
-	case VFADDS, VFADDD, VFSUBS, VFSUBD:
-		var sz, b byte
-		switch n.Instruction {
-		case VFADDS:
-		case VFADDD:
-			sz = 0b1
-		case VFSUBS:
-			b = 0b1
-		case VFSUBD:
-			b = 0b1
-			sz = 0b1
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
+	VFADDS: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		switch arrangement {
+		case VectorArrangement2S:
+			size, Q = 0b00, 0
+		case VectorArrangement4S:
+			size, Q = 0b00, 1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFADDS))
 		}
-
-		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--
-		a.Buf.Write([]byte{
-			(srcVectorRegBits << 5) | dstVectorRegBits,
-			0b110101<<2 | srcVectorRegBits>>3,
-			b<<7 | sz<<6 | 0b1<<5 | dstVectorRegBits,
-			0b1<<6 | 0b1110,
-		})
-
-	case SSHLL, USHLL:
-		// SSHLL: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
-		// USHLL: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
-		var u byte
-		switch n.Instruction {
-		case SSHLL:
-			u = 0b0
-		case USHLL:
-			u = 0b1
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
+	VFADDD: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		switch arrangement {
+		case VectorArrangement2D:
+			size, Q = 0b01, 1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFADDD))
 		}
-
-		var immb, immh byte
-		switch n.VectorArrangement {
-		case VectorArrangement8B:
-			if n.SrcConst < 0 || n.SrcConst > 7 {
-				return fmt.Errorf("shift amount on %s must be between 0 and 7 for %s but was %d",
-					InstructionName(n.Instruction), n.VectorArrangement, n.SrcConst)
-			}
-			immb = byte(n.SrcConst)
-			immh = 0b0001
-		case VectorArrangement4H:
-			if n.SrcConst < 0 || n.SrcConst > 15 {
-				return fmt.Errorf("shift amount on %s must be between 0 and 15 for %s but was %d",
-					InstructionName(n.Instruction), n.VectorArrangement, n.SrcConst)
-			}
-			immb = byte(n.SrcConst) & 0b111
-			immh = 0b0010 | byte(n.SrcConst>>3)
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
+	VFSUBS: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		switch arrangement {
 		case VectorArrangement2S:
-			if n.SrcConst < 0 || n.SrcConst > 31 {
-				return fmt.Errorf("shift amount on %s must be between 0 and 31 for %s but was %d",
-					InstructionName(n.Instruction), n.VectorArrangement, n.SrcConst)
-			}
-			immb = byte(n.SrcConst) & 0b111
-			immh = 0b0100 | byte(n.SrcConst>>3)
+			size, Q = 0b10, 0
+		case VectorArrangement4S:
+			size, Q = 0b10, 1
 		default:
-			return fmt.Errorf("unsupported arrangement for %s: %s",
-				InstructionName(n.Instruction), n.VectorArrangement)
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFSUBS))
 		}
-
-		a.Buf.Write([]byte{
-			(srcVectorRegBits << 5) | dstVectorRegBits,
-			0b101001<<2 | srcVectorRegBits>>3,
-			immh<<3 | immb,
-			u<<5 | 0b1111,
-		})
-	case ADDP:
-		var opcode byte
-		var size, q byte
-		var rm, op byte
-		switch n.VectorArrangement {
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
+	VFSUBD: {U: 0b0, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		switch arrangement {
+		case VectorArrangement2D:
+			size, Q = 0b11, 1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(VFSUBD))
+		}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-?lang=en
+	UMAXP: {U: 0b1, Opcode: 0b10100, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en
+	CMEQ: {U: 0b1, Opcode: 0b10001, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+	// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector-
+	VADDP: {U: 0b0, Opcode: 0b10111, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector---Add--vector--?lang=en
+	VADD: {U: 0, Opcode: 0b10000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--?lang=en
+	VSUB: {U: 1, Opcode: 0b10000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
+	SSHL: {U: 0, Opcode: 0b01000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+	USHL: {U: 0b1, Opcode: 0b01000, qAndSizeResolver: func(arrangement VectorArrangement) (Q, size byte, err error) {
+		size, Q = arrangementSizeQ(arrangement)
+		return
+	}},
+}
+
+// advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD across lanes" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDAcrossLanes = map[asm.Instruction]struct {
+	U, Opcode byte
+	// TODO: extract common implementation of qAndSizeResolver.
+	qAndSizeResolver func(arrangement VectorArrangement) (Q, Size byte, err error)
+}{
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-?lang=en
+	ADDV: {U: 0b0, Opcode: 0b11011, qAndSizeResolver: func(arrangement VectorArrangement) (Q, Size byte, err error) {
+		switch arrangement {
+		case VectorArrangement16B:
+			Size, Q = 0b00, 0b1
+		case VectorArrangement8B:
+			Size, Q = 0b00, 0b0
+		case VectorArrangement8H:
+			Size, Q = 0b01, 0b1
+		case VectorArrangement4H:
+			Size, Q = 0b01, 0b0
+		case VectorArrangement4S:
+			Size, Q = 0b10, 0b1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(ADDV))
+		}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMINV--Unsigned-Minimum-across-Vector-?lang=en
+	UMINV: {U: 0b1, Opcode: 0b11010, qAndSizeResolver: func(arrangement VectorArrangement) (Q, Size byte, err error) {
+		switch arrangement {
+		case VectorArrangement16B:
+			Size, Q = 0b00, 0b1
+		case VectorArrangement8B:
+			Size, Q = 0b00, 0b0
+		case VectorArrangement8H:
+			Size, Q = 0b01, 0b1
+		case VectorArrangement4H:
+			Size, Q = 0b01, 0b0
+		case VectorArrangement4S:
+			Size, Q = 0b10, 0b1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s for %s", arrangement.String(), InstructionName(UMINV))
+		}
+		return
+	}},
+}
+
+// advancedSIMDScalarPairwise holds information to encode instructions as "Advanced SIMD scalar pairwise" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDScalarPairwise = map[asm.Instruction]struct {
+	U, Opcode    byte
+	sizeResolver func(arrangement VectorArrangement) (Size byte)
+}{
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en
+	ADDP: {U: 0b0, Opcode: 0b11011, sizeResolver: func(arrangement VectorArrangement) (size byte) {
+		size = 0b11
+		return
+	}},
+}
+
+// advancedSIMDCopy holds information to encode instructions as "Advanced SIMD copy" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDCopy = map[asm.Instruction]struct {
+	op byte
+	// TODO: extract common implementation of resolver.
+	resolver func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error)
+}{
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-?lang=en
+	DUPELEM: {op: 0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
+		imm4 = 0b0000
+		q = 0b1
+
+		switch arr {
+		case VectorArrangementB:
+			imm5 |= 0b1
+			imm5 |= byte(srcIndex) << 1
+		case VectorArrangementH:
+			imm5 |= 0b10
+			imm5 |= byte(srcIndex) << 2
+		case VectorArrangementS:
+			imm5 |= 0b100
+			imm5 |= byte(srcIndex) << 3
 		case VectorArrangementD:
-			opcode = 0b10111_0
-			size, q = 0b11, 0b1
-			// ADDP (scalar) https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en
-			rm = 0b10001
-			op = 0b1
+			imm5 |= 0b1000
+			imm5 |= byte(srcIndex) << 4
+		default:
+			err = fmt.Errorf("unsupported arrangement for DUPELEM: %d", arr)
+		}
+
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
+	DUPGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
+		imm4 = 0b0001
+		switch arr {
+		case VectorArrangement8B:
+			imm5 = 0b1
+		case VectorArrangement16B:
+			imm5 = 0b1
+			q = 0b1
+		case VectorArrangement4H:
+			imm5 = 0b10
+		case VectorArrangement8H:
+			imm5 = 0b10
+			q = 0b1
+		case VectorArrangement2S:
+			imm5 = 0b100
+		case VectorArrangement4S:
+			imm5 = 0b100
+			q = 0b1
+		case VectorArrangement2D:
+			imm5 = 0b1000
+			q = 0b1
 		default:
-			// ADDP (vector) https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--vector---Add-Pairwise--vector--?lang=en
-			opcode = 0b10111_1
-			size, q = arrangementSizeQ(n.VectorArrangement)
-			rm = dstVectorRegBits
-			op = 0b0
+			err = fmt.Errorf("unsupported arrangement for DUPGEN: %s", arr)
+		}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en
+	INSGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
+		imm4, q = 0b0011, 0b1
+		switch arr {
+		case VectorArrangementB:
+			imm5 |= 0b1
+			imm5 |= byte(dstIndex) << 1
+		case VectorArrangementH:
+			imm5 |= 0b10
+			imm5 |= byte(dstIndex) << 2
+		case VectorArrangementS:
+			imm5 |= 0b100
+			imm5 |= byte(dstIndex) << 3
+		case VectorArrangementD:
+			imm5 |= 0b1000
+			imm5 |= byte(dstIndex) << 4
+		default:
+			err = fmt.Errorf("unsupported arrangement for INSGEN: %s", arr)
+		}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
+	UMOV: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
+		imm4 = 0b0111
+		switch arr {
+		case VectorArrangementB:
+			imm5 |= 0b1
+			imm5 |= byte(srcIndex) << 1
+		case VectorArrangementH:
+			imm5 |= 0b10
+			imm5 |= byte(srcIndex) << 2
+		case VectorArrangementS:
+			imm5 |= 0b100
+			imm5 |= byte(srcIndex) << 3
+		case VectorArrangementD:
+			imm5 |= 0b1000
+			imm5 |= byte(srcIndex) << 4
+			q = 0b1
+		default:
+			err = fmt.Errorf("unsupported arrangement for UMOV: %s", arr)
+		}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-?lang=en
+	SMOV32: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
+		imm4 = 0b0101
+		switch arr {
+		case VectorArrangementB:
+			imm5 |= 0b1
+			imm5 |= byte(srcIndex) << 1
+		case VectorArrangementH:
+			imm5 |= 0b10
+			imm5 |= byte(srcIndex) << 2
+		default:
+			err = fmt.Errorf("unsupported arrangement for SMOV32: %s", arr)
+		}
+		return
+	}},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
+	INSELEM: {op: 0b1, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
+		q = 0b1
+		switch arr {
+		case VectorArrangementB:
+			imm5 |= 0b1
+			imm5 |= byte(dstIndex) << 1
+			imm4 = byte(srcIndex)
+		case VectorArrangementH:
+			imm5 |= 0b10
+			imm5 |= byte(dstIndex) << 2
+			imm4 = byte(srcIndex) << 1
+		case VectorArrangementS:
+			imm5 |= 0b100
+			imm5 |= byte(dstIndex) << 3
+			imm4 = byte(srcIndex) << 2
+		case VectorArrangementD:
+			imm5 |= 0b1000
+			imm5 |= byte(dstIndex) << 4
+			imm4 = byte(srcIndex) << 3
+		default:
+			err = fmt.Errorf("unsupported arrangement for INSELEM: %d", arr)
+		}
+		return
+	}},
+}
+
+// advancedSIMDTableLookup holds information to encode instructions as "Advanced SIMD table lookup" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDTableLookup = map[asm.Instruction]struct {
+	op, op2, Len byte
+	qResolver    func(arr VectorArrangement) (q byte)
+}{
+	TBL1: {op: 0, op2: 0, Len: 0b00, qResolver: func(arr VectorArrangement) (q byte) {
+		switch arr {
+		case VectorArrangement16B:
+			q = 0b1
+		case VectorArrangement8B:
+			q = 0b0
+		}
+		return
+	}},
+	TBL2: {op: 0, op2: 0, Len: 0b01, qResolver: func(arr VectorArrangement) (q byte) {
+		switch arr {
+		case VectorArrangement16B:
+			q = 0b1
+		case VectorArrangement8B:
+			q = 0b0
+		}
+		return
+	}},
+}
+
+// advancedSIMDScalarTwoRegisterMisc holds information to encode instructions as "Advanced SIMD scalar two-register miscellaneous" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDScalarTwoRegisterMisc = map[asm.Instruction]struct {
+	U, opcode        byte
+	qAndSizeResolver func(arr VectorArrangement) (q, size byte)
+}{
+	CMEQZERO: {U: 0b0, opcode: 0b01001, qAndSizeResolver: func(arr VectorArrangement) (q, size byte) {
+		size, q = arrangementSizeQ(arr)
+		return
+	}},
+}
+
+// advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDShiftByImmediate = map[asm.Instruction]struct {
+	U, opcode    byte
+	immQResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error)
+}{
+	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
+	SSHLLIMM: {U: 0b0, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate},
+	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
+	USHLLIMM: {U: 0b1, opcode: 0b10100, immQResolver: immResolverForSIMDSiftLeftByImmediate},
+	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en
+	SSHR: {U: 0b0, opcode: 0b00000, immQResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) {
+		// TODO:
+		switch arr {
+		case VectorArrangement16B, VectorArrangement8B:
+			immh = 0b0001
+			immb = 8 - byte(shiftAmount&0b111)
+			if arr == VectorArrangement16B {
+				q = 1
+			}
+		case VectorArrangement8H, VectorArrangement4H:
+			v := 16 - byte(shiftAmount&0b1111)
+			immb = v & 0b111
+			immh = 0b0010 | (v >> 3)
+			if arr == VectorArrangement8H {
+				q = 1
+			}
+		case VectorArrangement4S, VectorArrangement2S:
+			v := 32 - byte(shiftAmount&0b11111)
+			immb = v & 0b111
+			immh = 0b0100 | (v >> 3)
+			if arr == VectorArrangement4S {
+				q = 1
+			}
+		case VectorArrangement2D:
+			v := 64 - byte(shiftAmount&0b111111)
+			immb = v & 0b111
+			immh = 0b1000 | (v >> 3)
+			q = 1
+		default:
+			err = fmt.Errorf("unsupported arrangement %s", arr)
+		}
+		return
+	}},
+}
+
+// advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD permute" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+var advancedSIMDPermute = map[asm.Instruction]struct {
+	opcode byte
+}{
+	ZIP1: {opcode: 0b011},
+}
+
+func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb, q byte, err error) {
+	q = 0b0
+
+	switch arr {
+	case VectorArrangement8B:
+		immb = byte(shiftAmount)
+		immh = 0b0001
+	case VectorArrangement4H:
+		immb = byte(shiftAmount) & 0b111
+		immh = 0b0010 | byte(shiftAmount>>3)
+	case VectorArrangement2S:
+		immb = byte(shiftAmount) & 0b111
+		immh = 0b0100 | byte(shiftAmount>>3)
+	default:
+		err = fmt.Errorf("unsupported arrangement %s", arr)
+	}
+	return
+}
+
+// encodeAdvancedSIMDCopy encodes instruction as "Advanced SIMD copy" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func (a *AssemblerImpl) encodeAdvancedSIMDCopy(srcRegBits, dstRegBits, op, imm5, imm4, q byte) {
+	a.Buf.Write([]byte{
+		(srcRegBits << 5) | dstRegBits,
+		imm4<<3 | 0b1<<2 | srcRegBits>>3,
+		imm5,
+		q<<6 | op<<5 | 0b1110,
+	})
+}
+
+// encodeAdvancedSIMDThreeSame encodes instruction as  "Advanced SIMD three same" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func (a *AssemblerImpl) encodeAdvancedSIMDThreeSame(src1, src2, dst, opcode, size, q, u byte) {
+	a.Buf.Write([]byte{
+		(src2 << 5) | dst,
+		opcode<<3 | 1<<2 | src2>>3,
+		size<<6 | 0b1<<5 | src1,
+		q<<6 | u<<5 | 0b1110,
+	})
+}
+
+// encodeAdvancedSIMDPermute encodes instruction as  "Advanced SIMD permute" in
+// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func (a *AssemblerImpl) encodeAdvancedSIMDPermute(src1, src2, dst, opcode, size, q byte) {
+	a.Buf.Write([]byte{
+		(src2 << 5) | dst,
+		opcode<<4 | 0b1<<3 | src2>>3,
+		size<<6 | src1,
+		q<<6 | 0b1110,
+	})
+}
+
+func (a *AssemblerImpl) EncodeVectorRegisterToVectorRegister(n *NodeImpl) (err error) {
+	var srcVectorRegBits byte
+	if n.SrcReg != RegRZR {
+		srcVectorRegBits, err = vectorRegisterBits(n.SrcReg)
+		if err != nil {
+			return err
+		}
+	}
+
+	dstVectorRegBits, err := vectorRegisterBits(n.DstReg)
+	if err != nil {
+		return err
+	}
+
+	if simdCopy, ok := advancedSIMDCopy[n.Instruction]; ok {
+		imm5, imm4, q, err := simdCopy.resolver(n.SrcVectorIndex, n.DstVectorIndex, n.VectorArrangement)
+		if err != nil {
+			return err
 		}
+		a.encodeAdvancedSIMDCopy(srcVectorRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
+		return nil
+	}
+
+	if scalarPairwise, ok := advancedSIMDScalarPairwise[n.Instruction]; ok {
+		// See "Advanced SIMD scalar pairwise" in
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+		size := scalarPairwise.sizeResolver(n.VectorArrangement)
 		a.Buf.Write([]byte{
 			(srcVectorRegBits << 5) | dstVectorRegBits,
-			opcode<<2 | srcVectorRegBits>>3,
-			size<<6 | 0b1<<5 | rm,
-			q<<6 | op<<4 | 0b01110,
+			scalarPairwise.Opcode<<4 | 1<<3 | srcVectorRegBits>>3,
+			size<<6 | 0b11<<4 | scalarPairwise.Opcode>>4,
+			0b1<<6 | scalarPairwise.U<<5 | 0b11110,
 		})
-	case UMAXP:
-		// "Advanced SIMD three same" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-		var opcode, u byte
-		switch n.Instruction {
-		case UMAXP:
-			// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-
-			opcode, u = 0b10100, 0b1
+		return
+	}
+
+	if twoRegMisc, ok := advancedSIMDTwoRegisterMisc[n.Instruction]; ok {
+		// See "Advanced SIMD two-register miscellaneous" in
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+		q, size, err := twoRegMisc.qAndSizeResolver(n.VectorArrangement)
+		if err != nil {
+			return err
 		}
-		var size, q byte = arrangementSizeQ(n.VectorArrangement)
 		a.Buf.Write([]byte{
 			(srcVectorRegBits << 5) | dstVectorRegBits,
-			opcode<<3 | 0b1<<2 | srcVectorRegBits>>3,
-			size<<6 | 0b1<<5 | dstVectorRegBits,
-			q<<6 | u<<5 | 0b01110,
+			twoRegMisc.Opcode<<4 | 0b1<<3 | srcVectorRegBits>>3,
+			size<<6 | 0b1<<5 | twoRegMisc.Opcode>>4,
+			q<<6 | twoRegMisc.U<<5 | 0b01110,
 		})
-	case UMINV:
-		// "Advanced SIMD across lanes" in  https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
-		var opcode, u byte = 0b11010, 0b1
-		var size, q byte = arrangementSizeQ(n.VectorArrangement)
+		return nil
+	}
+
+	if threeSame, ok := advancedSIMDThreeSame[n.Instruction]; ok {
+		q, size, err := threeSame.qAndSizeResolver(n.VectorArrangement)
+		if err != nil {
+			return err
+		}
+		a.encodeAdvancedSIMDThreeSame(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeSame.Opcode, size, q, threeSame.U)
+		return nil
+	}
 
+	if acrossLanes, ok := advancedSIMDAcrossLanes[n.Instruction]; ok {
+		// See "Advanced SIMD across lanes" in
+		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+		q, size, err := acrossLanes.qAndSizeResolver(n.VectorArrangement)
+		if err != nil {
+			return err
+		}
 		a.Buf.Write([]byte{
 			(srcVectorRegBits << 5) | dstVectorRegBits,
-			opcode<<4 | 0b1<<3 | srcVectorRegBits>>3,
-			size<<6 | 0b11000<<1 | opcode>>4,
-			q<<6 | u<<5 | 0b01110,
+			acrossLanes.Opcode<<4 | 0b1<<3 | srcVectorRegBits>>3,
+			size<<6 | 0b11000<<1 | acrossLanes.Opcode>>4,
+			q<<6 | acrossLanes.U<<5 | 0b01110,
 		})
-	case CMEQ:
-		const size byte = 0b11
-		if n.SrcReg == RegRZR {
-			// CMEQ (zero, vector)
-			// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en
-			a.Buf.Write([]byte{
-				(dstVectorRegBits << 5) | dstVectorRegBits,
-				0b100110<<2 | dstVectorRegBits>>3,
-				size<<6 | 0b1<<5,
-				0b01001110,
-			})
-		} else {
-			// CMEQ (register, vector)
-			// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en
-			a.Buf.Write([]byte{
-				(srcVectorRegBits << 5) | dstVectorRegBits,
-				0b100011<<2 | srcVectorRegBits>>3,
-				size<<6 | 0b1<<5 | dstVectorRegBits,
-				0b01101110,
-			})
+		return nil
+	}
+
+	if lookup, ok := advancedSIMDTableLookup[n.Instruction]; ok {
+		q := lookup.qResolver(n.VectorArrangement)
+
+		a.Buf.Write([]byte{
+			(srcVectorRegBits << 5) | dstVectorRegBits,
+			lookup.Len<<5 | lookup.op<<4 | srcVectorRegBits>>3,
+			lookup.op2<<6 | dstVectorRegBits,
+			q<<6 | 0b1110,
+		})
+		return
+	}
+
+	if scalaTwoMisc, ok := advancedSIMDScalarTwoRegisterMisc[n.Instruction]; ok {
+		q, size := scalaTwoMisc.qAndSizeResolver(n.VectorArrangement)
+		a.Buf.Write([]byte{
+			(dstVectorRegBits << 5) | dstVectorRegBits,
+			0b100110<<2 | dstVectorRegBits>>3,
+			size<<6 | 0b1<<5,
+			q<<6 | scalaTwoMisc.U<<5 | 0b01001110,
+		})
+		return
+	}
+
+	if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.Instruction]; ok {
+		immh, immb, q, err := shiftByImmediate.immQResolver(n.SrcConst, n.VectorArrangement)
+		if err != nil {
+			return err
 		}
+		a.Buf.Write([]byte{
+			(srcVectorRegBits << 5) | dstVectorRegBits,
+			shiftByImmediate.opcode<<3 | 0b1<<2 | srcVectorRegBits>>3,
+			immh<<3 | immb,
+			q<<6 | shiftByImmediate.U<<5 | 0b1111,
+		})
+		return nil
+	}
 
-	case TBL1, TBL2:
-		// Interpret dstVectorRegBits as the index register (`Rm` in the doc)
-		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/TBL--Table-vector-Lookup-?lang=en
+	if permute, ok := advancedSIMDPermute[n.Instruction]; ok {
+		size, q := arrangementSizeQ(n.VectorArrangement)
+		a.encodeAdvancedSIMDPermute(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, permute.opcode, size, q)
+		return
+	}
+	return errorEncodingUnsupported(n)
+}
 
-		var l byte // `len` in the doc.
-		switch n.Instruction {
-		case TBL1:
-			l = 0b00
-		case TBL2:
-			l = 0b01
+func (a *AssemblerImpl) encodeTwoVectorRegistersToVectorRegister(n *NodeImpl) (err error) {
+	var srcRegBits, srcRegBits2, dstRegBits byte
+	srcRegBits, err = vectorRegisterBits(n.SrcReg)
+	if err != nil {
+		return err
+	}
+
+	srcRegBits2, err = vectorRegisterBits(n.SrcReg2)
+	if err != nil {
+		return err
+	}
+
+	dstRegBits, err = vectorRegisterBits(n.DstReg)
+	if err != nil {
+		return err
+	}
+
+	if threeSame, ok := advancedSIMDThreeSame[n.Instruction]; ok {
+		q, size, err := threeSame.qAndSizeResolver(n.VectorArrangement)
+		if err != nil {
+			return err
 		}
+		a.encodeAdvancedSIMDThreeSame(srcRegBits, srcRegBits2, dstRegBits, threeSame.Opcode, size, q, threeSame.U)
+		return nil
+	}
+
+	if permute, ok := advancedSIMDPermute[n.Instruction]; ok {
+		size, q := arrangementSizeQ(n.VectorArrangement)
+		a.encodeAdvancedSIMDPermute(srcRegBits, srcRegBits2, dstRegBits, permute.opcode, size, q)
+		return
+	}
 
-		var q byte
+	if n.Instruction == EXT {
+		// EXT is the only instruction in "Advanced SIMD extract", so inline the encoding here.
+		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
+		var q, imm4 byte
 		switch n.VectorArrangement {
 		case VectorArrangement16B:
+			imm4 = 0b1111 & byte(n.SrcConst)
 			q = 0b1
 		case VectorArrangement8B:
-			q = 0b0
+			imm4 = 0b111 & byte(n.SrcConst)
+		default:
+			return fmt.Errorf("invalid arrangement %s for EXT", n.VectorArrangement)
 		}
-
 		a.Buf.Write([]byte{
-			(srcVectorRegBits << 5) | dstVectorRegBits,
-			l<<5 | srcVectorRegBits>>3,
-			dstVectorRegBits,
-			q<<6 | 0b1110,
+			(srcRegBits2 << 5) | dstRegBits,
+			imm4<<3 | srcRegBits2>>3,
+			srcRegBits,
+			q<<6 | 0b101110,
 		})
-	default:
-		return errorEncodingUnsupported(n)
+		return
+	}
+	return
+}
+
+func (a *AssemblerImpl) EncodeVectorRegisterToRegister(n *NodeImpl) (err error) {
+	if err = checkArrangementIndexPair(n.VectorArrangement, n.SrcVectorIndex); err != nil {
+		return
+	}
+
+	srcVecRegBits, err := vectorRegisterBits(n.SrcReg)
+	if err != nil {
+		return err
+	}
+
+	dstRegBits, err := intRegisterBits(n.DstReg)
+	if err != nil {
+		return err
 	}
 
-	return nil
+	if simdCopy, ok := advancedSIMDCopy[n.Instruction]; ok {
+		imm5, imm4, q, err := simdCopy.resolver(n.SrcVectorIndex, n.DstVectorIndex, n.VectorArrangement)
+		if err != nil {
+			return err
+		}
+		a.encodeAdvancedSIMDCopy(srcVecRegBits, dstRegBits, simdCopy.op, imm5, imm4, q)
+		return nil
+	}
+	return errorEncodingUnsupported(n)
+}
+
+func (a *AssemblerImpl) EncodeRegisterToVectorRegister(n *NodeImpl) (err error) {
+	srcRegBits, err := intRegisterBits(n.SrcReg)
+	if err != nil {
+		return err
+	}
+
+	dstVectorRegBits, err := vectorRegisterBits(n.DstReg)
+	if err != nil {
+		return err
+	}
+
+	if simdCopy, ok := advancedSIMDCopy[n.Instruction]; ok {
+		imm5, imm4, q, err := simdCopy.resolver(n.SrcVectorIndex, n.DstVectorIndex, n.VectorArrangement)
+		if err != nil {
+			return err
+		}
+		a.encodeAdvancedSIMDCopy(srcRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
+		return nil
+	}
+	return errorEncodingUnsupported(n)
 }
 
 var zeroRegisterBits byte = 0b11111
diff --git a/internal/asm/arm64/impl_test.go b/internal/asm/arm64/impl_test.go
index 26c49abea1..3d36195e3c 100644
--- a/internal/asm/arm64/impl_test.go
+++ b/internal/asm/arm64/impl_test.go
@@ -37,16 +37,16 @@ func TestNodeImpl_String(t *testing.T) {
 			exp: "NOP",
 		},
 		{
-			in:  &NodeImpl{Instruction: BEQ, Types: OperandTypesNoneToRegister, DstReg: RegR1},
-			exp: "BEQ R1",
+			in:  &NodeImpl{Instruction: BCONDEQ, Types: OperandTypesNoneToRegister, DstReg: RegR1},
+			exp: "BCONDEQ R1",
 		},
 		{
-			in:  &NodeImpl{Instruction: BNE, Types: OperandTypesNoneToMemory, DstReg: RegR1, DstConst: 0x1234},
-			exp: "BNE [R1 + 0x1234]",
+			in:  &NodeImpl{Instruction: BCONDNE, Types: OperandTypesNoneToMemory, DstReg: RegR1, DstConst: 0x1234},
+			exp: "BCONDNE [R1 + 0x1234]",
 		},
 		{
-			in:  &NodeImpl{Instruction: BNE, Types: OperandTypesNoneToBranch, JumpTarget: &NodeImpl{Instruction: NOP}},
-			exp: "BNE {NOP}",
+			in:  &NodeImpl{Instruction: BCONDNE, Types: OperandTypesNoneToBranch, JumpTarget: &NodeImpl{Instruction: NOP}},
+			exp: "BCONDNE {NOP}",
 		},
 		{
 			in:  &NodeImpl{Instruction: ADD, Types: OperandTypesRegisterToRegister, SrcReg: RegV0, DstReg: RegV10},
@@ -229,9 +229,9 @@ func TestAssemblerImpl_CompileJump(t *testing.T) {
 
 func TestAssemblerImpl_CompileJumpToRegister(t *testing.T) {
 	a := NewAssemblerImpl(RegR10)
-	a.CompileJumpToRegister(BNE, RegR27)
+	a.CompileJumpToRegister(BCONDNE, RegR27)
 	actualNode := a.Current
-	require.Equal(t, BNE, actualNode.Instruction)
+	require.Equal(t, BCONDNE, actualNode.Instruction)
 	require.Equal(t, RegR27, actualNode.DstReg)
 	require.Equal(t, OperandTypeNone, actualNode.Types.src)
 	require.Equal(t, OperandTypeRegister, actualNode.Types.dst)
@@ -239,9 +239,9 @@ func TestAssemblerImpl_CompileJumpToRegister(t *testing.T) {
 
 func TestAssemblerImpl_CompileJumpToMemory(t *testing.T) {
 	a := NewAssemblerImpl(RegR10)
-	a.CompileJumpToMemory(BNE, RegR27)
+	a.CompileJumpToMemory(BCONDNE, RegR27)
 	actualNode := a.Current
-	require.Equal(t, BNE, actualNode.Instruction)
+	require.Equal(t, BCONDNE, actualNode.Instruction)
 	require.Equal(t, RegR27, actualNode.DstReg)
 	require.Equal(t, OperandTypeNone, actualNode.Types.src)
 	require.Equal(t, OperandTypeMemory, actualNode.Types.dst)
@@ -453,6 +453,19 @@ func Test_CompileVectorRegisterToVectorRegister(t *testing.T) {
 	require.Equal(t, VectorIndex(2), actualNode.DstVectorIndex)
 }
 
+func Test_CompileTwoVectorRegistersToVectorRegister(t *testing.T) {
+	a := NewAssemblerImpl(RegR10)
+	a.CompileTwoVectorRegistersToVectorRegister(VMOV, RegV3, RegV15, RegV10, VectorArrangement1D)
+	actualNode := a.Current
+	require.Equal(t, VMOV, actualNode.Instruction)
+	require.Equal(t, RegV3, actualNode.SrcReg)
+	require.Equal(t, RegV15, actualNode.SrcReg2)
+	require.Equal(t, RegV10, actualNode.DstReg)
+	require.Equal(t, OperandTypeTwoVectorRegisters, actualNode.Types.src)
+	require.Equal(t, OperandTypeVectorRegister, actualNode.Types.dst)
+	require.Equal(t, VectorArrangement1D, actualNode.VectorArrangement)
+}
+
 func Test_checkRegisterToRegisterType(t *testing.T) {
 	tests := []struct {
 		src, dst                     asm.Register
@@ -876,70 +889,257 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 		srcIndex, dstIndex VectorIndex
 		exp                []byte
 	}{
-		// These are not supported in golang-asm, so test it here instead of integration tests.
 		{
+			inst: ZIP1,
+			name: "zip1 v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x39, 0x2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			inst: ADDV,
+			name: "addv b10, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0xb8, 0x31, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			inst: VORR,
+			name: "orr v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x1d, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			inst: VORR,
+			name: "orr v10.8b, v10.8b, v2.8b",
+			x1:   RegV2,
+			x2:   RegV10,
+			arr:  VectorArrangement8B,
+			exp:  []byte{0x4a, 0x1d, 0xa2, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "fadd v10.2d, v10.2d, v2.2d",
 			x1:   RegV2,
 			x2:   RegV10,
 			inst: VFADDD,
-			exp: []byte{
-				0x4a, 0xd4, 0x6a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-			},
+			arr:  VectorArrangement2D,
+			exp:  []byte{0x4a, 0xd5, 0x62, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
+			name: "fadd v10.4s, v10.4s, v2.4s",
 			x1:   RegV2,
 			x2:   RegV10,
 			inst: VFADDS,
-			exp: []byte{
-				0x4a, 0xd4, 0x2a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-			},
+			arr:  VectorArrangement4S,
+			exp:  []byte{0x4a, 0xd5, 0x22, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
+			name: "fsub v10.2d, v10.2d, v2.2d",
 			x1:   RegV2,
 			x2:   RegV10,
 			inst: VFSUBD,
-			exp: []byte{
-				0x4a, 0xd4, 0xea, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-			},
+			arr:  VectorArrangement2D,
+			exp:  []byte{0x4a, 0xd5, 0xe2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
+			name: "fsub v10.4s, v10.4s, v2.4s",
 			x1:   RegV2,
 			x2:   RegV10,
 			inst: VFSUBS,
-			exp: []byte{
-				0x4a, 0xd4, 0xaa, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-			},
+			arr:  VectorArrangement4S,
+			exp:  []byte{0x4a, 0xd5, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "ushll v10.8h, v2.8b, #0",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: USHLLIMM,
+			exp:  []byte{0x4a, 0xa4, 0x8, 0x2f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8B,
+		},
+		{
+			name: "ushll v10.8h, v2.8b, #7",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: USHLLIMM,
+			exp:  []byte{0x4a, 0xa4, 0xf, 0x2f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8B,
+			c:    7,
+		},
+		{
+			name: "10.8h, v2.8b, #0",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x8, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement16B,
+			c:    8,
+		},
+		{
+			name: "sshr v10.16b, v2.16b, #3",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0xd, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement16B,
+			c:    3,
+		},
+		{
+			name: "sshr v10.16b, v2.16b, #1",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0xf, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement16B,
+			c:    1,
+		},
+		{
+			name: "sshr v10.8b, v2.8b, #3",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8B,
+			c:    3,
+		},
+		{
+			name: "sshr v10.8h, v2.8h, #0x10",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x10, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8H,
+			c:    16,
+		},
+		{
+			name: "sshr v10.8h, v2.8h, #0xf",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x11, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8H,
+			c:    15,
+		},
+		{
+			name: "sshr v10.8h, v2.8h, #3",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x1d, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8H,
+			c:    3,
+		},
+		{
+			name: "sshr v10.4h, v2.4h, #0xf",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x11, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement4H,
+			c:    15,
+		},
+		{
+			name: "sshr v10.2s, v2.2s, #0x20",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x20, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2S,
+			c:    32,
+		},
+		{
+			name: "sshr v10.2s, v2.2s, #0x1f",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x21, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2S,
+			c:    31,
+		},
+		{
+			name: "sshr v10.2s, v2.2s, #7",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x39, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2S,
+			c:    7,
 		},
 		{
+			name: "sshr v10.4s, v2.4s, #7",
 			x1:   RegV2,
 			x2:   RegV10,
-			inst: SSHLL,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x39, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement4S,
+			c:    7,
+		},
+		{
+			name: "sshr v10.2d, v2.2d, #0x3f",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x41, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2D,
+			c:    63,
+		},
+		{
+			name: "sshr v10.2d, v2.2d, #0x21",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x5f, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2D,
+			c:    33,
+		},
+		{
+			name: "sshr v10.2d, v2.2d, #1",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHR,
+			exp:  []byte{0x4a, 0x4, 0x7f, 0x4f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2D,
+			c:    1,
+		},
+		{
+			name: "sshll v10.8h, v2.8b, #0",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: SSHLLIMM,
 			exp: []byte{
 				0x4a, 0xa4, 0x8, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 			},
 			arr: VectorArrangement8B,
 		},
 		{
+			name: "sshll v10.8h, v2.8b, #7",
 			x1:   RegV2,
 			x2:   RegV10,
-			inst: SSHLL, exp: []byte{
+			inst: SSHLLIMM, exp: []byte{
 				0x4a, 0xa4, 0xf, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 			},
 			arr: VectorArrangement8B,
 			c:   7,
 		},
 		{
+			name: "sshll v10.4s, v2.4h, #0",
 			x1:   RegV2,
 			x2:   RegV10,
-			inst: SSHLL,
+			inst: SSHLLIMM,
 			exp: []byte{
 				0x4a, 0xa4, 0x10, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 			},
 			arr: VectorArrangement4H,
 		},
 		{
+			name: "sshll v10.4s, v2.4h, #0xf",
 			x1:   RegV2,
 			x2:   RegV10,
-			inst: SSHLL,
+			inst: SSHLLIMM,
 			exp: []byte{
 				0x4a, 0xa4, 0x1f, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 			},
@@ -947,18 +1147,20 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			c:   15,
 		},
 		{
+			name: "sshll v10.2d, v2.2s, #0",
 			x1:   RegV2,
 			x2:   RegV10,
-			inst: SSHLL,
+			inst: SSHLLIMM,
 			exp: []byte{
 				0x4a, 0xa4, 0x20, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 			},
 			arr: VectorArrangement2S,
 		},
 		{
+			name: "sshll v10.2d, v2.2s, #0x1f",
 			x1:   RegV2,
 			x2:   RegV10,
-			inst: SSHLL,
+			inst: SSHLLIMM,
 			exp: []byte{
 				0x4a, 0xa4, 0x3f, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
 			},
@@ -969,7 +1171,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "ins v10.s[2], v2.s[1]",
-			inst:     VMOV,
+			inst:     INSELEM,
 			exp:      []byte{0x4a, 0x24, 0x14, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementS,
 			srcIndex: 1,
@@ -979,7 +1181,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "ins v10.s[0], v2.s[3]",
-			inst:     VMOV,
+			inst:     INSELEM,
 			exp:      []byte{0x4a, 0x64, 0x4, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementS,
 			srcIndex: 3,
@@ -989,7 +1191,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "ins v10.b[0], v2.b[0xf]",
-			inst:     VMOV,
+			inst:     INSELEM,
 			exp:      []byte{0x4a, 0x7c, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementB,
 			srcIndex: 15,
@@ -999,7 +1201,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "ins v10.d[1], v2.d[0]",
-			inst:     VMOV,
+			inst:     INSELEM,
 			exp:      []byte{0x4a, 0x4, 0x18, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementD,
 			srcIndex: 0,
@@ -1009,7 +1211,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "dup v10.2d, v2.d[0]",
-			inst:     DUP,
+			inst:     DUPELEM,
 			exp:      []byte{0x4a, 0x4, 0x8, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementD,
 			srcIndex: 0,
@@ -1018,7 +1220,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "dup v10.2d, v2.d[1]",
-			inst:     DUP,
+			inst:     DUPELEM,
 			exp:      []byte{0x4a, 0x4, 0x18, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementD,
 			srcIndex: 1,
@@ -1027,7 +1229,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "dup v10.4s, v2.s[3]",
-			inst:     DUP,
+			inst:     DUPELEM,
 			exp:      []byte{0x4a, 0x4, 0x1c, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementS,
 			srcIndex: 3,
@@ -1036,7 +1238,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "dup v10.8h, v2.h[7]",
-			inst:     DUP,
+			inst:     DUPELEM,
 			exp:      []byte{0x4a, 0x4, 0x1e, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementH,
 			srcIndex: 7,
@@ -1045,7 +1247,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			x1:       RegV2,
 			x2:       RegV10,
 			name:     "dup v10.16b, v2.b[0xf]",
-			inst:     DUP,
+			inst:     DUPELEM,
 			exp:      []byte{0x4a, 0x4, 0x1f, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:      VectorArrangementB,
 			srcIndex: 15,
@@ -1053,25 +1255,25 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "umaxp v10.16b, v2.16b, v10.16b",
+			name: "umaxp v10.16b, v10.16b, v2.16b",
 			inst: UMAXP,
-			exp:  []byte{0x4a, 0xa4, 0x2a, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			exp:  []byte{0x4a, 0xa5, 0x22, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:  VectorArrangement16B,
 		},
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "umaxp v10.8h, v2.8h, v10.8h",
+			name: "umaxp v10.8h, v10.8h, v2.8h",
 			inst: UMAXP,
-			exp:  []byte{0x4a, 0xa4, 0x6a, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			exp:  []byte{0x4a, 0xa5, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:  VectorArrangement8H,
 		},
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "umaxp v10.4s, v2.8h, v10.4s",
+			name: "umaxp v10.4s, v10.4s, v2.4s",
 			inst: UMAXP,
-			exp:  []byte{0x4a, 0xa4, 0xaa, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			exp:  []byte{0x4a, 0xa5, 0xa2, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:  VectorArrangement4S,
 		},
 		{
@@ -1080,30 +1282,29 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			name: "addp d11, v11.2d",
 			inst: ADDP,
 			exp:  []byte{0x6b, 0xb9, 0xf1, 0x5e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
-			arr:  VectorArrangementD,
 		},
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "addp v10.16b, v2.16b, v10.16b",
-			inst: ADDP,
-			exp:  []byte{0x4a, 0xbc, 0x2a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			name: "addp v10.16b, v10.16b, v2.16b",
+			inst: VADDP,
+			exp:  []byte{0x4a, 0xbd, 0x22, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:  VectorArrangement16B,
 		},
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "addp v10.8h, v2.8h, v10.8h",
-			inst: ADDP,
-			exp:  []byte{0x4a, 0xbc, 0x6a, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			name: "addp v10.8h, v10.8h, v2.8h",
+			inst: VADDP,
+			exp:  []byte{0x4a, 0xbd, 0x62, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:  VectorArrangement8H,
 		},
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "addp v10.4s, v2.8h, v10.4s",
-			inst: ADDP,
-			exp:  []byte{0x4a, 0xbc, 0xaa, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			name: "addp v10.4s, v10.4s, v2.4s",
+			inst: VADDP,
+			exp:  []byte{0x4a, 0xbd, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 			arr:  VectorArrangement4S,
 		},
 		{
@@ -1133,15 +1334,17 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 		{
 			x1:   RegV2,
 			x2:   RegV10,
-			name: "cmeq v10.2d, v2.2d, v10.2d",
+			name: "cmeq v10.2d, v10.2d, v2.2d",
+			arr:  VectorArrangement2D,
 			inst: CMEQ,
-			exp:  []byte{0x4a, 0x8c, 0xea, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			exp:  []byte{0x4a, 0x8d, 0xe2, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
 			x1:   RegRZR,
 			x2:   RegV30,
 			name: "cmeq v30.2d, v30.2d, #0",
-			inst: CMEQ,
+			inst: CMEQZERO,
+			arr:  VectorArrangement2D,
 			exp:  []byte{0xde, 0x9b, 0xe0, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
@@ -1176,6 +1379,135 @@ func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
 			arr:  VectorArrangement16B,
 			exp:  []byte{0xe1, 0x23, 0x1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
+		{
+			x1:   RegV2,
+			x2:   RegV10,
+			name: "add v10.4s, v10.4s, v2.4s",
+			inst: VADD,
+			exp:  []byte{0x4a, 0x85, 0xa2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement4S,
+		},
+		{
+			x1:   RegV2,
+			x2:   RegV10,
+			name: "add v10.2d, v10.2d, v2.2d",
+			inst: VADD,
+			exp:  []byte{0x4a, 0x85, 0xe2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2D,
+		},
+		{
+			x1:   RegV2,
+			x2:   RegV10,
+			name: "sub v10.8h, v10.8h, v2.8h",
+			inst: VSUB,
+			exp:  []byte{0x4a, 0x85, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8H,
+		},
+		{
+			x1:   RegV29,
+			x2:   RegV30,
+			name: "sub v30.16b, v30.16b, v29.16b",
+			inst: VSUB,
+			exp:  []byte{0xde, 0x87, 0x3d, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement16B,
+		},
+		{
+			name: "bic v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: BIC,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x1d, 0x62, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "eor v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: EOR,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x1d, 0x22, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "bsl v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: BSL,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x1d, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "bsl v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: BSL,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x1d, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and v10.16b, v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: VAND,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x1d, 0x22, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			// mvn is an alias of NOT: https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MVN--Bitwise-NOT--vector---an-alias-of-NOT-?lang=en
+			name: "mvn v10.16b, v2.16b",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: NOT,
+			arr:  VectorArrangement16B,
+			exp:  []byte{0x4a, 0x58, 0x20, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "fneg v10.2d, v2.2d",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: VFNEG,
+			arr:  VectorArrangement2D,
+			exp:  []byte{0x4a, 0xf8, 0xe0, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "fneg v10.4s, v2.4s",
+			x1:   RegV2,
+			x2:   RegV10,
+			inst: VFNEG,
+			arr:  VectorArrangement4S,
+			exp:  []byte{0x4a, 0xf8, 0xa0, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			x1:   RegV2,
+			x2:   RegV10,
+			name: "sshl v10.2d, v10.2d, v2.2d",
+			inst: SSHL,
+			exp:  []byte{0x4a, 0x45, 0xe2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement2D,
+		},
+		{
+			x1:   RegV25,
+			x2:   RegV30,
+			name: "sshl v30.4s, v30.4s, v25.4s",
+			inst: SSHL,
+			exp:  []byte{0xde, 0x47, 0xb9, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement4S,
+		},
+		{
+			x1:   RegV2,
+			x2:   RegV10,
+			name: "ushl v10.8h, v10.8h, v2.8h",
+			inst: USHL,
+			exp:  []byte{0x4a, 0x45, 0x62, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement8H,
+		},
+		{
+			x1:   RegV25,
+			x2:   RegV30,
+			name: "ushl v30.16b, v30.16b, v25.16b",
+			inst: USHL,
+			exp:  []byte{0xde, 0x47, 0x39, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+			arr:  VectorArrangement16B,
+		},
 	}
 
 	for _, tt := range tests {
@@ -1207,10 +1539,43 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) {
 		exp  []byte
 	}{
 		// These are not supported in golang-asm, so test it here instead of integration tests.
+		{
+			name: "umov w10, v0.b[0xf]",
+			n: &NodeImpl{
+				Instruction:       UMOV,
+				SrcReg:            RegV0,
+				DstReg:            RegR10,
+				VectorArrangement: VectorArrangementB,
+				SrcVectorIndex:    15,
+			},
+			exp: []byte{0xa, 0x3c, 0x1f, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "mov w10, v0.s[3]",
+			n: &NodeImpl{
+				Instruction:       UMOV,
+				SrcReg:            RegV0,
+				DstReg:            RegR10,
+				VectorArrangement: VectorArrangementS,
+				SrcVectorIndex:    3,
+			},
+			exp: []byte{0xa, 0x3c, 0x1c, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "mov x5, v30.d[1]",
+			n: &NodeImpl{
+				Instruction:       UMOV,
+				SrcReg:            RegV30,
+				DstReg:            RegR5,
+				VectorArrangement: VectorArrangementD,
+				SrcVectorIndex:    1,
+			},
+			exp: []byte{0xc5, 0x3f, 0x18, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
 		{
 			name: "smov w10, v0.b[0xf]",
 			n: &NodeImpl{
-				Instruction:       SMOV,
+				Instruction:       SMOV32,
 				SrcReg:            RegV0,
 				DstReg:            RegR10,
 				VectorArrangement: VectorArrangementB,
@@ -1221,7 +1586,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) {
 		{
 			name: "smov w10, v0.b[0]",
 			n: &NodeImpl{
-				Instruction:       SMOV,
+				Instruction:       SMOV32,
 				SrcReg:            RegV0,
 				DstReg:            RegR10,
 				VectorArrangement: VectorArrangementB,
@@ -1232,7 +1597,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) {
 		{
 			name: "smov w1, v30.h[7]",
 			n: &NodeImpl{
-				Instruction:       SMOV,
+				Instruction:       SMOV32,
 				SrcReg:            RegV30,
 				DstReg:            RegR1,
 				VectorArrangement: VectorArrangementH,
@@ -1243,7 +1608,7 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) {
 		{
 			name: "smov w1, v30.h[0]",
 			n: &NodeImpl{
-				Instruction:       SMOV,
+				Instruction:       SMOV32,
 				SrcReg:            RegV30,
 				DstReg:            RegR1,
 				VectorArrangement: VectorArrangementH,
@@ -1267,6 +1632,253 @@ func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) {
 	}
 }
 
+func TestAssemblerImpl_encodeTwoVectorRegistersToVectorRegister(t *testing.T) {
+	tests := []struct {
+		name string
+		n    *NodeImpl
+		exp  []byte
+	}{
+		{
+			name: "orr v30.16b, v10.16b, v1.16b",
+			n: &NodeImpl{
+				Instruction:       VORR,
+				DstReg:            RegV30,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV10,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0x5e, 0x1d, 0xa1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "orr v30.8b, v10.8b, v1.8b",
+			n: &NodeImpl{
+				Instruction:       VORR,
+				DstReg:            RegV30,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV10,
+				VectorArrangement: VectorArrangement8B,
+			},
+			exp: []byte{0x5e, 0x1d, 0xa1, 0xe, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "bsl v0.8b, v15.8b, v1.8b",
+			n: &NodeImpl{
+				Instruction:       BSL,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				VectorArrangement: VectorArrangement8B,
+			},
+			exp: []byte{0xe0, 0x1d, 0x61, 0x2e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "zip1 v0.4s, v15.4s, v1.4s",
+			n: &NodeImpl{
+				Instruction:       ZIP1,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				VectorArrangement: VectorArrangement4S,
+			},
+			exp: []byte{0xe0, 0x39, 0x81, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "zip1 v0.2d, v15.2d, v1.2d",
+			n: &NodeImpl{
+				Instruction:       ZIP1,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				VectorArrangement: VectorArrangement2D,
+			},
+			exp: []byte{0xe0, 0x39, 0xc1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "ext v0.16b, v15.16b, v1.16b, #0xf",
+			n: &NodeImpl{
+				Instruction:       EXT,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				SrcConst:          0xf,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xe0, 0x79, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "ext v0.16b, v15.16b, v1.16b, #8",
+			n: &NodeImpl{
+				Instruction:       EXT,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				SrcConst:          8,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xe0, 0x41, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "ext v0.16b, v15.16b, v1.16b, #0",
+			n: &NodeImpl{
+				Instruction:       EXT,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				SrcConst:          0,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xe0, 0x1, 0x1, 0x6e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "ext v0.8b, v15.8b, v1.8b, #7",
+			n: &NodeImpl{
+				Instruction:       EXT,
+				DstReg:            RegV0,
+				SrcReg:            RegV1,
+				SrcReg2:           RegV15,
+				SrcConst:          7,
+				VectorArrangement: VectorArrangement8B,
+			},
+			exp: []byte{0xe0, 0x39, 0x1, 0x2e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+	}
+
+	for _, tt := range tests {
+		tc := tt
+		t.Run(tc.name, func(t *testing.T) {
+			a := NewAssemblerImpl(asm.NilRegister)
+			err := a.encodeTwoVectorRegistersToVectorRegister(tc.n)
+			require.NoError(t, err)
+			actual, err := a.Assemble()
+			require.NoError(t, err)
+
+			require.Equal(t, tc.exp, actual, hex.EncodeToString(actual))
+		})
+	}
+}
+
+func TestAssemblerImpl_EncodeConstToRegister(t *testing.T) {
+	tests := []struct {
+		name string
+		n    *NodeImpl
+		exp  []byte
+	}{
+		{
+			name: "and w30, w30, #1",
+			n: &NodeImpl{
+				Instruction:       ANDIMM32,
+				DstReg:            RegR30,
+				SrcConst:          1,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0x3, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and w30, w30, #7",
+			n: &NodeImpl{
+				Instruction:       ANDIMM32,
+				DstReg:            RegR30,
+				SrcConst:          0x7,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0xb, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and w30, w30, #0xf",
+			n: &NodeImpl{
+				Instruction:       ANDIMM32,
+				DstReg:            RegR30,
+				SrcConst:          0xf,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0xf, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and w30, w30, #0x1f",
+			n: &NodeImpl{
+				Instruction:       ANDIMM32,
+				DstReg:            RegR30,
+				SrcConst:          0x1f,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0x13, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and w30, w30, #0x3f",
+			n: &NodeImpl{
+				Instruction:       ANDIMM32,
+				DstReg:            RegR30,
+				SrcConst:          0x3f,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0x17, 0x0, 0x12, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and x30, x30, #1",
+			n: &NodeImpl{
+				Instruction:       ANDIMM64,
+				DstReg:            RegR30,
+				SrcConst:          1,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0x3, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and x30, x30, #7",
+			n: &NodeImpl{
+				Instruction:       ANDIMM64,
+				DstReg:            RegR30,
+				SrcConst:          0x7,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0xb, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and x30, x30, #0xf",
+			n: &NodeImpl{
+				Instruction:       ANDIMM64,
+				DstReg:            RegR30,
+				SrcConst:          0xf,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0xf, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and x30, x30, #0x1f",
+			n: &NodeImpl{
+				Instruction:       ANDIMM64,
+				DstReg:            RegR30,
+				SrcConst:          0x1f,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0x13, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "and x30, x30, #0x3f",
+			n: &NodeImpl{
+				Instruction:       ANDIMM64,
+				DstReg:            RegR30,
+				SrcConst:          0x3f,
+				VectorArrangement: VectorArrangement16B,
+			},
+			exp: []byte{0xde, 0x17, 0x40, 0x92, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+	}
+
+	for _, tt := range tests {
+		tc := tt
+		t.Run(tc.name, func(t *testing.T) {
+			a := NewAssemblerImpl(asm.NilRegister)
+			err := a.EncodeConstToRegister(tc.n)
+			require.NoError(t, err)
+			actual, err := a.Assemble()
+			require.NoError(t, err)
+
+			require.Equal(t, tc.exp, actual, hex.EncodeToString(actual))
+		})
+	}
+}
+
 func TestAssemblerImpl_EncodeRegisterToVectorRegister(t *testing.T) {
 	tests := []struct {
 		name string
@@ -1275,42 +1887,63 @@ func TestAssemblerImpl_EncodeRegisterToVectorRegister(t *testing.T) {
 	}{
 		// These are not supported in golang-asm, so test it here instead of integration tests.
 		{
-			name: "dup v10.2d, x10",
+			name: "ins v10.d[0], x10",
 			n: &NodeImpl{
-				Instruction:       DUP,
+				Instruction:       INSGEN,
+				DstReg:            RegV10,
 				SrcReg:            RegR10,
+				VectorArrangement: VectorArrangementD,
+			},
+			exp: []byte{0x4a, 0x1d, 0x8, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "ins v10.d[1], x10",
+			n: &NodeImpl{
+				Instruction:       INSGEN,
 				DstReg:            RegV10,
+				SrcReg:            RegR10,
 				VectorArrangement: VectorArrangementD,
+				DstVectorIndex:    1,
+			},
+			exp: []byte{0x4a, 0x1d, 0x18, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+		},
+		{
+			name: "dup v10.2d, x10",
+			n: &NodeImpl{
+				Instruction:       DUPGEN,
+				SrcReg:            RegR10,
+				DstReg:            RegV10,
+				VectorArrangement: VectorArrangement2D,
 			},
 			exp: []byte{0x4a, 0xd, 0x8, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
 			name: "dup v1.4s, w30",
 			n: &NodeImpl{
-				Instruction:       DUP,
+				Instruction:       DUPGEN,
 				SrcReg:            RegR30,
 				DstReg:            RegV1,
-				VectorArrangement: VectorArrangementS,
+				VectorArrangement: VectorArrangement4S,
 			},
 			exp: []byte{0xc1, 0xf, 0x4, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
 			name: "dup v30.8h, w1",
 			n: &NodeImpl{
-				Instruction:       DUP,
+				Instruction:       DUPGEN,
 				SrcReg:            RegR1,
 				DstReg:            RegV30,
-				VectorArrangement: VectorArrangementH,
+				VectorArrangement: VectorArrangement8H,
 			},
 			exp: []byte{0x3e, 0xc, 0x2, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
 		{
 			name: "dup v30.16b, w1",
 			n: &NodeImpl{
-				Instruction:       DUP,
+				Instruction:       DUPGEN,
 				SrcReg:            RegR1,
 				DstReg:            RegV30,
-				VectorArrangement: VectorArrangementB,
+				VectorArrangement: VectorArrangement16B,
 			},
 			exp: []byte{0x3e, 0xc, 0x1, 0x4e, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
 		},
diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go
index 3bd8a28108..6df5be5f7b 100644
--- a/internal/engine/compiler/compiler_vec_test.go
+++ b/internal/engine/compiler/compiler_vec_test.go
@@ -110,10 +110,6 @@ func TestCompiler_compileV128Add(t *testing.T) {
 }
 
 func TestCompiler_compileV128Sub(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
 
 	tests := []struct {
 		name        string
@@ -1902,10 +1898,6 @@ func TestCompiler_compileV128Shuffle(t *testing.T) {
 }
 
 func TestCompiler_compileV128Bitmask(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
 
 	u16x8 := func(u1, u2, u3, u4, u5, u6, u7, u8 uint16) (ret [16]byte) {
 		binary.LittleEndian.PutUint16(ret[0:], u1)
@@ -2028,6 +2020,7 @@ func TestCompiler_compileV128Bitmask(t *testing.T) {
 			require.NoError(t, err)
 
 			// Generate and run the code under test.
+
 			code, _, _, err := compiler.compile()
 			require.NoError(t, err)
 			env.exec(code)
@@ -2039,11 +2032,6 @@ func TestCompiler_compileV128Bitmask(t *testing.T) {
 }
 
 func TestCompiler_compileV128_Not(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
-
 	env := newCompilerEnvironment()
 	compiler := env.requireNewCompiler(t, newCompiler,
 		&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
@@ -2079,10 +2067,6 @@ func TestCompiler_compileV128_Not(t *testing.T) {
 }
 
 func TestCompiler_compileV128_And_Or_Xor_AndNot(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
 
 	tests := []struct {
 		name        string
@@ -2315,11 +2299,6 @@ func TestCompiler_compileV128_And_Or_Xor_AndNot(t *testing.T) {
 }
 
 func TestCompiler_compileV128Bitselect(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
-
 	tests := []struct {
 		name                  string
 		selector, x1, x2, exp [16]byte
@@ -2414,11 +2393,6 @@ func TestCompiler_compileV128Bitselect(t *testing.T) {
 }
 
 func TestCompiler_compileV128Shl(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
-
 	tests := []struct {
 		name   string
 		shape  wazeroir.Shape
@@ -2704,11 +2678,6 @@ func TestCompiler_compileV128Shl(t *testing.T) {
 }
 
 func TestCompiler_compileV128Shr(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
-
 	tests := []struct {
 		name   string
 		signed bool
diff --git a/internal/engine/compiler/impl_arm64.go b/internal/engine/compiler/impl_arm64.go
index 91fd60cd36..614fd826a6 100644
--- a/internal/engine/compiler/impl_arm64.go
+++ b/internal/engine/compiler/impl_arm64.go
@@ -276,7 +276,7 @@ func (c *arm64Compiler) compileMaybeGrowValueStack() error {
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmpX, tmpY)
 
 	// If ceil > valueStackLen - stack base pointer, we need to grow the stack by calling builtin Go function.
-	brIfValueStackOK := c.assembler.CompileJump(arm64.BLS)
+	brIfValueStackOK := c.assembler.CompileJump(arm64.BCONDLS)
 	if err := c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction, builtinFunctionIndexGrowValueStack); err != nil {
 		return err
 	}
@@ -319,7 +319,7 @@ func (c *arm64Compiler) compileReturnFunction() error {
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, callFramePointerReg, arm64.RegRZR)
 
 	// If the values are identical, we return back to the Go code with returned status.
-	brIfNotEqual := c.assembler.CompileJump(arm64.BNE)
+	brIfNotEqual := c.assembler.CompileJump(arm64.BCONDNE)
 	c.compileExitFromNativeCode(nativeCallStatusCodeReturned)
 
 	// Otherwise, we have to jump to the caller's return address.
@@ -663,31 +663,31 @@ func (c *arm64Compiler) compileBrIf(o *wazeroir.OperationBrIf) error {
 		// Here we represent the conditional codes by using arm64.COND_** registers, and that means the
 		// conditional jump can be performed if we use arm64.B**.
 		// For example, if we have arm64.CondEQ on cond, that means we performed compileEq right before
-		// this compileBrIf and BrIf can be achieved by arm64.BEQ.
+		// this compileBrIf and BrIf can be achieved by arm64.BCONDEQ.
 		var brInst asm.Instruction
 		switch cond.conditionalRegister {
 		case arm64.CondEQ:
-			brInst = arm64.BEQ
+			brInst = arm64.BCONDEQ
 		case arm64.CondNE:
-			brInst = arm64.BNE
+			brInst = arm64.BCONDNE
 		case arm64.CondHS:
-			brInst = arm64.BHS
+			brInst = arm64.BCONDHS
 		case arm64.CondLO:
-			brInst = arm64.BLO
+			brInst = arm64.BCONDLO
 		case arm64.CondMI:
-			brInst = arm64.BMI
+			brInst = arm64.BCONDMI
 		case arm64.CondHI:
-			brInst = arm64.BHI
+			brInst = arm64.BCONDHI
 		case arm64.CondLS:
-			brInst = arm64.BLS
+			brInst = arm64.BCONDLS
 		case arm64.CondGE:
-			brInst = arm64.BGE
+			brInst = arm64.BCONDGE
 		case arm64.CondLT:
-			brInst = arm64.BLT
+			brInst = arm64.BCONDLT
 		case arm64.CondGT:
-			brInst = arm64.BGT
+			brInst = arm64.BCONDGT
 		case arm64.CondLE:
-			brInst = arm64.BLE
+			brInst = arm64.BCONDLE
 		default:
 			// BUG: This means that we use the cond.conditionalRegister somewhere in this file,
 			// but not covered in switch ^. That shouldn't happen.
@@ -704,7 +704,7 @@ func (c *arm64Compiler) compileBrIf(o *wazeroir.OperationBrIf) error {
 		// so we use CMPW (32-bit compare) here.
 		c.assembler.CompileTwoRegistersToNone(arm64.CMPW, cond.register, arm64.RegRZR)
 
-		conditionalBR = c.assembler.CompileJump(arm64.BNE)
+		conditionalBR = c.assembler.CompileJump(arm64.BCONDNE)
 
 		c.markRegisterUnused(cond.register)
 	}
@@ -815,7 +815,7 @@ func (c *arm64Compiler) compileBrTable(o *wazeroir.OperationBrTable) error {
 	// Compare the length with offset.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmpReg, index.register)
 	// If the value exceeds the length, we will branch into the default target (corresponding to len(o.Targets) index).
-	brDefaultIndex := c.assembler.CompileJump(arm64.BLO)
+	brDefaultIndex := c.assembler.CompileJump(arm64.BCONDLO)
 	c.assembler.CompileRegisterToRegister(arm64.MOVWU, tmpReg, index.register)
 	c.assembler.SetJumpTargetOnNext(brDefaultIndex)
 
@@ -946,7 +946,7 @@ func (c *arm64Compiler) compileCallImpl(index wasm.Index, targetFunctionAddressR
 	)
 	// Compare tmp(len(ce.callFrameStack)) with callFrameStackPointerRegister(ce.callFrameStackPointer).
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, callFrameStackPointerRegister)
-	brIfCallFrameStackOK := c.assembler.CompileJump(arm64.BNE)
+	brIfCallFrameStackOK := c.assembler.CompileJump(arm64.BCONDNE)
 
 	// If these values equal, we need to grow the callFrame stack.
 	// For call_indirect, we need to push the value back to the register.
@@ -1192,7 +1192,7 @@ func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) e
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp2, offset.register)
 
 	// If it exceeds len(table), we exit the execution.
-	brIfOffsetOK := c.assembler.CompileJump(arm64.BLO)
+	brIfOffsetOK := c.assembler.CompileJump(arm64.BCONDLO)
 	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
 
 	// Otherwise, we proceed to do function type check.
@@ -1220,7 +1220,7 @@ func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) e
 
 	// Check if the value of table[offset] equals zero, meaning that the target element is uninitialized.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, offset.register)
-	brIfInitialized := c.assembler.CompileJump(arm64.BNE)
+	brIfInitialized := c.assembler.CompileJump(arm64.BCONDNE)
 	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
 
 	c.assembler.SetJumpTargetOnNext(brIfInitialized)
@@ -1244,7 +1244,7 @@ func (c *arm64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) e
 
 	// Compare these two values, and if they equal, we are ready to make function call.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, tmp, tmp2)
-	brIfTypeMatched := c.assembler.CompileJump(arm64.BEQ)
+	brIfTypeMatched := c.assembler.CompileJump(arm64.BCONDEQ)
 	c.compileExitFromNativeCode(nativeCallStatusCodeTypeMismatchOnIndirectCall)
 
 	c.assembler.SetJumpTargetOnNext(brIfTypeMatched)
@@ -1352,7 +1352,7 @@ func (c *arm64Compiler) compileSelect() error {
 	// At this point, x1 is non-zero register, and x2 is either general purpose or zero register.
 
 	c.assembler.CompileTwoRegistersToNone(arm64.CMPW, arm64.RegRZR, cv.register)
-	brIfNotZero := c.assembler.CompileJump(arm64.BNE)
+	brIfNotZero := c.assembler.CompileJump(arm64.BCONDNE)
 
 	// If cv == 0, we move the value of x2 to the x1.register.
 
@@ -1403,8 +1403,8 @@ func (c *arm64Compiler) compilePick(o *wazeroir.OperationPick) error {
 		case runtimeValueTypeF64:
 			c.assembler.CompileRegisterToRegister(arm64.FMOVD, pickTarget.register, pickedRegister)
 		case runtimeValueTypeV128Lo:
-			c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV,
-				pickTarget.register, pickedRegister, arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
+			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
+				pickTarget.register, pickTarget.register, pickedRegister, arm64.VectorArrangement16B)
 		case runtimeValueTypeV128Hi:
 			panic("BUG") // since pick target must point to the lower 64-bits of vectors.
 		}
@@ -1744,7 +1744,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide
 	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisor)
 
 	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
-	brIfDivisorNonZero := c.assembler.CompileJump(arm64.BNE)
+	brIfDivisorNonZero := c.assembler.CompileJump(arm64.BCONDNE)
 	c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
 
 	// Otherwise, we proceed.
@@ -1760,7 +1760,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide
 		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, divisor)
 
 		// If they not equal, we skip the following check.
-		brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BNE)
+		brIfDivisorNonMinusOne := c.assembler.CompileJump(arm64.BCONDNE)
 
 		// Otherwise, we further check if the dividend equals math.MinInt32 or MinInt64.
 		c.assembler.CompileMemoryToRegister(
@@ -1771,7 +1771,7 @@ func (c *arm64Compiler) compileIntegerDivPrecheck(is32Bit, isSigned bool, divide
 		c.assembler.CompileTwoRegistersToNone(cmpInst, arm64ReservedRegisterForTemporary, dividend)
 
 		// If they not equal, we are safe to execute the division.
-		brIfDividendNotMinInt := c.assembler.CompileJump(arm64.BNE)
+		brIfDividendNotMinInt := c.assembler.CompileJump(arm64.BCONDNE)
 
 		// Otherwise, we raise overflow error.
 		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
@@ -1823,7 +1823,7 @@ func (c *arm64Compiler) compileRem(o *wazeroir.OperationRem) error {
 	c.assembler.CompileTwoRegistersToNone(cmpInst, arm64.RegRZR, divisorReg)
 
 	// If it is zero, we exit with nativeCallStatusIntegerDivisionByZero.
-	brIfDivisorNonZero := c.assembler.CompileJump(arm64.BNE)
+	brIfDivisorNonZero := c.assembler.CompileJump(arm64.BCONDNE)
 	c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
 
 	// Otherwise, we proceed.
@@ -2256,7 +2256,7 @@ func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) err
 		// See https://developer.arm.com/documentation/ddi0595/2020-12/AArch64-Registers/FPSR--Floating-point-Status-Register
 		c.assembler.CompileRegisterAndConstToNone(arm64.CMP, arm64ReservedRegisterForTemporary, 1)
 
-		brOK := c.assembler.CompileJump(arm64.BNE)
+		brOK := c.assembler.CompileJump(arm64.BCONDNE)
 
 		// If so, exit the execution with errors depending on whether or not the source value is NaN.
 		var floatcmp asm.Instruction
@@ -2268,7 +2268,7 @@ func (c *arm64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) err
 		c.assembler.CompileTwoRegistersToNone(floatcmp, source.register, source.register)
 		// VS flag is set if at least one of values for FCMP is NaN.
 		// https://developer.arm.com/documentation/dui0801/g/Condition-Codes/Comparison-of-condition-code-meanings-in-integer-and-floating-point-code
-		brIfSourceNaN := c.assembler.CompileJump(arm64.BVS)
+		brIfSourceNaN := c.assembler.CompileJump(arm64.BCONDVS)
 
 		// If the source value is not NaN, the operation was overflow.
 		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
@@ -2847,7 +2847,7 @@ func (c *arm64Compiler) compileMemoryAccessOffsetSetup(offsetArg uint32, targetS
 
 	// Check if offsetRegister(= base+offsetArg+targetSizeInBytes) > len(memory.Buffer).
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, offsetRegister)
-	boundsOK := c.assembler.CompileJump(arm64.BLS)
+	boundsOK := c.assembler.CompileJump(arm64.BCONDLS)
 
 	// If offsetRegister(= base+offsetArg+targetSizeInBytes) exceeds the memory length,
 	//  we exit the function with nativeCallStatusCodeMemoryOutOfBounds.
@@ -3135,7 +3135,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32)
 		arm64ReservedRegisterForTemporary)
 
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
-	sourceBoundsOK := c.assembler.CompileJump(arm64.BLS)
+	sourceBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
 
 	// If not, raise out of bounds memory access error.
 	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
@@ -3165,7 +3165,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32)
 	}
 
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
-	destinationBoundsOK := c.assembler.CompileJump(arm64.BLS)
+	destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
 
 	// If not, raise out of bounds memory access error.
 	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
@@ -3176,7 +3176,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32)
 	if !isZeroRegister(copySize.register) {
 		// If the size equals zero, we can skip the entire instructions beflow.
 		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
-		skipCopyJump := c.assembler.CompileJump(arm64.BEQ)
+		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
 
 		var movInst asm.Instruction
 		var movSize int64
@@ -3231,7 +3231,7 @@ func (c *arm64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32)
 
 		// Decrement the size counter and if the value is still negative, continue the loop.
 		c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
-		c.assembler.CompileJump(arm64.BMI).AssignJumpTarget(beginCopyLoop)
+		c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
 
 		c.assembler.SetJumpTargetOnNext(skipCopyJump)
 	}
@@ -3351,7 +3351,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd
 
 	// Check memory len >= sourceOffset.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, sourceOffset.register)
-	sourceBoundsOK := c.assembler.CompileJump(arm64.BLS)
+	sourceBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
 
 	// If not, raise out of bounds memory access error.
 	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
@@ -3377,7 +3377,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd
 	}
 
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
-	destinationBoundsOK := c.assembler.CompileJump(arm64.BLS)
+	destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
 
 	// If not, raise out of bounds memory access error.
 	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
@@ -3398,11 +3398,11 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd
 	// If the size equals zero, we can skip the entire instructions beflow.
 	if !isZeroRegister(copySize.register) {
 		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, copySize.register)
-		skipCopyJump := c.assembler.CompileJump(arm64.BEQ)
+		skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
 
 		// If source offet < destination offset: for (i = size-1; i >= 0; i--) dst[i] = src[i];
 		c.assembler.CompileTwoRegistersToNone(arm64.CMP, sourceOffset.register, destinationOffset.register)
-		destLowerThanSourceJump := c.assembler.CompileJump(arm64.BLS)
+		destLowerThanSourceJump := c.assembler.CompileJump(arm64.BCONDLS)
 		var endJump asm.Node
 		{
 			// sourceOffset -= size.
@@ -3464,7 +3464,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd
 			)
 
 			// If the value on the copySize.register is not equal zero, continue the loop.
-			c.assembler.CompileJump(arm64.BNE).AssignJumpTarget(beginCopyLoop)
+			c.assembler.CompileJump(arm64.BCONDNE).AssignJumpTarget(beginCopyLoop)
 
 			// Otherwise, exit the loop.
 			endJump = c.assembler.CompileJump(arm64.B)
@@ -3529,7 +3529,7 @@ func (c *arm64Compiler) compileCopyImpl(isTable bool, srcTableIndex, dstTableInd
 
 			// size += 1
 			c.assembler.CompileConstToRegister(arm64.ADDS, movSize, copySize.register)
-			c.assembler.CompileJump(arm64.BMI).AssignJumpTarget(beginCopyLoop)
+			c.assembler.CompileJump(arm64.BCONDMI).AssignJumpTarget(beginCopyLoop)
 		}
 		c.assembler.SetJumpTargetOnNext(skipCopyJump, endJump)
 	}
@@ -3602,7 +3602,7 @@ func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
 
 	// Check  len >= destinationOffset.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64ReservedRegisterForTemporary, destinationOffset.register)
-	destinationBoundsOK := c.assembler.CompileJump(arm64.BLS)
+	destinationBoundsOK := c.assembler.CompileJump(arm64.BCONDLS)
 
 	// If not, raise the runtime error.
 	if isTable {
@@ -3616,7 +3616,7 @@ func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
 
 	// If the size equals zero, we can skip the entire instructions beflow.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, fillSize.register)
-	skipCopyJump := c.assembler.CompileJump(arm64.BEQ)
+	skipCopyJump := c.assembler.CompileJump(arm64.BCONDEQ)
 
 	// destinationOffset -= size.
 	c.assembler.CompileRegisterToRegister(arm64.SUB, fillSize.register, destinationOffset.register)
@@ -3664,7 +3664,7 @@ func (c *arm64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
 	)
 
 	// If the value on the copySizeRgister.register is not equal zero, continue the loop.
-	continueJump := c.assembler.CompileJump(arm64.BNE)
+	continueJump := c.assembler.CompileJump(arm64.BCONDNE)
 	continueJump.AssignJumpTarget(beginCopyLoop)
 
 	// Mark all of the operand registers.
@@ -3774,7 +3774,7 @@ func (c *arm64Compiler) compileTableGet(o *wazeroir.OperationTableGet) error {
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, ref, offset.register)
 
 	// If it exceeds len(table), we exit the execution.
-	brIfBoundsOK := c.assembler.CompileJump(arm64.BLO)
+	brIfBoundsOK := c.assembler.CompileJump(arm64.BCONDLO)
 	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
 	c.assembler.SetJumpTargetOnNext(brIfBoundsOK)
 
@@ -3835,7 +3835,7 @@ func (c *arm64Compiler) compileTableSet(o *wazeroir.OperationTableSet) error {
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, tmp, offset.register)
 
 	// If it exceeds len(table), we exit the execution.
-	brIfBoundsOK := c.assembler.CompileJump(arm64.BLO)
+	brIfBoundsOK := c.assembler.CompileJump(arm64.BCONDLO)
 	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
 	c.assembler.SetJumpTargetOnNext(brIfBoundsOK)
 
@@ -4156,7 +4156,7 @@ func (c *arm64Compiler) compileModuleContextInitialization() error {
 
 	// If the module instance address stays the same, we could skip the entire code below.
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64CallingConventionModuleInstanceAddressRegister, tmpX)
-	brIfModuleUnchanged := c.assembler.CompileJump(arm64.BEQ)
+	brIfModuleUnchanged := c.assembler.CompileJump(arm64.BCONDEQ)
 
 	// Otherwise, update the moduleEngine.moduleContext.ModuleInstanceAddress.
 	c.assembler.CompileRegisterToMemory(arm64.MOVD,
diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go
index c406972ceb..44def48679 100644
--- a/internal/engine/compiler/impl_vec_arm64.go
+++ b/internal/engine/compiler/impl_vec_arm64.go
@@ -34,7 +34,7 @@ func (c *arm64Compiler) compileV128Const(o *wazeroir.OperationV128Const) error {
 		c.assembler.CompileConstToRegister(arm64.MOVD, int64(o.Hi), arm64ReservedRegisterForTemporary)
 	}
 	// "ins Vn.D[1], intReg"
-	c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, intReg, result, arm64.VectorArrangementD, 1)
+	c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, intReg, result, arm64.VectorArrangementD, 1)
 
 	c.pushVectorRuntimeValueLocationOnRegister(result)
 	return nil
@@ -151,7 +151,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD,
 		)
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result,
 			arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone)
 	case wazeroir.V128LoadType8x8u:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
@@ -161,7 +161,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD,
 		)
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result,
 			arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone)
 	case wazeroir.V128LoadType16x4s:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
@@ -171,7 +171,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD,
 		)
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result,
 			arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone)
 	case wazeroir.V128LoadType16x4u:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
@@ -181,7 +181,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD,
 		)
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result,
 			arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone)
 	case wazeroir.V128LoadType32x2s:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
@@ -191,7 +191,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD,
 		)
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLLIMM, result, result,
 			arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone)
 	case wazeroir.V128LoadType32x2u:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
@@ -201,7 +201,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementD,
 		)
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLLIMM, result, result,
 			arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone)
 	case wazeroir.V128LoadType8Splat:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 1)
@@ -284,7 +284,7 @@ func (c *arm64Compiler) compileV128LoadLane(o *wazeroir.OperationV128LoadLane) (
 	}
 
 	c.assembler.CompileMemoryWithRegisterOffsetToRegister(loadInst, arm64ReservedRegisterForMemory, source, source)
-	c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, source, targetVector.register, arr, arm64.VectorIndex(o.LaneIndex))
+	c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, source, targetVector.register, arr, arm64.VectorIndex(o.LaneIndex))
 
 	c.pushVectorRuntimeValueLocationOnRegister(targetVector.register)
 	c.locationStack.markRegisterUnused(source)
@@ -341,7 +341,7 @@ func (c *arm64Compiler) compileV128StoreLane(o *wazeroir.OperationV128StoreLane)
 		return err
 	}
 
-	c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v.register, arm64ReservedRegisterForTemporary, arr,
+	c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v.register, arm64ReservedRegisterForTemporary, arr,
 		arm64.VectorIndex(o.LaneIndex))
 
 	c.assembler.CompileRegisterToMemoryWithRegisterOffset(storeInst,
@@ -366,9 +366,9 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL
 		}
 		var inst asm.Instruction
 		if o.Signed {
-			inst = arm64.SMOV
+			inst = arm64.SMOV32
 		} else {
-			inst = arm64.VMOV
+			inst = arm64.UMOV
 		}
 		c.assembler.CompileVectorRegisterToRegister(inst, v.register, result,
 			arm64.VectorArrangementB, arm64.VectorIndex(o.LaneIndex))
@@ -382,9 +382,9 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL
 		}
 		var inst asm.Instruction
 		if o.Signed {
-			inst = arm64.SMOV
+			inst = arm64.SMOV32
 		} else {
-			inst = arm64.VMOV
+			inst = arm64.UMOV
 		}
 		c.assembler.CompileVectorRegisterToRegister(inst, v.register, result,
 			arm64.VectorArrangementH, arm64.VectorIndex(o.LaneIndex))
@@ -396,7 +396,7 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL
 		if err != nil {
 			return err
 		}
-		c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v.register, result,
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v.register, result,
 			arm64.VectorArrangementS, arm64.VectorIndex(o.LaneIndex))
 
 		c.locationStack.markRegisterUnused(v.register)
@@ -406,17 +406,17 @@ func (c *arm64Compiler) compileV128ExtractLane(o *wazeroir.OperationV128ExtractL
 		if err != nil {
 			return err
 		}
-		c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v.register, result,
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v.register, result,
 			arm64.VectorArrangementD, arm64.VectorIndex(o.LaneIndex))
 
 		c.locationStack.markRegisterUnused(v.register)
 		c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
 	case wazeroir.ShapeF32x4:
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, v.register, v.register,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, v.register, v.register,
 			arm64.VectorArrangementS, arm64.VectorIndex(o.LaneIndex), 0)
 		c.pushRuntimeValueLocationOnRegister(v.register, runtimeValueTypeF32)
 	case wazeroir.ShapeF64x2:
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, v.register, v.register,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, v.register, v.register,
 			arm64.VectorArrangementD, arm64.VectorIndex(o.LaneIndex), 0)
 		c.pushRuntimeValueLocationOnRegister(v.register, runtimeValueTypeF64)
 	}
@@ -437,22 +437,22 @@ func (c *arm64Compiler) compileV128ReplaceLane(o *wazeroir.OperationV128ReplaceL
 
 	switch o.Shape {
 	case wazeroir.ShapeI8x16:
-		c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register,
+		c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register,
 			arm64.VectorArrangementB, arm64.VectorIndex(o.LaneIndex))
 	case wazeroir.ShapeI16x8:
-		c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register,
+		c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register,
 			arm64.VectorArrangementH, arm64.VectorIndex(o.LaneIndex))
 	case wazeroir.ShapeI32x4:
-		c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register,
+		c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register,
 			arm64.VectorArrangementS, arm64.VectorIndex(o.LaneIndex))
 	case wazeroir.ShapeI64x2:
-		c.assembler.CompileRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register,
+		c.assembler.CompileRegisterToVectorRegister(arm64.INSGEN, origin.register, vector.register,
 			arm64.VectorArrangementD, arm64.VectorIndex(o.LaneIndex))
 	case wazeroir.ShapeF32x4:
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, origin.register, vector.register,
 			arm64.VectorArrangementS, 0, arm64.VectorIndex(o.LaneIndex))
 	case wazeroir.ShapeF64x2:
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, origin.register, vector.register,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.INSELEM, origin.register, vector.register,
 			arm64.VectorArrangementD, 0, arm64.VectorIndex(o.LaneIndex))
 	}
 
@@ -475,36 +475,36 @@ func (c *arm64Compiler) compileV128Splat(o *wazeroir.OperationV128Splat) (err er
 		if err != nil {
 			return
 		}
-		c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result,
-			arm64.VectorArrangementB, arm64.VectorIndexNone)
+		c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result,
+			arm64.VectorArrangement16B, arm64.VectorIndexNone)
 	case wazeroir.ShapeI16x8:
 		result, err = c.allocateRegister(registerTypeVector)
 		if err != nil {
 			return
 		}
-		c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result,
-			arm64.VectorArrangementH, arm64.VectorIndexNone)
+		c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result,
+			arm64.VectorArrangement8H, arm64.VectorIndexNone)
 	case wazeroir.ShapeI32x4:
 		result, err = c.allocateRegister(registerTypeVector)
 		if err != nil {
 			return
 		}
-		c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result,
-			arm64.VectorArrangementS, arm64.VectorIndexNone)
+		c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result,
+			arm64.VectorArrangement4S, arm64.VectorIndexNone)
 	case wazeroir.ShapeI64x2:
 		result, err = c.allocateRegister(registerTypeVector)
 		if err != nil {
 			return
 		}
-		c.assembler.CompileRegisterToVectorRegister(arm64.DUP, origin.register, result,
-			arm64.VectorArrangementD, arm64.VectorIndexNone)
+		c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, origin.register, result,
+			arm64.VectorArrangement2D, arm64.VectorIndexNone)
 	case wazeroir.ShapeF32x4:
 		result = origin.register
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUP, origin.register, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUPELEM, origin.register, result,
 			arm64.VectorArrangementS, 0, arm64.VectorIndexNone)
 	case wazeroir.ShapeF64x2:
 		result = origin.register
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUP, origin.register, result,
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.DUPELEM, origin.register, result,
 			arm64.VectorArrangementD, 0, arm64.VectorIndexNone)
 	}
 
@@ -536,8 +536,8 @@ func (c *arm64Compiler) compileV128Shuffle(o *wazeroir.OperationV128Shuffle) (er
 		c.onValueReleaseRegisterToStack(wReg)
 
 		if w.onRegister() {
-			c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, w.register, wReg,
-				arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
+			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
+				w.register, w.register, wReg, arm64.VectorArrangement16B)
 			// We no longer use the old register.
 			c.markRegisterUnused(w.register)
 		} else { // on stack
@@ -553,8 +553,8 @@ func (c *arm64Compiler) compileV128Shuffle(o *wazeroir.OperationV128Shuffle) (er
 		c.onValueReleaseRegisterToStack(vReg)
 
 		if v.onRegister() {
-			c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, v.register, vReg,
-				arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
+			c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR,
+				v.register, v.register, vReg, arm64.VectorArrangement16B)
 			// We no longer use the old register.
 			c.markRegisterUnused(v.register)
 		} else { // on stack
@@ -607,7 +607,7 @@ func (c *arm64Compiler) compileV128AnyTrue(*wazeroir.OperationV128AnyTrue) (err
 	v := vector.register
 	c.assembler.CompileVectorRegisterToVectorRegister(arm64.UMAXP, v, v,
 		arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v, arm64ReservedRegisterForTemporary,
+	c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, arm64ReservedRegisterForTemporary,
 		arm64.VectorArrangementD, 0)
 	c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, arm64ReservedRegisterForTemporary)
 	c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondNE)
@@ -625,10 +625,10 @@ func (c *arm64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) (er
 
 	v := vector.register
 	if o.Shape == wazeroir.ShapeI64x2 {
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.CMEQ, arm64.RegRZR, v,
-			arm64.VectorArrangementNone, arm64.VectorIndexNone, arm64.VectorIndexNone)
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.CMEQZERO, arm64.RegRZR, v,
+			arm64.VectorArrangement2D, arm64.VectorIndexNone, arm64.VectorIndexNone)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDP, v, v,
-			arm64.VectorArrangementD, arm64.VectorIndexNone, arm64.VectorIndexNone)
+			arm64.VectorArrangementNone, arm64.VectorIndexNone, arm64.VectorIndexNone)
 		c.assembler.CompileTwoRegistersToNone(arm64.FCMPD, v, v)
 		c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondEQ)
 	} else {
@@ -644,7 +644,7 @@ func (c *arm64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) (er
 
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.UMINV, v, v,
 			arr, arm64.VectorIndexNone, arm64.VectorIndexNone)
-		c.assembler.CompileVectorRegisterToRegister(arm64.VMOV, v, arm64ReservedRegisterForTemporary,
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, arm64ReservedRegisterForTemporary,
 			arm64.VectorArrangementD, 0)
 		c.assembler.CompileTwoRegistersToNone(arm64.CMP, arm64.RegRZR, arm64ReservedRegisterForTemporary)
 		c.locationStack.pushRuntimeValueLocationOnConditionalRegister(arm64.CondNE)
@@ -653,49 +653,288 @@ func (c *arm64Compiler) compileV128AllTrue(o *wazeroir.OperationV128AllTrue) (er
 	return
 }
 
+var (
+	i8x16BitmaskConst = [16]byte{
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+	}
+	i16x8BitmaskConst = [16]byte{
+		0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00,
+		0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x00,
+	}
+	i32x4BitmaskConst = [16]byte{
+		0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+		0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+	}
+)
+
 // compileV128BitMask implements compiler.compileV128BitMask for arm64.
-func (c *arm64Compiler) compileV128BitMask(o *wazeroir.OperationV128BitMask) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128BitMask(o *wazeroir.OperationV128BitMask) (err error) {
+	vector := c.locationStack.popV128()
+	if err = c.compileEnsureOnGeneralPurposeRegister(vector); err != nil {
+		return
+	}
+
+	v := vector.register
+
+	result, err := c.allocateRegister(registerTypeGeneralPurpose)
+	if err != nil {
+		return err
+	}
+
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		vecTmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+		// Right arithmetic shift on the original vector and store the result into vecTmp. So we have:
+		// v[i] = 0xff if vi<0, 0 otherwise.
+		c.assembler.CompileVectorRegisterToVectorRegisterWithConst(arm64.SSHR, v, v, arm64.VectorArrangement16B, 7)
+
+		// Load the bit mask into vecTmp.
+		c.assembler.CompileLoadStaticConstToVectorRegister(arm64.VMOV, i8x16BitmaskConst[:], vecTmp, arm64.VectorArrangementQ)
+
+		// Lane-wise logical AND with i8x16BitmaskConst, meaning that we have
+		// v[i] = (1 << i) if vi<0, 0 otherwise.
+		//
+		// Below, we use the following notation:
+		// wi := (1 << i) if vi<0, 0 otherwise.
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VAND, vecTmp, v, arm64.VectorArrangement16B,
+			arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+		// Swap the lower and higher 8 byte elements, and write it into vecTmp, meaning that we have
+		// vecTmp[i] = w(i+8) if i < 8, w(i-8) otherwise.
+		//
+		c.assembler.CompileTwoVectorRegistersToVectorRegisterWithConst(arm64.EXT, v, v, vecTmp, arm64.VectorArrangement16B, 0x8)
+
+		// v = [w0, w8, ..., w7, w15]
+		c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.ZIP1, vecTmp, v, v, arm64.VectorArrangement16B)
+
+		// v.h[0] = w0 + ... + w15
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDV, v, v,
+			arm64.VectorArrangement8H, arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+		// Extract the v.h[0] as the result.
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, arm64.VectorArrangementH, 0)
+	case wazeroir.ShapeI16x8:
+		vecTmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+		// Right arithmetic shift on the original vector and store the result into vecTmp. So we have:
+		// v[i] = 0xffff if vi<0, 0 otherwise.
+		c.assembler.CompileVectorRegisterToVectorRegisterWithConst(arm64.SSHR, v, v, arm64.VectorArrangement8H, 15)
+
+		// Load the bit mask into vecTmp.
+		c.assembler.CompileLoadStaticConstToVectorRegister(arm64.VMOV, i16x8BitmaskConst[:], vecTmp, arm64.VectorArrangementQ)
+
+		// Lane-wise logical AND with i16x8BitmaskConst, meaning that we have
+		// v[i] = (1 << i)     if vi<0, 0 otherwise for i=0..3
+		//      = (1 << (i+4)) if vi<0, 0 otherwise for i=3..7
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VAND, vecTmp, v, arm64.VectorArrangement16B,
+			arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDV, v, v,
+			arm64.VectorArrangement8H, arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, arm64.VectorArrangementH, 0)
+	case wazeroir.ShapeI32x4:
+		vecTmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+
+		// Right arithmetic shift on the original vector and store the result into vecTmp. So we have:
+		// v[i] = 0xffffffff if vi<0, 0 otherwise.
+		c.assembler.CompileVectorRegisterToVectorRegisterWithConst(arm64.SSHR, v, v, arm64.VectorArrangement4S, 32)
+
+		// Load the bit mask into vecTmp.
+		c.assembler.CompileLoadStaticConstToVectorRegister(arm64.VMOV, i32x4BitmaskConst[:], vecTmp, arm64.VectorArrangementQ)
+
+		// Lane-wise logical AND with i16x8BitmaskConst, meaning that we have
+		// v[i] = (1 << i)     if vi<0, 0 otherwise for i in [0, 1]
+		//      = (1 << (i+4)) if vi<0, 0 otherwise for i in [2, 3]
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VAND, vecTmp, v, arm64.VectorArrangement16B,
+			arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+		c.assembler.CompileVectorRegisterToVectorRegister(arm64.ADDV, v, v,
+			arm64.VectorArrangement4S, arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result, arm64.VectorArrangementS, 0)
+	case wazeroir.ShapeI64x2:
+		// Move the lower 64-bit int into result,
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, result,
+			arm64.VectorArrangementD, 0)
+		// Move the higher 64-bit int into arm64ReservedRegisterForTemporary.
+		c.assembler.CompileVectorRegisterToRegister(arm64.UMOV, v, arm64ReservedRegisterForTemporary,
+			arm64.VectorArrangementD, 1)
+
+		// Move the sign bit into the least significant bit.
+		c.assembler.CompileConstToRegister(arm64.LSR, 63, result)
+		c.assembler.CompileConstToRegister(arm64.LSR, 63, arm64ReservedRegisterForTemporary)
+
+		// result = (arm64ReservedRegisterForTemporary<<1) | result
+		c.assembler.CompileLeftShiftedRegisterToRegister(arm64.ADD,
+			arm64ReservedRegisterForTemporary, 1, result, result)
+	}
+
+	c.markRegisterUnused(v)
+	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
+	return
 }
 
 // compileV128And implements compiler.compileV128And for arm64.
-func (c *arm64Compiler) compileV128And(o *wazeroir.OperationV128And) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128And(*wazeroir.OperationV128And) error {
+	return c.compileV128x2BinOp(arm64.VAND, arm64.VectorArrangement16B)
 }
 
 // compileV128Not implements compiler.compileV128Not for arm64.
-func (c *arm64Compiler) compileV128Not(o *wazeroir.OperationV128Not) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128Not(*wazeroir.OperationV128Not) error {
+	return c.compileV128UniOp(arm64.NOT, arm64.VectorArrangement16B)
 }
 
 // compileV128Or implements compiler.compileV128Or for arm64.
-func (c *arm64Compiler) compileV128Or(o *wazeroir.OperationV128Or) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128Or(*wazeroir.OperationV128Or) error {
+	return c.compileV128x2BinOp(arm64.VORR, arm64.VectorArrangement16B)
 }
 
 // compileV128Xor implements compiler.compileV128Xor for arm64.
-func (c *arm64Compiler) compileV128Xor(o *wazeroir.OperationV128Xor) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128Xor(*wazeroir.OperationV128Xor) error {
+	return c.compileV128x2BinOp(arm64.EOR, arm64.VectorArrangement16B)
 }
 
 // compileV128Bitselect implements compiler.compileV128Bitselect for arm64.
-func (c *arm64Compiler) compileV128Bitselect(o *wazeroir.OperationV128Bitselect) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128Bitselect(*wazeroir.OperationV128Bitselect) error {
+	selector := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(selector); err != nil {
+		return err
+	}
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.BSL,
+		x2.register, x1.register, selector.register, arm64.VectorArrangement16B)
+
+	c.markRegisterUnused(x1.register, x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(selector.register)
+	return nil
 }
 
 // compileV128AndNot implements compiler.compileV128AndNot for arm64.
-func (c *arm64Compiler) compileV128AndNot(o *wazeroir.OperationV128AndNot) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128AndNot(*wazeroir.OperationV128AndNot) error {
+	return c.compileV128x2BinOp(arm64.BIC, arm64.VectorArrangement16B)
+}
+
+func (c *arm64Compiler) compileV128UniOp(inst asm.Instruction, arr arm64.VectorArrangement) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+
+	c.assembler.CompileVectorRegisterToVectorRegister(inst, v.register, v.register, arr, arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+	c.pushVectorRuntimeValueLocationOnRegister(v.register)
+	return nil
+}
+
+func (c *arm64Compiler) compileV128x2BinOp(inst asm.Instruction, arr arm64.VectorArrangement) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileVectorRegisterToVectorRegister(inst, x2.register, x1.register, arr, arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+	c.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
 }
 
 // compileV128Shr implements compiler.compileV128Shr for arm64.
 func (c *arm64Compiler) compileV128Shr(o *wazeroir.OperationV128Shr) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+	var inst asm.Instruction
+	if o.Signed {
+		inst = arm64.SSHL
+	} else {
+		inst = arm64.USHL
+	}
+	return c.compileV128ShiftImpl(o.Shape, inst, true)
 }
 
 // compileV128Shl implements compiler.compileV128Shl for arm64.
 func (c *arm64Compiler) compileV128Shl(o *wazeroir.OperationV128Shl) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+	return c.compileV128ShiftImpl(o.Shape, arm64.SSHL, false)
+}
+
+func (c *arm64Compiler) compileV128ShiftImpl(shape wazeroir.Shape, ins asm.Instruction, rightShift bool) error {
+	s := c.locationStack.pop()
+	if s.register == arm64.RegRZR {
+		// If the shift amount is zero register, nothing to do here.
+		return nil
+	}
+
+	var modulo asm.ConstantValue
+	var arr arm64.VectorArrangement
+	switch shape {
+	case wazeroir.ShapeI8x16:
+		modulo = 0x7 // modulo 8.
+		arr = arm64.VectorArrangement16B
+	case wazeroir.ShapeI16x8:
+		modulo = 0xf // modulo 16.
+		arr = arm64.VectorArrangement8H
+	case wazeroir.ShapeI32x4:
+		modulo = 0x1f // modulo 32.
+		arr = arm64.VectorArrangement4S
+	case wazeroir.ShapeI64x2:
+		modulo = 0x3f // modulo 64.
+		arr = arm64.VectorArrangement2D
+	}
+
+	if err := c.compileEnsureOnGeneralPurposeRegister(s); err != nil {
+		return err
+	}
+
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	c.assembler.CompileConstToRegister(arm64.ANDIMM32, modulo, s.register)
+
+	if rightShift {
+		// Negate the amount to make this as right shift.
+		c.assembler.CompileRegisterToRegister(arm64.NEG, s.register, s.register)
+	}
+
+	// Copy the shift amount into a vector register as SSHL requires it to be there.
+	c.assembler.CompileRegisterToVectorRegister(arm64.DUPGEN, s.register, tmp,
+		arr, arm64.VectorIndexNone)
+
+	c.assembler.CompileVectorRegisterToVectorRegister(ins, tmp, v.register, arr,
+		arm64.VectorIndexNone, arm64.VectorIndexNone)
+
+	c.markRegisterUnused(s.register)
+	c.pushVectorRuntimeValueLocationOnRegister(v.register)
+	return nil
 }
 
 // compileV128Cmp implements compiler.compileV128Cmp for arm64.
diff --git a/internal/engine/compiler/impl_vec_arm64_test.go b/internal/engine/compiler/impl_vec_arm64_test.go
index 5ce04d5a3e..03d25e37d4 100644
--- a/internal/engine/compiler/impl_vec_arm64_test.go
+++ b/internal/engine/compiler/impl_vec_arm64_test.go
@@ -67,8 +67,8 @@ func TestArm64Compiler_V128Shuffle_ConstTable_MiddleOfFunction(t *testing.T) {
 
 func TestArm64Compiler_V128Shuffle_combinations(t *testing.T) {
 	movValueRegisterToRegister := func(t *testing.T, c *arm64Compiler, src *runtimeValueLocation, dst asm.Register) {
-		c.assembler.CompileVectorRegisterToVectorRegister(arm64.VMOV, src.register, dst,
-			arm64.VectorArrangement16B, arm64.VectorIndexNone, arm64.VectorIndexNone)
+		c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VORR, src.register, src.register, dst,
+			arm64.VectorArrangement16B)
 		c.locationStack.markRegisterUnused(src.register)
 		src.setRegister(dst)
 		// We have to set the lower 64-bits' location as well.
diff --git a/internal/integration_test/asm/arm64_debug/debug_assembler.go b/internal/integration_test/asm/arm64_debug/debug_assembler.go
index 72e411f0d1..24c772e97b 100644
--- a/internal/integration_test/asm/arm64_debug/debug_assembler.go
+++ b/internal/integration_test/asm/arm64_debug/debug_assembler.go
@@ -309,3 +309,17 @@ func (ta *testAssembler) CompileLoadStaticConstToVectorRegister(instruction asm.
 	ta.goasm.CompileLoadStaticConstToVectorRegister(instruction, c, dstReg, arrangement)
 	ta.a.CompileLoadStaticConstToVectorRegister(instruction, c, dstReg, arrangement)
 }
+
+// CompileTwoVectorRegistersToVectorRegister implements the same method as documented on arm64.Assembler.
+func (ta *testAssembler) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction,
+	srcReg, srcReg2, dstReg asm.Register, arrangement arm64.VectorArrangement) {
+	ta.goasm.CompileTwoVectorRegistersToVectorRegister(instruction, srcReg, srcReg2, dstReg, arrangement)
+	ta.a.CompileTwoVectorRegistersToVectorRegister(instruction, srcReg, srcReg2, dstReg, arrangement)
+}
+
+// CompileTwoVectorRegistersToVectorRegisterWithConst implements the same method as documented on arm64.Assembler.
+func (ta *testAssembler) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction,
+	srcReg, srcReg2, dstReg asm.Register, arrangement arm64.VectorArrangement, c asm.ConstantValue) {
+	ta.goasm.CompileTwoVectorRegistersToVectorRegisterWithConst(instruction, srcReg, srcReg2, dstReg, arrangement, c)
+	ta.a.CompileTwoVectorRegistersToVectorRegisterWithConst(instruction, srcReg, srcReg2, dstReg, arrangement, c)
+}
diff --git a/internal/integration_test/asm/arm64_debug/golang_asm.go b/internal/integration_test/asm/arm64_debug/golang_asm.go
index 34d44669de..50e63b2def 100644
--- a/internal/integration_test/asm/arm64_debug/golang_asm.go
+++ b/internal/integration_test/asm/arm64_debug/golang_asm.go
@@ -18,13 +18,13 @@ func newAssembler(temporaryRegister asm.Register) (*assemblerGoAsmImpl, error) {
 	return &assemblerGoAsmImpl{GolangAsmBaseAssembler: g, temporaryRegister: temporaryRegister}, err
 }
 
-// assemblerGoAsmImpl implements asm_arm64.Assembler for golang-asm library.
+// assemblerGoAsmImpl implements arm64.Assembler for golang-asm library.
 type assemblerGoAsmImpl struct {
 	*golang_asm.GolangAsmBaseAssembler
 	temporaryRegister asm.Register
 }
 
-// CompileConstToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileConstToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileConstToRegister(instruction asm.Instruction, constValue asm.ConstantValue, destinationReg asm.Register) asm.Node {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -45,7 +45,7 @@ func (a *assemblerGoAsmImpl) CompileConstToRegister(instruction asm.Instruction,
 	return golang_asm.NewGolangAsmNode(inst)
 }
 
-// CompileMemoryToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileMemoryToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileMemoryToRegister(instruction asm.Instruction, sourceBaseReg asm.Register, sourceOffsetConst asm.ConstantValue, destinationReg asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -57,7 +57,7 @@ func (a *assemblerGoAsmImpl) CompileMemoryToRegister(instruction asm.Instruction
 	a.AddInstruction(inst)
 }
 
-// CompileMemoryWithRegisterOffsetToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileMemoryWithRegisterOffsetToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileMemoryWithRegisterOffsetToRegister(instruction asm.Instruction, sourceBaseReg, sourceOffsetReg, destinationReg asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -70,7 +70,7 @@ func (a *assemblerGoAsmImpl) CompileMemoryWithRegisterOffsetToRegister(instructi
 	a.AddInstruction(inst)
 }
 
-// CompileRegisterToMemory implements the same method as documented on asm_arm64.Assembler.
+// CompileRegisterToMemory implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileRegisterToMemory(instruction asm.Instruction, sourceReg asm.Register, destinationBaseReg asm.Register, destinationOffsetConst asm.ConstantValue) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -95,7 +95,7 @@ func (a *assemblerGoAsmImpl) CompileRegisterToMemoryWithRegisterOffset(instructi
 	a.AddInstruction(inst)
 }
 
-// CompileRegisterToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileRegisterToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -106,7 +106,7 @@ func (a *assemblerGoAsmImpl) CompileRegisterToRegister(instruction asm.Instructi
 	a.AddInstruction(inst)
 }
 
-// CompileTwoRegistersToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileTwoRegistersToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileTwoRegistersToRegister(instruction asm.Instruction, src1, src2, destination asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -118,7 +118,7 @@ func (a *assemblerGoAsmImpl) CompileTwoRegistersToRegister(instruction asm.Instr
 	a.AddInstruction(inst)
 }
 
-// CompileThreeRegistersToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileThreeRegistersToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileThreeRegistersToRegister(instruction asm.Instruction, src1, src2, src3, dst asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -131,7 +131,7 @@ func (a *assemblerGoAsmImpl) CompileThreeRegistersToRegister(instruction asm.Ins
 	a.AddInstruction(inst)
 }
 
-// CompileTwoRegistersToNone implements the same method as documented on asm_arm64.Assembler.
+// CompileTwoRegistersToNone implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileTwoRegistersToNone(instruction asm.Instruction, src1, src2 asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -144,7 +144,7 @@ func (a *assemblerGoAsmImpl) CompileTwoRegistersToNone(instruction asm.Instructi
 	a.AddInstruction(inst)
 }
 
-// CompileRegisterAndConstToNone implements the same method as documented on asm_arm64.Assembler.
+// CompileRegisterAndConstToNone implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileRegisterAndConstToNone(instruction asm.Instruction, src asm.Register, srcConst asm.ConstantValue) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -192,7 +192,7 @@ func (a *assemblerGoAsmImpl) CompileStandAlone(instruction asm.Instruction) asm.
 	return golang_asm.NewGolangAsmNode(prog)
 }
 
-// CompileLeftShiftedRegisterToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileLeftShiftedRegisterToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileLeftShiftedRegisterToRegister(instruction asm.Instruction, shiftedSourceReg asm.Register, shiftNum asm.ConstantValue, srcReg, destinationReg asm.Register) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -205,7 +205,7 @@ func (a *assemblerGoAsmImpl) CompileLeftShiftedRegisterToRegister(instruction as
 	a.AddInstruction(inst)
 }
 
-// CompileReadInstructionAddress implements the same method as documented on asm_arm64.Assembler.
+// CompileReadInstructionAddress implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileReadInstructionAddress(destinationReg asm.Register, beforeAcquisitionTargetInstruction asm.Instruction) {
 	// Emit ADR instruction to read the specified instruction's absolute address.
 	// Note: we cannot emit the "ADR REG, $(target's offset from here)" due to the
@@ -262,7 +262,7 @@ func (a *assemblerGoAsmImpl) CompileReadInstructionAddress(destinationReg asm.Re
 	})
 }
 
-// CompileConditionalRegisterSet implements the same method as documented on asm_arm64.Assembler.
+// CompileConditionalRegisterSet implements the same method as documented on arm64.Assembler.
 //
 // We use CSET instruction to set 1 on the register if the condition satisfies:
 // https://developer.arm.com/documentation/100076/0100/a64-instruction-set-reference/a64-general-instructions/cset
@@ -282,7 +282,7 @@ func simdRegisterForScalarFloatRegister(freg int16) int16 {
 	return freg + (arm64.REG_F31 - arm64.REG_F0) + 1
 }
 
-// CompileTwoSIMDBytesToSIMDByteRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileTwoSIMDBytesToSIMDByteRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileTwoSIMDBytesToSIMDByteRegister(instruction asm.Instruction, srcReg1, srcReg2, dstReg asm.Register) {
 	src1FloatReg, src2FloatReg, dstFloatReg := castAsGolangAsmRegister[srcReg1], castAsGolangAsmRegister[srcReg2], castAsGolangAsmRegister[dstReg]
 	src1VReg, src2VReg, dstVReg := simdRegisterForScalarFloatRegister(src1FloatReg), simdRegisterForScalarFloatRegister(src2FloatReg), simdRegisterForScalarFloatRegister(dstFloatReg)
@@ -300,7 +300,7 @@ func (a *assemblerGoAsmImpl) CompileTwoSIMDBytesToSIMDByteRegister(instruction a
 
 }
 
-// CompileSIMDByteToSIMDByte implements the same method as documented on asm_arm64.Assembler.
+// CompileSIMDByteToSIMDByte implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileSIMDByteToSIMDByte(instruction asm.Instruction, srcReg, dstReg asm.Register) {
 	srcFloatReg, dstFloatReg := castAsGolangAsmRegister[srcReg], castAsGolangAsmRegister[dstReg]
 	srcVReg, dstVReg := simdRegisterForScalarFloatRegister(srcFloatReg), simdRegisterForScalarFloatRegister(dstFloatReg)
@@ -316,7 +316,7 @@ func (a *assemblerGoAsmImpl) CompileSIMDByteToSIMDByte(instruction asm.Instructi
 	a.AddInstruction(inst)
 }
 
-// CompileSIMDByteToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileSIMDByteToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileSIMDByteToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register) {
 	srcFloatReg, dstFlaotReg := castAsGolangAsmRegister[srcReg], castAsGolangAsmRegister[dstReg]
 	srcVReg, dstVReg := simdRegisterForScalarFloatRegister(srcFloatReg), simdRegisterForScalarFloatRegister(dstFlaotReg)
@@ -332,30 +332,42 @@ func (a *assemblerGoAsmImpl) CompileSIMDByteToRegister(instruction asm.Instructi
 	a.AddInstruction(inst)
 }
 
-// CompileMemoryToVectorRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileMemoryToVectorRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileMemoryToVectorRegister(
 	_ asm.Instruction, _ asm.Register, _ asm.ConstantValue, _ asm.Register, _ asm_arm64.VectorArrangement,
 ) {
 	panic("CompileMemoryToVectorRegister is unsupported with golang-asm")
 }
 
-// CompileVectorRegisterToMemory implements the same method as documented on asm_arm64.Assembler.
+// CompileVectorRegisterToMemory implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileVectorRegisterToMemory(_ asm.Instruction, _, _ asm.Register, _ asm.ConstantValue,
 	_ asm_arm64.VectorArrangement) {
 	panic("CompileVectorRegisterToMemory is unsupported with golang-asm")
 }
 
-// CompileMemoryWithRegisterOffsetToVectorRegister  implements the same method as documented on asm_arm64.Assembler.
+// CompileMemoryWithRegisterOffsetToVectorRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileMemoryWithRegisterOffsetToVectorRegister(_ asm.Instruction, _, _ asm.Register, _ asm.Register, _ asm_arm64.VectorArrangement) {
 	panic("CompileMemoryWithRegisterOffsetToVectorRegister is unsupported with golang-asm")
 }
 
-// CompileVectorRegisterToMemoryWithRegisterOffset  implements the same method as documented on asm_arm64.Assembler.
+// CompileVectorRegisterToMemoryWithRegisterOffset implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileVectorRegisterToMemoryWithRegisterOffset(_ asm.Instruction, _, _, _ asm.Register, _ asm_arm64.VectorArrangement) {
 	panic("CompileVectorRegisterToMemoryWithRegisterOffset is unsupported with golang-asm")
 }
 
-// CompileRegisterToVectorRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileTwoVectorRegistersToVectorRegister implements the same method as documented on arm64.Assembler.
+func (a *assemblerGoAsmImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction,
+	srcReg, srcReg2, dstReg asm.Register, arrangement asm_arm64.VectorArrangement) {
+	panic("CompileTwoVectorRegistersToVectorRegister is unsupported with golang-asm")
+}
+
+// CompileTwoVectorRegistersToVectorRegisterWithConst implements the same method as documented on arm64.Assembler.
+func (a *assemblerGoAsmImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction,
+	srcReg, srcReg2, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, value asm.ConstantValue) {
+	panic("CompileTwoVectorRegistersToVectorRegisterWithConst is unsupported with golang-asm")
+}
+
+// CompileRegisterToVectorRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileRegisterToVectorRegister(instruction asm.Instruction, srcReg, dstReg asm.Register,
 	arrangement asm_arm64.VectorArrangement, index asm_arm64.VectorIndex) {
 	inst := a.NewProg()
@@ -369,7 +381,7 @@ func (a *assemblerGoAsmImpl) CompileRegisterToVectorRegister(instruction asm.Ins
 	a.AddInstruction(inst)
 }
 
-// CompileVectorRegisterToVectorRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileVectorRegisterToVectorRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegister(instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement asm_arm64.VectorArrangement, srcIndex, dstIndex asm_arm64.VectorIndex) {
 	inst := a.NewProg()
 	inst.As = castAsGolangAsmInstruction[instruction]
@@ -395,11 +407,11 @@ func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegister(instruction a
 	}
 }
 
-// CompileVectorRegisterToVectorRegisterWithConst implements the same method as documented on asm_arm64.Assembler.
+// CompileVectorRegisterToVectorRegisterWithConst implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, srcReg,
 	dstReg asm.Register, arrangement asm_arm64.VectorArrangement, c asm.ConstantValue) {
 	switch instruction {
-	case asm_arm64.USHLL:
+	case asm_arm64.USHLLIMM:
 		var dstArrangement asm_arm64.VectorArrangement
 		if arrangement == asm_arm64.VectorArrangement8B {
 			dstArrangement = asm_arm64.VectorArrangement8H
@@ -421,7 +433,7 @@ func (a *assemblerGoAsmImpl) CompileVectorRegisterToVectorRegisterWithConst(inst
 	}
 }
 
-// CompileVectorRegisterToRegister implements the same method as documented on asm_arm64.Assembler.
+// CompileVectorRegisterToRegister implements the same method as documented on arm64.Assembler.
 func (a *assemblerGoAsmImpl) CompileVectorRegisterToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register,
 	arrangement asm_arm64.VectorArrangement, index asm_arm64.VectorIndex) {
 	inst := a.NewProg()
@@ -594,19 +606,19 @@ var castAsGolangAsmInstruction = [...]obj.As{
 	asm_arm64.ASR:      arm64.AASR,
 	asm_arm64.ASRW:     arm64.AASRW,
 	asm_arm64.B:        arm64.AB,
-	asm_arm64.BEQ:      arm64.ABEQ,
-	asm_arm64.BGE:      arm64.ABGE,
-	asm_arm64.BGT:      arm64.ABGT,
-	asm_arm64.BHI:      arm64.ABHI,
-	asm_arm64.BHS:      arm64.ABHS,
-	asm_arm64.BLE:      arm64.ABLE,
-	asm_arm64.BLO:      arm64.ABLO,
-	asm_arm64.BLS:      arm64.ABLS,
-	asm_arm64.BLT:      arm64.ABLT,
-	asm_arm64.BMI:      arm64.ABMI,
-	asm_arm64.BPL:      arm64.ABPL,
-	asm_arm64.BNE:      arm64.ABNE,
-	asm_arm64.BVS:      arm64.ABVS,
+	asm_arm64.BCONDEQ:  arm64.ABEQ,
+	asm_arm64.BCONDGE:  arm64.ABGE,
+	asm_arm64.BCONDGT:  arm64.ABGT,
+	asm_arm64.BCONDHI:  arm64.ABHI,
+	asm_arm64.BCONDHS:  arm64.ABHS,
+	asm_arm64.BCONDLE:  arm64.ABLE,
+	asm_arm64.BCONDLO:  arm64.ABLO,
+	asm_arm64.BCONDLS:  arm64.ABLS,
+	asm_arm64.BCONDLT:  arm64.ABLT,
+	asm_arm64.BCONDMI:  arm64.ABMI,
+	asm_arm64.BCONDPL:  arm64.ABPL,
+	asm_arm64.BCONDNE:  arm64.ABNE,
+	asm_arm64.BCONDVS:  arm64.ABVS,
 	asm_arm64.CLZ:      arm64.ACLZ,
 	asm_arm64.CLZW:     arm64.ACLZW,
 	asm_arm64.CMP:      arm64.ACMP,
@@ -705,5 +717,5 @@ var castAsGolangAsmInstruction = [...]obj.As{
 	asm_arm64.VMOV:     arm64.AVMOV,
 	asm_arm64.VADD:     arm64.AVADD,
 	asm_arm64.VSUB:     arm64.AVSUB,
-	asm_arm64.USHLL:    arm64.AVUSHLL,
+	asm_arm64.USHLLIMM: arm64.AVUSHLL,
 }
diff --git a/internal/integration_test/asm/arm64_debug/impl_test.go b/internal/integration_test/asm/arm64_debug/impl_test.go
index 4a254efeb0..bae42276b0 100644
--- a/internal/integration_test/asm/arm64_debug/impl_test.go
+++ b/internal/integration_test/asm/arm64_debug/impl_test.go
@@ -1,7 +1,6 @@
 package arm64debug
 
 import (
-	"encoding/hex"
 	"fmt"
 	"math"
 	"testing"
@@ -1138,9 +1137,9 @@ func TestAssemblerImpl_EncodeRelativeJump(t *testing.T) {
 	})
 
 	for _, inst := range []asm.Instruction{
-		arm64.B, arm64.BEQ, arm64.BGE, arm64.BGT, arm64.BHI, arm64.BHS,
-		arm64.BLE, arm64.BLO, arm64.BLS, arm64.BLT, arm64.BMI, arm64.BNE, arm64.BVS,
-		arm64.BPL,
+		arm64.B, arm64.BCONDEQ, arm64.BCONDGE, arm64.BCONDGT, arm64.BCONDHI, arm64.BCONDHS,
+		arm64.BCONDLE, arm64.BCONDLO, arm64.BCONDLS, arm64.BCONDLT, arm64.BCONDMI, arm64.BCONDNE, arm64.BCONDVS,
+		arm64.BCONDPL,
 	} {
 		inst := inst
 		t.Run(arm64.InstructionName(inst), func(t *testing.T) {
@@ -1276,374 +1275,6 @@ func TestAssemblerImpl_EncodeTwoSIMDBytesToSIMDByteRegister(t *testing.T) {
 	}
 }
 
-func TestAssemblerImpl_EncodeVectorRegisterToVectorRegister(t *testing.T) {
-	t.Run("error", func(t *testing.T) {
-		tests := []struct {
-			n      *arm64.NodeImpl
-			expErr string
-		}{
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.B,
-					SrcReg:         arm64.RegV21,
-					DstReg:         arm64.RegV21,
-					Types:          arm64.OperandTypesVectorRegisterToVectorRegister,
-					SrcVectorIndex: arm64.VectorIndexNone,
-					DstVectorIndex: arm64.VectorIndexNone,
-				},
-				expErr: "B is unsupported for from:vector-register,to:vector-register type",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VMOV,
-					SrcReg:         arm64.RegV21,
-					DstReg:         arm64.RegV21,
-					Types:          arm64.OperandTypesVectorRegisterToVectorRegister,
-					SrcVectorIndex: arm64.VectorIndexNone,
-					DstVectorIndex: arm64.VectorIndexNone,
-				},
-				expErr: "unsupported arrangement for VMOV: none",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VADD,
-					SrcReg:         arm64.RegV21,
-					DstReg:         arm64.RegV21,
-					Types:          arm64.OperandTypesVectorRegisterToVectorRegister,
-					SrcVectorIndex: arm64.VectorIndexNone,
-					DstVectorIndex: arm64.VectorIndexNone,
-				},
-				expErr: "unsupported arrangement for VADD: none",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VADD,
-					SrcReg:            arm64.RegV21,
-					DstReg:            arm64.RegV21,
-					Types:             arm64.OperandTypesVectorRegisterToVectorRegister,
-					VectorArrangement: arm64.VectorArrangement1D,
-					SrcVectorIndex:    arm64.VectorIndexNone,
-					DstVectorIndex:    arm64.VectorIndexNone,
-				},
-				expErr: "unsupported arrangement for VADD: 1D",
-			},
-		}
-
-		for _, tt := range tests {
-			tc := tt
-			t.Run(tc.expErr, func(t *testing.T) {
-				a := arm64.NewAssemblerImpl(asm.NilRegister)
-				err := a.EncodeVectorRegisterToVectorRegister(tc.n)
-				require.EqualError(t, err, tc.expErr)
-			})
-		}
-	})
-
-	vectorRegs := []asm.Register{arm64.RegV10, arm64.RegV2, arm64.RegV30}
-	tests := []struct {
-		name               string
-		inst               asm.Instruction
-		arr                arm64.VectorArrangement
-		needConst          bool
-		c                  asm.ConstantValue
-		srcIndex, dstIndex arm64.VectorIndex
-	}{
-		{inst: arm64.VMOV, arr: arm64.VectorArrangement16B, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.VADD, arr: arm64.VectorArrangement2D, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.VADD, arr: arm64.VectorArrangement4S, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.VADD, arr: arm64.VectorArrangement8H, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.VADD, arr: arm64.VectorArrangement16B, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{
-			name: "VSUB 2d",
-			inst: arm64.VSUB, arr: arm64.VectorArrangement2D, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone,
-		},
-		{
-			name: "VSUB 4s",
-			inst: arm64.VSUB, arr: arm64.VectorArrangement4S, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone,
-		},
-		{
-			name: "VSUB 8h",
-			inst: arm64.VSUB, arr: arm64.VectorArrangement8H, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone,
-		},
-		{
-			name: "VSUB 16b",
-			inst: arm64.VSUB, arr: arm64.VectorArrangement16B, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone,
-		},
-		{inst: arm64.USHLL, arr: arm64.VectorArrangement8B, needConst: true, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.USHLL, arr: arm64.VectorArrangement4H, needConst: true, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.USHLL, arr: arm64.VectorArrangement2S, needConst: true, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.USHLL, arr: arm64.VectorArrangement8B, needConst: true, c: 7, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.USHLL, arr: arm64.VectorArrangement4H, needConst: true, c: 15, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-		{inst: arm64.USHLL, arr: arm64.VectorArrangement2S, needConst: true, c: 31, srcIndex: arm64.VectorIndexNone, dstIndex: arm64.VectorIndexNone},
-	}
-
-	for _, tt := range tests {
-		tc := tt
-		t.Run(tc.name, func(t *testing.T) {
-			for _, src := range vectorRegs {
-				for _, dst := range vectorRegs {
-					src, dst := src, dst
-					t.Run(fmt.Sprintf("src=%s.%s,dst=%s.%s",
-						arm64.RegisterName(src), tc.arr, arm64.RegisterName(dst), tc.arr), func(t *testing.T) {
-						goasm := newGoasmAssembler(t, asm.NilRegister)
-						a := arm64.NewAssemblerImpl(asm.NilRegister)
-
-						for _, assembler := range []arm64.Assembler{goasm, a} {
-							if tc.needConst {
-								assembler.CompileVectorRegisterToVectorRegisterWithConst(tc.inst, src, dst, tc.arr, tc.c)
-							} else {
-								assembler.CompileVectorRegisterToVectorRegister(tc.inst, src, dst, tc.arr, tc.srcIndex, tc.dstIndex)
-							}
-						}
-
-						expected, err := goasm.Assemble()
-						require.NoError(t, err)
-
-						actual, err := a.Assemble()
-						require.NoError(t, err)
-						require.Equal(t, expected, actual, hex.EncodeToString(expected))
-					})
-				}
-			}
-		})
-	}
-}
-
-func TestAssemblerImpl_EncodeRegisterToVectorRegister(t *testing.T) {
-	t.Run("error", func(t *testing.T) {
-		tests := []struct {
-			n   *arm64.NodeImpl
-			exp string
-		}{
-			{
-				n: &arm64.NodeImpl{
-					Instruction: arm64.B, Types: arm64.OperandTypesRegisterToVectorRegister,
-					SrcReg: arm64.RegR0,
-					DstReg: arm64.RegV3,
-				},
-				exp: "B is unsupported for from:register,to:vector-register type",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VMOV,
-					SrcReg:         arm64.RegR0,
-					DstReg:         arm64.RegV3,
-					Types:          arm64.OperandTypesRegisterToVectorRegister,
-					DstVectorIndex: 100, VectorArrangement: arm64.VectorArrangement1D,
-				},
-				exp: "invalid arrangement and index pair: 1D[100]",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VMOV,
-					Types:          arm64.OperandTypesRegisterToVectorRegister,
-					SrcReg:         arm64.RegR0,
-					DstReg:         arm64.RegV3,
-					DstVectorIndex: 0, VectorArrangement: arm64.VectorArrangement1D,
-				},
-				exp: "unsupported arrangement for VMOV: 1D",
-			},
-		}
-
-		for _, tt := range tests {
-			tc := tt
-			t.Run(tc.exp, func(t *testing.T) {
-				a := arm64.NewAssemblerImpl(asm.NilRegister)
-				err := a.EncodeRegisterToVectorRegister(tc.n)
-				require.EqualError(t, err, tc.exp)
-			})
-		}
-	})
-
-	regs := []asm.Register{arm64.RegR0, arm64.RegR10, arm64.RegR30}
-	vectorRegs := []asm.Register{arm64.RegV0, arm64.RegV10, arm64.RegV30}
-
-	tests := []struct {
-		inst        asm.Instruction
-		arrangement arm64.VectorArrangement
-		index       arm64.VectorIndex
-	}{
-		{
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementD,
-			index:       0,
-		},
-		{
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementD,
-			index:       1,
-		},
-		{
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementB,
-			index:       0,
-		},
-		{
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementB,
-			index:       5,
-		},
-		{
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementH,
-			index:       1,
-		},
-		{
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementH,
-			index:       4,
-		},
-	}
-
-	for _, tt := range tests {
-		tc := tt
-		t.Run(arm64.InstructionName(tc.inst), func(t *testing.T) {
-			for _, r := range regs {
-				for _, vr := range vectorRegs {
-					r, vr := r, vr
-					t.Run(fmt.Sprintf("src=%s,dst=%s.%s[%d]",
-						arm64.RegisterName(r), arm64.RegisterName(vr), tc.arrangement, tc.index), func(t *testing.T) {
-						goasm := newGoasmAssembler(t, asm.NilRegister)
-						a := arm64.NewAssemblerImpl(asm.NilRegister)
-
-						for _, assembler := range []arm64.Assembler{goasm, a} {
-							assembler.CompileRegisterToVectorRegister(tc.inst, r, vr, tc.arrangement, tc.index)
-						}
-
-						expected, err := goasm.Assemble()
-						require.NoError(t, err)
-
-						actual, err := a.Assemble()
-						require.NoError(t, err)
-						require.Equal(t, expected, actual)
-					})
-				}
-			}
-		})
-	}
-}
-
-func TestAssemblerImpl_EncodeVectorRegisterToRegister(t *testing.T) {
-	t.Run("error", func(t *testing.T) {
-		tests := []struct {
-			n      *arm64.NodeImpl
-			expErr string
-		}{
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.B, Types: arm64.OperandTypesVectorRegisterToRegister,
-					SrcReg: arm64.RegV0,
-					DstReg: arm64.RegR3,
-				},
-				expErr: "B is unsupported for from:vector-register,to:register type",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VMOV,
-					Types:          arm64.OperandTypesVectorRegisterToRegister,
-					SrcReg:         arm64.RegV0,
-					DstReg:         arm64.RegR3,
-					SrcVectorIndex: 100, VectorArrangement: arm64.VectorArrangement1D,
-				},
-				expErr: "invalid arrangement and index pair: 1D[100]",
-			},
-			{
-				n: &arm64.NodeImpl{Instruction: arm64.VMOV,
-					Types:          arm64.OperandTypesVectorRegisterToRegister,
-					SrcReg:         arm64.RegV0,
-					DstReg:         arm64.RegR3,
-					SrcVectorIndex: 0, VectorArrangement: arm64.VectorArrangement1D,
-				},
-				expErr: "unsupported arrangement for VMOV: 1D",
-			},
-		}
-
-		for _, tt := range tests {
-			tc := tt
-			a := arm64.NewAssemblerImpl(asm.NilRegister)
-			err := a.EncodeVectorRegisterToRegister(tc.n)
-			require.EqualError(t, err, tc.expErr)
-		}
-	})
-
-	regs := []asm.Register{arm64.RegR0, arm64.RegR10, arm64.RegR30}
-	vectorRegs := []asm.Register{arm64.RegV0, arm64.RegV10, arm64.RegV30}
-
-	tests := []struct {
-		name        string
-		inst        asm.Instruction
-		arrangement arm64.VectorArrangement
-		index       arm64.VectorIndex
-	}{
-		{
-			name:        "VMOV D[0]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementD,
-			index:       0,
-		},
-		{
-			name:        "VMOV D[1]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementD,
-			index:       1,
-		},
-		{
-			name:        "VMOV B[0]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementB,
-			index:       0,
-		},
-		{
-			name:        "VMOV B[15]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementB,
-			index:       15,
-		},
-		{
-			name:        "VMOV H[1]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementH,
-			index:       1,
-		},
-		{
-			name:        "VMOV H[4]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementH,
-			index:       7,
-		},
-		{
-			name:        "VMOV S[2]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementS,
-			index:       2,
-		},
-		{
-			name:        "VMOV S[3]",
-			inst:        arm64.VMOV,
-			arrangement: arm64.VectorArrangementS,
-			index:       3,
-		},
-	}
-
-	for _, tt := range tests {
-		tc := tt
-		t.Run(tc.name, func(t *testing.T) {
-			for _, r := range regs {
-				for _, vr := range vectorRegs {
-					r, vr := r, vr
-					t.Run(fmt.Sprintf("dst=%s,src=%s.%s[%d]",
-						arm64.RegisterName(r), arm64.RegisterName(vr), tc.arrangement, tc.index), func(t *testing.T) {
-						goasm := newGoasmAssembler(t, asm.NilRegister)
-						a := arm64.NewAssemblerImpl(asm.NilRegister)
-
-						for _, assembler := range []arm64.Assembler{goasm, a} {
-							assembler.CompileVectorRegisterToRegister(tc.inst, vr, r, tc.arrangement, tc.index)
-						}
-
-						expected, err := goasm.Assemble()
-						require.NoError(t, err)
-						actual, err := a.Assemble()
-						require.NoError(t, err)
-						require.Equal(t, expected, actual)
-					})
-				}
-			}
-		})
-	}
-}
-
 func conditionalRegisterToState(r asm.Register) asm.ConditionalRegisterState {
 	switch r {
 	case arm64.RegCondEQ:
diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go
index 76838f372c..f928f4b015 100644
--- a/internal/integration_test/spectest/v2/spec_test.go
+++ b/internal/integration_test/spectest/v2/spec_test.go
@@ -26,8 +26,7 @@ func TestCompiler(t *testing.T) {
 
 	spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool {
 		switch path.Base(jsonname) {
-		case "simd_bitwise.json", "simd_boolean.json", "simd_bit_shift.json",
-			"simd_i8x16_cmp.json", "simd_i16x8_cmp.json", "simd_i32x4_cmp.json", "simd_i64x2_cmp.json",
+		case "simd_i8x16_cmp.json", "simd_i16x8_cmp.json", "simd_i32x4_cmp.json", "simd_i64x2_cmp.json",
 			"simd_f32x4_cmp.json", "simd_f64x2_cmp.json", "simd_f32x4_arith.json", "simd_f64x2_arith.json",
 			"simd_i16x8_arith.json", "simd_i64x2_arith.json", "simd_i32x4_arith.json", "simd_i8x16_arith.json",
 			"simd_i16x8_sat_arith.json", "simd_i8x16_sat_arith.json",