diff --git a/internal/asm/amd64/consts.go b/internal/asm/amd64/consts.go
index ec71cb86b1a..b7083e4eb44 100644
--- a/internal/asm/amd64/consts.go
+++ b/internal/asm/amd64/consts.go
@@ -323,16 +323,16 @@ const (
 	PADDB
 	// PADDW is the PADDW instruction. https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
 	PADDW
-	// PADDL is the PADDD instruction. https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
-	PADDL
+	// PADDD is the PADDD instruction. https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
+	PADDD
 	// PADDQ is the PADDQ instruction. https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
 	PADDQ
 	// PSUBB is the PSUBB instruction. https://www.felixcloutier.com/x86/psubb:psubw:psubd
 	PSUBB
 	// PSUBW is the PSUBW instruction. https://www.felixcloutier.com/x86/psubb:psubw:psubd
 	PSUBW
-	// PSUBL is the PSUBD instruction. https://www.felixcloutier.com/x86/psubb:psubw:psubd
-	PSUBL
+	// PSUBD is the PSUBD instruction. https://www.felixcloutier.com/x86/psubb:psubw:psubd
+	PSUBD
 	// PSUBQ is the PSUBQ instruction. https://www.felixcloutier.com/x86/psubq
 	PSUBQ
 	// ADDPS is the ADDPS instruction. https://www.felixcloutier.com/x86/addps
@@ -449,6 +449,112 @@ const (
 	PMAXUW
 	// PMAXUB is the PMAXUB instruction https://www.felixcloutier.com/x86/pmaxub:pmaxuw
 	PMAXUB
+	// PMULLW is the PMULLW instruction https://www.felixcloutier.com/x86/pmullw
+	PMULLW
+	// PMULLD is the PMULLD instruction https://www.felixcloutier.com/x86/pmulld:pmullq
+	PMULLD
+	// PMULUDQ is the PMULUDQ instruction https://www.felixcloutier.com/x86/pmuludq
+	PMULUDQ
+	// PSUBSB is the PSUBSB instruction https://www.felixcloutier.com/x86/psubsb:psubsw
+	PSUBSB
+	// PSUBSW is the PSUBSW instruction https://www.felixcloutier.com/x86/psubsb:psubsw
+	PSUBSW
+	// PSUBUSB is the PSUBUSB instruction https://www.felixcloutier.com/x86/psubusb:psubusw
+	PSUBUSB
+	// PSUBUSW is the PSUBUSW instruction https://www.felixcloutier.com/x86/psubusb:psubusw
+	PSUBUSW
+	// PADDSW is the PADDSW instruction https://www.felixcloutier.com/x86/paddsb:paddsw
+	PADDSW
+	// PADDSB is the PADDSB instruction https://www.felixcloutier.com/x86/paddsb:paddsw
+	PADDSB
+	// PADDUSW is the PADDUSW instruction https://www.felixcloutier.com/x86/paddusb:paddusw
+	PADDUSW
+	// PAVGB is the PAVGB instruction https://www.felixcloutier.com/x86/pavgb:pavgw
+	PAVGB
+	// PAVGW is the PAVGW instruction https://www.felixcloutier.com/x86/pavgb:pavgw
+	PAVGW
+	// PABSB is the PABSB instruction https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
+	PABSB
+	// PABSW is the PABSW instruction https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
+	PABSW
+	// PABSD is the PABSD instruction https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
+	PABSD
+	// BLENDVPD is the BLENDVPD instruction https://www.felixcloutier.com/x86/blendvpd
+	BLENDVPD
+	// MAXPD is the MAXPD instruction https://www.felixcloutier.com/x86/maxpd
+	MAXPD
+	// MAXPS is the MAXPS instruction https://www.felixcloutier.com/x86/maxps
+	MAXPS
+	// MINPD is the MINPD instruction https://www.felixcloutier.com/x86/minpd
+	MINPD
+	// MINPS is the MINPS instruction https://www.felixcloutier.com/x86/minps
+	MINPS
+	// ANDNPD is the ANDNPD instruction https://www.felixcloutier.com/x86/andnpd
+	ANDNPD
+	// ANDNPS is the ANDNPS instruction https://www.felixcloutier.com/x86/andnps
+	ANDNPS
+	// MULPS is the MULPS instruction https://www.felixcloutier.com/x86/mulps
+	MULPS
+	// MULPD is the MULPD instruction https://www.felixcloutier.com/x86/mulpd
+	MULPD
+	// DIVPS is the DIVPS instruction https://www.felixcloutier.com/x86/divps
+	DIVPS
+	// DIVPD is the DIVPD instruction https://www.felixcloutier.com/x86/divpd
+	DIVPD
+	// SQRTPS is the SQRTPS instruction https://www.felixcloutier.com/x86/sqrtps
+	SQRTPS
+	// SQRTPD is the SQRTPD instruction https://www.felixcloutier.com/x86/sqrtpd
+	SQRTPD
+	// ROUNDPS is the ROUNDPS instruction https://www.felixcloutier.com/x86/roundps
+	ROUNDPS
+	// ROUNDPD is the ROUNDPD instruction https://www.felixcloutier.com/x86/roundpd
+	ROUNDPD
+	// PALIGNR is the PALIGNR instruction https://www.felixcloutier.com/x86/palignr
+	PALIGNR
+	// PUNPCKLWD is the PUNPCKLWD instruction https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
+	PUNPCKLWD
+	// PUNPCKHWD is the PUNPCKHWD instruction https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
+	PUNPCKHWD
+	// PMULHUW is the PMULHUW instruction https://www.felixcloutier.com/x86/pmulhuw
+	PMULHUW
+	// PMULDQ is the PMULDQ instruction https://www.felixcloutier.com/x86/pmuldq
+	PMULDQ
+	// PMULHRSW is the PMULHRSW instruction https://www.felixcloutier.com/x86/pmulhrsw
+	PMULHRSW
+	// PMULHW is the PMULHW instruction https://www.felixcloutier.com/x86/pmulhw
+	PMULHW
+	// CMPEQPS is the CMPEQPS instruction https://www.felixcloutier.com/x86/cmpps
+	CMPEQPS
+	// CMPEQPD is the CMPEQPD instruction https://www.felixcloutier.com/x86/cmppd
+	CMPEQPD
+	// CVTTPS2DQ is the CVTTPS2DQ instruction https://www.felixcloutier.com/x86/cvttps2dq
+	CVTTPS2DQ
+	// CVTDQ2PS is the CVTDQ2PS instruction https://www.felixcloutier.com/x86/cvtdq2ps
+	CVTDQ2PS
+	// MOVUPD is the MOVUPD instruction https://www.felixcloutier.com/x86/movupd
+	MOVUPD
+	// SHUFPS is the SHUFPS instruction https://www.felixcloutier.com/x86/shufps
+	SHUFPS
+	// PMADDWD is the PMADDWD instruction https://www.felixcloutier.com/x86/pmaddwd
+	PMADDWD
+	// CVTDQ2PD is the CVTDQ2PD instruction https://www.felixcloutier.com/x86/cvtdq2pd
+	CVTDQ2PD
+	// UNPCKLPS is the UNPCKLPS instruction https://www.felixcloutier.com/x86/unpcklps
+	UNPCKLPS
+	// PACKUSWB is the PACKUSWB instruction https://www.felixcloutier.com/x86/packuswb
+	PACKUSWB
+	// PACKSSDW is the PACKSSDW instruction https://www.felixcloutier.com/x86/packsswb:packssdw
+	PACKSSDW
+	// PACKUSDW is the PACKUSDW instruction https://www.felixcloutier.com/x86/packusdw
+	PACKUSDW
+	// CVTPS2PD is the CVTPS2PD instruction https://www.felixcloutier.com/x86/cvtps2pd
+	CVTPS2PD
+	// CVTPD2PS is the CVTPD2PS instruction https://www.felixcloutier.com/x86/cvtpd2ps
+	CVTPD2PS
+	// PMADDUBSW is the PMADDUBSW instruction https://www.felixcloutier.com/x86/pmaddubsw
+	PMADDUBSW
+	// CVTTPD2DQ is the CVTTPD2DQ instruction https://www.felixcloutier.com/x86/cvttpd2dq
+	CVTTPD2DQ
 
 	// instructionEnd is always placed at the bottom of this iota definition to be used in the test.
 	instructionEnd
@@ -731,8 +837,8 @@ func InstructionName(instruction asm.Instruction) string {
 		return "PADDB"
 	case PADDW:
 		return "PADDW"
-	case PADDL:
-		return "PADDL"
+	case PADDD:
+		return "PADDD"
 	case PADDQ:
 		return "PADDQ"
 	case ADDPS:
@@ -743,7 +849,7 @@ func InstructionName(instruction asm.Instruction) string {
 		return "PSUBB"
 	case PSUBW:
 		return "PSUBW"
-	case PSUBL:
+	case PSUBD:
 		return "PSUBL"
 	case PSUBQ:
 		return "PSUBQ"
@@ -863,6 +969,112 @@ func InstructionName(instruction asm.Instruction) string {
 		return "PMAXSW"
 	case PMAXSB:
 		return "PMAXSB"
+	case PMULLW:
+		return "PMULLW"
+	case PMULLD:
+		return "PMULLD"
+	case PMULUDQ:
+		return "PMULUDQ"
+	case PSUBSB:
+		return "PSUBSB"
+	case PSUBUSB:
+		return "PSUBUSB"
+	case PADDSW:
+		return "PADDSW"
+	case PADDSB:
+		return "PADDSB"
+	case PADDUSW:
+		return "PADDUSW"
+	case PSUBSW:
+		return "PSUBSW"
+	case PSUBUSW:
+		return "PSUBUSW"
+	case PAVGB:
+		return "PAVGB"
+	case PAVGW:
+		return "PAVGW"
+	case PABSB:
+		return "PABSB"
+	case PABSW:
+		return "PABSW"
+	case PABSD:
+		return "PABSD"
+	case BLENDVPD:
+		return "BLENDVPD"
+	case MAXPD:
+		return "MAXPD"
+	case MAXPS:
+		return "MAXPS"
+	case MINPD:
+		return "MINPD"
+	case MINPS:
+		return "MINPS"
+	case ANDNPD:
+		return "ANDNPD"
+	case ANDNPS:
+		return "ANDNPS"
+	case MULPS:
+		return "MULPS"
+	case MULPD:
+		return "MULPD"
+	case DIVPS:
+		return "DIVPS"
+	case DIVPD:
+		return "DIVPD"
+	case SQRTPS:
+		return "SQRTPS"
+	case SQRTPD:
+		return "SQRTPD"
+	case ROUNDPS:
+		return "ROUNDPS"
+	case ROUNDPD:
+		return "ROUNDPD"
+	case PALIGNR:
+		return "PALIGNR"
+	case PUNPCKLWD:
+		return "PUNPCKLWD"
+	case PUNPCKHWD:
+		return "PUNPCKHWD"
+	case PMULHUW:
+		return "PMULHUW"
+	case PMULDQ:
+		return "PMULDQ"
+	case PMULHRSW:
+		return "PMULHRSW"
+	case PMULHW:
+		return "PMULHW"
+	case CMPEQPS:
+		return "CMPEQPS"
+	case CMPEQPD:
+		return "CMPEQPD"
+	case CVTTPS2DQ:
+		return "CVTTPS2DQ"
+	case CVTDQ2PS:
+		return "CVTDQ2PS"
+	case MOVUPD:
+		return "MOVUPD"
+	case SHUFPS:
+		return "SHUFPS"
+	case PMADDWD:
+		return "PMADDWD"
+	case CVTDQ2PD:
+		return "CVTDQ2PD"
+	case UNPCKLPS:
+		return "UNPCKLPS"
+	case PACKUSWB:
+		return "PACKUSWB"
+	case PACKSSDW:
+		return "PACKSSDW"
+	case PACKUSDW:
+		return "PACKUSDW"
+	case CVTPS2PD:
+		return "CVTPS2PD"
+	case CVTPD2PS:
+		return "CVTPD2PS"
+	case PMADDUBSW:
+		return "PMADDUBSW"
+	case CVTTPD2DQ:
+		return "CVTTPD2DQ"
 	}
 	panic(fmt.Errorf("unknown instruction %d", instruction))
 }
diff --git a/internal/asm/amd64/impl.go b/internal/asm/amd64/impl.go
index c9b213aec59..50f849a8d58 100644
--- a/internal/asm/amd64/impl.go
+++ b/internal/asm/amd64/impl.go
@@ -1214,12 +1214,12 @@ var registerToRegisterOpcode = map[asm.Instruction]struct {
 	// https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
 	PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}, requireSrcFloat: true, requireDstFloat: true},
 	PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}, requireSrcFloat: true, requireDstFloat: true},
-	PADDL: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}, requireSrcFloat: true, requireDstFloat: true},
+	PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}, requireSrcFloat: true, requireDstFloat: true},
 	PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}, requireSrcFloat: true, requireDstFloat: true},
 	// https://www.felixcloutier.com/x86/psubb:psubw:psubd
 	PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}, requireSrcFloat: true, requireDstFloat: true},
 	PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}, requireSrcFloat: true, requireDstFloat: true},
-	PSUBL: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}, requireSrcFloat: true, requireDstFloat: true},
+	PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}, requireSrcFloat: true, requireDstFloat: true},
 	// https://www.felixcloutier.com/x86/psubq
 	PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}, requireSrcFloat: true, requireDstFloat: true},
 	// https://www.felixcloutier.com/x86/addps
@@ -1330,6 +1330,122 @@ var registerToRegisterOpcode = map[asm.Instruction]struct {
 	PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}, requireSrcFloat: true, requireDstFloat: true},
 	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
 	PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmullw
+	PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmulld:pmullq
+	PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmuludq
+	PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/psubsb:psubsw
+	PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/psubsb:psubsw
+	PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/psubusb:psubusw
+	PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/psubusb:psubusw
+	PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/paddsb:paddsw
+	PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/paddsb:paddsw
+	PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/paddusb:paddusw
+	PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pavgb:pavgw
+	PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pavgb:pavgw
+	PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
+	PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
+	PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
+	PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/blendvpd
+	BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/maxpd
+	MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/maxps
+	MAXPS: {opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/minpd
+	MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/minps
+	MINPS: {opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/andnpd
+	ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/andnps
+	ANDNPS: {opcode: []byte{0x0f, 0x55}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/mulps
+	MULPS: {opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/mulpd
+	MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/divps
+	DIVPS: {opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/divpd
+	DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/sqrtps
+	SQRTPS: {opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/sqrtpd
+	SQRTPD:  {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true},
+	ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
+	ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
+	// https://www.felixcloutier.com/x86/palignr
+	PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
+	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
+	PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
+	PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmulhuw
+	PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmuldq
+	PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmulhrsw
+	PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmovsx
+	PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmovsx
+	PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmovsx
+	PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmovzx
+	PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmovzx
+	PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmovzx
+	PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmulhw
+	PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/cmpps
+	CMPEQPS: {opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
+	// https://www.felixcloutier.com/x86/cmppd
+	CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
+	// https://www.felixcloutier.com/x86/cvttps2dq
+	CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/cvtdq2ps
+	CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/cvtdq2pd
+	CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/cvtpd2ps
+	CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/cvtps2pd
+	CVTPS2PD: {opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/movupd
+	MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/shufps
+	SHUFPS: {opcode: []byte{0x0f, 0xc6}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
+	// https://www.felixcloutier.com/x86/pmaddwd
+	PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/unpcklps
+	UNPCKLPS: {opcode: []byte{0x0f, 0x14}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/packuswb
+	PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/packsswb:packssdw
+	PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/packusdw
+	PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/pmaddubsw
+	PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}, requireSrcFloat: true, requireDstFloat: true},
+	// https://www.felixcloutier.com/x86/cvttpd2dq
+	CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}, requireDstFloat: true, requireSrcFloat: true},
 }
 
 var RegisterToRegisterShiftOpcode = map[asm.Instruction]struct {
@@ -1931,7 +2047,7 @@ func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) {
 
 	isFloatReg := IsVectorRegister(n.DstReg)
 	switch n.Instruction {
-	case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW:
+	case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD:
 		if !isFloatReg {
 			return fmt.Errorf("%s needs float register but got %s", InstructionName(n.Instruction), RegisterName(n.DstReg))
 		}
@@ -2093,7 +2209,7 @@ func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) {
 			a.Buf.Write([]byte{0x66, 0x0f, 0x73, modRM})
 			a.WriteConst(n.SrcConst, 8)
 		}
-	case PSRAW:
+	case PSRAW, PSRAD:
 		// https://www.felixcloutier.com/x86/psraw:psrad:psraq
 		modRM := 0b11_000_000 | // Specifying that operand is register.
 			0b00_100_000 | // PSRAW with immediate needs "/4" extension.
@@ -2102,7 +2218,15 @@ func (a *AssemblerImpl) EncodeConstToRegister(n *NodeImpl) (err error) {
 		if rexPrefix != RexPrefixNone {
 			a.Buf.WriteByte(rexPrefix)
 		}
-		a.Buf.Write([]byte{0x0f, 0x71, modRM})
+
+		var op byte
+		if inst == PSRAD {
+			op = 0x72
+		} else {
+			op = 0x71
+		}
+
+		a.Buf.Write([]byte{0x0f, op, modRM})
 		a.WriteConst(n.SrcConst, 8)
 	case PSRLW:
 		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
diff --git a/internal/asm/amd64/impl_staticconst.go b/internal/asm/amd64/impl_staticconst.go
index 376ae625eb1..63c3684fc75 100644
--- a/internal/asm/amd64/impl_staticconst.go
+++ b/internal/asm/amd64/impl_staticconst.go
@@ -110,6 +110,10 @@ func (a *AssemblerImpl) encodeStaticConstToRegister(n *NodeImpl) (err error) {
 		// https://www.felixcloutier.com/x86/lea
 		rexPrefix |= RexPrefixW
 		opcodes = []byte{0x8d}
+	case MOVUPD:
+		// https://www.felixcloutier.com/x86/movupd
+		mandatoryPrefix = 0x66
+		opcodes = []byte{0x0f, 0x10}
 	default:
 		err = errorEncodingUnsupported(n)
 		return
diff --git a/internal/asm/amd64/impl_staticconst_test.go b/internal/asm/amd64/impl_staticconst_test.go
index 89c96b6cfb2..602e37524af 100644
--- a/internal/asm/amd64/impl_staticconst_test.go
+++ b/internal/asm/amd64/impl_staticconst_test.go
@@ -1,6 +1,7 @@
 package amd64
 
 import (
+	"encoding/hex"
 	"testing"
 
 	"github.com/tetratelabs/wazero/internal/asm"
@@ -138,7 +139,7 @@ func TestAssemblerImpl_encodeStaticConstToRegister(t *testing.T) {
 	a.CompileStandAlone(UD2) // insert any dummy instruction before MOVDQUs.
 	err := a.CompileLoadStaticConstToRegister(MOVDQU, consts[0], RegX12)
 	require.NoError(t, err)
-	err = a.CompileLoadStaticConstToRegister(MOVDQU, consts[1], RegX0)
+	err = a.CompileLoadStaticConstToRegister(MOVUPD, consts[1], RegX0)
 	require.NoError(t, err)
 	err = a.CompileLoadStaticConstToRegister(LEAQ, consts[0], RegX0)
 	require.NoError(t, err)
@@ -153,9 +154,9 @@ func TestAssemblerImpl_encodeStaticConstToRegister(t *testing.T) {
 		// 0x2: movdqu xmm12, xmmword ptr [rip + 0x18]
 		// where rip = 0x0b, therefore [rip + 0x18] = [0x23] = consts[0].
 		0xf3, 0x44, 0x0f, 0x6f, 0x25, 0x18, 0x00, 0x00, 0x00,
-		// 0x0b: movdqu xmm0, xmmword ptr [rip + 0x18]
+		// 0x0b: movupd xmm0, xmmword ptr [rip + 0x18]
 		// where rip = 0x13, therefore [rip + 0x18] = [0x2b] = consts[1].
-		0xf3, 0x0f, 0x6f, 0x05, 0x18, 0x00, 0x00, 0x00,
+		0x66, 0x0f, 0x10, 0x05, 0x18, 0x00, 0x00, 0x00,
 		// 0x13: lea rax, [rip + 9]
 		// where rip = 0x1a, therefore [rip + 0x9] = [0x23] = consts[0].
 		0x48, 0x8d, 0x05, 0x09, 0x00, 0x00, 0x00,
@@ -168,6 +169,5 @@ func TestAssemblerImpl_encodeStaticConstToRegister(t *testing.T) {
 		0x22, 0x22, 0x22, 0x22,
 		// 0x2f: consts[2]
 		0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33,
-	}, actual)
-
+	}, actual, hex.EncodeToString(actual))
 }
diff --git a/internal/asm/amd64/impl_test.go b/internal/asm/amd64/impl_test.go
index 0a49b7ab838..1571701a1ec 100644
--- a/internal/asm/amd64/impl_test.go
+++ b/internal/asm/amd64/impl_test.go
@@ -487,7 +487,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "MOVDQU",
 			n: &NodeImpl{
 				Instruction: MOVDQU,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX3,
 				DstReg:      RegX10,
 			},
@@ -497,7 +496,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "MOVDQU",
 			n: &NodeImpl{
 				Instruction: MOVDQU,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegX3,
 			},
@@ -507,7 +505,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "MOVDQU",
 			n: &NodeImpl{
 				Instruction: MOVDQU,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegX15,
 			},
@@ -517,7 +514,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "MOVDQA",
 			n: &NodeImpl{
 				Instruction: MOVDQA,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX3,
 				DstReg:      RegX10,
 			},
@@ -527,7 +523,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "MOVDQA",
 			n: &NodeImpl{
 				Instruction: MOVDQA,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegX3,
 			},
@@ -537,7 +532,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "MOVDQA",
 			n: &NodeImpl{
 				Instruction: MOVDQA,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegX15,
 			},
@@ -547,7 +541,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "PACKSSWB",
 			n: &NodeImpl{
 				Instruction: PACKSSWB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegX15,
 			},
@@ -557,7 +550,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmovmskb r15d, xmm10",
 			n: &NodeImpl{
 				Instruction: PMOVMSKB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegR15,
 			},
@@ -567,7 +559,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "movmskps eax, xmm10",
 			n: &NodeImpl{
 				Instruction: MOVMSKPS,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegAX,
 			},
@@ -577,7 +568,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "movmskps r13d, xmm1",
 			n: &NodeImpl{
 				Instruction: MOVMSKPS,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegR13,
 			},
@@ -587,7 +577,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "movmskpd eax, xmm10",
 			n: &NodeImpl{
 				Instruction: MOVMSKPD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX10,
 				DstReg:      RegAX,
 			},
@@ -597,7 +586,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "movmskpd r15d, xmm1",
 			n: &NodeImpl{
 				Instruction: MOVMSKPD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegR15,
 			},
@@ -607,7 +595,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pand xmm15, xmm1",
 			n: &NodeImpl{
 				Instruction: PAND,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegX15,
 			},
@@ -617,7 +604,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "por xmm1, xmm15",
 			n: &NodeImpl{
 				Instruction: POR,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX1,
 			},
@@ -627,7 +613,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pandn xmm13, xmm15",
 			n: &NodeImpl{
 				Instruction: PANDN,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX13,
 			},
@@ -637,7 +622,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psrad xmm13, xmm15",
 			n: &NodeImpl{
 				Instruction: PSRAD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX13,
 			},
@@ -647,7 +631,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psraw xmm1, xmm1",
 			n: &NodeImpl{
 				Instruction: PSRAW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegX1,
 			},
@@ -657,7 +640,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psrlq xmm14, xmm14",
 			n: &NodeImpl{
 				Instruction: PSRLQ,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX14,
 				DstReg:      RegX14,
 			},
@@ -667,7 +649,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psrld xmm3, xmm3",
 			n: &NodeImpl{
 				Instruction: PSRLD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX3,
 				DstReg:      RegX3,
 			},
@@ -677,7 +658,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psrlw xmm15, xmm1",
 			n: &NodeImpl{
 				Instruction: PSRLW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegX15,
 			},
@@ -687,7 +667,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psllw xmm1, xmm15",
 			n: &NodeImpl{
 				Instruction: PSLLW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX1,
 			},
@@ -697,7 +676,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "punpcklbw xmm1, xmm15",
 			n: &NodeImpl{
 				Instruction: PUNPCKLBW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX1,
 			},
@@ -707,7 +685,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "punpckhbw xmm11, xmm1",
 			n: &NodeImpl{
 				Instruction: PUNPCKHBW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegX11,
 			},
@@ -717,7 +694,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pslld xmm11, xmm1",
 			n: &NodeImpl{
 				Instruction: PSLLD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX1,
 				DstReg:      RegX11,
 			},
@@ -727,7 +703,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "psllq xmm11, xmm15",
 			n: &NodeImpl{
 				Instruction: PSLLQ,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX11,
 			},
@@ -737,7 +712,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "cmpeqps xmm11, xmm15",
 			n: &NodeImpl{
 				Instruction: CMPPS,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX11,
 				Arg:         0, // CMPPS with arg=0 == Pseudo-Op CMPEQPS.
@@ -748,7 +722,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "cmpordps xmm1, xmm5",
 			n: &NodeImpl{
 				Instruction: CMPPS,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX5,
 				DstReg:      RegX1,
 				Arg:         7, // CMPPS with arg=7 == Pseudo-Op CMPORDPS.
@@ -759,7 +732,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "cmplepd xmm11, xmm15",
 			n: &NodeImpl{
 				Instruction: CMPPD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX15,
 				DstReg:      RegX11,
 				Arg:         2, // CMPPD with arg=2 == Pseudo-Op CMPLEPD.
@@ -770,7 +742,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "cmpneqpd xmm1, xmm5",
 			n: &NodeImpl{
 				Instruction: CMPPD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX5,
 				DstReg:      RegX1,
 				Arg:         4, // CMPPD with arg=4 == Pseudo-Op CMPNEQPD.
@@ -781,7 +752,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pcmpgtq xmm10, xmm3",
 			n: &NodeImpl{
 				Instruction: PCMPGTQ,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX3,
 				DstReg:      RegX10,
 			},
@@ -791,7 +761,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pcmpgtd xmm10, xmm3",
 			n: &NodeImpl{
 				Instruction: PCMPGTD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX3,
 				DstReg:      RegX10,
 			},
@@ -801,7 +770,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pminsd xmm10, xmm3",
 			n: &NodeImpl{
 				Instruction: PMINSD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX3,
 				DstReg:      RegX10,
 			},
@@ -811,7 +779,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmaxsd xmm1, xmm12",
 			n: &NodeImpl{
 				Instruction: PMAXSD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX12,
 				DstReg:      RegX1,
 			},
@@ -821,7 +788,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmaxsw xmm1, xmm12",
 			n: &NodeImpl{
 				Instruction: PMAXSW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX12,
 				DstReg:      RegX1,
 			},
@@ -831,7 +797,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pminsw xmm1, xmm12",
 			n: &NodeImpl{
 				Instruction: PMINSW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX12,
 				DstReg:      RegX1,
 			},
@@ -841,7 +806,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pcmpgtb xmm1, xmm12",
 			n: &NodeImpl{
 				Instruction: PCMPGTB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX12,
 				DstReg:      RegX1,
 			},
@@ -851,7 +815,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pminsb xmm1, xmm12",
 			n: &NodeImpl{
 				Instruction: PMINSB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX12,
 				DstReg:      RegX1,
 			},
@@ -861,7 +824,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmaxsb xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMAXSB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -871,7 +833,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pminud xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMINUD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -881,7 +842,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pminuw xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMINUW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -891,7 +851,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pminub xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMINUB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -901,7 +860,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmaxud xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMAXUD,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -911,7 +869,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmaxuw xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMAXUW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -921,7 +878,6 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pmaxub xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PMAXUB,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
@@ -931,12 +887,495 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 			name: "pcmpgtw xmm1, xmm2",
 			n: &NodeImpl{
 				Instruction: PCMPGTW,
-				Types:       OperandTypesRegisterToRegister,
 				SrcReg:      RegX2,
 				DstReg:      RegX1,
 			},
 			exp: []byte{0x66, 0xf, 0x65, 0xca},
 		},
+
+		{
+			name: "pmullw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PMULLW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xd5, 0xe9},
+		},
+		{
+			name: "pmulld xmm1, xmm11",
+			n: &NodeImpl{
+				Instruction: PMULLD,
+				SrcReg:      RegX11,
+				DstReg:      RegX1,
+			},
+			exp: []byte{0x66, 0x41, 0xf, 0x38, 0x40, 0xcb},
+		},
+		{
+			name: "pmuludq xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PMULUDQ,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xf4, 0xe9},
+		},
+		{
+			name: "psubsb xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PSUBSB,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xe8, 0xe9},
+		},
+		{
+			name: "psubsw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PSUBSW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xe9, 0xe9},
+		},
+		{
+			name: "psubusb xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PSUBUSB,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xd8, 0xe9},
+		},
+		{
+			name: "psubusw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PSUBUSW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xd9, 0xe9},
+		},
+		{
+			name: "paddsw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PADDSW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xed, 0xe9},
+		},
+		{
+			name: "paddsb xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PADDSB,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xec, 0xe9},
+		},
+		{
+			name: "paddusw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PADDUSW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xdd, 0xe9},
+		},
+		{
+			name: "pavgb xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PAVGB,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xe0, 0xe9},
+		},
+		{
+			name: "pavgw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PAVGW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xe3, 0xe9},
+		},
+		{
+			name: "pabsb xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PABSB,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x38, 0x1c, 0xe9},
+		},
+		{
+			name: "pabsw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PABSW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x38, 0x1d, 0xe9},
+		},
+		{
+			name: "pabsd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PABSD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x38, 0x1e, 0xe9},
+		},
+		{
+			name: "blendvpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: BLENDVPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x38, 0x15, 0xe9},
+		},
+		{
+			name: "maxpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: MAXPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x5f, 0xe9},
+		},
+		{
+			name: "maxps xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: MAXPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x44, 0xf, 0x5f, 0xe9},
+		},
+		{
+			name: "minpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: MINPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x5d, 0xe9},
+		},
+		{
+			name: "minps xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: MINPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x44, 0xf, 0x5d, 0xe9},
+		},
+		{
+			name: "andnpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: ANDNPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x55, 0xe9},
+		},
+		{
+			name: "andnps xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: ANDNPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x44, 0xf, 0x55, 0xe9},
+		},
+		{
+			name: "mulps xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: MULPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x44, 0xf, 0x59, 0xe9},
+		},
+		{
+			name: "mulpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: MULPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x59, 0xe9},
+		},
+		{
+			name: "divps xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: DIVPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x44, 0xf, 0x5e, 0xe9},
+		},
+		{
+			name: "divpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: DIVPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x5e, 0xe9},
+		},
+		{
+			name: "sqrtps xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: SQRTPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x44, 0xf, 0x51, 0xe9},
+		},
+		{
+			name: "sqrtpd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: SQRTPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x51, 0xe9},
+		},
+		{
+			name: "roundps xmm13, xmm1, 0",
+			n: &NodeImpl{
+				Instruction: ROUNDPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         0,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0x8, 0xe9, 0x0},
+		},
+		{
+			name: "roundps xmm13, xmm1, 1",
+			n: &NodeImpl{
+				Instruction: ROUNDPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         1,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0x8, 0xe9, 0x1},
+		},
+		{
+			name: "roundps xmm13, xmm1, 3",
+			n: &NodeImpl{
+				Instruction: ROUNDPS,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         3,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0x8, 0xe9, 0x3},
+		},
+		{
+			name: "roundpd xmm13, xmm1, 0",
+			n: &NodeImpl{
+				Instruction: ROUNDPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         0,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0x9, 0xe9, 0x0},
+		},
+		{
+			name: "roundpd xmm13, xmm1, 1",
+			n: &NodeImpl{
+				Instruction: ROUNDPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         1,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0x9, 0xe9, 0x1},
+		},
+		{
+			name: "roundpd xmm13, xmm1, 3",
+			n: &NodeImpl{
+				Instruction: ROUNDPD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         3,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0x9, 0xe9, 0x3},
+		},
+		{
+			name: "palignr xmm13, xmm1, 3",
+			n: &NodeImpl{
+				Instruction: PALIGNR,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+				Arg:         3,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x3a, 0xf, 0xe9, 0x3},
+		},
+		{
+			name: "punpcklwd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PUNPCKLWD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x61, 0xe9},
+		},
+		{
+			name: "punpckhwd xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PUNPCKHWD,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x69, 0xe9},
+		},
+		{
+			name: "pmulhuw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PMULHUW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0xe4, 0xe9},
+		},
+		{
+			name: "pmuldq xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PMULDQ,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x38, 0x28, 0xe9},
+		},
+		{
+			name: "pmulhrsw xmm13, xmm1",
+			n: &NodeImpl{
+				Instruction: PMULHRSW,
+				SrcReg:      RegX1,
+				DstReg:      RegX13,
+			},
+			exp: []byte{0x66, 0x44, 0xf, 0x38, 0xb, 0xe9},
+		},
+
+		{
+			name: "pmovsxbw xmm5, xmm10",
+			n:    &NodeImpl{Instruction: PMOVSXBW, SrcReg: RegX10, DstReg: RegX5},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x20, 0xea},
+		},
+		{
+			name: "pmovsxwd xmm5, xmm10",
+			n:    &NodeImpl{Instruction: PMOVSXWD, SrcReg: RegX10, DstReg: RegX5},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x23, 0xea},
+		},
+		{
+			name: "pmovsxdq xmm5, xmm10",
+			n:    &NodeImpl{Instruction: PMOVSXDQ, SrcReg: RegX10, DstReg: RegX5},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x25, 0xea},
+		},
+		{
+			name: "pmovzxbw xmm5, xmm10",
+			n:    &NodeImpl{Instruction: PMOVZXBW, SrcReg: RegX10, DstReg: RegX5},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x30, 0xea},
+		},
+		{
+			name: "pmovzxwd xmm5, xmm10",
+			n:    &NodeImpl{Instruction: PMOVZXWD, SrcReg: RegX10, DstReg: RegX5},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x33, 0xea},
+		},
+		{
+			name: "pmovzxdq xmm5, xmm10",
+			n:    &NodeImpl{Instruction: PMOVZXDQ, SrcReg: RegX10, DstReg: RegX5},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x35, 0xea},
+		},
+		{
+			name: "pmulhw xmm2, xmm1",
+			n:    &NodeImpl{Instruction: PMULHW, SrcReg: RegX1, DstReg: RegX2},
+			exp:  []byte{0x66, 0xf, 0xe5, 0xd1},
+		},
+		{
+			name: "cmpltps xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CMPEQPS, SrcReg: RegX14, DstReg: RegX1, Arg: 1},
+			exp:  []byte{0x41, 0xf, 0xc2, 0xce, 0x1},
+		},
+		{
+			name: "cmpunordpd xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CMPEQPD, SrcReg: RegX14, DstReg: RegX1, Arg: 3},
+			exp:  []byte{0x66, 0x41, 0xf, 0xc2, 0xce, 0x3},
+		},
+		{
+			name: "cvttps2dq xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CVTTPS2DQ, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0xf3, 0x41, 0xf, 0x5b, 0xce},
+		},
+		{
+			name: "cvtdq2ps xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CVTDQ2PS, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x41, 0xf, 0x5b, 0xce},
+		},
+		{
+			name: "movupd xmm1, xmm14",
+			n:    &NodeImpl{Instruction: MOVUPD, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0x10, 0xce},
+		},
+		{
+			name: "shufps xmm1, xmm14, 5",
+			n:    &NodeImpl{Instruction: SHUFPS, SrcReg: RegX14, DstReg: RegX1, Arg: 5},
+			exp:  []byte{0x41, 0xf, 0xc6, 0xce, 0x5},
+		},
+		{
+			name: "pmaddwd xmm1, xmm14",
+			n:    &NodeImpl{Instruction: PMADDWD, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0xf5, 0xce},
+		},
+		{
+			name: "cvtdq2pd xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CVTDQ2PD, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0xf3, 0x41, 0xf, 0xe6, 0xce},
+		},
+		{
+			name: "unpcklps xmm1, xmm14",
+			n:    &NodeImpl{Instruction: UNPCKLPS, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x41, 0xf, 0x14, 0xce},
+		},
+		{
+			name: "packuswb xmm1, xmm14",
+			n:    &NodeImpl{Instruction: PACKUSWB, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0x67, 0xce},
+		},
+		{
+			name: "packssdw xmm1, xmm14",
+			n:    &NodeImpl{Instruction: PACKSSDW, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0x6b, 0xce},
+		},
+		{
+			name: "packusdw xmm1, xmm14",
+			n:    &NodeImpl{Instruction: PACKUSDW, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x2b, 0xce},
+		},
+		{
+			name: "cvtps2pd xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CVTPS2PD, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x41, 0xf, 0x5a, 0xce},
+		},
+		{
+			name: "cvtpd2ps xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CVTPD2PS, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0x5a, 0xce},
+		},
+		{
+			name: "pmaddubsw xmm1, xmm14",
+			n:    &NodeImpl{Instruction: PMADDUBSW, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0x38, 0x4, 0xce},
+		},
+		{
+			name: "cvttpd2dq xmm1, xmm14",
+			n:    &NodeImpl{Instruction: CVTTPD2DQ, SrcReg: RegX14, DstReg: RegX1},
+			exp:  []byte{0x66, 0x41, 0xf, 0xe6, 0xce},
+		},
 	}
 
 	for _, tt := range tests {
@@ -1021,6 +1460,16 @@ func TestAssemblerImpl_EncodeConstToRegister(t *testing.T) {
 			},
 			exp: []byte{0x66, 0x41, 0xf, 0x71, 0xf2, 0x8},
 		},
+		{
+			name: "psrad xmm10, 0x1f",
+			n: &NodeImpl{
+				Instruction: PSRAD,
+				Types:       OperandTypesRegisterToRegister,
+				SrcConst:    0x1f,
+				DstReg:      RegX10,
+			},
+			exp: []byte{0x66, 0x41, 0xf, 0x72, 0xe2, 0x1f},
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/internal/asm/assembler.go b/internal/asm/assembler.go
index 100037f04cc..421954f8b98 100644
--- a/internal/asm/assembler.go
+++ b/internal/asm/assembler.go
@@ -15,7 +15,7 @@ type Register byte
 const NilRegister Register = 0
 
 // Instruction represents architecture-specific instructions.
-type Instruction byte
+type Instruction uint16
 
 // ConditionalRegisterState represents architecture-specific conditional
 // register's states.
diff --git a/internal/engine/compiler/compiler.go b/internal/engine/compiler/compiler.go
index 8e57a6eab23..3fd689e2687 100644
--- a/internal/engine/compiler/compiler.go
+++ b/internal/engine/compiler/compiler.go
@@ -16,474 +16,286 @@ type compiler interface {
 	// stackPointerCeil is the max stack pointer that the target function would reach.
 	// staticData is codeStaticData for the resulting native code.
 	compile() (code []byte, staticData codeStaticData, stackPointerCeil uint64, err error)
-	// compileHostFunction emits the trampoline code from which native code can jump into the host function.
+	// compileHostFunction adds the trampoline code from which native code can jump into the host function.
 	// TODO: maybe we wouldn't need to have trampoline for host functions.
 	compileHostFunction() error
 	// compileLabel notify compilers of the beginning of a label.
 	// Return true if the compiler decided to skip the entire label.
 	// See wazeroir.OperationLabel
 	compileLabel(o *wazeroir.OperationLabel) (skipThisLabel bool)
-	// compileUnreachable adds instructions to return to engine with nativeCallStatusCodeUnreachable status.
-	// See wasm.OpcodeUnreachable
+	// compileUnreachable adds instruction to perform wazeroir.OperationUnreachable.
 	compileUnreachable() error
-	// compileSwap adds instruction to swap the stack top value with the target in the Wasm value stack.
-	// The values are might be on registers or memory-stack at runtime, so compiler implementations
-	// emit instructions to swap values depending these locations.
-	// See wazeroir.OperationBrIf
+	// compileSwap adds instruction to perform wazeroir.OperationSwap.
 	compileSwap(o *wazeroir.OperationSwap) error
-	// compileGlobalGet adds instructions to read the value of the given index in the ModuleInstance.Globals
-	// and push the value onto the stack.
-	// See wasm.OpcodeGlobalGet
+	// compileGlobalGet adds instructions to perform wazeroir.OperationGlobalGet.
 	compileGlobalGet(o *wazeroir.OperationGlobalGet) error
-	// compileGlobalSet adds instructions to set the top value on the stack to the given index in the ModuleInstance.Globals.
-	// See wasm.OpcodeGlobalSet
+	// compileGlobalSet adds instructions to perform wazeroir.OperationGlobalSet.
 	compileGlobalSet(o *wazeroir.OperationGlobalSet) error
-	// compileBr adds instructions to branch into the given label.
-	// See wasm.OpcodeBr
+	// compileBr adds instructions to perform wazeroir.OperationBr.
 	compileBr(o *wazeroir.OperationBr) error
-	// compileBrIf adds instructions to pops a value and branch into ".then" label if the value equals 1.
-	// Otherwise, the code branches into ".else" label.
-	// See wasm.OpcodeBrIf and wazeroir.OperationBrIf
+	// compileBrIf adds instructions to perform wazeroir.OperationBrIf.
 	compileBrIf(o *wazeroir.OperationBrIf) error
-	// compileBrTable adds instructions to do br_table operation.
-	// A br_table operation has list of targets and default target, and
-	// this pops a value from the stack (called "index") and decide which branch we go into next
-	// based on the value.
-	//
-	// For example, assume we have operations like {default: L_DEFAULT, targets: [L0, L1, L2]}.
-	// If "index" >= len(defaults), then branch into the L_DEFAULT label.
-	// Otherwise, we enter label of targets[index].
-	// See wasm.OpcodeBrTable
+	// compileBrTable adds instructions to perform wazeroir.OperationBrTable.
 	compileBrTable(o *wazeroir.OperationBrTable) error
-	// compileCall adds instructions to call into a function of the given index.
-	// See wasm.OpcodeCall
+	// compileCall adds instructions to perform wazeroir.OperationCall.
 	compileCall(o *wazeroir.OperationCall) error
-	// compileCallIndirect adds instructions to perform call_indirect operation.
-	// This consumes the one value from the top of stack (called "offset"),
-	// and make a function call against the function whose function address equals "table[offset]".
-	//
-	// Note: This is called indirect function call in the sense that the target function is indirectly
-	// determined by the current state (top value) of the stack.
-	// Therefore, two checks are performed at runtime before entering the target function:
-	// 1) If "offset" exceeds the length of table, the function exits with nativeCallStatusCodeInvalidTableAccess.
-	// 2) If the type of the function table[offset] doesn't match the specified function type, the function exits with nativeCallStatusCodeTypeMismatchOnIndirectCall.
-	// Otherwise, we successfully enter the target function.
-	//
-	// See wasm.CallIndirect
+	// compileCallIndirect adds instructions to perform wazeroir.OperationCallIndirect.
 	compileCallIndirect(o *wazeroir.OperationCallIndirect) error
-	// compileDrop adds instructions to drop values within the given inclusive range from the value stack.
-	// See wazeroir.OperationDrop
+	// compileDrop adds instructions to perform wazeroir.OperationDrop.
 	compileDrop(o *wazeroir.OperationDrop) error
-	// compileSelect uses top three values on the stack. For example, if we have stack as [..., x1, x2, c]
-	// and the value "c" equals zero, then the stack results in [..., x1], otherwise, [..., x2].
-	// See wasm.OpcodeSelect
+	// compileSelect adds instructions to perform wazeroir.OperationSelect.
 	compileSelect() error
-	// compilePick adds instructions to copy a value on the given location in the Wasm value stack,
-	// and push the copied value onto the top of the stack.
-	// See wazeroir.OperationPick
+	// compilePick adds instructions to perform wazeroir.OperationPick.
 	compilePick(o *wazeroir.OperationPick) error
-	// compileAdd adds instructions to pop two values from the stack, add these two values, and push
-	// back the result onto the stack.
-	// See wasm.OpcodeI32Add wasm.OpcodeI64Add wasm.OpcodeF32Add wasm.OpcodeF64Add
+	// compileAdd adds instructions to perform wazeroir.OperationAdd.
 	compileAdd(o *wazeroir.OperationAdd) error
-	// compileSub adds instructions to pop two values from the stack, subtract the top from the second one, and push
-	// back the result onto the stack.
-	// See wasm.OpcodeI32Sub wasm.OpcodeI64Sub wasm.OpcodeF32Sub wasm.OpcodeF64Sub
+	// compileSub adds instructions to perform wazeroir.OperationSub.
 	compileSub(o *wazeroir.OperationSub) error
-	// compileMul adds instructions to pop two values from the stack, multiply these two values, and push
-	// back the result onto the stack.
-	// See wasm.OpcodeI32Mul wasm.OpcodeI64Mul wasm.OpcodeF32Mul wasm.OpcodeF64Mul
+	// compileMul adds instructions to perform wazeroir.OperationMul.
 	compileMul(o *wazeroir.OperationMul) error
-	// compileClz emits instructions to count up the leading zeros in the
-	// current top of the stack, and push the count result.
-	// For example, stack of [..., 0x00_ff_ff_ff] results in [..., 8].
-	// See wasm.OpcodeI32Clz wasm.OpcodeI64Clz
+	// compileClz adds instructions to perform wazeroir.OperationClz.
 	compileClz(o *wazeroir.OperationClz) error
-	// compileCtz emits instructions to count up the trailing zeros in the
-	// current top of the stack, and push the count result.
-	// For example, stack of [..., 0xff_ff_ff_00] results in [..., 8].
-	// See wasm.OpcodeI32Ctz wasm.OpcodeI64Ctz
+	// compileCtz adds instructions to perform wazeroir.OperationCtz.
 	compileCtz(o *wazeroir.OperationCtz) error
-	// compilePopcnt emits instructions to count up the number of set bits in the
-	// current top of the stack, and push the count result.
-	// For example, stack of [..., 0b00_00_00_11] results in [..., 2].
-	// See wasm.OpcodeI32Popcnt wasm.OpcodeI64Popcnt
+	// compilePopcnt adds instructions to perform wazeroir.OperationPopcnt.
 	compilePopcnt(o *wazeroir.OperationPopcnt) error
-	// compileDiv emits the instructions to perform division on the top two values on the stack.
-	// See wasm.OpcodeI32DivS wasm.OpcodeI32DivU wasm.OpcodeI64DivS wasm.OpcodeI64DivU wasm.OpcodeF32Div wasm.OpcodeF64Div
+	// compileDiv adds instructions to perform wazeroir.OperationDiv.
 	compileDiv(o *wazeroir.OperationDiv) error
-	// compileRem emits the instructions to perform division on the top
-	// two values of integer type on the stack and puts the remainder of the result
-	// onto the stack. For example, stack [..., 10, 3] results in [..., 1] where
-	// the quotient is discarded.
-	// See wasm.OpcodeI32RemS wasm.OpcodeI32RemU wasm.OpcodeI64RemS wasm.OpcodeI64RemU
+	// compileRem adds instructions to perform wazeroir.OperationRem.
 	compileRem(o *wazeroir.OperationRem) error
-	// compileAnd emits instructions to perform logical "and" operation on
-	// top two values on the stack, and push the result.
-	// See wasm.OpcodeI32And wasm.OpcodeI64And
+	// compileAnd adds instructions to perform wazeroir.OperationAnd.
 	compileAnd(o *wazeroir.OperationAnd) error
-	// compileOr emits instructions to perform logical "or" operation on
-	// top two values on the stack, and pushes the result.
-	// See wasm.OpcodeI32Or wasm.OpcodeI64Or
+	// compileOr adds instructions to perform wazeroir.OperationOr.
 	compileOr(o *wazeroir.OperationOr) error
-	// compileXor emits instructions to perform logical "xor" operation on
-	// top two values on the stack, and pushes the result.
-	// See wasm.OpcodeI32Xor wasm.OpcodeI64Xor
+	// compileXor adds instructions to perform wazeroir.OperationXor.
 	compileXor(o *wazeroir.OperationXor) error
-	// compileShl emits instructions to perform a shift-left operation on
-	// top two values on the stack, and pushes the result.
-	// See wasm.OpcodeI32Shl wasm.OpcodeI64Shl
+	// compileShl adds instructions to perform wazeroir.OperationShl.
 	compileShl(o *wazeroir.OperationShl) error
-	// compileShr emits instructions to perform a shift-right operation on
-	// top two values on the stack, and pushes the result.
-	// See wasm.OpcodeI32Shr wasm.OpcodeI64Shr
+	// compileShr adds instructions to perform wazeroir.OperationShr.
 	compileShr(o *wazeroir.OperationShr) error
-	// compileRotl emits instructions to perform a rotate-left operation on
-	// top two values on the stack, and pushes the result.
-	// See wasm.OpcodeI32Rotl wasm.OpcodeI64Rotl
+	// compileRotl adds instructions to perform wazeroir.OperationRotl.
 	compileRotl(o *wazeroir.OperationRotl) error
-	// compileRotr emits instructions to perform a rotate-right operation on
-	// top two values on the stack, and pushes the result.
-	// See wasm.OpcodeI32Rotr wasm.OpcodeI64Rotr
+	// compileRotr adds instructions to perform wazeroir.OperationRotr.
 	compileRotr(o *wazeroir.OperationRotr) error
-	// compileAbs adds instructions to replace the top value of float type on the stack with its absolute value.
-	// For example, stack [..., -1.123] results in [..., 1.123].
-	// See wasm.OpcodeF32Abs wasm.OpcodeF64Abs
+	// compileNeg adds instructions to perform wazeroir.OperationAbs.
 	compileAbs(o *wazeroir.OperationAbs) error
-	// compileNeg adds instructions to replace the top value of float type on the stack with its negated value.
-	// For example, stack [..., -1.123] results in [..., 1.123].
-	// See wasm.OpcodeF32Neg wasm.OpcodeF64Neg
+	// compileNeg adds instructions to perform wazeroir.OperationNeg.
 	compileNeg(o *wazeroir.OperationNeg) error
-	// compileCeil adds instructions to replace the top value of float type on the stack with its ceiling value.
-	// For example, stack [..., 1.123] results in [..., 2.0]. This is equivalent to math.Ceil.
-	// See wasm.OpcodeF32Ceil wasm.OpcodeF64Ceil
+	// compileCeil adds instructions to perform wazeroir.OperationCeil.
 	compileCeil(o *wazeroir.OperationCeil) error
-	// compileFloor adds instructions to replace the top value of float type on the stack with its floor value.
-	// For example, stack [..., 1.123] results in [..., 1.0]. This is equivalent to math.Floor.
-	// See wasm.OpcodeF32Floor wasm.OpcodeF64Floor
+	// compileFloor adds instructions to perform wazeroir.OperationFloor.
 	compileFloor(o *wazeroir.OperationFloor) error
-	// compileTrunc adds instructions to replace the top value of float type on the stack with its truncated value.
-	// For example, stack [..., 1.9] results in [..., 1.0]. This is equivalent to math.Trunc.
-	// See wasm.OpcodeF32Trunc wasm.OpcodeF64Trunc
+	// compileTrunc adds instructions to perform wazeroir.OperationTrunc.
 	compileTrunc(o *wazeroir.OperationTrunc) error
-	// compileNearest adds instructions to replace the top value of float type on the stack with its nearest integer value.
-	// For example, stack [..., 1.9] results in [..., 2.0]. This is *not* equivalent to math.Round and instead has the same
-	// the semantics of LLVM's rint intrinsic. See https://llvm.org/docs/LangRef.html#llvm-rint-intrinsic.
-	// For example, math.Round(-4.5) produces -5 while we want to produce -4.
-	// See wasm.OpcodeF32Nearest wasm.OpcodeF64Nearest
+	// compileNearest adds instructions to perform wazeroir.OperationNearest.
 	compileNearest(o *wazeroir.OperationNearest) error
-	// compileSqrt adds instructions to replace the top value of float type on the stack with its square root.
-	// For example, stack [..., 9.0] results in [..., 3.0]. This is equivalent to "math.Sqrt".
-	// See wasm.OpcodeF32Sqrt wasm.OpcodeF64Sqrt
+	// compileSqrt adds instructions perform wazeroir.OperationSqrt.
 	compileSqrt(o *wazeroir.OperationSqrt) error
-	// compileMin adds instructions to pop two values from the stack, and push back the maximum of
-	// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 1.9].
-	// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN,
-	// which is a different behavior different from math.Min.
-	// See wasm.OpcodeF32Min wasm.OpcodeF64Min
+	// compileMin adds instructions perform wazeroir.OperationMin.
 	compileMin(o *wazeroir.OperationMin) error
-	// compileMax adds instructions to pop two values from the stack, and push back the maximum of
-	// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 100.1].
-	// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN,
-	// which is a different behavior different from math.Max.
-	// See wasm.OpcodeF32Max wasm.OpcodeF64Max
+	// compileMax adds instructions perform wazeroir.OperationMax.
 	compileMax(o *wazeroir.OperationMax) error
-	// compileCopysign adds instructions to pop two float values from the stack, and copy the signbit of
-	// the first-popped value to the last one.
-	// For example, stack [..., 1.213, -5.0] results in [..., -1.213].
-	// See wasm.OpcodeF32Copysign wasm.OpcodeF64Copysign
+	// compileCopysign adds instructions to perform wazeroir.OperationCopysign.
 	compileCopysign(o *wazeroir.OperationCopysign) error
-	// compileI32WrapFromI64 adds instructions to replace the 64-bit int on top of the stack
-	// with the corresponding 32-bit integer. This is equivalent to uint64(uint32(v)) in Go.
-	// See wasm.OpcodeI32WrapI64.
+	// compileI32WrapFromI64 adds instructions to perform wazeroir.OperationI32WrapFromI64.
 	compileI32WrapFromI64() error
-	// compileITruncFromF adds instructions to replace the top value of float type on the stack with
-	// the corresponding int value. This is equivalent to int32(math.Trunc(float32(x))), uint32(math.Trunc(float64(x))), etc in Go.
-	//
-	// Please refer to [1] and [2] for when we encounter undefined behavior in the WebAssembly specification.
-	// To summarize, if the source float value is NaN or doesn't fit in the destination range of integers (incl. +=Inf),
-	// then the runtime behavior is undefined. In wazero, we exit the function in these undefined cases with
-	// nativeCallStatusCodeInvalidFloatToIntConversion or nativeCallStatusIntegerOverflow status code.
-	// [1] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-umathrmtruncmathsfu_m-n-z for unsigned integers.
-	// [2] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-smathrmtruncmathsfs_m-n-z for signed integers.
-	// See OpcodeI32TruncF32S OpcodeI32TruncF32U OpcodeI32TruncF64S OpcodeI32TruncF64U
-	// See OpcodeI64TruncF32S OpcodeI64TruncF32U OpcodeI64TruncF64S OpcodeI64TruncF64U
+	// compileITruncFromF adds instructions to perform wazeroir.OperationITruncFromF.
 	compileITruncFromF(o *wazeroir.OperationITruncFromF) error
-	// compileFConvertFromI adds instructions to replace the top value of int type on the stack with
-	// the corresponding float value. This is equivalent to float32(uint32(x)), float32(int32(x)), etc in Go.
-	// See OpcodeI32ConvertF32S OpcodeI32ConvertF32U OpcodeI32ConvertF64S OpcodeI32ConvertF64U
-	// See OpcodeI64ConvertF32S OpcodeI64ConvertF32U OpcodeI64ConvertF64S OpcodeI64ConvertF64U
+	// compileFConvertFromI adds instructions to perform wazeroir.OperationFConvertFromI.
 	compileFConvertFromI(o *wazeroir.OperationFConvertFromI) error
-	// compileF32DemoteFromF64 adds instructions to replace the 64-bit float on top of the stack
-	// with the corresponding 32-bit float. This is equivalent to float32(float64(v)) in Go.
-	// See wasm.OpcodeF32DemoteF64
+	// compileF32DemoteFromF64 adds instructions to perform wazeroir.OperationF32DemoteFromF64.
 	compileF32DemoteFromF64() error
-	// compileF64PromoteFromF32 adds instructions to replace the 32-bit float on top of the stack
-	// with the corresponding 64-bit float. This is equivalent to float64(float32(v)) in Go.
-	// See wasm.OpcodeF64PromoteF32
+	// compileF64PromoteFromF32 adds instructions to perform wazeroir.OperationF64PromoteFromF32.
 	compileF64PromoteFromF32() error
-	// compileI32ReinterpretFromF32 adds instructions to reinterpret the 32-bit float on top of the stack
-	// as a 32-bit integer by preserving the bit representation. If the value is on the stack,
-	// this is no-op as there is nothing to do for converting type.
-	// See wasm.OpcodeI32ReinterpretF32.
+	// compileI32ReinterpretFromF32 adds instructions to perform wazeroir.OperationI32ReinterpretFromF32.
 	compileI32ReinterpretFromF32() error
-	// compileI64ReinterpretFromF64 adds instructions to reinterpret the 64-bit float on top of the stack
-	// as a 64-bit integer by preserving the bit representation.
-	// See wasm.OpcodeI64ReinterpretF64.
+	// compileI64ReinterpretFromF64 adds instructions to perform wazeroir.OperationI64ReinterpretFromF64.
 	compileI64ReinterpretFromF64() error
-	// compileF32ReinterpretFromI32 adds instructions to reinterpret the 32-bit int on top of the stack
-	// as a 32-bit float by preserving the bit representation.
-	// See wasm.OpcodeF32ReinterpretI32.
+	// compileF32ReinterpretFromI32 adds instructions to perform wazeroir.OperationF32ReinterpretFromI32.
 	compileF32ReinterpretFromI32() error
-	// compileF64ReinterpretFromI64 adds instructions to reinterpret the 64-bit int on top of the stack
-	// as a 64-bit float by preserving the bit representation.
-	// See wasm.OpcodeF64ReinterpretI64.
+	// compileF64ReinterpretFromI64 adds instructions to perform wazeroir.OperationF64ReinterpretFromI64.
 	compileF64ReinterpretFromI64() error
-	// compileExtend adds instructions to extend the 32-bit signed or unsigned int on top of the stack
-	// as a 64-bit integer of corresponding signedness. For unsigned case, this is just reinterpreting the
-	// underlying bit pattern as 64-bit integer. For signed case, this is sign-extension which preserves the
-	// original integer's sign.
-	// See wasm.OpcodeI64ExtendI32S wasm.OpcodeI64ExtendI32U
+	// compileExtend adds instructions to perform wazeroir.OperationExtend.
 	compileExtend(o *wazeroir.OperationExtend) error
-	// compileEq adds instructions to pop two values from the stack and push 1 if they equal otherwise 0.
-	// See wasm.OpcodeI32Eq wasm.OpcodeI64Eq
+	// compileEq adds instructions to perform wazeroir.OperationEq.
 	compileEq(o *wazeroir.OperationEq) error
-	// compileEq adds instructions to pop two values from the stack and push 0 if they equal otherwise 1.
-	// See wasm.OpcodeI32Ne wasm.OpcodeI64Ne
+	// compileEq adds instructions to perform wazeroir.OperationNe.
 	compileNe(o *wazeroir.OperationNe) error
-	// compileEq adds instructions to pop a value from the stack and push 1 if it equals zero, 0.
-	// See wasm.OpcodeI32Eqz wasm.OpcodeI64Eqz
+	// compileEq adds instructions to perform wazeroir.OperationEqz.
 	compileEqz(o *wazeroir.OperationEqz) error
-	// compileLt adds instructions to pop two values from the stack and push 1 if the second is less than the top one. Otherwise 0.
-	// See wasm.OpcodeI32Lt wasm.OpcodeI64Lt
+	// compileLt adds instructions to perform wazeroir.OperationLt.
 	compileLt(o *wazeroir.OperationLt) error
-	// compileGt adds instructions to pop two values from the stack and push 1 if the second is greater than the top one. Otherwise 0.
-	// See wasm.OpcodeI32Gt wasm.OpcodeI64Gt
+	// compileGt adds instructions to perform wazeroir.OperationGt.
 	compileGt(o *wazeroir.OperationGt) error
-	// compileLe adds instructions to pop two values from the stack and push 1 if the second is less than or equals the top one. Otherwise 0.
-	// See wasm.OpcodeI32Le wasm.OpcodeI64Le
+	// compileLe adds instructions to perform wazeroir.OperationLe.
 	compileLe(o *wazeroir.OperationLe) error
-	// compileLe adds instructions to pop two values from the stack and push 1 if the second is greater than or equals the top one. Otherwise 0.
-	// See wasm.OpcodeI32Ge wasm.OpcodeI64Ge
+	// compileLe adds instructions to perform wazeroir.OperationGe.
 	compileGe(o *wazeroir.OperationGe) error
-	// compileLoad adds instructions to perform load instruction in WebAssembly.
-	// See wasm.OpcodeI32Load wasm.OpcodeI64Load wasm.OpcodeF32Load wasm.OpcodeF64Load
+	// compileLoad adds instructions to perform wazeroir.OperationLoad.
 	compileLoad(o *wazeroir.OperationLoad) error
-	// compileLoad8 adds instructions to perform load8 instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds if out-of-bounds access happens.
-	// See wasm.OpcodeI32Load8S wasm.OpcodeI32Load8U wasm.OpcodeI64Load8S wasm.OpcodeI64Load8U
+	// compileLoad8 adds instructions to perform wazeroir.OperationLoad8.
 	compileLoad8(o *wazeroir.OperationLoad8) error
-	// compileLoad16 adds instructions to perform load16 instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds if out-of-bounds access happens.
-	// See wasm.OpcodeI32Load16S wasm.OpcodeI32Load16U wasm.OpcodeI64Load16S wasm.OpcodeI64Load16U
+	// compileLoad16 adds instructions to perform wazeroir.OperationLoad16.
 	compileLoad16(o *wazeroir.OperationLoad16) error
-	// compileLoad32 adds instructions to perform load32 instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds
-	// if out-of-bounds access happens.
-	// See wasm.OpcodeI64Load32S wasm.OpcodeI64Load32U
+	// compileLoad32 adds instructions to perform wazeroir.OperationLoad32.
 	compileLoad32(o *wazeroir.OperationLoad32) error
-	// compileStore adds instructions to perform store instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds
-	// if out-of-bounds access happens.
-	// See wasm.OpcodeI32Store wasm.OpcodeI64Store wasm.OpcodeF32Store wasm.OpcodeF64Store
+	// compileStore adds instructions to perform wazeroir.OperationStore.
 	compileStore(o *wazeroir.OperationStore) error
-	// compileStore8 adds instructions to perform store8 instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds
-	// if out-of-bounds access happens.
-	// See wasm.OpcodeI32Store8S wasm.OpcodeI32Store8U wasm.OpcodeI64Store8S wasm.OpcodeI64Store8U
+	// compileStore8 adds instructions to perform wazeroir.OperationStore8.
 	compileStore8(o *wazeroir.OperationStore8) error
-	// compileStore16 adds instructions to perform store16 instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds
-	// if out-of-bounds access happens.
-	// See wasm.OpcodeI32Store16S wasm.OpcodeI32Store16U wasm.OpcodeI64Store16S wasm.OpcodeI64Store16U
+	// compileStore16 adds instructions to perform wazeroir.OperationStore16.
 	compileStore16(o *wazeroir.OperationStore16) error
-	// compileStore32 adds instructions to perform store32 instruction in WebAssembly.
-	// The resulting code checks the memory boundary at runtime, and exit the function with nativeCallStatusCodeMemoryOutOfBounds
-	// if out-of-bounds access happens.
-	// See wasm.OpcodeI64Store32S wasm.OpcodeI64Store32U
+	// compileStore32 adds instructions to perform wazeroir.OperationStore32.
 	compileStore32(o *wazeroir.OperationStore32) error
-	// compileMemorySize adds instruction to pop a value from the stack, grow the memory buffer according to the value,
-	// and push the previous page size onto the stack.
-	// See wasm.OpcodeMemoryGrow
+	// compileMemorySize adds instruction to perform wazeroir.OperationMemoryGrow.
 	compileMemoryGrow() error
-	// compileMemorySize adds instruction to read the current page size of memory instance and push it onto the stack.
-	// See wasm.OpcodeMemorySize
+	// compileMemorySize adds instruction to perform wazeroir.OperationMemorySize.
 	compileMemorySize() error
-	// compileConstI32 adds instruction to push the given constant i32 value onto the stack.
-	// See wasm.OpcodeI32Const
+	// compileConstI32 adds instruction to perform wazeroir.OperationConstI32.
 	compileConstI32(o *wazeroir.OperationConstI32) error
-	// compileConstI32 adds instruction to push the given constant i64 value onto the stack.
-	// See wasm.OpcodeI64Const
+	// compileConstI64 adds instruction to perform wazeroir.OperationConstI64.
 	compileConstI64(o *wazeroir.OperationConstI64) error
-	// compileConstI32 adds instruction to push the given constant f32 value onto the stack.
-	// See wasm.OpcodeF32Const
+	// compileConstF32 adds instruction to perform wazeroir.OperationConstF32.
 	compileConstF32(o *wazeroir.OperationConstF32) error
-	// compileConstI32 adds instruction to push the given constant f64 value onto the stack.
-	// See wasm.OpcodeF64Const
+	// compileConstF64 adds instruction to perform wazeroir.OperationConstF64.
 	compileConstF64(o *wazeroir.OperationConstF64) error
-	// compileSignExtend32From8 adds instruction to sign-extends the first 8-bits of 32-bit in as signed 32-bit int.
-	// See wasm.OpcodeI32Extend8S
+	// compileSignExtend32From8 adds instructions to perform wazeroir.OperationSignExtend32From8.
 	compileSignExtend32From8() error
-	// compileSignExtend32From16 adds instruction to sign-extends the first 16-bits of 32-bit in as signed 32-bit int.
-	// See wasm.OpcodeI32Extend16S
+	// compileSignExtend32From16 adds instructions to perform wazeroir.OperationSignExtend32From16.
 	compileSignExtend32From16() error
-	// compileSignExtend64From8 adds instruction to sign-extends the first 8-bits of 64-bit in as signed 64-bit int.
-	// See wasm.OpcodeI64Extend8S
+	// compileSignExtend64From8 adds instructions to perform wazeroir.OperationSignExtend64From8.
 	compileSignExtend64From8() error
-	// compileSignExtend64From16 adds instruction to sign-extends the first 16-bits of 64-bit in as signed 64-bit int.
-	// See wasm.OpcodeI64Extend16S
+	// compileSignExtend64From16 adds instructions to perform wazeroir.OperationSignExtend64From16.
 	compileSignExtend64From16() error
-	// compileSignExtend64From32 adds instruction to sign-extends the first 32-bits of 64-bit in as signed 64-bit int.
-	// See wasm.OpcodeI64Extend32S
+	// compileSignExtend64From32 adds instructions to perform wazeroir.OperationSignExtend64From32.
 	compileSignExtend64From32() error
-	// compileMemoryInit adds instructions to perform operations corresponding to the wasm.OpcodeMemoryInitName instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileMemoryInit adds instructions to perform wazeroir.OperationMemoryInit.
 	compileMemoryInit(*wazeroir.OperationMemoryInit) error
-	// compileDataDrop adds instructions to perform operations corresponding to the wasm.OpcodeDataDropName instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileDataDrop adds instructions to perform wazeroir.OperationDataDrop.
 	compileDataDrop(*wazeroir.OperationDataDrop) error
-	// compileMemoryCopy adds instructions to perform operations corresponding to the wasm.OpcodeMemoryCopylName instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileMemoryCopy adds instructions to perform wazeroir.OperationMemoryCopy.
 	compileMemoryCopy() error
-	// compileMemoryCopy adds instructions to perform operations corresponding to the wasm.OpcodeMemoryFillName instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileMemoryFill adds instructions to perform wazeroir.OperationMemoryFill.
 	compileMemoryFill() error
-	// compileTableInit adds instructions to perform operations corresponding to the wasm.OpcodeTableInit instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileTableInit adds instructions to perform wazeroir.OperationTableInit.
 	compileTableInit(*wazeroir.OperationTableInit) error
-	// compileTableCopy adds instructions to perform operations corresponding to the wasm.OpcodeTableCopy instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileTableCopy adds instructions to perform wazeroir.OperationTableCopy.
 	compileTableCopy(*wazeroir.OperationTableCopy) error
-	// compileElemDrop adds instructions to perform operations corresponding to the wasm.OpcodeElemDrop instruction in
-	// wasm.FeatureBulkMemoryOperations.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/appendix/changes.html#bulk-memory-and-table-instructions
+	// compileElemDrop adds instructions to perform wazeroir.OperationElemDrop.
 	compileElemDrop(*wazeroir.OperationElemDrop) error
-	// compileRefFunc adds instructions to perform operations corresponding to wasm.OpcodeRefFunc instruction introduced in
-	// wasm.FeatureReferenceTypes.
-	//
-	// Note: in wazero, we express any reference types (funcref or externref) as opaque pointers which is uint64.
-	// Therefore, the compilers implementations emit instructions to push the address of *function onto the stack.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/valid/instructions.html#xref-syntax-instructions-syntax-instr-ref-mathsf-ref-func-x
+	// compileRefFunc adds instructions to perform wazeroir.OperationRefFunc.
 	compileRefFunc(*wazeroir.OperationRefFunc) error
-	// compileTableGet adds instructions to perform operations corresponding to wasm.OpcodeTableGet instruction introduced in
-	// wasm.FeatureReferenceTypes.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/valid/instructions.html#xref-syntax-instructions-syntax-instr-table-mathsf-table-get-x
+	// compileTableGet adds instructions to perform wazeroir.OperationTableGet.
 	compileTableGet(*wazeroir.OperationTableGet) error
-	// compileTableSet adds instructions to perform operations corresponding to wasm.OpcodeTableSet instruction introduced in
-	// wasm.FeatureReferenceTypes.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/valid/instructions.html#xref-syntax-instructions-syntax-instr-table-mathsf-table-set-x
+	// compileTableSet adds instructions to perform wazeroir.OperationTableSet.
 	compileTableSet(*wazeroir.OperationTableSet) error
-	// compileTableGrow adds instructions to perform operations corresponding to wasm.OpcodeMiscTableGrow instruction introduced in
-	// wasm.FeatureReferenceTypes.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/valid/instructions.html#xref-syntax-instructions-syntax-instr-table-mathsf-table-grow-x
+	// compileTableGrow adds instructions to perform wazeroir.OperationTableGrow.
 	compileTableGrow(*wazeroir.OperationTableGrow) error
-	// compileTableSize adds instructions to perform operations corresponding to wasm.OpcodeMiscTableSize instruction introduced in
-	// wasm.FeatureReferenceTypes.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/valid/instructions.html#xref-syntax-instructions-syntax-instr-table-mathsf-table-size-x
+	// compileTableSize adds instructions to perform wazeroir.OperationTableSize.
 	compileTableSize(*wazeroir.OperationTableSize) error
-	// compileTableFill adds instructions to perform operations corresponding to wasm.OpcodeMiscTableFill instruction introduced in
-	// wasm.FeatureReferenceTypes.
-	//
-	// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/valid/instructions.html#xref-syntax-instructions-syntax-instr-table-mathsf-table-fill-x
+	// compileTableFill adds instructions to perform wazeroir.OperationTableFill.
 	compileTableFill(*wazeroir.OperationTableFill) error
-	// compileV128Const adds instructions to push a constant V128 value onto the stack.
-	// See wasm.OpcodeVecV128Const
+	// compileV128Const adds instructions to perform wazeroir.OperationV128Const.
 	compileV128Const(*wazeroir.OperationV128Const) error
-	// compileV128Add adds instruction to add two vector values whose shape is specified as `o.Shape`.
-	// See wasm.OpcodeVecI8x16Add wasm.OpcodeVecI16x8Add wasm.OpcodeVecI32x4Add wasm.OpcodeVecI64x2Add wasm.OpcodeVecF32x4Add wasm.OpcodeVecF64x2Add
+	// compileV128Add adds instructions to perform wazeroir.OperationV128Add.
 	compileV128Add(o *wazeroir.OperationV128Add) error
-	// compileV128Sub adds instruction to subtract two vector values whose shape is specified as `o.Shape`.
-	// See wasm.OpcodeVecI8x16Sub wasm.OpcodeVecI16x8Sub wasm.OpcodeVecI32x4Sub wasm.OpcodeVecI64x2Sub wasm.OpcodeVecF32x4Sub wasm.OpcodeVecF64x2Sub
+	// compileV128Sub adds instructions to perform wazeroir.OperationV128Sub.
+	// compileV128Sub adds instructions to perform wazeroir.OperationV128Sub.
 	compileV128Sub(o *wazeroir.OperationV128Sub) error
-	// compileV128Load adds instruction to perform vector load kind instructions.
-	// See wasm.OpcodeVecV128Load* instructions.
+	// compileV128Load adds instructions to perform wazeroir.OperationV128Load.
 	compileV128Load(o *wazeroir.OperationV128Load) error
-	// compileV128LoadLane adds instructions which are equivalent to wasm.OpcodeVecV128LoadXXLane instructions.
-	// See wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName
+	// compileV128LoadLane adds instructions to perform wazeroir.OperationV128LoadLane.
 	compileV128LoadLane(o *wazeroir.OperationV128LoadLane) error
-	// compileV128Store adds instructions which are equivalent to wasm.OpcodeVecV128StoreName.
+	// compileV128Store adds instructions to perform wazeroir.OperationV128Store.
 	compileV128Store(o *wazeroir.OperationV128Store) error
-	// compileV128StoreLane adds instructions which are equivalent to wasm.OpcodeVecV128StoreXXLane instructions.
-	// See wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
+	// compileV128StoreLane adds instructions to perform wazeroir.OperationV128StoreLane.
 	compileV128StoreLane(o *wazeroir.OperationV128StoreLane) error
-	// compileV128ExtractLane adds instructions which are equivalent to wasm.OpcodeVecXXXXExtractLane instructions.
-	// See wasm.OpcodeVecI8x16ExtractLaneSName wasm.OpcodeVecI8x16ExtractLaneUName wasm.OpcodeVecI16x8ExtractLaneSName wasm.OpcodeVecI16x8ExtractLaneUName
-	// wasm.OpcodeVecI32x4ExtractLaneName wasm.OpcodeVecI64x2ExtractLaneName wasm.OpcodeVecF32x4ExtractLaneName wasm.OpcodeVecF64x2ExtractLaneName.
+	// compileV128ExtractLane adds instructions to perform wazeroir.OperationV128ExtractLane.
 	compileV128ExtractLane(o *wazeroir.OperationV128ExtractLane) error
-	// compileV128ReplaceLane adds instructions which are equivalent to wasm.OpcodeVecXXXXReplaceLane instructions.
-	// See wasm.OpcodeVecI8x16ReplaceLaneName wasm.OpcodeVecI16x8ReplaceLaneName wasm.OpcodeVecI32x4ReplaceLaneName wasm.OpcodeVecI64x2ReplaceLaneName
-	// wasm.OpcodeVecF32x4ReplaceLaneName wasm.OpcodeVecF64x2ReplaceLaneName.
+	// compileV128ReplaceLane adds instructions to perform wazeroir.OperationV128ReplaceLane.
 	compileV128ReplaceLane(o *wazeroir.OperationV128ReplaceLane) error
-	// compileV128Splat adds instructions which are equivalent to wasm.OpcodeVecXXXSplat instructions.
-	// See wasm.OpcodeVecI8x16SplatName wasm.OpcodeVecI16x8SplatName wasm.OpcodeVecI32x4SplatName wasm.OpcodeVecI64x2SplatName
-	// wasm.OpcodeVecF32x4SplatName wasm.OpcodeVecF64x2SplatName.
+	// compileV128Splat adds instructions to perform wazeroir.OperationV128Splat.
 	compileV128Splat(o *wazeroir.OperationV128Splat) error
-	// compileV128Shuffle adds instructions which are equivalent to wasm.OpcodeVecV128i8x16ShuffleName instruction.
+	// compileV128Shuffle adds instructions to perform wazeroir.OperationV128Shuffle.
 	compileV128Shuffle(o *wazeroir.OperationV128Shuffle) error
-	// compileV128Swizzle adds instructions which are equivalent to wasm.OpcodeVecI8x16SwizzleName instruction.
+	// compileV128Swizzle adds instructions to perform wazeroir.OperationV128Swizzle.
 	compileV128Swizzle(o *wazeroir.OperationV128Swizzle) error
-	// compileV128Swizzle adds instructions which are equivalent to wasm.OpcodeVecV128AnyTrueName instruction.
+	// compileV128AnyTrue adds instructions to perform wazeroir.OperationV128AnyTrue.
 	compileV128AnyTrue(o *wazeroir.OperationV128AnyTrue) error
-	// compileV128AllTrue adds instructions which are equivalent to wasm.OpcodeVecXXXAllTrue instructions.
-	// See wasm.OpcodeVecI8x16AllTrueName wasm.OpcodeVecI16x8AllTrueName wasm.OpcodeVecI32x4AllTrueName wasm.OpcodeVecI64x2AllTrueName.
+	// compileV128AllTrue adds instructions to perform wazeroir.OperationV128AllTrue.
 	compileV128AllTrue(o *wazeroir.OperationV128AllTrue) error
-	// compileV128BitMask adds instructions which are equivalent to wasm.OpcodeVecV128XXXBitMask instruction.
-	// See wasm.OpcodeVecI8x16BitMaskName wasm.OpcodeVecI16x8BitMaskName wasm.OpcodeVecI32x4BitMaskName wasm.OpcodeVecI64x2BitMaskName.
+	// compileV128BitMask adds instructions to perform wazeroir.OperationV128BitMask.
 	compileV128BitMask(*wazeroir.OperationV128BitMask) error
-	// compileV128And adds instructions which are equivalent to wasm.OpcodeVecV128AndName instruction.
-	// See wasm.OpcodeVecV128AndName.
+	// compileV128And adds instructions to perform wazeroir.OperationV128And.
 	compileV128And(*wazeroir.OperationV128And) error
-	// compileV128Not adds instructions which are equivalent to wasm.OpcodeVecV128NotName instruction.
-	// See wasm.OpcodeVecV128NotName.
+	// compileV128Not adds instructions to perform wazeroir.OperationV128Not.
 	compileV128Not(*wazeroir.OperationV128Not) error
-	// compileV128Or adds instructions which are equivalent to wasm.OpcodeVecV128OrName instruction.
-	// See wasm.OpcodeVecV128OrName.
+	// compileV128Or adds instructions to perform wazeroir.OperationV128Or.
 	compileV128Or(*wazeroir.OperationV128Or) error
-	// compileV128Xor adds instructions which are equivalent to wasm.OpcodeVecV128XorName instruction.
-	// See wasm.OpcodeVecV128XorName.
+	// compileV128Xor adds instructions to perform wazeroir.OperationV128Xor.
 	compileV128Xor(*wazeroir.OperationV128Xor) error
-	// compileV128Bitselect adds instructions which are equivalent to wasm.OpcodeVecV128BitselectName instruction.
-	// See wasm.OpcodeVecV128BitselectName.
+	// compileV128Bitselect adds instructions to perform wazeroir.OperationV128Bitselect.
 	compileV128Bitselect(*wazeroir.OperationV128Bitselect) error
-	// compileV128AndNot adds instructions which are equivalent to wasm.OpcodeVecV128AndNotName instruction.
-	// See wasm.OpcodeVecV128AndNotName.
+	// compileV128AndNot adds instructions to perform wazeroir.OperationV128AndNot.
 	compileV128AndNot(*wazeroir.OperationV128AndNot) error
-	// compileV128Shr adds instructions which are equivalent to wasm.OpcodeVecXXXShrYYYY instructions.
-	// See wasm.OpcodeVecI8x16ShrSName wasm.OpcodeVecI8x16ShrUName wasm.OpcodeVecI16x8ShrSName
-	// wasm.OpcodeVecI16x8ShrUName wasm.OpcodeVecI32x4ShrSName wasm.OpcodeVecI32x4ShrUName.
-	// wasm.OpcodeVecI64x2ShrSName wasm.OpcodeVecI64x2ShrUName.
+	// compileV128Shr adds instructions to perform wazeroir.OperationV128Shr.
 	compileV128Shr(*wazeroir.OperationV128Shr) error
-	// compileV128Shl adds instructions which are equivalent to wasm.OpcodeVecXXXShl instructions.
-	// See wasm.OpcodeVecI8x16ShlName wasm.OpcodeVecI16x8ShlName wasm.OpcodeVecI32x4ShlName wasm.OpcodeVecI64x2ShlName
+	// compileV128Shl adds instructions to perform wazeroir.OperationV128Shl.
 	compileV128Shl(*wazeroir.OperationV128Shl) error
-	// compileV128Cmp adds instructions which are equivalent to various vector comparison instructions.
-	// See wasm.OpcodeVecI8x16EqName, wasm.OpcodeVecI8x16NeName, wasm.OpcodeVecI8x16LtSName, wasm.OpcodeVecI8x16LtUName, wasm.OpcodeVecI8x16GtSName,
-	//	wasm.OpcodeVecI8x16GtUName, wasm.OpcodeVecI8x16LeSName, wasm.OpcodeVecI8x16LeUName, wasm.OpcodeVecI8x16GeSName, wasm.OpcodeVecI8x16GeUName,
-	//	wasm.OpcodeVecI16x8EqName, wasm.OpcodeVecI16x8NeName, wasm.OpcodeVecI16x8LtSName, wasm.OpcodeVecI16x8LtUName, wasm.OpcodeVecI16x8GtSName,
-	//	wasm.OpcodeVecI16x8GtUName, wasm.OpcodeVecI16x8LeSName, wasm.OpcodeVecI16x8LeUName, wasm.OpcodeVecI16x8GeSName, wasm.OpcodeVecI16x8GeUName,
-	//	wasm.OpcodeVecI32x4EqName, wasm.OpcodeVecI32x4NeName, wasm.OpcodeVecI32x4LtSName, wasm.OpcodeVecI32x4LtUName, wasm.OpcodeVecI32x4GtSName,
-	//	wasm.OpcodeVecI32x4GtUName, wasm.OpcodeVecI32x4LeSName, wasm.OpcodeVecI32x4LeUName, wasm.OpcodeVecI32x4GeSName, wasm.OpcodeVecI32x4GeUName,
-	//	wasm.OpcodeVecI64x2EqName, wasm.OpcodeVecI64x2NeName, wasm.OpcodeVecI64x2LtSName, wasm.OpcodeVecI64x2GtSName, wasm.OpcodeVecI64x2LeSName,
-	//	wasm.OpcodeVecI64x2GeSName, wasm.OpcodeVecF32x4EqName, wasm.OpcodeVecF32x4NeName, wasm.OpcodeVecF32x4LtName, wasm.OpcodeVecF32x4GtName,
-	//	wasm.OpcodeVecF32x4LeName, wasm.OpcodeVecF32x4GeName, wasm.OpcodeVecF64x2EqName, wasm.OpcodeVecF64x2NeName, wasm.OpcodeVecF64x2LtName,
-	//	wasm.OpcodeVecF64x2GtName, wasm.OpcodeVecF64x2LeName, wasm.OpcodeVecF64x2GeName
+	// compileV128Cmp adds instructions to perform wazeroir.OperationV128Cmp.
 	compileV128Cmp(*wazeroir.OperationV128Cmp) error
+	// compileV128AddSat adds instructions to perform wazeroir.OperationV128AddSat.
+	compileV128AddSat(*wazeroir.OperationV128AddSat) error
+	// compileV128SubSat adds instructions to perform wazeroir.OperationV128SubSat.
+	compileV128SubSat(*wazeroir.OperationV128SubSat) error
+	// compileV128Mul adds instructions to perform wazeroir.OperationV128Mul.
+	compileV128Mul(*wazeroir.OperationV128Mul) error
+	// compileV128Div adds instructions to perform wazeroir.OperationV128Div.
+	compileV128Div(*wazeroir.OperationV128Div) error
+	// compileV128Neg adds instructions to perform wazeroir.OperationV128Neg.
+	compileV128Neg(*wazeroir.OperationV128Neg) error
+	// compileV128Sqrt adds instructions to perform wazeroir.OperationV128Sqrt.
+	compileV128Sqrt(*wazeroir.OperationV128Sqrt) error
+	// compileV128Abs adds instructions to perform wazeroir.OperationV128Abs.
+	compileV128Abs(*wazeroir.OperationV128Abs) error
+	// compileV128Popcnt adds instructions to perform wazeroir.OperationV128Popcnt.
+	compileV128Popcnt(*wazeroir.OperationV128Popcnt) error
+	// compileV128Min adds instructions to perform wazeroir.OperationV128Min.
+	compileV128Min(*wazeroir.OperationV128Min) error
+	// compileV128Max adds instructions to perform wazeroir.OperationV128Max.
+	compileV128Max(*wazeroir.OperationV128Max) error
+	// compileV128AvgrU adds instructions to perform wazeroir.OperationV128AvgrU.
+	compileV128AvgrU(*wazeroir.OperationV128AvgrU) error
+	// compileV128Pmin adds instructions to perform wazeroir.OperationV128Pmin.
+	compileV128Pmin(*wazeroir.OperationV128Pmin) error
+	// compileV128Pmax adds instructions to perform wazeroir.OperationV128Pmax.
+	compileV128Pmax(*wazeroir.OperationV128Pmax) error
+	// compileV128Ceil adds instructions to perform wazeroir.OperationV128Ceil.
+	compileV128Ceil(*wazeroir.OperationV128Ceil) error
+	// compileV128Floor adds instructions to perform wazeroir.OperationV128Floor.
+	compileV128Floor(*wazeroir.OperationV128Floor) error
+	// compileV128Trunc adds instructions to perform wazeroir.OperationV128Trunc.
+	compileV128Trunc(*wazeroir.OperationV128Trunc) error
+	// compileV128Nearest adds instructions to perform wazeroir.OperationV128Nearest.
+	compileV128Nearest(*wazeroir.OperationV128Nearest) error
+	// compileV128Extend adds instructions to perform wazeroir.OperationV128Extend.
+	compileV128Extend(*wazeroir.OperationV128Extend) error
+	// compileV128ExtMul adds instructions to perform wazeroir.OperationV128ExtMul.
+	compileV128ExtMul(*wazeroir.OperationV128ExtMul) error
+	// compileV128Q15mulrSatS adds instructions to perform wazeroir.OperationV128Q15mulrSatS.
+	compileV128Q15mulrSatS(*wazeroir.OperationV128Q15mulrSatS) error
+	// compileV128ExtAddPairwise adds instructions to perform wazeroir.OperationV128ExtAddPairwise.
+	compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error
+	// compileV128FloatPromote adds instructions to perform wazeroir.OperationV128FloatPromote.
+	compileV128FloatPromote(o *wazeroir.OperationV128FloatPromote) error
+	// compileV128FloatDemote adds instructions to perform wazeroir.OperationV128FloatDemote.
+	compileV128FloatDemote(o *wazeroir.OperationV128FloatDemote) error
+	// compileV128FConvertFromI adds instructions to perform wazeroir.OperationV128FConvertFromI.
+	compileV128FConvertFromI(o *wazeroir.OperationV128FConvertFromI) error
+	// compileV128Dot adds instructions to perform wazeroir.OperationV128Dot.
+	compileV128Dot(o *wazeroir.OperationV128Dot) error
+	// compileV128Narrow adds instructions to perform wazeroir.OperationV128Narrow.
+	compileV128Narrow(o *wazeroir.OperationV128Narrow) error
+	// compileV128ITruncSatFromF adds instructions to perform wazeroir.OperationV128ITruncSatFromF.
+	compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error
 }
diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go
index a658d72d051..3bd8a281089 100644
--- a/internal/engine/compiler/compiler_vec_test.go
+++ b/internal/engine/compiler/compiler_vec_test.go
@@ -6,16 +6,115 @@ import (
 	"runtime"
 	"testing"
 
+	"github.com/tetratelabs/wazero/internal/moremath"
 	"github.com/tetratelabs/wazero/internal/testing/require"
 	"github.com/tetratelabs/wazero/internal/wasm"
 	"github.com/tetratelabs/wazero/internal/wazeroir"
 )
 
 func TestCompiler_compileV128Add(t *testing.T) {
-	// TODO
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:  "i8x16",
+			shape: wazeroir.ShapeI8x16,
+			x1:    [16]byte{0: 1, 2: 10, 10: 10},
+			x2:    [16]byte{0: 10, 4: 5, 10: 5},
+			exp:   [16]byte{0: 11, 2: 10, 4: 5, 10: 15},
+		},
+		{
+			name:  "i16x8",
+			shape: wazeroir.ShapeI16x8,
+			x1:    i16x8(1123, 0, 123, 1, 1, 5, 8, 1),
+			x2:    i16x8(0, 123, 123, 0, 1, 5, 9, 1),
+			exp:   i16x8(1123, 123, 246, 1, 2, 10, 17, 2),
+		},
+		{
+			name:  "i32x4",
+			shape: wazeroir.ShapeI32x4,
+			x1:    i32x4(i32ToU32(-123), 5, 4, math.MaxUint32),
+			x2:    i32x4(i32ToU32(-10), 1, i32ToU32(-104), math.MaxUint32),
+			exp:   i32x4(i32ToU32(-133), 6, i32ToU32(-100), math.MaxUint32-1),
+		},
+		{
+			name:  "i64x2",
+			shape: wazeroir.ShapeI64x2,
+			x1:    i64x2(i64ToU64(math.MinInt64), 12345),
+			x2:    i64x2(i64ToU64(-1), i64ToU64(-12345)),
+			exp:   i64x2(i64ToU64(math.MinInt64)+i64ToU64(-1), 0),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(1.0, 123, float32(math.Inf(1)), float32(math.Inf(-1))),
+			x2:    f32x4(51234.12341, 123, math.MaxFloat32, -123),
+			exp:   f32x4(51235.12341, 246, float32(math.Inf(1)), float32(math.Inf(-1))),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(1.123, math.Inf(1)),
+			x2:    f64x2(1.123, math.MinInt64),
+			exp:   f64x2(2.246, math.Inf(1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Add(&wazeroir.OperationV128Add{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
 }
 
 func TestCompiler_compileV128Sub(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
 	tests := []struct {
 		name        string
 		shape       wazeroir.Shape
@@ -28,7 +127,41 @@ func TestCompiler_compileV128Sub(t *testing.T) {
 			x2:    [16]byte{0: 10, 4: 5, 10: 5},
 			exp:   [16]byte{0: i8ToU8(-9), 2: 10, 4: i8ToU8(-5), 10: 5},
 		},
-		// TODO: add more cases.
+		{
+			name:  "i16x8",
+			shape: wazeroir.ShapeI16x8,
+			x1:    i16x8(1123, 0, 123, 1, 1, 5, 8, 1),
+			x2:    i16x8(0, 123, 123, 0, 1, 5, 9, 1),
+			exp:   i16x8(1123, i16ToU16(-123), 0, 1, 0, 0, i16ToU16(-1), 0),
+		},
+		{
+			name:  "i32x4",
+			shape: wazeroir.ShapeI32x4,
+			x1:    i32x4(i32ToU32(-123), 5, 4, math.MaxUint32),
+			x2:    i32x4(i32ToU32(-10), 1, i32ToU32(-104), math.MaxUint32),
+			exp:   i32x4(i32ToU32(-113), 4, 108, 0),
+		},
+		{
+			name:  "i64x2",
+			shape: wazeroir.ShapeI64x2,
+			x1:    i64x2(i64ToU64(math.MinInt64), 12345),
+			x2:    i64x2(i64ToU64(-1), i64ToU64(-12345)),
+			exp:   i64x2(i64ToU64(math.MinInt64+1), 12345*2),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(1.0, 123, float32(math.Inf(1)), float32(math.Inf(-1))),
+			x2:    f32x4(51234.12341, 123, math.MaxFloat32, -123),
+			exp:   f32x4(-51233.12341, 0, float32(math.Inf(1)), float32(math.Inf(-1))),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(1.123, math.Inf(1)),
+			x2:    f64x2(1.123, math.MinInt64),
+			exp:   f64x2(0, math.Inf(1)),
+		},
 	}
 
 	for _, tc := range tests {
@@ -82,26 +215,26 @@ func TestCompiler_compileV128Load(t *testing.T) {
 	tests := []struct {
 		name       string
 		memSetupFn func(buf []byte)
-		loadType   wazeroir.LoadV128Type
+		loadType   wazeroir.V128LoadType
 		offset     uint32
 		exp        [16]byte
 	}{
 		{
-			name: "v128 offset=0", loadType: wazeroir.LoadV128Type128, offset: 0,
+			name: "v128 offset=0", loadType: wazeroir.V128LoadType128, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20})
 			},
 			exp: [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
 		},
 		{
-			name: "v128 offset=2", loadType: wazeroir.LoadV128Type128, offset: 2,
+			name: "v128 offset=2", loadType: wazeroir.V128LoadType128, offset: 2,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20})
 			},
 			exp: [16]byte{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
 		},
 		{
-			name: "8x8s offset=0", loadType: wazeroir.LoadV128Type8x8s, offset: 0,
+			name: "8x8s offset=0", loadType: wazeroir.V128LoadType8x8s, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff, 9, 10,
@@ -113,7 +246,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "8x8s offset=3", loadType: wazeroir.LoadV128Type8x8s, offset: 3,
+			name: "8x8s offset=3", loadType: wazeroir.V128LoadType8x8s, offset: 3,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff, 9, 10,
@@ -125,7 +258,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "8x8u offset=0", loadType: wazeroir.LoadV128Type8x8u, offset: 0,
+			name: "8x8u offset=0", loadType: wazeroir.V128LoadType8x8u, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff, 9, 10,
@@ -137,7 +270,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "8x8i offset=3", loadType: wazeroir.LoadV128Type8x8u, offset: 3,
+			name: "8x8i offset=3", loadType: wazeroir.V128LoadType8x8u, offset: 3,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff, 9, 10,
@@ -149,7 +282,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "16x4s offset=0", loadType: wazeroir.LoadV128Type16x4s, offset: 0,
+			name: "16x4s offset=0", loadType: wazeroir.V128LoadType16x4s, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff, 9, 10,
@@ -164,7 +297,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "16x4s offset=3", loadType: wazeroir.LoadV128Type16x4s, offset: 3,
+			name: "16x4s offset=3", loadType: wazeroir.V128LoadType16x4s, offset: 3,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 0xff, 0xff, 9, 10,
@@ -179,7 +312,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "16x4u offset=0", loadType: wazeroir.LoadV128Type16x4u, offset: 0,
+			name: "16x4u offset=0", loadType: wazeroir.V128LoadType16x4u, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 0xff, 7, 0xff, 9, 10,
@@ -194,7 +327,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "16x4u offset=3", loadType: wazeroir.LoadV128Type16x4u, offset: 3,
+			name: "16x4u offset=3", loadType: wazeroir.V128LoadType16x4u, offset: 3,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 0xff, 0xff, 9, 10,
@@ -209,7 +342,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "32x2s offset=0", loadType: wazeroir.LoadV128Type32x2s, offset: 0,
+			name: "32x2s offset=0", loadType: wazeroir.V128LoadType32x2s, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 10,
@@ -222,7 +355,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "32x2s offset=2", loadType: wazeroir.LoadV128Type32x2s, offset: 2,
+			name: "32x2s offset=2", loadType: wazeroir.V128LoadType32x2s, offset: 2,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -235,7 +368,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "32x2u offset=0", loadType: wazeroir.LoadV128Type32x2u, offset: 0,
+			name: "32x2u offset=0", loadType: wazeroir.V128LoadType32x2u, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 10,
@@ -248,7 +381,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "32x2u offset=2", loadType: wazeroir.LoadV128Type32x2u, offset: 2,
+			name: "32x2u offset=2", loadType: wazeroir.V128LoadType32x2u, offset: 2,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -261,7 +394,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "32zero offset=0", loadType: wazeroir.LoadV128Type32zero, offset: 0,
+			name: "32zero offset=0", loadType: wazeroir.V128LoadType32zero, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -274,7 +407,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "32zero offset=3", loadType: wazeroir.LoadV128Type32zero, offset: 3,
+			name: "32zero offset=3", loadType: wazeroir.V128LoadType32zero, offset: 3,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 0xff, 8, 9, 0xff,
@@ -287,7 +420,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "64zero offset=0", loadType: wazeroir.LoadV128Type64zero, offset: 0,
+			name: "64zero offset=0", loadType: wazeroir.V128LoadType64zero, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -300,7 +433,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "64zero offset=2", loadType: wazeroir.LoadV128Type64zero, offset: 2,
+			name: "64zero offset=2", loadType: wazeroir.V128LoadType64zero, offset: 2,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -313,7 +446,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			},
 		},
 		{
-			name: "8splat offset=0", loadType: wazeroir.LoadV128Type8Splat, offset: 0,
+			name: "8splat offset=0", loadType: wazeroir.V128LoadType8Splat, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -322,7 +455,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			exp: [16]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
 		},
 		{
-			name: "8splat offset=1", loadType: wazeroir.LoadV128Type8Splat, offset: 1,
+			name: "8splat offset=1", loadType: wazeroir.V128LoadType8Splat, offset: 1,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -332,7 +465,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
 		},
 		{
-			name: "16splat offset=0", loadType: wazeroir.LoadV128Type16Splat, offset: 0,
+			name: "16splat offset=0", loadType: wazeroir.V128LoadType16Splat, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -341,7 +474,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			exp: [16]byte{1, 0xff, 1, 0xff, 1, 0xff, 1, 0xff, 1, 0xff, 1, 0xff, 1, 0xff, 1, 0xff},
 		},
 		{
-			name: "16splat offset=5", loadType: wazeroir.LoadV128Type16Splat, offset: 5,
+			name: "16splat offset=5", loadType: wazeroir.V128LoadType16Splat, offset: 5,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -350,7 +483,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			exp: [16]byte{6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7},
 		},
 		{
-			name: "32splat offset=0", loadType: wazeroir.LoadV128Type32Splat, offset: 0,
+			name: "32splat offset=0", loadType: wazeroir.V128LoadType32Splat, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -359,7 +492,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			exp: [16]byte{1, 0xff, 3, 0xff, 1, 0xff, 3, 0xff, 1, 0xff, 3, 0xff, 1, 0xff, 3, 0xff},
 		},
 		{
-			name: "32splat offset=1", loadType: wazeroir.LoadV128Type32Splat, offset: 1,
+			name: "32splat offset=1", loadType: wazeroir.V128LoadType32Splat, offset: 1,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -368,7 +501,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			exp: [16]byte{0xff, 3, 0xff, 5, 0xff, 3, 0xff, 5, 0xff, 3, 0xff, 5, 0xff, 3, 0xff, 5},
 		},
 		{
-			name: "64splat offset=0", loadType: wazeroir.LoadV128Type64Splat, offset: 0,
+			name: "64splat offset=0", loadType: wazeroir.V128LoadType64Splat, offset: 0,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -377,7 +510,7 @@ func TestCompiler_compileV128Load(t *testing.T) {
 			exp: [16]byte{1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 1, 0xff, 3, 0xff, 5, 6, 7, 0xff},
 		},
 		{
-			name: "64splat offset=1", loadType: wazeroir.LoadV128Type64Splat, offset: 1,
+			name: "64splat offset=1", loadType: wazeroir.V128LoadType64Splat, offset: 1,
 			memSetupFn: func(buf []byte) {
 				copy(buf, []byte{
 					1, 0xff, 3, 0xff, 5, 6, 7, 0xff, 9, 0xff,
@@ -1557,6 +1690,18 @@ func i8ToU8(v int8) byte {
 	return byte(v)
 }
 
+func i16ToU16(v int16) uint16 {
+	return uint16(v)
+}
+
+func i32ToU32(v int32) uint32 {
+	return uint32(v)
+}
+
+func i64ToU64(v int64) uint64 {
+	return uint64(v)
+}
+
 func TestCompiler_compileV128Swizzle(t *testing.T) {
 
 	tests := []struct {
@@ -3274,3 +3419,4011 @@ func TestCompiler_compileV128Cmp(t *testing.T) {
 		})
 	}
 }
+
+func TestCompiler_compileV128AvgrU(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:  "i8x16",
+			shape: wazeroir.ShapeI8x16,
+			x1:    [16]byte{0: 1, 2: 10, 10: 10, 15: math.MaxUint8},
+			x2:    [16]byte{0: 10, 4: 5, 10: 5, 15: 10},
+			exp: [16]byte{
+				0:  byte((uint16(1) + uint16(10) + 1) / 2),
+				2:  byte((uint16(10) + 1) / 2),
+				4:  byte((uint16(5) + 1) / 2),
+				10: byte((uint16(10) + uint16(5) + 1) / 2),
+				15: byte((uint16(math.MaxUint8) + uint16(10) + 1) / 2),
+			},
+		},
+		{
+			name:  "i16x8",
+			shape: wazeroir.ShapeI16x8,
+			x1:    i16x8(1, 0, 100, 0, 0, math.MaxUint16, 0, 0),
+			x2:    i16x8(10, 0, math.MaxUint16, 0, 0, 1, 0, 0),
+			exp: i16x8(
+				uint16((uint32(1)+uint32(10)+1)/2),
+				0,
+				uint16((uint32(100)+uint32(math.MaxUint16)+1)/2),
+				0,
+				0,
+				uint16((uint32(1)+uint32(math.MaxUint16)+1)/2),
+				0, 0,
+			),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128AvgrU(&wazeroir.OperationV128AvgrU{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Sqrt(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		shape  wazeroir.Shape
+		v, exp [16]byte
+	}{
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			v:     f32x4(1.23, -123.1231, math.MaxFloat32, float32(math.Inf(1))),
+			exp: f32x4(
+				float32(math.Sqrt(float64(float32(1.23)))),
+				float32(math.Sqrt(float64(float32(-123.1231)))),
+				float32(math.Sqrt(float64(float32(math.MaxFloat32)))),
+				float32(math.Sqrt(float64(float32(math.Inf(1))))),
+			),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			v:     f64x2(1.2314, math.MaxFloat64),
+			exp:   f64x2(math.Sqrt(1.2314), math.Sqrt(math.MaxFloat64)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Sqrt(&wazeroir.OperationV128Sqrt{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Mul(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:  "i16x8",
+			shape: wazeroir.ShapeI16x8,
+			x1:    i16x8(1123, 0, 123, 1, 1, 5, 8, 1),
+			x2:    i16x8(0, 123, 123, 0, 1, 5, 9, 1),
+			exp:   i16x8(0, 0, 123*123, 0, 1, 25, 8*9, 1),
+		},
+		{
+			name:  "i32x4",
+			shape: wazeroir.ShapeI32x4,
+			x1:    i32x4(i32ToU32(-123), 5, 4, math.MaxUint32),
+			x2:    i32x4(i32ToU32(-10), 1, i32ToU32(-104), 0),
+			exp:   i32x4(1230, 5, i32ToU32(-416), 0),
+		},
+		{
+			name:  "i64x2",
+			shape: wazeroir.ShapeI64x2,
+			x1:    i64x2(1, 12345),
+			x2:    i64x2(100, i64ToU64(-10)),
+			exp:   i64x2(100, i64ToU64(-123450)),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(1.0, 123, float32(math.Inf(1)), float32(math.Inf(-1))),
+			x2:    f32x4(51234.12341, 123, math.MaxFloat32, -123),
+			exp:   f32x4(51234.12341, 123*123, float32(math.Inf(1)), float32(math.Inf(1))),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(1.123, math.Inf(1)),
+			x2:    f64x2(1.123, math.MinInt64),
+			exp:   f64x2(1.123*1.123, math.Inf(-1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Mul(&wazeroir.OperationV128Mul{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Neg(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		shape  wazeroir.Shape
+		v, exp [16]byte
+	}{
+		{
+			name:  "i8x16",
+			shape: wazeroir.ShapeI8x16,
+			v:     [16]byte{1: 123, 5: i8ToU8(-1), 15: i8ToU8(-125)},
+			exp:   [16]byte{1: i8ToU8(-123), 5: 1, 15: 125},
+		},
+		{
+			name:  "i16x8",
+			shape: wazeroir.ShapeI16x8,
+			v:     i16x8(0, 0, i16ToU16(-123), 0, 1, 25, 8, i16ToU16(-1)),
+			exp:   i16x8(0, 0, 123, 0, i16ToU16(-1), i16ToU16(-25), i16ToU16(-8), 1),
+		},
+		{
+			name:  "i32x4",
+			shape: wazeroir.ShapeI32x4,
+			v:     i32x4(1230, 5, i32ToU32(-416), 0),
+			exp:   i32x4(i32ToU32(-1230), i32ToU32(-5), 416, 0),
+		},
+		{
+			name:  "i64x2",
+			shape: wazeroir.ShapeI64x2,
+			v:     i64x2(100, i64ToU64(-123450)),
+			exp:   i64x2(i64ToU64(-100), 123450),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			v:     f32x4(51234.12341, -123, float32(math.Inf(1)), 0.1),
+			exp:   f32x4(-51234.12341, 123, float32(math.Inf(-1)), -0.1),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			v:     f32x4(51234.12341, 0, float32(math.Inf(1)), 0.1),
+			exp:   f32x4(-51234.12341, float32(math.Copysign(0, -1)), float32(math.Inf(-1)), -0.1),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			v:     f64x2(1.123, math.Inf(-1)),
+			exp:   f64x2(-1.123, math.Inf(1)),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			v:     f64x2(0, math.Inf(-1)),
+			exp:   f64x2(math.Copysign(0, -1), math.Inf(1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Neg(&wazeroir.OperationV128Neg{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Abs(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		shape  wazeroir.Shape
+		v, exp [16]byte
+	}{
+		{
+			name:  "i8x16",
+			shape: wazeroir.ShapeI8x16,
+			v:     [16]byte{1: 123, 5: i8ToU8(-1), 15: i8ToU8(-125)},
+			exp:   [16]byte{1: 123, 5: 1, 15: 125},
+		},
+		{
+			name:  "i16x8",
+			shape: wazeroir.ShapeI16x8,
+			v:     i16x8(0, 0, i16ToU16(-123), 0, 1, 25, 8, i16ToU16(-1)),
+			exp:   i16x8(0, 0, 123, 0, 1, 25, 8, 1),
+		},
+		{
+			name:  "i32x4",
+			shape: wazeroir.ShapeI32x4,
+			v:     i32x4(i32ToU32(-1230), 5, i32ToU32(-416), 0),
+			exp:   i32x4(1230, 5, 416, 0),
+		},
+		{
+			name:  "i64x2",
+			shape: wazeroir.ShapeI64x2,
+			v:     i64x2(i64ToU64(-100), i64ToU64(-123450)),
+			exp:   i64x2(100, 123450),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			v:     f32x4(51234.12341, -123, float32(math.Inf(1)), 0.1),
+			exp:   f32x4(51234.12341, 123, float32(math.Inf(1)), 0.1),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			v:     f32x4(51234.12341, 0, float32(math.Inf(1)), -0.1),
+			exp:   f32x4(51234.12341, 0, float32(math.Inf(1)), 0.1),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			v:     f64x2(-1.123, math.Inf(-1)),
+			exp:   f64x2(1.123, math.Inf(1)),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			v:     f64x2(0, math.Inf(-1)),
+			exp:   f64x2(0, math.Inf(1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Abs(&wazeroir.OperationV128Abs{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Div(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(1.0, 123, float32(math.Inf(1)), float32(math.Inf(-1))),
+			x2:    f32x4(123.12, 123, math.MaxFloat32, -123),
+			exp:   f32x4(float32(1.0)/float32(123.12), 1, float32(math.Inf(1)), float32(math.Inf(1))),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(1.123, math.Inf(1)),
+			x2:    f64x2(1.123, math.MinInt64),
+			exp:   f64x2(1.0, math.Inf(-1)),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(0, math.Inf(1)),
+			x2:    f64x2(1.123, math.MaxInt64),
+			exp:   f64x2(0, math.Inf(1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Div(&wazeroir.OperationV128Div{Shape: tc.shape})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Min(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		signed      bool
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:   "i8x16s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			x1:     [16]byte{0: 123, 5: i8ToU8(-1), 15: 2},
+			x2:     [16]byte{0: 1, 5: 0, 15: i8ToU8(-1)},
+			exp:    [16]byte{0: 1, 5: i8ToU8(-1), 15: i8ToU8(-1)},
+		},
+		{
+			name:   "i8x16u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			x1:     [16]byte{0: 123, 5: i8ToU8(-1), 15: 2},
+			x2:     [16]byte{0: 1, 5: 0, 15: i8ToU8(-1)},
+			exp:    [16]byte{0: 1, 5: 0, 15: 2},
+		},
+		{
+			name:   "i16x8s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(1123, 0, 123, 1, 1, 6, i16ToU16(-123), 1),
+			x2:     i16x8(0, 123, i16ToU16(-123), 3, 1, 4, 5, 1),
+			exp:    i16x8(0, 0, i16ToU16(-123), 1, 1, 4, i16ToU16(-123), 1),
+		},
+		{
+			name:   "i16x8u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(1123, 0, 123, 1, 1, 6, i16ToU16(-123), 1),
+			x2:     i16x8(0, 123, i16ToU16(-123), 3, 1, 4, 5, 1),
+			exp:    i16x8(0, 0, 123, 1, 1, 4, 5, 1),
+		},
+		{
+			name:   "i32x4s",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			x1:     i32x4(i32ToU32(-123), 0, 1, i32ToU32(math.MinInt32)),
+			x2:     i32x4(123, 5, 1, 0),
+			exp:    i32x4(i32ToU32(-123), 0, 1, i32ToU32(math.MinInt32)),
+		},
+		{
+			name:   "i32x4u",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			x1:     i32x4(i32ToU32(-123), 0, 1, i32ToU32(math.MinInt32)),
+			x2:     i32x4(123, 5, 1, 0),
+			exp:    i32x4(123, 0, 1, 0),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(float32(math.NaN()), -123.12, 2.3, float32(math.Inf(1))),
+			x2:    f32x4(5.5, 123.12, 5.0, float32(math.Inf(-1))),
+			exp:   f32x4(float32(math.NaN()), -123.12, 2.3, float32(math.Inf(-1))),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(5.5, 123.12, -5.0, float32(math.Inf(-1))),
+			x2:    f32x4(-123.12, float32(math.NaN()), 2.3, float32(math.Inf(-1))),
+			exp:   f32x4(-123.12, float32(math.NaN()), -5.0, float32(math.Inf(-1))),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			x2:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			x2:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.MinInt64, 0),
+			x2:    f64x2(math.MaxInt64, -12.3),
+			exp:   f64x2(math.MinInt64, -12.3),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.MaxInt64, -12.3),
+			x2:    f64x2(math.MinInt64, 0),
+			exp:   f64x2(math.MinInt64, -12.3),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.NaN(), math.NaN()),
+			x2:    f64x2(math.Inf(1), math.Inf(-1)),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.Inf(1), math.Inf(-1)),
+			x2:    f64x2(math.NaN(), math.NaN()),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Min(&wazeroir.OperationV128Min{Shape: tc.shape, Signed: tc.signed})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			switch tc.shape {
+			case wazeroir.ShapeF64x2:
+				for _, vs := range [][2]float64{
+					{math.Float64frombits(lo), math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[:8]))},
+					{math.Float64frombits(hi), math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[8:]))},
+				} {
+					actual, exp := vs[0], vs[1]
+					if math.IsNaN(exp) {
+						require.True(t, math.IsNaN(actual))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			case wazeroir.ShapeF32x4:
+				for _, vs := range [][2]float32{
+					{math.Float32frombits(uint32(lo)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[:4]))},
+					{math.Float32frombits(uint32(lo >> 32)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[4:8]))},
+					{math.Float32frombits(uint32(hi)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[8:12]))},
+					{math.Float32frombits(uint32(hi >> 32)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[12:]))},
+				} {
+					actual, exp := vs[0], vs[1]
+					if math.IsNaN(float64(exp)) {
+						require.True(t, math.IsNaN(float64(actual)))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			default:
+				var actual [16]byte
+				binary.LittleEndian.PutUint64(actual[:8], lo)
+				binary.LittleEndian.PutUint64(actual[8:], hi)
+				require.Equal(t, tc.exp, actual)
+			}
+		})
+	}
+}
+
+func TestCompiler_compileV128Max(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		signed      bool
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:   "i8x16s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			x1:     [16]byte{0: 123, 5: i8ToU8(-1), 15: 2},
+			x2:     [16]byte{0: 1, 5: 0, 15: i8ToU8(-1)},
+			exp:    [16]byte{0: 123, 5: 0, 15: 2},
+		},
+		{
+			name:   "i8x16u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			x1:     [16]byte{0: 123, 5: i8ToU8(-1), 15: 2},
+			x2:     [16]byte{0: 1, 5: 0, 15: i8ToU8(-1)},
+			exp:    [16]byte{0: 123, 5: i8ToU8(-1), 15: i8ToU8(-1)},
+		},
+		{
+			name:   "i16x8s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(1123, 0, 123, 1, 1, 6, i16ToU16(-123), 1),
+			x2:     i16x8(0, 123, i16ToU16(-123), 3, 1, 4, 5, 1),
+			exp:    i16x8(1123, 123, 123, 3, 1, 6, 5, 1),
+		},
+		{
+			name:   "i16x8u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(1123, 0, 123, 1, 1, 6, i16ToU16(-123), 1),
+			x2:     i16x8(0, 123, i16ToU16(-123), 3, 1, 4, 5, 1),
+			exp:    i16x8(1123, 123, i16ToU16(-123), 3, 1, 6, i16ToU16(-123), 1),
+		},
+		{
+			name:   "i32x4s",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			x1:     i32x4(i32ToU32(-123), 0, 1, i32ToU32(math.MinInt32)),
+			x2:     i32x4(123, 5, 1, 0),
+			exp:    i32x4(123, 5, 1, 0),
+		},
+		{
+			name:   "i32x4u",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			x1:     i32x4(i32ToU32(-123), 0, 1, i32ToU32(math.MinInt32)),
+			x2:     i32x4(123, 5, 1, 0),
+			exp:    i32x4(i32ToU32(-123), 5, 1, i32ToU32(math.MinInt32)),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(float32(math.NaN()), -123.12, 2.3, float32(math.Inf(1))),
+			x2:    f32x4(5.5, 123.12, 5.0, float32(math.Inf(-1))),
+			exp:   f32x4(float32(math.NaN()), 123.12, 5.0, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(5.5, 123.12, -5.0, float32(math.Inf(-1))),
+			x2:    f32x4(-123.12, float32(math.NaN()), 2.3, float32(math.Inf(-1))),
+			exp:   f32x4(5.5, float32(math.NaN()), 2.3, float32(math.Inf(-1))),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			x2:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f32x4",
+			shape: wazeroir.ShapeF32x4,
+			x1:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			x2:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.MinInt64, 0),
+			x2:    f64x2(math.MaxInt64, -12.3),
+			exp:   f64x2(math.MaxInt64, 0),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.MaxInt64, -12.3),
+			x2:    f64x2(math.MinInt64, 0),
+			exp:   f64x2(math.MaxInt64, 0),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.NaN(), -12.3),
+			x2:    f64x2(math.MinInt64, math.NaN()),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.MinInt64, math.NaN()),
+			x2:    f64x2(math.NaN(), -12.3),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.NaN(), math.NaN()),
+			x2:    f64x2(math.Inf(1), math.Inf(-1)),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64x2",
+			shape: wazeroir.ShapeF64x2,
+			x1:    f64x2(math.Inf(1), math.Inf(-1)),
+			x2:    f64x2(math.NaN(), math.NaN()),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Max(&wazeroir.OperationV128Max{Shape: tc.shape, Signed: tc.signed})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			switch tc.shape {
+			case wazeroir.ShapeF64x2:
+				for _, vs := range [][2]float64{
+					{math.Float64frombits(lo), math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[:8]))},
+					{math.Float64frombits(hi), math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[8:]))},
+				} {
+					actual, exp := vs[0], vs[1]
+					if math.IsNaN(exp) {
+						require.True(t, math.IsNaN(actual))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			case wazeroir.ShapeF32x4:
+				for _, vs := range [][2]float32{
+					{math.Float32frombits(uint32(lo)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[:4]))},
+					{math.Float32frombits(uint32(lo >> 32)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[4:8]))},
+					{math.Float32frombits(uint32(hi)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[8:12]))},
+					{math.Float32frombits(uint32(hi >> 32)), math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[12:]))},
+				} {
+					actual, exp := vs[0], vs[1]
+					if math.IsNaN(float64(exp)) {
+						require.True(t, math.IsNaN(float64(actual)))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			default:
+				var actual [16]byte
+				binary.LittleEndian.PutUint64(actual[:8], lo)
+				binary.LittleEndian.PutUint64(actual[8:], hi)
+				require.Equal(t, tc.exp, actual)
+			}
+		})
+	}
+}
+
+func TestCompiler_compileV128AddSat(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		signed      bool
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:   "i8x16s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			x1: [16]byte{
+				0:  i8ToU8(math.MaxInt8),
+				5:  i8ToU8(-1),
+				15: i8ToU8(math.MinInt8),
+			},
+			x2: [16]byte{
+				0:  1,
+				5:  0,
+				15: i8ToU8(-1),
+			},
+			exp: [16]byte{
+				0:  i8ToU8(math.MaxInt8),
+				5:  i8ToU8(-1),
+				15: i8ToU8(math.MinInt8),
+			},
+		},
+		{
+			name:   "i8x16u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			x1: [16]byte{
+				0:  i8ToU8(math.MaxInt8),
+				5:  0,
+				15: math.MaxUint8,
+			},
+			x2: [16]byte{
+				0:  1,
+				5:  i8ToU8(-1),
+				15: 1,
+			},
+			exp: [16]byte{
+				0:  i8ToU8(math.MaxInt8) + 1,
+				5:  i8ToU8(-1),
+				15: math.MaxUint8,
+			},
+		},
+		{
+			name:   "i16x8s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(i16ToU16(math.MinInt16), 0, 123, 1, 1, 6, i16ToU16(-123), i16ToU16(math.MaxInt16)),
+			x2:     i16x8(i16ToU16(-1), 123, i16ToU16(-123), 3, 1, 4, 5, 1),
+			exp:    i16x8(i16ToU16(math.MinInt16), 123, 0, 4, 2, 10, i16ToU16(-118), i16ToU16(math.MaxInt16)),
+		},
+		{
+			name:   "i16x8u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(1123, 0, 123, 1, 1, 6, i16ToU16(-123), math.MaxUint16),
+			x2:     i16x8(0, 123, math.MaxUint16, 3, 1, 4, 0, 1),
+			exp:    i16x8(1123, 123, math.MaxUint16, 4, 2, 10, i16ToU16(-123), math.MaxUint16),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128AddSat(&wazeroir.OperationV128AddSat{Shape: tc.shape, Signed: tc.signed})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128SubSat(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		signed      bool
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:   "i8x16s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			x1: [16]byte{
+				0:  i8ToU8(math.MinInt8),
+				5:  i8ToU8(-1),
+				15: i8ToU8(math.MaxInt8),
+			},
+			x2: [16]byte{
+				0:  1,
+				5:  0,
+				15: i8ToU8(-1),
+			},
+			exp: [16]byte{
+				0:  i8ToU8(math.MinInt8),
+				5:  i8ToU8(-1),
+				15: i8ToU8(math.MaxInt8),
+			},
+		},
+		{
+			name:   "i8x16u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			x1: [16]byte{
+				0:  i8ToU8(math.MinInt8),
+				5:  i8ToU8(-1),
+				15: 0,
+			},
+			x2: [16]byte{
+				0:  1,
+				5:  0,
+				15: 1,
+			},
+			exp: [16]byte{
+				0:  i8ToU8(math.MinInt8) - 1,
+				5:  i8ToU8(-1),
+				15: 0,
+			},
+		},
+		{
+			name:   "i16x8s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(i16ToU16(math.MinInt16), 0, 123, 1, 1, 6, i16ToU16(-123), i16ToU16(math.MaxInt16)),
+			x2:     i16x8(1, 123, i16ToU16(-123), 3, 1, 4, 5, i16ToU16(-123)),
+			exp:    i16x8(i16ToU16(math.MinInt16), i16ToU16(-123), 246, i16ToU16(-2), 0, 2, i16ToU16(-128), i16ToU16(math.MaxInt16)),
+		},
+		{
+			name:   "i16x8u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(1123, 0, 123, 1, 1, 6, 200, math.MaxUint16),
+			x2:     i16x8(0, 123, math.MaxUint16, 3, 1, 4, i16ToU16(-1), 12),
+			exp:    i16x8(1123, 0, 0, 0, 0, 2, 0, math.MaxUint16-12),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128SubSat(&wazeroir.OperationV128SubSat{Shape: tc.shape, Signed: tc.signed})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Popcnt(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		v, exp [16]byte
+	}{
+		{
+			name: "ones",
+			v: [16]byte{
+				1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7,
+				0, 1 << 2, 0, 1 << 4, 0, 1 << 6, 0, 0,
+			},
+			exp: [16]byte{
+				1, 1, 1, 1, 1, 1, 1, 1,
+				0, 1, 0, 1, 0, 1, 0, 0,
+			},
+		},
+		{
+			name: "mix",
+			v: [16]byte{
+				0b1, 0b11, 0b111, 0b1111, 0b11111, 0b111111, 0b1111111, 0b11111111,
+				0b10000001, 0b10000010, 0b10000100, 0b10001000, 0b10010000, 0b10100000, 0b11000000, 0,
+			},
+			exp: [16]byte{
+				1, 2, 3, 4, 5, 6, 7, 8,
+				2, 2, 2, 2, 2, 2, 2, 0,
+			},
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Popcnt(&wazeroir.OperationV128Popcnt{})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Round(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name  string
+		shape wazeroir.Shape
+		kind  wazeroir.OperationKind
+		v     [16]byte
+	}{
+		{
+			name:  "f32 ceil",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Ceil,
+			v:     f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 ceil",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Ceil,
+			v:     f32x4(math.Pi, -1231231.123, float32(math.NaN()), float32(math.Inf(-1))),
+		},
+		{
+			name:  "f64 ceil",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Ceil,
+			v:     f64x2(1.231, -123.12313),
+		},
+		{
+			name:  "f64 ceil",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Ceil,
+			v:     f64x2(math.Inf(1), math.NaN()),
+		},
+		{
+			name:  "f64 ceil",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Ceil,
+			v:     f64x2(math.Inf(-1), math.Pi),
+		},
+		{
+			name:  "f32 floor",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Floor,
+			v:     f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 floor",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Floor,
+			v:     f32x4(math.Pi, -1231231.123, float32(math.NaN()), float32(math.Inf(-1))),
+		},
+		{
+			name:  "f64 floor",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Floor,
+			v:     f64x2(1.231, -123.12313),
+		},
+		{
+			name:  "f64 floor",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Floor,
+			v:     f64x2(math.Inf(1), math.NaN()),
+		},
+		{
+			name:  "f64 floor",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Floor,
+			v:     f64x2(math.Inf(-1), math.Pi),
+		},
+		{
+			name:  "f32 trunc",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Trunc,
+			v:     f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 trunc",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Trunc,
+			v:     f32x4(math.Pi, -1231231.123, float32(math.NaN()), float32(math.Inf(-1))),
+		},
+		{
+			name:  "f64 trunc",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Trunc,
+			v:     f64x2(1.231, -123.12313),
+		},
+		{
+			name:  "f64 trunc",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Trunc,
+			v:     f64x2(math.Inf(1), math.NaN()),
+		},
+		{
+			name:  "f64 trunc",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Trunc,
+			v:     f64x2(math.Inf(-1), math.Pi),
+		},
+		{
+			name:  "f32 nearest",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Nearest,
+			v:     f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 nearest",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Nearest,
+			v:     f32x4(math.Pi, -1231231.123, float32(math.NaN()), float32(math.Inf(-1))),
+		},
+		{
+			name:  "f64 nearest",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Nearest,
+			v:     f64x2(1.231, -123.12313),
+		},
+		{
+			name:  "f64 nearest",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Nearest,
+			v:     f64x2(math.Inf(1), math.NaN()),
+		},
+		{
+			name:  "f64 nearest",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Nearest,
+			v:     f64x2(math.Inf(-1), math.Pi),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			is32bit := tc.shape == wazeroir.ShapeF32x4
+			switch tc.kind {
+			case wazeroir.OperationKindV128Ceil:
+				err = compiler.compileV128Ceil(&wazeroir.OperationV128Ceil{Shape: tc.shape})
+			case wazeroir.OperationKindV128Floor:
+				err = compiler.compileV128Floor(&wazeroir.OperationV128Floor{Shape: tc.shape})
+			case wazeroir.OperationKindV128Trunc:
+				err = compiler.compileV128Trunc(&wazeroir.OperationV128Trunc{Shape: tc.shape})
+			case wazeroir.OperationKindV128Nearest:
+				err = compiler.compileV128Nearest(&wazeroir.OperationV128Nearest{Shape: tc.shape})
+			}
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+
+			if is32bit {
+				actualFs := [4]float32{
+					math.Float32frombits(uint32(lo)),
+					math.Float32frombits(uint32(lo >> 32)),
+					math.Float32frombits(uint32(hi)),
+					math.Float32frombits(uint32(hi >> 32))}
+				f1Original, f2Original, f3Original, f4Original :=
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.v[:4])),
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.v[4:8])),
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.v[8:12])),
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.v[12:]))
+
+				var expFs [4]float32
+				switch tc.kind {
+				case wazeroir.OperationKindV128Ceil:
+					expFs[0] = float32(math.Ceil(float64(f1Original)))
+					expFs[1] = float32(math.Ceil(float64(f2Original)))
+					expFs[2] = float32(math.Ceil(float64(f3Original)))
+					expFs[3] = float32(math.Ceil(float64(f4Original)))
+				case wazeroir.OperationKindV128Floor:
+					expFs[0] = float32(math.Floor(float64(f1Original)))
+					expFs[1] = float32(math.Floor(float64(f2Original)))
+					expFs[2] = float32(math.Floor(float64(f3Original)))
+					expFs[3] = float32(math.Floor(float64(f4Original)))
+				case wazeroir.OperationKindV128Trunc:
+					expFs[0] = float32(math.Trunc(float64(f1Original)))
+					expFs[1] = float32(math.Trunc(float64(f2Original)))
+					expFs[2] = float32(math.Trunc(float64(f3Original)))
+					expFs[3] = float32(math.Trunc(float64(f4Original)))
+				case wazeroir.OperationKindV128Nearest:
+					expFs[0] = moremath.WasmCompatNearestF32(f1Original)
+					expFs[1] = moremath.WasmCompatNearestF32(f2Original)
+					expFs[2] = moremath.WasmCompatNearestF32(f3Original)
+					expFs[3] = moremath.WasmCompatNearestF32(f4Original)
+				}
+
+				for i := range expFs {
+					exp, actual := expFs[i], actualFs[i]
+					if math.IsNaN(float64(exp)) {
+						require.True(t, math.IsNaN(float64(actual)))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			} else {
+				actualFs := [2]float64{math.Float64frombits(lo), math.Float64frombits(hi)}
+				f1Original, f2Original :=
+					math.Float64frombits(binary.LittleEndian.Uint64(tc.v[:8])), math.Float64frombits(binary.LittleEndian.Uint64(tc.v[8:]))
+
+				var expFs [2]float64
+				switch tc.kind {
+				case wazeroir.OperationKindV128Ceil:
+					expFs[0] = math.Ceil(f1Original)
+					expFs[1] = math.Ceil(f2Original)
+				case wazeroir.OperationKindV128Floor:
+					expFs[0] = math.Floor(f1Original)
+					expFs[1] = math.Floor(f2Original)
+				case wazeroir.OperationKindV128Trunc:
+					expFs[0] = math.Trunc(f1Original)
+					expFs[1] = math.Trunc(f2Original)
+				case wazeroir.OperationKindV128Nearest:
+					expFs[0] = moremath.WasmCompatNearestF64(f1Original)
+					expFs[1] = moremath.WasmCompatNearestF64(f2Original)
+				}
+
+				for i := range expFs {
+					exp, actual := expFs[i], actualFs[i]
+					if math.IsNaN(exp) {
+						require.True(t, math.IsNaN(actual))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestCompiler_compileV128_Pmax_Pmin(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		kind        wazeroir.OperationKind
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:  "f32 pmin",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f32x4(float32(math.Inf(1)), -1.5, 1123.5, float32(math.Inf(1))),
+			x2:    f32x4(1.4, float32(math.Inf(-1)), -1231.5, float32(math.Inf(1))),
+			exp:   f32x4(1.4, float32(math.Inf(-1)), -1231.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 pmin",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			x2:    f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f32 pmin",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+			x2:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			exp:   f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 pmin",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			x2:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			exp:   f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 pmin",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			x2:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f64 pmin",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f64x2(math.Inf(1), -123123.1231),
+			x2:    f64x2(-123123.1, math.Inf(-1)),
+			exp:   f64x2(-123123.1, math.Inf(-1)),
+		},
+		{
+			name:  "f64 pmin",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f64x2(math.NaN(), math.NaN()),
+			x2:    f64x2(-123123.1, 1.0),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64 pmin",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f64x2(-123123.1, 1.0),
+			x2:    f64x2(math.NaN(), math.NaN()),
+			exp:   f64x2(-123123.1, 1.0),
+		},
+		{
+			name:  "f64 pmin",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f64x2(math.NaN(), math.NaN()),
+			x2:    f64x2(math.Inf(1), math.Inf(-1)),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64 pmin",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmin,
+			x1:    f64x2(math.Inf(1), math.Inf(-1)),
+			x2:    f64x2(math.NaN(), math.NaN()),
+			exp:   f64x2(math.Inf(1), math.Inf(-1)),
+		},
+		{
+			name:  "f32 pmax",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f32x4(float32(math.Inf(1)), -1.5, 1123.5, float32(math.Inf(1))),
+			x2:    f32x4(1.4, float32(math.Inf(-1)), -1231.5, float32(math.Inf(1))),
+			exp:   f32x4(float32(math.Inf(1)), -1.5, 1123.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 pmax",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			x2:    f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f32 pmax",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+			x2:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			exp:   f32x4(1.4, -1.5, 1.5, float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 pmax",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			x2:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			exp:   f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+		},
+		{
+			name:  "f32 pmax",
+			shape: wazeroir.ShapeF32x4,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+			x2:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(1))),
+			exp:   f32x4(float32(math.NaN()), float32(math.NaN()), float32(math.NaN()), float32(math.NaN())),
+		},
+		{
+			name:  "f64 pmax",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f64x2(math.Inf(1), -123123.1231),
+			x2:    f64x2(-123123.1, math.Inf(-1)),
+			exp:   f64x2(math.Inf(1), -123123.1231),
+		},
+		{
+			name:  "f64 pmax",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f64x2(math.NaN(), math.NaN()),
+			x2:    f64x2(-123123.1, 1.0),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64 pmax",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f64x2(-123123.1, 1.0),
+			x2:    f64x2(math.NaN(), math.NaN()),
+			exp:   f64x2(-123123.1, 1.0),
+		},
+		{
+			name:  "f64 pmax",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f64x2(math.NaN(), math.NaN()),
+			x2:    f64x2(math.Inf(1), math.Inf(-1)),
+			exp:   f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name:  "f64 pmax",
+			shape: wazeroir.ShapeF64x2,
+			kind:  wazeroir.OperationKindV128Pmax,
+			x1:    f64x2(math.Inf(1), math.Inf(-1)),
+			x2:    f64x2(math.NaN(), math.NaN()),
+			exp:   f64x2(math.Inf(1), math.Inf(-1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			is32bit := tc.shape == wazeroir.ShapeF32x4
+			switch tc.kind {
+			case wazeroir.OperationKindV128Pmin:
+				err = compiler.compileV128Pmin(&wazeroir.OperationV128Pmin{Shape: tc.shape})
+			case wazeroir.OperationKindV128Pmax:
+				err = compiler.compileV128Pmax(&wazeroir.OperationV128Pmax{Shape: tc.shape})
+			}
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+
+			if is32bit {
+				actualFs := [4]float32{
+					math.Float32frombits(uint32(lo)),
+					math.Float32frombits(uint32(lo >> 32)),
+					math.Float32frombits(uint32(hi)),
+					math.Float32frombits(uint32(hi >> 32))}
+				expFs := [4]float32{
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[:4])),
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[4:8])),
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[8:12])),
+					math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[12:])),
+				}
+				for i := range expFs {
+					exp, actual := expFs[i], actualFs[i]
+					if math.IsNaN(float64(exp)) {
+						require.True(t, math.IsNaN(float64(actual)))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			} else {
+				actualFs := [2]float64{
+					math.Float64frombits(lo), math.Float64frombits(hi),
+				}
+				expFs := [2]float64{
+					math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[:8])),
+					math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[8:])),
+				}
+				for i := range expFs {
+					exp, actual := expFs[i], actualFs[i]
+					if math.IsNaN(exp) {
+						require.True(t, math.IsNaN(actual))
+					} else {
+						require.Equal(t, exp, actual)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestCompiler_compileV128ExtMul(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name           string
+		shape          wazeroir.Shape
+		signed, useLow bool
+		x1, x2, exp    [16]byte
+	}{
+		{
+			name:   "i8x16s low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			x1:     [16]byte{}, x2: [16]byte{},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i8x16s low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			x1: [16]byte{
+				255, 255, 255, 255, 255, 255, 255, 255,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			x2: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(128, 128, 128, 128, 128, 128, 128, 128),
+		},
+		{
+			name:   "i8x16s low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			x1: [16]byte{
+				255, 255, 255, 255, 255, 255, 255, 255,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			x2: [16]byte{
+				255, 255, 255, 255, 255, 255, 255, 255,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+		},
+		{
+			name:   "i8x16s low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			x1: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			x2: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384),
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			x1:     [16]byte{}, x2: [16]byte{},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			x1: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				255, 255, 255, 255, 255, 255, 255, 255,
+			},
+			x2: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(128, 128, 128, 128, 128, 128, 128, 128),
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			x1: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				255, 255, 255, 255, 255, 255, 255, 255,
+			},
+			x2: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				255, 255, 255, 255, 255, 255, 255, 255,
+			},
+			exp: i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			x1: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			x2: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384),
+		},
+		{
+			name:   "i8x16u low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			x1:     [16]byte{}, x2: [16]byte{},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i8x16u low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			x1: [16]byte{
+				255, 255, 255, 255, 255, 255, 255, 255,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			x2: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				0, 0, 0, 0,
+			},
+			exp: i16x8(32640, 32640, 32640, 32640, 32640, 32640, 32640, 32640),
+		},
+		{
+			name:   "i8x16u low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			x1: [16]byte{
+				255, 255, 255, 255, 255, 255, 255, 255,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			x2: [16]byte{
+				255, 255, 255, 255, 255, 255, 255, 255,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(i16ToU16(-511), i16ToU16(-511), i16ToU16(-511), i16ToU16(-511),
+				i16ToU16(-511), i16ToU16(-511), i16ToU16(-511), i16ToU16(-511)),
+		},
+		{
+			name:   "i8x16u low",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			x1: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			x2: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384),
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			x1:     [16]byte{}, x2: [16]byte{},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			x1: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				255, 255, 255, 255, 255, 255, 255, 255,
+			},
+			x2: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				0, 0, 0, 0,
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(32640, 32640, 32640, 32640, 32640, 32640, 32640, 32640),
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			x1: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				255, 255, 255, 255, 255, 255, 255, 255,
+			},
+			x2: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				255, 255, 255, 255, 255, 255, 255, 255,
+			},
+			exp: i16x8(i16ToU16(-511), i16ToU16(-511), i16ToU16(-511), i16ToU16(-511),
+				i16ToU16(-511), i16ToU16(-511), i16ToU16(-511), i16ToU16(-511)),
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			x1: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			x2: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384),
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			x1: i16x8(
+				16383, 16383, 16383, 16383,
+				0, 0, 1, 0,
+			),
+			x2: i16x8(
+				16384, 16384, 16384, 16384,
+				0, 0, 1, 0,
+			),
+			exp: i32x4(268419072, 268419072, 268419072, 268419072),
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			x1: i16x8(
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+				0, 0, 1, 0,
+			),
+			x2: i16x8(
+				i16ToU16(-32767), 0, i16ToU16(-32767), 0,
+				0, 0, 1, 0,
+			),
+			exp: i32x4(1073709056, 0, 1073709056, 0),
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			x1: i16x8(
+				65535, 65535, 65535, 65535,
+				0, 0, 1, 0,
+			),
+			x2: i16x8(
+				65535, 0, 65535, 0,
+				0, 0, 1, 0,
+			),
+			exp: i32x4(1, 0, 1, 0),
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			x1: i16x8(
+				0, 0, 1, 0,
+				16383, 16383, 16383, 16383,
+			),
+			x2: i16x8(
+				0, 0, 1, 0,
+				16384, 16384, 16384, 16384,
+			),
+			exp: i32x4(268419072, 268419072, 268419072, 268419072),
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			x1: i16x8(
+				0, 0, 1, 0,
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+			),
+			x2: i16x8(
+				0, 0, 1, 0,
+				i16ToU16(-32767), 0, i16ToU16(-32767), 0,
+			),
+			exp: i32x4(1073709056, 0, 1073709056, 0),
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			x1: i16x8(
+				0, 0, 1, 0,
+				65535, 65535, 65535, 65535,
+			),
+			x2: i16x8(
+				0, 0, 1, 0,
+
+				65535, 0, 65535, 0,
+			),
+			exp: i32x4(1, 0, 1, 0),
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			x1: i16x8(
+				16383, 16383, 16383, 16383,
+				0, 0, 1, 0,
+			),
+			x2: i16x8(
+				16384, 16384, 16384, 16384,
+				0, 0, 1, 0,
+			),
+			exp: i32x4(268419072, 268419072, 268419072, 268419072),
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			x1: i16x8(
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+				0, 0, 1, 0,
+			),
+			x2: i16x8(
+				i16ToU16(-32767), 0, i16ToU16(-32767), 0,
+				0, 0, 1, 0,
+			),
+			exp: i32x4(1073774592, 0, 1073774592, 0),
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			x1: i16x8(
+				65535, 65535, 65535, 65535,
+				0, 0, 1, 0,
+			),
+			x2: i16x8(
+				65535, 0, 65535, 0,
+				0, 0, 1, 0,
+			),
+			exp: i32x4(i32ToU32(-131071), 0, i32ToU32(-131071), 0),
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			x1: i16x8(
+				0, 0, 1, 0,
+				16383, 16383, 16383, 16383,
+			),
+			x2: i16x8(
+				0, 0, 1, 0,
+				16384, 16384, 16384, 16384,
+			),
+			exp: i32x4(268419072, 268419072, 268419072, 268419072),
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			x1: i16x8(
+				0, 0, 1, 0,
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+			),
+			x2: i16x8(
+				0, 0, 1, 0,
+				i16ToU16(-32767), 0, i16ToU16(-32767), 0,
+			),
+			exp: i32x4(1073774592, 0, 1073774592, 0),
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			x1: i16x8(
+				0, 0, 1, 0,
+				65535, 65535, 65535, 65535,
+			),
+			x2: i16x8(
+				0, 0, 1, 0,
+				65535, 0, 65535, 0,
+			),
+			exp: i32x4(i32ToU32(-131071), 0, i32ToU32(-131071), 0),
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			x1: i32x4(
+				1, i32ToU32(-1),
+				0, 0,
+			),
+			x2: i32x4(
+				i32ToU32(-1), 1,
+				0, 0,
+			),
+			exp: i64x2(i64ToU64(-1), i64ToU64(-1)),
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			x1: i32x4(
+				1073741824, 4294967295,
+				0, 0,
+			),
+			x2: i32x4(
+				1073741824, 4294967295,
+				0, 0,
+			),
+			exp: i64x2(1152921504606846976, 1),
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			x1: i32x4(
+				0, 0,
+				1, i32ToU32(-1),
+			),
+			x2: i32x4(
+				0, 0,
+				i32ToU32(-1), 1,
+			),
+			exp: i64x2(i64ToU64(-1), i64ToU64(-1)),
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			x1: i32x4(
+				0, 0,
+				1073741824, 4294967295,
+			),
+			x2: i32x4(
+				0, 0,
+				1073741824, 4294967295,
+			),
+			exp: i64x2(1152921504606846976, 1),
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			x1: i32x4(
+				1, i32ToU32(-1),
+				0, 0,
+			),
+			x2: i32x4(
+				i32ToU32(-1), 1,
+				0, 0,
+			),
+			exp: i64x2(4294967295, 4294967295),
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			x1: i32x4(
+				1073741824, 4294967295,
+				0, 0,
+			),
+			x2: i32x4(
+				1073741824, 4294967295,
+				0, 0,
+			),
+			exp: i64x2(1152921504606846976, i64ToU64(-8589934591)),
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			x1:     [16]byte{},
+			x2:     [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			x1: i32x4(
+				0, 0,
+				1, i32ToU32(-1),
+			),
+			x2: i32x4(
+				0, 0,
+				i32ToU32(-1), 1,
+			),
+			exp: i64x2(4294967295, 4294967295),
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			x1: i32x4(
+				0, 0,
+				1073741824, 4294967295,
+			),
+			x2: i32x4(
+				0, 0,
+				1073741824, 4294967295,
+			),
+			exp: i64x2(1152921504606846976, i64ToU64(-8589934591)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128ExtMul(&wazeroir.OperationV128ExtMul{
+				OriginShape: tc.shape, Signed: tc.signed, UseLow: tc.useLow,
+			})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Extend(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name           string
+		shape          wazeroir.Shape
+		signed, useLow bool
+		v, exp         [16]byte
+	}{
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			v: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+			},
+			exp: i16x8(i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1)),
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			v: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				1, 1, 1, 1, 1, 1, 1, 1,
+			},
+			exp: i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+		},
+		{
+			name:   "i8x16s hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: false,
+			v: [16]byte{
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i8x16s lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i8x16s lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			v: [16]byte{
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1)),
+		},
+		{
+			name:   "i8x16s lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			v: [16]byte{
+				1, 1, 1, 1, 1, 1, 1, 1,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+		},
+		{
+			name:   "i8x16s lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			useLow: true,
+			v: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+			},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		// unsigned
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			v: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+			},
+			exp: i16x8(255, 255, 255, 255, 255, 255, 255, 255),
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			v: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				1, 1, 1, 1, 1, 1, 1, 1,
+			},
+			exp: i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+		},
+		{
+			name:   "i8x16u hi",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: false,
+			v: [16]byte{
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i8x16u lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i8x16u lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			v: [16]byte{
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(255, 255, 255, 255, 255, 255, 255, 255),
+		},
+		{
+			name:   "i8x16u lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			v: [16]byte{
+				1, 1, 1, 1, 1, 1, 1, 1,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			exp: i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+		},
+		{
+			name:   "i8x16u lo",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			useLow: true,
+			v: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+			},
+			exp: i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			v:      i16x8(1, 1, 1, 1, 0, 0, 0, 0),
+			exp:    i32x4(0, 0, 0, 0),
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			v:      i16x8(0, 0, 0, 0, i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1)),
+			exp:    i32x4(i32ToU32(-1), i32ToU32(-1), i32ToU32(-1), i32ToU32(-1)),
+		},
+		{
+			name:   "i16x8s hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: false,
+			v:      i16x8(0, 0, 0, 0, 123, 0, 123, 0),
+			exp:    i32x4(123, 0, 123, 0),
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			v:      i16x8(0, 0, 0, 0, 1, 1, 1, 1),
+			exp:    i32x4(0, 0, 0, 0),
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			v:      i16x8(i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), 0, 0, 0, 0),
+			exp:    i32x4(i32ToU32(-1), i32ToU32(-1), i32ToU32(-1), i32ToU32(-1)),
+		},
+		{
+			name:   "i16x8s lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			useLow: true,
+			v:      i16x8(123, 0, 123, 0, 0, 0, 0, 0),
+			exp:    i32x4(123, 0, 123, 0),
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			v:      i16x8(1, 1, 1, 1, 0, 0, 0, 0),
+			exp:    i32x4(0, 0, 0, 0),
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			v:      i16x8(0, 0, 0, 0, i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1)),
+			exp:    i32x4(65535, 65535, 65535, 65535),
+		},
+		{
+			name:   "i16x8u hi",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: false,
+			v:      i16x8(0, 0, 0, 0, 123, 0, 123, 0),
+			exp:    i32x4(123, 0, 123, 0),
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			v:      i16x8(0, 0, 0, 0, 1, 1, 1, 1),
+			exp:    i32x4(0, 0, 0, 0),
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			v:      i16x8(i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), 0, 0, 0, 0),
+			exp:    i32x4(65535, 65535, 65535, 65535),
+		},
+		{
+			name:   "i16x8u lo",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			useLow: true,
+			v:      i16x8(123, 0, 123, 0, 0, 0, 0, 0),
+			exp:    i32x4(123, 0, 123, 0),
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			v:      i32x4(0, 0, 1, i32ToU32(-1)),
+			exp:    i64x2(1, i64ToU64(-1)),
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			v:      i32x4(1, i32ToU32(-1), 0, 0),
+			exp:    i64x2(0, 0),
+		},
+		{
+			name:   "i32x4s hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: false,
+			v:      i32x4(1, i32ToU32(-1), 123, 123),
+			exp:    i64x2(123, 123),
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			v:      i32x4(1, i32ToU32(-1), 0, 0),
+			exp:    i64x2(1, i64ToU64(-1)),
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			v:      i32x4(0, 0, 1, i32ToU32(-1)),
+			exp:    i64x2(0, 0),
+		},
+		{
+			name:   "i32x4s lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			useLow: true,
+			v:      i32x4(123, 123, 1, i32ToU32(-1)),
+			exp:    i64x2(123, 123),
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			v:      i32x4(0, 0, 1, i32ToU32(-1)),
+			exp:    i64x2(1, 4294967295),
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			v:      i32x4(1, i32ToU32(-1), 0, 0),
+			exp:    i64x2(0, 0),
+		},
+		{
+			name:   "i32x4u hi",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: false,
+			v:      i32x4(1, i32ToU32(-1), 123, 123),
+			exp:    i64x2(123, 123),
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			v:      i32x4(1, i32ToU32(-1), 0, 0),
+			exp:    i64x2(1, 4294967295),
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			v:      i32x4(0, 0, 1, i32ToU32(-1)),
+			exp:    i64x2(0, 0),
+		},
+		{
+			name:   "i32x4u lo",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			useLow: true,
+			v:      i32x4(123, 123, 1, i32ToU32(-1)),
+			exp:    i64x2(123, 123),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Extend(&wazeroir.OperationV128Extend{
+				OriginShape: tc.shape, Signed: tc.signed, UseLow: tc.useLow,
+			})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Q15mulrSatS(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		x1, x2, exp [16]byte
+	}{
+		{
+			name: "1",
+			x1:   i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:   i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			exp:  i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name: "2",
+			x1:   i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:   i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp:  i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name: "3",
+			x1:   i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			x2:   i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp:  i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name: "4",
+			x1:   i16x8(65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535),
+			x2:   i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp:  i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name: "5",
+			x1:   i16x8(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767),
+			x2:   i16x8(32767, 32767, 32767, 32767, 32767, 32767, 32767, 32767),
+			exp:  i16x8(32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Q15mulrSatS(&wazeroir.OperationV128Q15mulrSatS{})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileFloatPromote(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		v, exp [16]byte
+	}{
+		{
+			name: "1",
+			v:    f32x4(float32(0x1.8f867ep+125), float32(0x1.8f867ep+125), float32(0x1.8f867ep+125), float32(0x1.8f867ep+125)),
+			exp:  f64x2(6.6382536710104395e+37, 6.6382536710104395e+37),
+		},
+		{
+			name: "2",
+			v:    f32x4(float32(0x1.8f867ep+125), float32(0x1.8f867ep+125), 0, 0),
+			exp:  f64x2(6.6382536710104395e+37, 6.6382536710104395e+37),
+		},
+		{
+			name: "3",
+			v:    f32x4(0, 0, float32(0x1.8f867ep+125), float32(0x1.8f867ep+125)),
+			exp:  f64x2(0, 0),
+		},
+		{
+			name: "4",
+			v:    f32x4(float32(math.NaN()), float32(math.NaN()), float32(0x1.8f867ep+125), float32(0x1.8f867ep+125)),
+			exp:  f64x2(math.NaN(), math.NaN()),
+		},
+		{
+			name: "5",
+			v:    f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), float32(0x1.8f867ep+125), float32(0x1.8f867ep+125)),
+			exp:  f64x2(math.Inf(1), math.Inf(-1)),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128FloatPromote(&wazeroir.OperationV128FloatPromote{})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			actualFs := [2]float64{
+				math.Float64frombits(lo), math.Float64frombits(hi),
+			}
+			expFs := [2]float64{
+				math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[:8])),
+				math.Float64frombits(binary.LittleEndian.Uint64(tc.exp[8:])),
+			}
+			for i := range expFs {
+				exp, actual := expFs[i], actualFs[i]
+				if math.IsNaN(exp) {
+					require.True(t, math.IsNaN(actual))
+				} else {
+					require.Equal(t, exp, actual)
+				}
+			}
+		})
+	}
+}
+
+func TestCompiler_compileV128FloatDemote(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		v, exp [16]byte
+	}{
+		{
+			name: "1",
+			v:    f64x2(0, 0),
+			exp:  f32x4(0, 0, 0, 0),
+		},
+		{
+			name: "2",
+			v:    f64x2(0x1.fffffe0000000p-127, 0x1.fffffe0000000p-127),
+			exp:  f32x4(0x1p-126, 0x1p-126, 0, 0),
+		},
+		{
+			name: "3",
+			v:    f64x2(0x1.fffffep+127, 0x1.fffffep+127),
+			exp:  f32x4(0x1.fffffep+127, 0x1.fffffep+127, 0, 0),
+		},
+		{
+			name: "4",
+			v:    f64x2(math.NaN(), math.NaN()),
+			exp:  f32x4(float32(math.NaN()), float32(math.NaN()), 0, 0),
+		},
+		{
+			name: "5",
+			v:    f64x2(math.Inf(1), math.Inf(-1)),
+			exp:  f32x4(float32(math.Inf(1)), float32(math.Inf(-1)), 0, 0),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128FloatDemote(&wazeroir.OperationV128FloatDemote{})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			actualFs := [4]float32{
+				math.Float32frombits(uint32(lo)),
+				math.Float32frombits(uint32(lo >> 32)),
+				math.Float32frombits(uint32(hi)),
+				math.Float32frombits(uint32(hi >> 32))}
+			expFs := [4]float32{
+				math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[:4])),
+				math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[4:8])),
+				math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[8:12])),
+				math.Float32frombits(binary.LittleEndian.Uint32(tc.exp[12:])),
+			}
+			for i := range expFs {
+				exp, actual := expFs[i], actualFs[i]
+				if math.IsNaN(float64(exp)) {
+					require.True(t, math.IsNaN(float64(actual)))
+				} else {
+					require.Equal(t, exp, actual)
+				}
+			}
+		})
+	}
+}
+
+func TestCompiler_compileV128ExtAddPairwise(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name   string
+		shape  wazeroir.Shape
+		signed bool
+		v, exp [16]byte
+	}{
+		{
+			name:   "i8x16 s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i8x16 s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			v:      [16]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			exp:    i16x8(2, 2, 2, 2, 2, 2, 2, 2),
+		},
+		{
+			name:   "i8x16 s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			v: [16]byte{
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+			},
+			exp: i16x8(
+				i16ToU16(-2), i16ToU16(-2), i16ToU16(-2), i16ToU16(-2),
+				i16ToU16(-2), i16ToU16(-2), i16ToU16(-2), i16ToU16(-2),
+			),
+		},
+		{
+			name:   "i8x16 s",
+			shape:  wazeroir.ShapeI8x16,
+			signed: true,
+			v: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(
+				i16ToU16(-256), i16ToU16(-256), i16ToU16(-256), i16ToU16(-256),
+				i16ToU16(-256), i16ToU16(-256), i16ToU16(-256), i16ToU16(-256),
+			),
+		},
+		{
+			name:   "i8x16 u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i8x16 u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			v:      [16]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			exp:    i16x8(2, 2, 2, 2, 2, 2, 2, 2),
+		},
+		{
+			name:   "i8x16 u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			v: [16]byte{
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+				i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1), i8ToU8(-1),
+			},
+			exp: i16x8(510, 510, 510, 510, 510, 510, 510, 510),
+		},
+		{
+			name:   "i8x16 u",
+			shape:  wazeroir.ShapeI8x16,
+			signed: false,
+			v: [16]byte{
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+				i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128), i8ToU8(-128),
+			},
+			exp: i16x8(256, 256, 256, 256, 256, 256, 256, 256),
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			v:      i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp:    i32x4(2, 2, 2, 2),
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			v: i16x8(
+				i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1),
+				i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1),
+			),
+			exp: i32x4(i32ToU32(-2), i32ToU32(-2), i32ToU32(-2), i32ToU32(-2)),
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			v: i16x8(
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+			),
+			exp: i32x4(i32ToU32(-65536), i32ToU32(-65536), i32ToU32(-65536), i32ToU32(-65536)),
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			v:      [16]byte{},
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			v:      i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp:    i32x4(2, 2, 2, 2),
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			v: i16x8(
+				i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1),
+				i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1),
+			),
+			exp: i32x4(131070, 131070, 131070, 131070),
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			v: i16x8(
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+				i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768), i16ToU16(-32768),
+			),
+			exp: i32x4(65536, 65536, 65536, 65536),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128ExtAddPairwise(&wazeroir.OperationV128ExtAddPairwise{
+				OriginShape: tc.shape, Signed: tc.signed,
+			})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Narrow(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+	tests := []struct {
+		name        string
+		shape       wazeroir.Shape
+		signed      bool
+		x1, x2, exp [16]byte
+	}{
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:     i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				1, 1, 1, 1, 1, 1, 1, 1,
+			},
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			x2:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			exp: [16]byte{
+				1, 1, 1, 1, 1, 1, 1, 1,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0),
+			x2:     i16x8(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff),
+			exp: [16]byte{
+				0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+			},
+		},
+		{
+			name:   "i16x8 s",
+			shape:  wazeroir.ShapeI16x8,
+			signed: true,
+			x1:     i16x8(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff),
+			x2:     i16x8(i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0),
+			exp: [16]byte{
+				0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+			},
+		},
+		// i
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			exp:    [16]byte{},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:     i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			exp: [16]byte{
+				0, 0, 0, 0, 0, 0, 0, 0,
+				1, 1, 1, 1, 1, 1, 1, 1,
+			},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(1, 1, 1, 1, 1, 1, 1, 1),
+			x2:     i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			exp: [16]byte{
+				1, 1, 1, 1, 1, 1, 1, 1,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0),
+			x2:     i16x8(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff),
+			exp: [16]byte{
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff),
+			x2:     i16x8(i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0, i16ToU16(-0x8000), 0),
+			exp: [16]byte{
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(i16ToU16(-1), 0, i16ToU16(-1), 0, i16ToU16(-1), 0, i16ToU16(-1), 0),
+			x2:     i16x8(0, 0x100, 0, 0x100, 0, 0x100, 0, 0x100),
+			exp: [16]byte{
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
+			},
+		},
+		{
+			name:   "i16x8 u",
+			shape:  wazeroir.ShapeI16x8,
+			signed: false,
+			x1:     i16x8(0, 0x100, 0, 0x100, 0, 0x100, 0, 0x100),
+			x2:     i16x8(i16ToU16(-1), 0, i16ToU16(-1), 0, i16ToU16(-1), 0, i16ToU16(-1), 0),
+			exp: [16]byte{
+				0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			},
+		},
+		{
+			name:   "i32x4 s",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			x1:     i32x4(0, 0, 0, 0),
+			x2:     i32x4(0, 0, 0, 0),
+			exp:    i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i32x4 s",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			x1:     i32x4(0, 0, 0, 0),
+			x2:     i32x4(1, 1, 1, 1),
+			exp:    i16x8(0, 0, 0, 0, 1, 1, 1, 1),
+		},
+		{
+			name:   "i32x4 s",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			x1:     i32x4(1, 1, 1, 1),
+			x2:     i32x4(0, 0, 0, 0),
+			exp:    i16x8(1, 1, 1, 1, 0, 0, 0, 0),
+		},
+		{
+			name:   "i32x4 s",
+			shape:  wazeroir.ShapeI32x4,
+			signed: true,
+			x1:     i32x4(0x8000, 0x8000, 0x7fff, 0x7fff),
+			x2:     i32x4(0x7fff, 0x7fff, 0x8000, 0x8000),
+			exp:    i16x8(0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff),
+		},
+		{
+			name:   "i32x4 u",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			x1:     i32x4(0, 0, 0, 0),
+			x2:     i32x4(0, 0, 0, 0),
+			exp:    i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+		},
+		{
+			name:   "i32x4 u",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			x1:     i32x4(0, 0, 0, 0),
+			x2:     i32x4(1, 1, 1, 1),
+			exp:    i16x8(0, 0, 0, 0, 1, 1, 1, 1),
+		},
+		{
+			name:   "i32x4 u",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			x1:     i32x4(1, 1, 1, 1),
+			x2:     i32x4(0, 0, 0, 0),
+			exp:    i16x8(1, 1, 1, 1, 0, 0, 0, 0),
+		},
+		{
+			name:   "i32x4 u",
+			shape:  wazeroir.ShapeI32x4,
+			signed: false,
+			x1:     i32x4(0x8000, 0x8000, 0x7fff, 0x7fff),
+			x2:     i32x4(0x7fff, 0x7fff, 0x8000, 0x8000),
+			exp:    i16x8(0x8000, 0x8000, 0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x8000, 0x8000),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Narrow(&wazeroir.OperationV128Narrow{
+				OriginShape: tc.shape, Signed: tc.signed,
+			})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128FConvertFromI(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name      string
+		destShape wazeroir.Shape
+		signed    bool
+		v, exp    [16]byte
+	}{
+		{
+			name:      "f32x4 s",
+			destShape: wazeroir.ShapeF32x4,
+			signed:    true,
+			v:         i32x4(0, 0, 0, 0),
+			exp:       f32x4(0, 0, 0, 0),
+		},
+		{
+			name:      "f32x4 s",
+			destShape: wazeroir.ShapeF32x4,
+			signed:    true,
+			v:         i32x4(1, 0, 2, 3),
+			exp:       f32x4(1, 0, 2.0, 3),
+		},
+		{
+			name:      "f32x4 s",
+			destShape: wazeroir.ShapeF32x4,
+			signed:    true,
+			v:         i32x4(1234567890, i32ToU32(-2147483648.0), 2147483647, 1234567890),
+			exp:       f32x4(0x1.26580cp+30, -2147483648.0, 2147483647, 0x1.26580cp+30),
+		},
+		{
+			name:      "f32x4 s",
+			destShape: wazeroir.ShapeF32x4,
+			signed:    false,
+			v:         i32x4(0, 0, 0, 0),
+			exp:       f32x4(0, 0, 0, 0),
+		},
+		{
+			name:      "f32x4 s",
+			destShape: wazeroir.ShapeF32x4,
+			signed:    false,
+			v:         i32x4(1, 0, 2, 3),
+			exp:       f32x4(1, 0, 2.0, 3),
+		},
+		{
+			name:      "f32x4 s",
+			destShape: wazeroir.ShapeF32x4,
+			signed:    false,
+			v:         i32x4(2147483647, i32ToU32(-2147483648.0), 2147483647, i32ToU32(-1)),
+			exp:       f32x4(2147483648.0, 2147483648.0, 2147483648.0, 4294967295.0),
+		},
+		{
+			name:      "f64x2 s",
+			destShape: wazeroir.ShapeF64x2,
+			signed:    true,
+			v:         i32x4(0, 0, 0, 0),
+			exp:       f64x2(0, 0),
+		},
+		{
+			name:      "f64x2 s",
+			destShape: wazeroir.ShapeF64x2,
+			signed:    true,
+			v:         i32x4(0, 0, i32ToU32(-1), i32ToU32(-32)),
+			exp:       f64x2(0, 0),
+		},
+		{
+			name:      "f64x2 s",
+			destShape: wazeroir.ShapeF64x2,
+			signed:    true,
+			v:         i32x4(2147483647, i32ToU32(-2147483648), 0, 0),
+			exp:       f64x2(2147483647, -2147483648),
+		},
+		{
+			name:      "f64x2 s",
+			destShape: wazeroir.ShapeF64x2,
+			signed:    false,
+			v:         i32x4(0, 0, 0, 0),
+			exp:       f64x2(0, 0),
+		},
+		{
+			name:      "f64x2 s",
+			destShape: wazeroir.ShapeF64x2,
+			signed:    false,
+			v:         i32x4(0, 0, i32ToU32(-1), i32ToU32(-32)),
+			exp:       f64x2(0, 0),
+		},
+		{
+			name:      "f64x2 s",
+			destShape: wazeroir.ShapeF64x2,
+			signed:    false,
+			v:         i32x4(2147483647, i32ToU32(-2147483648), 0, 0),
+			exp:       f64x2(2147483647, 2147483648),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128FConvertFromI(&wazeroir.OperationV128FConvertFromI{
+				DestinationShape: tc.destShape,
+				Signed:           tc.signed,
+			})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128Dot(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		x1, x2, exp [16]byte
+	}{
+		{
+			name: "1",
+			x1:   i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			x2:   i16x8(0, 0, 0, 0, 0, 0, 0, 0),
+			exp:  i32x4(0, 0, 0, 0),
+		},
+		{
+			name: "2",
+			x1:   i16x8(1, 1, 1, 1, i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1)),
+			x2:   i16x8(i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), i16ToU16(-1), 2, 2, 2, 2),
+			exp:  i32x4(i32ToU32(-2), i32ToU32(-2), i32ToU32(-4), i32ToU32(-4)),
+		},
+		{
+			name: "3",
+			x1:   i16x8(65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535),
+			x2:   i16x8(65535, 65535, 65535, 65535, 65535, 65535, 65535, 65535),
+			exp:  i32x4(2, 2, 2, 2),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x2[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x2[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.x1[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.x1[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128Dot(&wazeroir.OperationV128Dot{})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCompiler_compileV128ITruncSatFromF(t *testing.T) {
+	if runtime.GOARCH != "amd64" {
+		// TODO: implement on amd64.
+		t.Skip()
+	}
+
+	tests := []struct {
+		name        string
+		originShape wazeroir.Shape
+		signed      bool
+		v, exp      [16]byte
+	}{
+		{
+			name:        "f32x4 s",
+			originShape: wazeroir.ShapeF32x4,
+			signed:      true,
+			v:           i32x4(0, 0, 0, 0),
+			exp:         f32x4(0, 0, 0, 0),
+		},
+		{
+			name:        "f32x4 s",
+			originShape: wazeroir.ShapeF32x4,
+			signed:      true,
+			v:           f32x4(1.5, -1.9, -1.9, 1.5),
+			exp:         i32x4(1, i32ToU32(-1), i32ToU32(-1), 1),
+		},
+		{
+			name:        "f32x4 s",
+			originShape: wazeroir.ShapeF32x4,
+			signed:      true,
+			v:           f32x4(float32(math.NaN()), -4294967294.0, float32(math.Inf(-1)), float32(math.Inf(1))),
+			exp:         i32x4(0, i32ToU32(-2147483648), i32ToU32(-2147483648), 2147483647),
+		},
+		{
+			name:        "f32x4 u",
+			originShape: wazeroir.ShapeF32x4,
+			signed:      false,
+			v:           i32x4(0, 0, 0, 0),
+			exp:         f32x4(0, 0, 0, 0),
+		},
+		{
+			name:        "f32x4 u",
+			originShape: wazeroir.ShapeF32x4,
+			signed:      false,
+			v:           f32x4(1.5, -1.9, -1.9, 1.5),
+			exp:         i32x4(1, 0, 0, 1),
+		},
+		{
+			name:        "f32x4 u",
+			originShape: wazeroir.ShapeF32x4,
+			signed:      false,
+			v:           f32x4(float32(math.NaN()), -4294967294.0, 4294967294.0, float32(math.Inf(1))),
+			exp:         i32x4(0, 0, 4294967295, 4294967295),
+		},
+		{
+			name:        "f64x2 s",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      true,
+			v:           f64x2(0, 0),
+			exp:         i32x4(0, 0, 0, 0),
+		},
+		{
+			name:        "f64x2 s",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      true,
+			v:           f64x2(5.123, -2.0),
+			exp:         i32x4(5, i32ToU32(-2), 0, 0),
+		},
+		{
+			name:        "f64x2 s",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      true,
+			v:           f64x2(math.NaN(), math.Inf(1)),
+			exp:         i32x4(0, 2147483647, 0, 0),
+		},
+		{
+			name:        "f64x2 s",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      true,
+			v:           f64x2(math.Inf(-1), 4294967294.0),
+			exp:         i32x4(i32ToU32(-2147483648), 2147483647, 0, 0),
+		},
+		{
+			name:        "f64x2 u",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      false,
+			v:           f64x2(0, 0),
+			exp:         i32x4(0, 0, 0, 0),
+		},
+		{
+			name:        "f64x2 u",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      false,
+			v:           f64x2(5.123, -2.0),
+			exp:         i32x4(5, 0, 0, 0),
+		},
+		{
+			name:        "f64x2 u",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      false,
+			v:           f64x2(math.NaN(), math.Inf(1)),
+			exp:         i32x4(0, 4294967295, 0, 0),
+		},
+		{
+			name:        "f64x2 u",
+			originShape: wazeroir.ShapeF64x2,
+			signed:      false,
+			v:           f64x2(math.Inf(-1), 4294967296.0),
+			exp:         i32x4(0, 4294967295, 0, 0),
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			env := newCompilerEnvironment()
+			compiler := env.requireNewCompiler(t, newCompiler,
+				&wazeroir.CompilationResult{HasMemory: true, Signature: &wasm.FunctionType{}})
+
+			err := compiler.compilePreamble()
+			require.NoError(t, err)
+
+			err = compiler.compileV128Const(&wazeroir.OperationV128Const{
+				Lo: binary.LittleEndian.Uint64(tc.v[:8]),
+				Hi: binary.LittleEndian.Uint64(tc.v[8:]),
+			})
+			require.NoError(t, err)
+
+			err = compiler.compileV128ITruncSatFromF(&wazeroir.OperationV128ITruncSatFromF{
+				OriginShape: tc.originShape,
+				Signed:      tc.signed,
+			})
+			require.NoError(t, err)
+
+			require.Equal(t, uint64(2), compiler.runtimeValueLocationStack().sp)
+			require.Equal(t, 1, len(compiler.runtimeValueLocationStack().usedRegisters))
+
+			err = compiler.compileReturnFunction()
+			require.NoError(t, err)
+
+			// Generate and run the code under test.
+			code, _, _, err := compiler.compile()
+			require.NoError(t, err)
+			env.exec(code)
+
+			require.Equal(t, nativeCallStatusCodeReturned, env.callEngine().statusCode)
+
+			lo, hi := env.stackTopAsV128()
+			var actual [16]byte
+			binary.LittleEndian.PutUint64(actual[:8], lo)
+			binary.LittleEndian.PutUint64(actual[8:], hi)
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
diff --git a/internal/engine/compiler/engine.go b/internal/engine/compiler/engine.go
index fbf7c07809d..e68c5da181f 100644
--- a/internal/engine/compiler/engine.go
+++ b/internal/engine/compiler/engine.go
@@ -148,14 +148,14 @@ type (
 		// Where we store the status code of Compiler execution.
 		statusCode nativeCallStatusCode
 
-		// Set when statusCode == compilerStatusCallBuiltInFunction}
+		// Set when statusCode == compilerStatusCallBuiltInFunction
 		// Indicating the function call index.
 		builtinFunctionCallIndex wasm.Index
 	}
 
 	// callFrame holds the information to which the caller function can return.
 	// callFrame is created for currently executed function frame as well,
-	// so some of the fields are not yet set when native code is currently executing it.
+	// so some fields are not yet set when native code is currently executing it.
 	// That is, callFrameTop().returnAddress or returnStackBasePointer are not set
 	// until it makes a function call.
 	callFrame struct {
@@ -187,8 +187,8 @@ type (
 		parent *code
 	}
 
-	// code corresponds to a function in a module (not insantaited one). This holds the machine code
-	// compiled by Wazero's compiler.
+	// code corresponds to a function in a module (not instantiated one). This holds the machine code
+	// compiled by wazero compiler.
 	code struct {
 		// codeSegment is holding the compiled native code as a byte slice.
 		codeSegment []byte
@@ -203,7 +203,7 @@ type (
 		sourceModule *wasm.Module
 	}
 
-	// staticData holds the read-only data (i.e. out side of codeSegment which is marked as executable) per function.
+	// staticData holds the read-only data (i.e. outside codeSegment which is marked as executable) per function.
 	// This is used to store jump tables for br_table instructions.
 	// The primary index is the logical separation of multiple data, for example data[0] and data[1]
 	// correspond to different jump tables for different br_table instructions.
@@ -250,7 +250,7 @@ const (
 	callEngineValueStackContextStackBasePointerOffset = 120
 
 	// Offsets for callEngine exitContext.
-	callEngineExitContextnativeCallStatusCodeOffset       = 128
+	callEngineExitContextNativeCallStatusCodeOffset       = 128
 	callEngineExitContextBuiltinFunctionCallAddressOffset = 132
 
 	// Offsets for callFrame.
@@ -294,10 +294,10 @@ const (
 	// https://github.com/golang/go/blob/release-branch.go1.17/src/runtime/runtime2.go#L207-L210
 	interfaceDataOffset = 8
 
-	// Consts for DataInstance.
+	// Consts for wasm.DataInstance.
 	dataInstanceStructSize = 24
 
-	// Consts for ElementInstance.
+	// Consts for wasm.ElementInstance.
 	elementInstanceStructSize = 32
 
 	// pointerSizeLog2 satisfies: 1 << pointerSizeLog2 = sizeOf(uintptr)
@@ -330,7 +330,7 @@ const (
 	nativeCallStatusIntegerDivisionByZero
 )
 
-// causePanic causes a panic with the corresponding error to the status code.
+// causePanic causes a panic with the corresponding error to the nativeCallStatusCode.
 func (s nativeCallStatusCode) causePanic() {
 	var err error
 	switch s {
@@ -495,9 +495,9 @@ func (e *engine) NewModuleEngine(name string, module *wasm.Module, importedFunct
 			return me, wasm.ErrElementOffsetOutOfBounds
 		}
 
-		for i, funcindex := range init.FunctionIndexes {
-			if funcindex != nil {
-				references[init.Offset+uint32(i)] = uintptr(unsafe.Pointer(me.functions[*funcindex]))
+		for i, funcIdx := range init.FunctionIndexes {
+			if funcIdx != nil {
+				references[init.Offset+uint32(i)] = uintptr(unsafe.Pointer(me.functions[*funcIdx]))
 			}
 		}
 	}
@@ -820,7 +820,7 @@ func (ce *callEngine) builtinFunctionMemoryGrow(ctx context.Context, mem *wasm.M
 
 func (ce *callEngine) builtinFunctionTableGrow(ctx context.Context, tables []*wasm.TableInstance) {
 	tableIndex := ce.popValue()
-	table := tables[tableIndex] // verifed not to be out of range by the func validation at compilation phase.
+	table := tables[tableIndex] // verified not to be out of range by the func validation at compilation phase.
 	num := ce.popValue()
 	ref := ce.popValue()
 	res := table.Grow(ctx, uint32(num), uintptr(ref))
@@ -1094,6 +1094,60 @@ func compileWasmFunction(_ wasm.Features, ir *wazeroir.CompilationResult) (*code
 			err = compiler.compileV128Shl(o)
 		case *wazeroir.OperationV128Cmp:
 			err = compiler.compileV128Cmp(o)
+		case *wazeroir.OperationV128AddSat:
+			err = compiler.compileV128AddSat(o)
+		case *wazeroir.OperationV128SubSat:
+			err = compiler.compileV128SubSat(o)
+		case *wazeroir.OperationV128Mul:
+			err = compiler.compileV128Mul(o)
+		case *wazeroir.OperationV128Div:
+			err = compiler.compileV128Div(o)
+		case *wazeroir.OperationV128Neg:
+			err = compiler.compileV128Neg(o)
+		case *wazeroir.OperationV128Sqrt:
+			err = compiler.compileV128Sqrt(o)
+		case *wazeroir.OperationV128Abs:
+			err = compiler.compileV128Abs(o)
+		case *wazeroir.OperationV128Popcnt:
+			err = compiler.compileV128Popcnt(o)
+		case *wazeroir.OperationV128Min:
+			err = compiler.compileV128Min(o)
+		case *wazeroir.OperationV128Max:
+			err = compiler.compileV128Max(o)
+		case *wazeroir.OperationV128AvgrU:
+			err = compiler.compileV128AvgrU(o)
+		case *wazeroir.OperationV128Pmin:
+			err = compiler.compileV128Pmin(o)
+		case *wazeroir.OperationV128Pmax:
+			err = compiler.compileV128Pmax(o)
+		case *wazeroir.OperationV128Ceil:
+			err = compiler.compileV128Ceil(o)
+		case *wazeroir.OperationV128Floor:
+			err = compiler.compileV128Floor(o)
+		case *wazeroir.OperationV128Trunc:
+			err = compiler.compileV128Trunc(o)
+		case *wazeroir.OperationV128Nearest:
+			err = compiler.compileV128Nearest(o)
+		case *wazeroir.OperationV128Extend:
+			err = compiler.compileV128Extend(o)
+		case *wazeroir.OperationV128ExtMul:
+			err = compiler.compileV128ExtMul(o)
+		case *wazeroir.OperationV128Q15mulrSatS:
+			err = compiler.compileV128Q15mulrSatS(o)
+		case *wazeroir.OperationV128ExtAddPairwise:
+			err = compiler.compileV128ExtAddPairwise(o)
+		case *wazeroir.OperationV128FloatPromote:
+			err = compiler.compileV128FloatPromote(o)
+		case *wazeroir.OperationV128FloatDemote:
+			err = compiler.compileV128FloatDemote(o)
+		case *wazeroir.OperationV128FConvertFromI:
+			err = compiler.compileV128FConvertFromI(o)
+		case *wazeroir.OperationV128Dot:
+			err = compiler.compileV128Dot(o)
+		case *wazeroir.OperationV128Narrow:
+			err = compiler.compileV128Narrow(o)
+		case *wazeroir.OperationV128ITruncSatFromF:
+			err = compiler.compileV128ITruncSatFromF(o)
 		default:
 			err = errors.New("unsupported")
 		}
diff --git a/internal/engine/compiler/engine_test.go b/internal/engine/compiler/engine_test.go
index 6ea3bfa0db4..2cf04e0da30 100644
--- a/internal/engine/compiler/engine_test.go
+++ b/internal/engine/compiler/engine_test.go
@@ -46,7 +46,7 @@ func TestCompiler_VerifyOffsetValue(t *testing.T) {
 	require.Equal(t, int(unsafe.Offsetof(ce.stackBasePointer)), callEngineValueStackContextStackBasePointerOffset)
 
 	// Offsets for callEngine.exitContext.
-	require.Equal(t, int(unsafe.Offsetof(ce.statusCode)), callEngineExitContextnativeCallStatusCodeOffset)
+	require.Equal(t, int(unsafe.Offsetof(ce.statusCode)), callEngineExitContextNativeCallStatusCodeOffset)
 	require.Equal(t, int(unsafe.Offsetof(ce.builtinFunctionCallIndex)), callEngineExitContextBuiltinFunctionCallAddressOffset)
 
 	// Size and offsets for callFrame.
diff --git a/internal/engine/compiler/impl_amd64.go b/internal/engine/compiler/impl_amd64.go
index c0c97968266..81cd9cb93dc 100644
--- a/internal/engine/compiler/impl_amd64.go
+++ b/internal/engine/compiler/impl_amd64.go
@@ -1814,33 +1814,7 @@ func (c *amd64Compiler) compileTrunc(o *wazeroir.OperationTrunc) error {
 
 // compileNearest implements compiler.compileNearest for the amd64 architecture.
 func (c *amd64Compiler) compileNearest(o *wazeroir.OperationNearest) error {
-	// Internally, nearest can be performed via ROUND instruction with 0x00 mode.
-	// If we compile the following Wat by "wasmtime wasm2obj",
-	//
-	// (module
-	//   (func (export "nearest_f32") (param $x f32) (result f32) (f32.nearest (local.get $x)))
-	//   (func (export "nearest_f64") (param $x f64) (result f64) (f64.nearest (local.get $x)))
-	// )
-	//
-	// we see a disassemble of the object via "objdump --disassemble-all" like:
-	//
-	// 0000000000000000 <_wasm_function_0>:
-	// 	0:       55                      push   %rbp
-	// 	1:       48 89 e5                mov    %rsp,%rbp
-	// 	4:       66 0f 3a 0a c0 00       roundss $0x0,%xmm0,%xmm0
-	// 	a:       48 89 ec                mov    %rbp,%rsp
-	// 	d:       5d                      pop    %rbp
-	// 	e:       c3                      retq
-	//
-	// 000000000000000f <_wasm_function_1>:
-	// 	f:        55                      push   %rbp
-	//  10:       48 89 e5                mov    %rsp,%rbp
-	//  13:       66 0f 3a 0b c0 00       roundsd $0x0,%xmm0,%xmm0
-	//  19:       48 89 ec                mov    %rbp,%rsp
-	//  1c:       5d                      pop    %rbp
-	//  1d:       c3                      retq
-	//
-	// Below, we use the same implementation: "rounds{s,d} $0x0,%xmm0,%xmm0" where the mode is set to zero.
+	// Nearest can be performed via ROUND instruction with 0x00 mode.
 	return c.compileRoundInstruction(o.Type == wazeroir.Float32, 0x00)
 }
 
@@ -1886,7 +1860,7 @@ func (c *amd64Compiler) compileMax(o *wazeroir.OperationMax) error {
 // Native min/max instructions return non-NaN value if exactly one of target values
 // is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN.
 // However, WebAssembly specifies that min/max must always return NaN if one of values is NaN.
-// Therefore in this function, we have to add conditional jumps to check if one of values is NaN before
+// Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before
 // the native min/max, which is why we cannot simply emit a native min/max instruction here.
 //
 // For the semantics, see wazeroir.Min and wazeroir.Max for detail.
@@ -4675,7 +4649,7 @@ func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation)
 }
 
 func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
-	c.assembler.CompileConstToMemory(amd64.MOVB, int64(status), amd64ReservedRegisterForCallEngine, callEngineExitContextnativeCallStatusCodeOffset)
+	c.assembler.CompileConstToMemory(amd64.MOVB, int64(status), amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
 
 	// Write back the cached SP to the actual eng.stackPointer.
 	c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp), amd64ReservedRegisterForCallEngine, callEngineValueStackContextStackPointerOffset)
diff --git a/internal/engine/compiler/impl_arm64.go b/internal/engine/compiler/impl_arm64.go
index 9ffb4ebf7e9..91fd60cd363 100644
--- a/internal/engine/compiler/impl_arm64.go
+++ b/internal/engine/compiler/impl_arm64.go
@@ -396,10 +396,10 @@ func (c *arm64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
 
 	if status != 0 {
 		c.assembler.CompileConstToRegister(arm64.MOVW, int64(status), arm64ReservedRegisterForTemporary)
-		c.assembler.CompileRegisterToMemory(arm64.MOVWU, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForCallEngine, callEngineExitContextnativeCallStatusCodeOffset)
+		c.assembler.CompileRegisterToMemory(arm64.MOVWU, arm64ReservedRegisterForTemporary, arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
 	} else {
 		// If the status == 0, we use zero register to store zero.
-		c.assembler.CompileRegisterToMemory(arm64.MOVWU, arm64.RegRZR, arm64ReservedRegisterForCallEngine, callEngineExitContextnativeCallStatusCodeOffset)
+		c.assembler.CompileRegisterToMemory(arm64.MOVWU, arm64.RegRZR, arm64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
 	}
 
 	// The return address to the Go code is stored in archContext.compilerReturnAddress which
diff --git a/internal/engine/compiler/impl_vec_amd64.go b/internal/engine/compiler/impl_vec_amd64.go
index d280faa0d77..c2c74956d4c 100644
--- a/internal/engine/compiler/impl_vec_amd64.go
+++ b/internal/engine/compiler/impl_vec_amd64.go
@@ -1,6 +1,8 @@
 package compiler
 
 import (
+	"errors"
+
 	"github.com/tetratelabs/wazero/internal/asm"
 	"github.com/tetratelabs/wazero/internal/asm/amd64"
 	"github.com/tetratelabs/wazero/internal/wazeroir"
@@ -60,7 +62,7 @@ func (c *amd64Compiler) compileV128Add(o *wazeroir.OperationV128Add) error {
 	case wazeroir.ShapeI16x8:
 		inst = amd64.PADDW
 	case wazeroir.ShapeI32x4:
-		inst = amd64.PADDL
+		inst = amd64.PADDD
 	case wazeroir.ShapeI64x2:
 		inst = amd64.PADDQ
 	case wazeroir.ShapeF32x4:
@@ -93,7 +95,7 @@ func (c *amd64Compiler) compileV128Sub(o *wazeroir.OperationV128Sub) error {
 	case wazeroir.ShapeI16x8:
 		inst = amd64.PSUBW
 	case wazeroir.ShapeI32x4:
-		inst = amd64.PSUBL
+		inst = amd64.PSUBD
 	case wazeroir.ShapeI64x2:
 		inst = amd64.PSUBQ
 	case wazeroir.ShapeF32x4:
@@ -116,21 +118,21 @@ func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error {
 	}
 
 	switch o.Type {
-	case wazeroir.LoadV128Type128:
+	case wazeroir.V128LoadType128:
 		err = c.compileV128LoadImpl(amd64.MOVDQU, o.Arg.Offset, 16, result)
-	case wazeroir.LoadV128Type8x8s:
+	case wazeroir.V128LoadType8x8s:
 		err = c.compileV128LoadImpl(amd64.PMOVSXBW, o.Arg.Offset, 8, result)
-	case wazeroir.LoadV128Type8x8u:
+	case wazeroir.V128LoadType8x8u:
 		err = c.compileV128LoadImpl(amd64.PMOVZXBW, o.Arg.Offset, 8, result)
-	case wazeroir.LoadV128Type16x4s:
+	case wazeroir.V128LoadType16x4s:
 		err = c.compileV128LoadImpl(amd64.PMOVSXWD, o.Arg.Offset, 8, result)
-	case wazeroir.LoadV128Type16x4u:
+	case wazeroir.V128LoadType16x4u:
 		err = c.compileV128LoadImpl(amd64.PMOVZXWD, o.Arg.Offset, 8, result)
-	case wazeroir.LoadV128Type32x2s:
+	case wazeroir.V128LoadType32x2s:
 		err = c.compileV128LoadImpl(amd64.PMOVSXDQ, o.Arg.Offset, 8, result)
-	case wazeroir.LoadV128Type32x2u:
+	case wazeroir.V128LoadType32x2u:
 		err = c.compileV128LoadImpl(amd64.PMOVZXDQ, o.Arg.Offset, 8, result)
-	case wazeroir.LoadV128Type8Splat:
+	case wazeroir.V128LoadType8Splat:
 		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 1)
 		if err != nil {
 			return err
@@ -148,7 +150,7 @@ func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error {
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRB, reg, result, 0)
 		c.assembler.CompileRegisterToRegister(amd64.PXOR, tmpVReg, tmpVReg)
 		c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmpVReg, result)
-	case wazeroir.LoadV128Type16Splat:
+	case wazeroir.V128LoadType16Splat:
 		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 2)
 		if err != nil {
 			return err
@@ -161,7 +163,7 @@ func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error {
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 0)
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRW, reg, result, 1)
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
-	case wazeroir.LoadV128Type32Splat:
+	case wazeroir.V128LoadType32Splat:
 		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 4)
 		if err != nil {
 			return err
@@ -172,7 +174,7 @@ func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error {
 		// pshufd $0, result, result (result = result[0,0,0,0])
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRD, reg, result, 0)
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, result, result, 0)
-	case wazeroir.LoadV128Type64Splat:
+	case wazeroir.V128LoadType64Splat:
 		reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -183,9 +185,9 @@ func (c *amd64Compiler) compileV128Load(o *wazeroir.OperationV128Load) error {
 		// pinsrq $1, reg, result
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 0)
 		c.assembler.CompileRegisterToRegisterWithArg(amd64.PINSRQ, reg, result, 1)
-	case wazeroir.LoadV128Type32zero:
+	case wazeroir.V128LoadType32zero:
 		err = c.compileV128LoadImpl(amd64.MOVL, o.Arg.Offset, 4, result)
-	case wazeroir.LoadV128Type64zero:
+	case wazeroir.V128LoadType64zero:
 		err = c.compileV128LoadImpl(amd64.MOVQ, o.Arg.Offset, 8, result)
 	}
 
@@ -827,7 +829,7 @@ func (c *amd64Compiler) compileV128ShrImpl(o *wazeroir.OperationV128Shr) error {
 	return nil
 }
 
-// compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x4 signed (arithmetic) shift.
+// compileV128ShrI64x2SignedImpl implements compiler.compileV128Shr for i64x2 signed (arithmetic) shift.
 // PSRAQ instruction requires AVX, so we emulate it without AVX instructions. https://www.felixcloutier.com/x86/psraw:psrad:psraq
 func (c *amd64Compiler) compileV128ShrI64x2SignedImpl() error {
 	const shiftCountRegister = amd64.RegCX
@@ -1330,3 +1332,1375 @@ func (c *amd64Compiler) compileV128Cmp(o *wazeroir.OperationV128Cmp) error {
 	c.pushVectorRuntimeValueLocationOnRegister(result)
 	return nil
 }
+
+// compileV128AddSat implements compiler.compileV128AddSat for amd64.
+func (c *amd64Compiler) compileV128AddSat(o *wazeroir.OperationV128AddSat) error {
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		if o.Signed {
+			inst = amd64.PADDSB
+		} else {
+			inst = amd64.PADDUSB
+		}
+	case wazeroir.ShapeI16x8:
+		if o.Signed {
+			inst = amd64.PADDSW
+		} else {
+			inst = amd64.PADDUSW
+		}
+	}
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128SubSat implements compiler.compileV128SubSat for amd64.
+func (c *amd64Compiler) compileV128SubSat(o *wazeroir.OperationV128SubSat) error {
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		if o.Signed {
+			inst = amd64.PSUBSB
+		} else {
+			inst = amd64.PSUBUSB
+		}
+	case wazeroir.ShapeI16x8:
+		if o.Signed {
+			inst = amd64.PSUBSW
+		} else {
+			inst = amd64.PSUBUSW
+		}
+	}
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128Mul implements compiler.compileV128Mul for amd64.
+func (c *amd64Compiler) compileV128Mul(o *wazeroir.OperationV128Mul) error {
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeI16x8:
+		inst = amd64.PMULLW
+	case wazeroir.ShapeI32x4:
+		inst = amd64.PMULLD
+	case wazeroir.ShapeI64x2:
+		return c.compileV128MulI64x2()
+	case wazeroir.ShapeF32x4:
+		inst = amd64.MULPS
+	case wazeroir.ShapeF64x2:
+		inst = amd64.MULPD
+	}
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128MulI64x2 implements V128Mul for i64x2.
+func (c *amd64Compiler) compileV128MulI64x2() error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	x1r, x2r := x1.register, x2.register
+
+	tmp1, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	c.locationStack.markRegisterUsed(tmp1)
+
+	tmp2, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	// Assuming that we have
+	//	x1r = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
+	//  x2r = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
+	// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.
+
+	// Copy x1's value into tmp1.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp1)
+	// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
+	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp1)
+
+	// Execute "pmuludq x2r,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
+	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, tmp1)
+
+	// Copy x2's value into tmp2.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x2r, tmp2)
+	// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
+	c.assembler.CompileConstToRegister(amd64.PSRLQ, 32, tmp2)
+
+	// Execute "pmuludq x1r,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
+	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x1r, tmp2)
+
+	// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
+	// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
+	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp2, tmp1)
+	c.assembler.CompileConstToRegister(amd64.PSLLQ, 32, tmp1)
+
+	// Execute "pmuludq x2r,x1r", which makes x1r = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
+	c.assembler.CompileRegisterToRegister(amd64.PMULUDQ, x2r, x1r)
+
+	// Finally, we get the result by adding x1r and tmp1,
+	// which makes x1r = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
+	c.assembler.CompileRegisterToRegister(amd64.PADDQ, tmp1, x1r)
+
+	c.locationStack.markRegisterUnused(x2r, tmp1)
+	c.pushVectorRuntimeValueLocationOnRegister(x1r)
+	return nil
+}
+
+// compileV128Div implements compiler.compileV128Div for amd64.
+func (c *amd64Compiler) compileV128Div(o *wazeroir.OperationV128Div) error {
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeF32x4:
+		inst = amd64.DIVPS
+	case wazeroir.ShapeF64x2:
+		inst = amd64.DIVPD
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128Neg implements compiler.compileV128Neg for amd64.
+func (c *amd64Compiler) compileV128Neg(o *wazeroir.OperationV128Neg) error {
+	if o.Shape <= wazeroir.ShapeI64x2 {
+		return c.compileV128NegInt(o.Shape)
+	} else {
+		return c.compileV128NegFloat(o.Shape)
+	}
+}
+
+// compileV128NegInt implements compiler.compileV128Neg for integer lanes.
+func (c *amd64Compiler) compileV128NegInt(s wazeroir.Shape) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+
+	result, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	var subInst asm.Instruction
+	switch s {
+	case wazeroir.ShapeI8x16:
+		subInst = amd64.PSUBB
+	case wazeroir.ShapeI16x8:
+		subInst = amd64.PSUBW
+	case wazeroir.ShapeI32x4:
+		subInst = amd64.PSUBD
+	case wazeroir.ShapeI64x2:
+		subInst = amd64.PSUBQ
+	}
+
+	c.assembler.CompileRegisterToRegister(amd64.PXOR, result, result)
+	c.assembler.CompileRegisterToRegister(subInst, v.register, result)
+
+	c.locationStack.markRegisterUnused(v.register)
+	c.pushVectorRuntimeValueLocationOnRegister(result)
+	return nil
+}
+
+// compileV128NegInt implements compiler.compileV128Neg for float lanes.
+func (c *amd64Compiler) compileV128NegFloat(s wazeroir.Shape) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	var leftShiftInst, xorInst asm.Instruction
+	var leftShiftAmount asm.ConstantValue
+	if s == wazeroir.ShapeF32x4 {
+		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLD, 31, amd64.XORPS
+	} else {
+		leftShiftInst, leftShiftAmount, xorInst = amd64.PSLLQ, 63, amd64.XORPD
+	}
+
+	// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPS instruction).
+	// See https://www.felixcloutier.com/x86/cmpps
+	c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPD, tmp, tmp, 0)
+	// Do the left shift on each lane to set only the most significant bit in each.
+	c.assembler.CompileConstToRegister(leftShiftInst, leftShiftAmount, tmp)
+	// Get the negated result by XOR on each lane with tmp.
+	c.assembler.CompileRegisterToRegister(xorInst, tmp, v.register)
+
+	c.pushVectorRuntimeValueLocationOnRegister(v.register)
+	return nil
+}
+
+// compileV128Sqrt implements compiler.compileV128Sqrt for amd64.
+func (c *amd64Compiler) compileV128Sqrt(o *wazeroir.OperationV128Sqrt) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeF64x2:
+		inst = amd64.SQRTPD
+	case wazeroir.ShapeF32x4:
+		inst = amd64.SQRTPS
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, v.register, v.register)
+	c.pushVectorRuntimeValueLocationOnRegister(v.register)
+	return nil
+}
+
+// compileV128Abs implements compiler.compileV128Abs for amd64.
+func (c *amd64Compiler) compileV128Abs(o *wazeroir.OperationV128Abs) error {
+	if o.Shape == wazeroir.ShapeI64x2 {
+		return c.compileV128AbsI64x2()
+	}
+
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+
+	result := v.register
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		c.assembler.CompileRegisterToRegister(amd64.PABSB, result, result)
+	case wazeroir.ShapeI16x8:
+		c.assembler.CompileRegisterToRegister(amd64.PABSW, result, result)
+	case wazeroir.ShapeI32x4:
+		c.assembler.CompileRegisterToRegister(amd64.PABSD, result, result)
+	case wazeroir.ShapeF32x4:
+		tmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+		// Set all bits on tmp.
+		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
+		// Shift right packed single floats by 1 to clear the sign bits.
+		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, tmp)
+		// Clear the sign bit of vr.
+		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, result)
+	case wazeroir.ShapeF64x2:
+		tmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+		// Set all bits on tmp.
+		c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
+		// Shift right packed single floats by 1 to clear the sign bits.
+		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, tmp)
+		// Clear the sign bit of vr.
+		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmp, result)
+	}
+
+	c.pushVectorRuntimeValueLocationOnRegister(result)
+	return nil
+}
+
+// compileV128AbsI64x2 implements compileV128Abs for i64x2 lanes.
+func (c *amd64Compiler) compileV128AbsI64x2() error {
+	// See https://www.felixcloutier.com/x86/blendvpd
+	const blendMaskReg = amd64.RegX0
+	c.onValueReleaseRegisterToStack(blendMaskReg)
+	c.locationStack.markRegisterUsed(blendMaskReg)
+
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	if vr == blendMaskReg {
+		return errors.New("BUG: X0 must not be used")
+	}
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+	c.locationStack.markRegisterUsed(tmp)
+
+	// Copy the value to tmp.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
+
+	// Clear all bits on blendMaskReg.
+	c.assembler.CompileRegisterToRegister(amd64.PXOR, blendMaskReg, blendMaskReg)
+	// Subtract vr from blendMaskReg.
+	c.assembler.CompileRegisterToRegister(amd64.PSUBQ, vr, blendMaskReg)
+	// Copy the subtracted value ^^ back into vr.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, blendMaskReg, vr)
+
+	c.assembler.CompileRegisterToRegister(amd64.BLENDVPD, tmp, vr)
+
+	c.locationStack.markRegisterUnused(blendMaskReg, tmp)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+var (
+	popcntMask = [16]byte{
+		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+		0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+	}
+	// popcntTable holds each index's Popcnt, for example popcntTable[5] holds popcnt(0x05).
+	popcntTable = [16]byte{
+		0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03,
+		0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04,
+	}
+)
+
+// compileV128Popcnt implements compiler.compileV128Popcnt for amd64.
+func (c *amd64Compiler) compileV128Popcnt(*wazeroir.OperationV128Popcnt) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	tmp1, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	c.locationStack.markRegisterUsed(tmp1)
+
+	tmp2, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	c.locationStack.markRegisterUsed(tmp2)
+
+	tmp3, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	// Read the popcntMask into tmp1, and we have
+	//  tmp1 = [0xf, ..., 0xf]
+	if err := c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, popcntMask[:], tmp1); err != nil {
+		return err
+	}
+
+	// Copy the original value into tmp2.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
+
+	// Given that we have:
+	//  v = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
+	//
+	// Take PAND on tmp1 and tmp2, and we have
+	//  tmp2 = [l1, ..., l16].
+	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, tmp2)
+
+	// Do logical (packed word) right shift by 4 on vr and PAND with vr and tmp1, meaning that we have
+	//  vr = [h1, ...., h16].
+	c.assembler.CompileConstToRegister(amd64.PSRLW, 4, vr)
+	c.assembler.CompileRegisterToRegister(amd64.PAND, tmp1, vr)
+
+	// Read the popcntTable into tmp1, and we have
+	//  tmp1 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
+	if err := c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, popcntTable[:], tmp1); err != nil {
+		return err
+	}
+
+	// Copy the tmp1 into tmp3, and we have
+	//  tmp3 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, tmp1, tmp3)
+
+	//  tmp3 = [popcnt(l1), ..., popcnt(l16)].
+	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, tmp2, tmp3)
+
+	//  tmp1 = [popcnt(h1), ..., popcnt(h16)].
+	c.assembler.CompileRegisterToRegister(amd64.PSHUFB, vr, tmp1)
+
+	// vr = tmp1 = [popcnt(h1), ..., popcnt(h16)].
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp1, vr)
+
+	// vr += tmp3 = [popcnt(h1)+popcnt(l1), ..., popcnt(h16)+popcnt(l16)] = [popcnt(b1), ..., popcnt(b16)].
+	c.assembler.CompileRegisterToRegister(amd64.PADDB, tmp3, vr)
+
+	c.locationStack.markRegisterUnused(tmp1, tmp2)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+// compileV128Min implements compiler.compileV128Min for amd64.
+func (c *amd64Compiler) compileV128Min(o *wazeroir.OperationV128Min) error {
+	if o.Shape >= wazeroir.ShapeF32x4 {
+		return c.compileV128MinOrMaxFloat(o.Shape, true)
+	}
+
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		if o.Signed {
+			inst = amd64.PMINSB
+		} else {
+			inst = amd64.PMINUB
+		}
+	case wazeroir.ShapeI16x8:
+		if o.Signed {
+			inst = amd64.PMINSW
+		} else {
+			inst = amd64.PMINUW
+		}
+	case wazeroir.ShapeI32x4:
+		if o.Signed {
+			inst = amd64.PMINSD
+		} else {
+			inst = amd64.PMINUD
+		}
+	}
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128MinOrMaxFloat implements compiler.compileV128Min and compiler.compileV128Max for float lanes.
+func (c *amd64Compiler) compileV128MinOrMaxFloat(o wazeroir.Shape, isMin bool) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	x1r, x2r := x1.register, x2.register
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	var minOrMaxInst, cmpInst, andnInst, orInst, logicalRightShiftInst asm.Instruction
+	var shiftNumToInverseNaN asm.ConstantValue
+	if o == wazeroir.ShapeF32x4 {
+		cmpInst, andnInst, orInst, logicalRightShiftInst, shiftNumToInverseNaN =
+			amd64.CMPPS, amd64.ANDNPS, amd64.ORPS, amd64.PSRLD, 0xa
+		if isMin {
+			minOrMaxInst = amd64.MINPS
+		} else {
+			minOrMaxInst = amd64.MAXPS
+		}
+	} else {
+		cmpInst, andnInst, orInst, logicalRightShiftInst, shiftNumToInverseNaN =
+			amd64.CMPPD, amd64.ANDNPD, amd64.ORPD, amd64.PSRLQ, 0xd
+		if isMin {
+			minOrMaxInst = amd64.MINPD
+		} else {
+			minOrMaxInst = amd64.MAXPD
+		}
+	}
+
+	// Copy the value on x1 to tmp.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
+
+	// Denote the original x1r and x2r 's vector as v1 and v2 below.
+	//
+	// Execute MINPS/MINPD/MAXPS/MAXPD with destination = tmp (holding v1), and we have
+	//  tmp = [ if (v1[i] != NaN && v2[i] != NaN) {min_max(v1[i], v2[i])} else {v1[i]} for i in 0..LANE_NUM]
+	c.assembler.CompileRegisterToRegister(minOrMaxInst, x2r, tmp)
+
+	// Execute MINPS/MINPD/MAXPS/MAXPD with destination = x2r (holding v2), and we have
+	//  x2r = [ if (v1[i] != NaN && v2[i] != NaN) {min_max(v1[i], v2[i])} else {v2[i]} for i in 0..LANE_NUM]
+	c.assembler.CompileRegisterToRegister(minOrMaxInst, x1r, x2r)
+
+	// Copy the current tmp into x1r.
+	c.assembler.CompileRegisterToRegister(amd64.MOVDQA, tmp, x1r)
+
+	// Set all bits on the lane where either v1[i] or v2[i] is NaN by via CMPPS/CMPPD (arg=3).
+	// That means, we have:
+	//  x1r =  [ if (v1[i] != NaN && v2[i] != NaN) {0} else {^0} for i in 0..4]
+	//
+	// See https://www.felixcloutier.com/x86/cmpps.
+	c.assembler.CompileRegisterToRegisterWithArg(cmpInst, x2r, x1r, 3)
+
+	// Mask all the lanes where either v1[i] or v2[i] is NaN, meaning that we have
+	//  tmp = [ if (v1[i] != NaN && v2[i] != NaN) {min_max(v1[i], v2[i])} else {^0} for i in 0..LANE_NUM]
+	c.assembler.CompileRegisterToRegister(orInst, x1r, tmp)
+
+	// Put the inverse of NaN if either v1[i] or v2[i] is NaN on each lane, otherwise zero on x1r.
+	// That means, we have:
+	//  x1r =  [ if (v1[i] != NaN && v2[i] != NaN) {0} else {^NaN} for i in 0..LANE_NUM]
+	//
+	c.assembler.CompileConstToRegister(logicalRightShiftInst, shiftNumToInverseNaN, x1r)
+
+	// Finally, we get the result but putting NaNs on each lane where either of v1[i] or v2[i] is NaN, otherwise min_max(v1[i], v2[i]).
+	// That means, we have:
+	//  x1r = [ if (v1[i] != NaN && v2[i] != NaN) {min_max(v1[i], v2[i])}  else {NaN} for i in 0..LANE_NUM]
+	c.assembler.CompileRegisterToRegister(andnInst, tmp, x1r)
+
+	c.locationStack.markRegisterUnused(x2r)
+	c.pushVectorRuntimeValueLocationOnRegister(x1r)
+	return nil
+}
+
+// compileV128Max implements compiler.compileV128Max for amd64.
+func (c *amd64Compiler) compileV128Max(o *wazeroir.OperationV128Max) error {
+	if o.Shape >= wazeroir.ShapeF32x4 {
+		return c.compileV128MinOrMaxFloat(o.Shape, false)
+	}
+
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		if o.Signed {
+			inst = amd64.PMAXSB
+		} else {
+			inst = amd64.PMAXUB
+		}
+	case wazeroir.ShapeI16x8:
+		if o.Signed {
+			inst = amd64.PMAXSW
+		} else {
+			inst = amd64.PMAXUW
+		}
+	case wazeroir.ShapeI32x4:
+		if o.Signed {
+			inst = amd64.PMAXSD
+		} else {
+			inst = amd64.PMAXUD
+		}
+	}
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128AvgrU implements compiler.compileV128AvgrU for amd64.
+func (c *amd64Compiler) compileV128AvgrU(o *wazeroir.OperationV128AvgrU) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	var inst asm.Instruction
+	switch o.Shape {
+	case wazeroir.ShapeI8x16:
+		inst = amd64.PAVGB
+	case wazeroir.ShapeI16x8:
+		inst = amd64.PAVGW
+	}
+
+	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+// compileV128Pmin implements compiler.compileV128Pmin for amd64.
+func (c *amd64Compiler) compileV128Pmin(o *wazeroir.OperationV128Pmin) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	var min asm.Instruction
+	if o.Shape == wazeroir.ShapeF32x4 {
+		min = amd64.MINPS
+	} else {
+		min = amd64.MINPD
+	}
+
+	x1r, v2r := x1.register, x2.register
+
+	c.assembler.CompileRegisterToRegister(min, x1r, v2r)
+
+	c.locationStack.markRegisterUnused(x1r)
+	c.pushVectorRuntimeValueLocationOnRegister(v2r)
+	return nil
+}
+
+// compileV128Pmax implements compiler.compileV128Pmax for amd64.
+func (c *amd64Compiler) compileV128Pmax(o *wazeroir.OperationV128Pmax) error {
+
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	var min asm.Instruction
+	if o.Shape == wazeroir.ShapeF32x4 {
+		min = amd64.MAXPS
+	} else {
+		min = amd64.MAXPD
+	}
+
+	x1r, v2r := x1.register, x2.register
+
+	c.assembler.CompileRegisterToRegister(min, x1r, v2r)
+
+	c.locationStack.markRegisterUnused(x1r)
+	c.pushVectorRuntimeValueLocationOnRegister(v2r)
+	return nil
+}
+
+// compileV128Ceil implements compiler.compileV128Ceil for amd64.
+func (c *amd64Compiler) compileV128Ceil(o *wazeroir.OperationV128Ceil) error {
+	// See https://www.felixcloutier.com/x86/roundpd
+	const roundModeCeil = 0x2
+	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeCeil)
+}
+
+// compileV128Floor implements compiler.compileV128Floor for amd64.
+func (c *amd64Compiler) compileV128Floor(o *wazeroir.OperationV128Floor) error {
+	// See https://www.felixcloutier.com/x86/roundpd
+	const roundModeFloor = 0x1
+	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeFloor)
+}
+
+// compileV128Trunc implements compiler.compileV128Trunc for amd64.
+func (c *amd64Compiler) compileV128Trunc(o *wazeroir.OperationV128Trunc) error {
+	// See https://www.felixcloutier.com/x86/roundpd
+	const roundModeTrunc = 0x3
+	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeTrunc)
+}
+
+// compileV128Nearest implements compiler.compileV128Nearest for amd64.
+func (c *amd64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) error {
+	// See https://www.felixcloutier.com/x86/roundpd
+	const roundModeNearest = 0x0
+	return c.compileV128RoundImpl(o.Shape == wazeroir.ShapeF32x4, roundModeNearest)
+}
+
+// compileV128RoundImpl implements compileV128Nearest compileV128Trunc compileV128Floor and compileV128Ceil
+// with ROUNDPS (32-bit lane) and ROUNDPD (64-bit lane).
+func (c *amd64Compiler) compileV128RoundImpl(is32bit bool, mode byte) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	var round asm.Instruction
+	if is32bit {
+		round = amd64.ROUNDPS
+	} else {
+		round = amd64.ROUNDPD
+	}
+
+	c.assembler.CompileRegisterToRegisterWithArg(round, vr, vr, mode)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+// compileV128Extend implements compiler.compileV128Extend for amd64.
+func (c *amd64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	if !o.UseLow {
+		// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
+		// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
+		// See https://www.felixcloutier.com/x86/palignr
+		c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, v.register, v.register, 0x8)
+	}
+
+	var extend asm.Instruction
+	switch o.OriginShape {
+	case wazeroir.ShapeI8x16:
+		if o.Signed {
+			extend = amd64.PMOVSXBW
+		} else {
+			extend = amd64.PMOVZXBW
+		}
+	case wazeroir.ShapeI16x8:
+		if o.Signed {
+			extend = amd64.PMOVSXWD
+		} else {
+			extend = amd64.PMOVZXWD
+		}
+	case wazeroir.ShapeI32x4:
+		if o.Signed {
+			extend = amd64.PMOVSXDQ
+		} else {
+			extend = amd64.PMOVZXDQ
+		}
+	}
+
+	c.assembler.CompileRegisterToRegister(extend, vr, vr)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+// compileV128ExtMul implements compiler.compileV128ExtMul for amd64.
+func (c *amd64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	x1r, x2r := x1.register, x2.register
+
+	switch o.OriginShape {
+	case wazeroir.ShapeI8x16:
+		if !o.UseLow {
+			// We have to shift the higher 64-bits into the lower ones before the actual extending instruction.
+			// Shifting right by 0x8 * 8 = 64bits and concatenate itself.
+			// See https://www.felixcloutier.com/x86/palignr
+			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x1r, x1r, 0x8)
+			c.assembler.CompileRegisterToRegisterWithArg(amd64.PALIGNR, x2r, x2r, 0x8)
+		}
+
+		var ext asm.Instruction
+		if o.Signed {
+			ext = amd64.PMOVSXBW
+		} else {
+			ext = amd64.PMOVZXBW
+		}
+
+		// Signed or Zero extend lower half packed bytes to packed words.
+		c.assembler.CompileRegisterToRegister(ext, x1r, x1r)
+		c.assembler.CompileRegisterToRegister(ext, x2r, x2r)
+
+		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
+	case wazeroir.ShapeI16x8:
+		tmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+
+		// Copy the value on x1r to tmp.
+		c.assembler.CompileRegisterToRegister(amd64.MOVDQA, x1r, tmp)
+
+		// Multiply the values and store the lower 16-bits into x1r.
+		c.assembler.CompileRegisterToRegister(amd64.PMULLW, x2r, x1r)
+		if o.Signed {
+			// Signed multiply the values and store the higher 16-bits into tmp.
+			c.assembler.CompileRegisterToRegister(amd64.PMULHW, x2r, tmp)
+		} else {
+			// Unsigned multiply the values and store the higher 16-bits into tmp.
+			c.assembler.CompileRegisterToRegister(amd64.PMULHUW, x2r, tmp)
+		}
+
+		// Unpack lower or higher half of vectors (tmp and x1r) and concatenate them.
+		if o.UseLow {
+			c.assembler.CompileRegisterToRegister(amd64.PUNPCKLWD, tmp, x1r)
+		} else {
+			c.assembler.CompileRegisterToRegister(amd64.PUNPCKHWD, tmp, x1r)
+		}
+	case wazeroir.ShapeI32x4:
+		var shuffleOrder byte
+		// Given that the original state of the register is as [v1, v2, v3, v4] where vN = a word,
+		if o.UseLow {
+			// This makes the register as [v1, v1, v2, v2]
+			shuffleOrder = 0b01010000
+		} else {
+			// This makes the register as [v3, v3, v4, v4]
+			shuffleOrder = 0b11111010
+		}
+		// See https://www.felixcloutier.com/x86/pshufd
+		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x1r, x1r, shuffleOrder)
+		c.assembler.CompileRegisterToRegisterWithArg(amd64.PSHUFD, x2r, x2r, shuffleOrder)
+
+		var mul asm.Instruction
+		if o.Signed {
+			mul = amd64.PMULDQ
+		} else {
+			mul = amd64.PMULUDQ
+		}
+		c.assembler.CompileRegisterToRegister(mul, x2r, x1r)
+	}
+
+	c.locationStack.markRegisterUnused(x2r)
+	c.pushVectorRuntimeValueLocationOnRegister(x1r)
+	return nil
+}
+
+var q15mulrSatSMask = [16]byte{
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+}
+
+// compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for amd64.
+func (c *amd64Compiler) compileV128Q15mulrSatS(*wazeroir.OperationV128Q15mulrSatS) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	x1r, x2r := x1.register, x2.register
+
+	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
+	if err := c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, q15mulrSatSMask[:], tmp); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(amd64.PMULHRSW, x2r, x1r)
+	c.assembler.CompileRegisterToRegister(amd64.PCMPEQW, x1r, tmp)
+	c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, x1r)
+
+	c.locationStack.markRegisterUnused(x2r)
+	c.pushVectorRuntimeValueLocationOnRegister(x1r)
+	return nil
+}
+
+var (
+	allOnesI8x16 = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
+	allOnesI16x8 = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
+
+	extAddPairwiseI16x8uMask = [16 * 2]byte{
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
+	}
+)
+
+// compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for amd64.
+func (c *amd64Compiler) compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	switch o.OriginShape {
+	case wazeroir.ShapeI8x16:
+		allOnesReg, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+
+		if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU,
+			allOnesI8x16[:], allOnesReg); err != nil {
+			return err
+		}
+
+		var result asm.Register
+		// See https://www.felixcloutier.com/x86/pmaddubsw for detail.
+		if o.Signed {
+			// Interpret vr's value as signed byte and multiply with one and add pairwise, which results in pairwise
+			// signed extadd.
+			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, vr, allOnesReg)
+			result = allOnesReg
+		} else {
+			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
+			c.assembler.CompileRegisterToRegister(amd64.PMADDUBSW, allOnesReg, vr)
+			result = vr
+		}
+
+		if result != vr {
+			c.locationStack.markRegisterUnused(vr)
+		}
+		c.pushVectorRuntimeValueLocationOnRegister(result)
+	case wazeroir.ShapeI16x8:
+		tmp, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+
+		if o.Signed {
+			// See https://www.felixcloutier.com/x86/pmaddwd
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, allOnesI16x8[:], tmp); err != nil {
+				return err
+			}
+
+			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
+			c.pushVectorRuntimeValueLocationOnRegister(vr)
+		} else {
+
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, extAddPairwiseI16x8uMask[:16], tmp); err != nil {
+				return err
+			}
+
+			// Flip the sign bits on vr.
+			//
+			// Assuming that vr = [w1, ..., w8], now we have,
+			// 	vr[i] = int8(-w1) for i = 0...8
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
+
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, allOnesI16x8[:], tmp); err != nil {
+				return err
+			}
+
+			// For i = 0,..4 (as this results in i32x4 lanes), now we have
+			// vr[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
+			c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
+
+			// tmp[i] = [0, 0, 1, 0] = int32(math.MaxInt16+1)
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, extAddPairwiseI16x8uMask[16:], tmp); err != nil {
+				return err
+			}
+
+			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
+			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
+			c.pushVectorRuntimeValueLocationOnRegister(vr)
+		}
+	}
+	return nil
+}
+
+// compileV128FloatPromote implements compiler.compileV128FloatPromote for amd64.
+func (c *amd64Compiler) compileV128FloatPromote(*wazeroir.OperationV128FloatPromote) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	c.assembler.CompileRegisterToRegister(amd64.CVTPS2PD, vr, vr)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+// compileV128FloatDemote implements compiler.compileV128FloatDemote for amd64.
+func (c *amd64Compiler) compileV128FloatDemote(*wazeroir.OperationV128FloatDemote) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	c.assembler.CompileRegisterToRegister(amd64.CVTPD2PS, vr, vr)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+// compileV128Dot implements compiler.compileV128Dot for amd64.
+func (c *amd64Compiler) compileV128Dot(*wazeroir.OperationV128Dot) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	c.assembler.CompileRegisterToRegister(amd64.PMADDWD, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+var fConvertFromIMask = [16]byte{
+	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+}
+
+// compileV128FConvertFromI implements compiler.compileV128FConvertFromI for amd64.
+func (c *amd64Compiler) compileV128FConvertFromI(o *wazeroir.OperationV128FConvertFromI) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	switch o.DestinationShape {
+	case wazeroir.ShapeF32x4:
+		if o.Signed {
+			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
+		} else {
+			tmp, err := c.allocateRegister(registerTypeVector)
+			if err != nil {
+				return err
+			}
+
+			// Copy the value into tmp.
+			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
+
+			// Clear the higher 16-bits of tmp.
+			c.assembler.CompileConstToRegister(amd64.PSLLD, 0xa, tmp)
+			c.assembler.CompileConstToRegister(amd64.PSRLD, 0xa, tmp)
+
+			// Subtract the higher 16-bits from vr == clear the lower 16-bits of vr.
+			c.assembler.CompileRegisterToRegister(amd64.PSUBD, tmp, vr)
+
+			// Convert the lower 16-bits in tmp.
+			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
+
+			// Left shift by one and convert vr, meaning that halved conversion result of higher 16-bits in vr.
+			c.assembler.CompileConstToRegister(amd64.PSRLD, 1, vr)
+			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, vr, vr)
+
+			// Double the converted halved higher 16bits.
+			c.assembler.CompileRegisterToRegister(amd64.ADDPS, vr, vr)
+
+			// Get the conversion result by add tmp (holding lower 16-bit conversion) into vr.
+			c.assembler.CompileRegisterToRegister(amd64.ADDPS, tmp, vr)
+		}
+	case wazeroir.ShapeF64x2:
+		if o.Signed {
+			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PD, vr, vr)
+		} else {
+			tmp, err := c.allocateRegister(registerTypeVector)
+			if err != nil {
+				return err
+			}
+
+			// tmp = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, fConvertFromIMask[:16], tmp); err != nil {
+				return err
+			}
+
+			// Given that we have vr = [d1, d2, d3, d4], this results in
+			//	vr = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
+			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
+			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
+			c.assembler.CompileRegisterToRegister(amd64.UNPCKLPS, tmp, vr)
+
+			// tmp = [float64(0x1.0p52), float64(0x1.0p52)]
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVDQU, twop52[:], tmp); err != nil {
+				return err
+			}
+
+			// Now, we get the result as
+			// 	vr = [float64(uint32(d1)), float64(uint32(d2))]
+			// because the following equality always satisfies:
+			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
+			c.assembler.CompileRegisterToRegister(amd64.SUBPD, tmp, vr)
+		}
+	}
+
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
+
+// compileV128Narrow implements compiler.compileV128Narrow for amd64.
+func (c *amd64Compiler) compileV128Narrow(o *wazeroir.OperationV128Narrow) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	var narrow asm.Instruction
+	switch o.OriginShape {
+	case wazeroir.ShapeI16x8:
+		if o.Signed {
+			narrow = amd64.PACKSSWB
+		} else {
+			narrow = amd64.PACKUSWB
+		}
+	case wazeroir.ShapeI32x4:
+		if o.Signed {
+			narrow = amd64.PACKSSDW
+		} else {
+			narrow = amd64.PACKUSDW
+		}
+	}
+	c.assembler.CompileRegisterToRegister(narrow, x2.register, x1.register)
+
+	c.locationStack.markRegisterUnused(x2.register)
+	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
+	return nil
+}
+
+var (
+	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
+	i32sMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
+	}
+
+	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
+	i32uMaxOnF64x2 = [16]byte{
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
+	}
+
+	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
+	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
+	// like addition or subtraction, the resulted floating point holds exactly the same
+	// bit representations in 32-bit integer on its mantissa.
+	//
+	// Note: the name twop52 is common across various compiler ecosystem.
+	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
+	//  E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
+	twop52 = [16]byte{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
+	}
+)
+
+// compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for amd64.
+func (c *amd64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error {
+	v := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(v); err != nil {
+		return err
+	}
+	vr := v.register
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	c.locationStack.markRegisterUsed(tmp)
+
+	switch o.OriginShape {
+	case wazeroir.ShapeF32x4:
+		if o.Signed {
+			// Copy the value into tmp.
+			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
+
+			// Assuming we have vr = [v1, v2, v3, v4].
+			//
+			// Set all bits if lane is not NaN on tmp.
+			// tmp[i] = 0xffffffff  if vi != NaN
+			//        = 0           if vi == NaN
+			c.assembler.CompileRegisterToRegister(amd64.CMPEQPS, tmp, tmp)
+
+			// Clear NaN lanes on vr, meaning that
+			// 	vr[i] = vi  if vi != NaN
+			//	        0   if vi == NaN
+			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp, vr)
+
+			// tmp[i] = ^vi         if vi != NaN
+			//        = 0xffffffff  if vi == NaN
+			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, vr, tmp)
+
+			// vr[i] = int32(vi)   if vi != NaN and vr is not overflowing.
+			//       = 0x80000000  if vi != NaN and vr is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
+			//       = 0           if vi == NaN
+			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
+
+			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
+			//
+			// tmp[i] = 0x80000000                         if vi is positive
+			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
+			c.assembler.CompileRegisterToRegister(amd64.PAND, vr, tmp)
+
+			// Arithmetic right shifting tmp by 31, meaning that we have
+			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
+			c.assembler.CompileConstToRegister(amd64.PSRAD, 0x1f, tmp)
+
+			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, vr)
+		} else {
+			tmp2, err := c.allocateRegister(registerTypeVector)
+			if err != nil {
+				return err
+			}
+
+			// See https://github.com/bytecodealliance/wasmtime/pull/2440
+			// Note: even v8 doesn't seem to have support for this i32x4.tranc_sat_f32x4_u.
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
+			c.assembler.CompileRegisterToRegister(amd64.MAXPS, tmp, vr)
+			c.assembler.CompileRegisterToRegister(amd64.PCMPEQD, tmp, tmp)
+			c.assembler.CompileConstToRegister(amd64.PSRLD, 0x1, tmp)
+			c.assembler.CompileRegisterToRegister(amd64.CVTDQ2PS, tmp, tmp)
+			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp2)
+			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, vr, vr)
+			c.assembler.CompileRegisterToRegister(amd64.SUBPS, tmp, tmp2)
+			c.assembler.CompileRegisterToRegisterWithArg(amd64.CMPPS, tmp2, tmp, 0x2) // == CMPLEPS
+			c.assembler.CompileRegisterToRegister(amd64.CVTTPS2DQ, tmp2, tmp2)
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp2)
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
+			c.assembler.CompileRegisterToRegister(amd64.PMAXSD, tmp, tmp2)
+			c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp2, vr)
+		}
+	case wazeroir.ShapeF64x2:
+		tmp2, err := c.allocateRegister(registerTypeVector)
+		if err != nil {
+			return err
+		}
+
+		if o.Signed {
+			// Copy the value into tmp.
+			c.assembler.CompileRegisterToRegister(amd64.MOVDQA, vr, tmp)
+
+			// Set all bits for non-NaN lanes, zeros otherwise.
+			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
+			c.assembler.CompileRegisterToRegister(amd64.CMPEQPD, tmp, tmp)
+
+			// Load the 2147483647 into tmp2's each lane.
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVUPD, i32sMaxOnF64x2[:], tmp2); err != nil {
+				return err
+			}
+
+			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
+			c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmp2, tmp)
+
+			// MINPD returns the source register's value as-is, so we have
+			//  vr[i] = vi   if vi != NaN
+			//        = 0    if vi == NaN
+			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp, vr)
+
+			c.assembler.CompileRegisterToRegister(amd64.CVTTPD2DQ, vr, vr)
+		} else {
+			// Clears all bits on tmp.
+			c.assembler.CompileRegisterToRegister(amd64.PXOR, tmp, tmp)
+
+			//  vr[i] = vi   if vi != NaN && vi > 0
+			//        = 0    if vi == NaN || vi <= 0
+			c.assembler.CompileRegisterToRegister(amd64.MAXPD, tmp, vr)
+
+			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVUPD, i32uMaxOnF64x2[:], tmp2); err != nil {
+				return err
+			}
+
+			// vr[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0    otherwise
+			c.assembler.CompileRegisterToRegister(amd64.MINPD, tmp2, vr)
+
+			// Round the floating points into integer.
+			c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDPD, vr, vr, 0x3)
+
+			// tmp2[i] = float64(0x1.0p52)
+			if err = c.assembler.CompileLoadStaticConstToRegister(amd64.MOVUPD, twop52[:], tmp2); err != nil {
+				return err
+			}
+
+			// vr[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//       = 0                                       otherwise
+			//
+			// This means that vr[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
+			c.assembler.CompileRegisterToRegister(amd64.ADDPD, tmp2, vr)
+
+			// At this point, we have
+			// 	vr  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
+			//  tmp = [0, 0, 0, 0]
+			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
+			//	vr = [vr[00], vr[10], tmp[00], tmp[00]] = [vr[00], vr[10], 0, 0]
+			// meaning that for i = 0 and 1, we have
+			//  vr[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
+			//        = 0          otherwise.
+			c.assembler.CompileRegisterToRegisterWithArg(amd64.SHUFPS, tmp, vr, 0b00_00_10_00)
+		}
+	}
+
+	c.locationStack.markRegisterUnused(tmp)
+	c.pushVectorRuntimeValueLocationOnRegister(vr)
+	return nil
+}
diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go
index ee1d368cdd9..c406972ceb2 100644
--- a/internal/engine/compiler/impl_vec_arm64.go
+++ b/internal/engine/compiler/impl_vec_arm64.go
@@ -135,7 +135,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 	}
 
 	switch o.Type {
-	case wazeroir.LoadV128Type128:
+	case wazeroir.V128LoadType128:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 16)
 		if err != nil {
 			return err
@@ -143,7 +143,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementQ,
 		)
-	case wazeroir.LoadV128Type8x8s:
+	case wazeroir.V128LoadType8x8s:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -153,7 +153,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result,
 			arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	case wazeroir.LoadV128Type8x8u:
+	case wazeroir.V128LoadType8x8u:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -163,7 +163,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result,
 			arm64.VectorArrangement8B, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	case wazeroir.LoadV128Type16x4s:
+	case wazeroir.V128LoadType16x4s:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -173,7 +173,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result,
 			arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	case wazeroir.LoadV128Type16x4u:
+	case wazeroir.V128LoadType16x4u:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -183,7 +183,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result,
 			arm64.VectorArrangement4H, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	case wazeroir.LoadV128Type32x2s:
+	case wazeroir.V128LoadType32x2s:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -193,7 +193,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.SSHLL, result, result,
 			arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	case wazeroir.LoadV128Type32x2u:
+	case wazeroir.V128LoadType32x2u:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
@@ -203,35 +203,35 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		)
 		c.assembler.CompileVectorRegisterToVectorRegister(arm64.USHLL, result, result,
 			arm64.VectorArrangement2S, arm64.VectorIndexNone, arm64.VectorIndexNone)
-	case wazeroir.LoadV128Type8Splat:
+	case wazeroir.V128LoadType8Splat:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 1)
 		if err != nil {
 			return err
 		}
 		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, offset)
 		c.assembler.CompileMemoryToVectorRegister(arm64.LD1R, offset, 0, result, arm64.VectorArrangement16B)
-	case wazeroir.LoadV128Type16Splat:
+	case wazeroir.V128LoadType16Splat:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 2)
 		if err != nil {
 			return err
 		}
 		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, offset)
 		c.assembler.CompileMemoryToVectorRegister(arm64.LD1R, offset, 0, result, arm64.VectorArrangement8H)
-	case wazeroir.LoadV128Type32Splat:
+	case wazeroir.V128LoadType32Splat:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 4)
 		if err != nil {
 			return err
 		}
 		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, offset)
 		c.assembler.CompileMemoryToVectorRegister(arm64.LD1R, offset, 0, result, arm64.VectorArrangement4S)
-	case wazeroir.LoadV128Type64Splat:
+	case wazeroir.V128LoadType64Splat:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 8)
 		if err != nil {
 			return err
 		}
 		c.assembler.CompileRegisterToRegister(arm64.ADD, arm64ReservedRegisterForMemory, offset)
 		c.assembler.CompileMemoryToVectorRegister(arm64.LD1R, offset, 0, result, arm64.VectorArrangement2D)
-	case wazeroir.LoadV128Type32zero:
+	case wazeroir.V128LoadType32zero:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 16)
 		if err != nil {
 			return err
@@ -239,7 +239,7 @@ func (c *arm64Compiler) compileV128Load(o *wazeroir.OperationV128Load) (err erro
 		c.assembler.CompileMemoryWithRegisterOffsetToVectorRegister(arm64.VMOV,
 			arm64ReservedRegisterForMemory, offset, result, arm64.VectorArrangementS,
 		)
-	case wazeroir.LoadV128Type64zero:
+	case wazeroir.V128LoadType64zero:
 		offset, err := c.compileMemoryAccessOffsetSetup(o.Arg.Offset, 16)
 		if err != nil {
 			return err
@@ -702,3 +702,138 @@ func (c *arm64Compiler) compileV128Shl(o *wazeroir.OperationV128Shl) error {
 func (c *arm64Compiler) compileV128Cmp(o *wazeroir.OperationV128Cmp) error {
 	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
 }
+
+// compileV128AddSat implements compiler.compileV128AddSat for arm64.
+func (c *arm64Compiler) compileV128AddSat(o *wazeroir.OperationV128AddSat) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128SubSat implements compiler.compileV128SubSat for arm64.
+func (c *arm64Compiler) compileV128SubSat(o *wazeroir.OperationV128SubSat) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Mul implements compiler.compileV128Mul for arm64.
+func (c *arm64Compiler) compileV128Mul(o *wazeroir.OperationV128Mul) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Div implements compiler.compileV128Div for arm64.
+func (c *arm64Compiler) compileV128Div(o *wazeroir.OperationV128Div) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Neg implements compiler.compileV128Neg for arm64.
+func (c *arm64Compiler) compileV128Neg(o *wazeroir.OperationV128Neg) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Sqrt implements compiler.compileV128Sqrt for arm64.
+func (c *arm64Compiler) compileV128Sqrt(o *wazeroir.OperationV128Sqrt) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Abs implements compiler.compileV128Abs for arm64.
+func (c *arm64Compiler) compileV128Abs(o *wazeroir.OperationV128Abs) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Popcnt implements compiler.compileV128Popcnt for arm64.
+func (c *arm64Compiler) compileV128Popcnt(o *wazeroir.OperationV128Popcnt) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Min implements compiler.compileV128Min for arm64.
+func (c *arm64Compiler) compileV128Min(o *wazeroir.OperationV128Min) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Max implements compiler.compileV128Max for arm64.
+func (c *arm64Compiler) compileV128Max(o *wazeroir.OperationV128Max) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128AvgrU implements compiler.compileV128AvgrU for arm64.
+func (c *arm64Compiler) compileV128AvgrU(o *wazeroir.OperationV128AvgrU) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Pmin implements compiler.compileV128Pmin for arm64.
+func (c *arm64Compiler) compileV128Pmin(o *wazeroir.OperationV128Pmin) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Pmax implements compiler.compileV128Pmax for arm64.
+func (c *arm64Compiler) compileV128Pmax(o *wazeroir.OperationV128Pmax) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Ceil implements compiler.compileV128Ceil for arm64.
+func (c *arm64Compiler) compileV128Ceil(o *wazeroir.OperationV128Ceil) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Floor implements compiler.compileV128Floor for arm64.
+func (c *arm64Compiler) compileV128Floor(o *wazeroir.OperationV128Floor) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Trunc implements compiler.compileV128Trunc for arm64.
+func (c *arm64Compiler) compileV128Trunc(o *wazeroir.OperationV128Trunc) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Nearest implements compiler.compileV128Nearest for arm64.
+func (c *arm64Compiler) compileV128Nearest(o *wazeroir.OperationV128Nearest) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Extend implements compiler.compileV128Extend for arm64.
+func (c *arm64Compiler) compileV128Extend(o *wazeroir.OperationV128Extend) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128ExtMul implements compiler.compileV128ExtMul for arm64.
+func (c *arm64Compiler) compileV128ExtMul(o *wazeroir.OperationV128ExtMul) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Q15mulrSatS implements compiler.compileV128Q15mulrSatS for arm64.
+func (c *arm64Compiler) compileV128Q15mulrSatS(o *wazeroir.OperationV128Q15mulrSatS) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128ExtAddPairwise implements compiler.compileV128ExtAddPairwise for arm64.
+func (c *arm64Compiler) compileV128ExtAddPairwise(o *wazeroir.OperationV128ExtAddPairwise) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128FloatPromote implements compiler.compileV128FloatPromote for arm64.
+func (c *arm64Compiler) compileV128FloatPromote(o *wazeroir.OperationV128FloatPromote) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128FloatDemote implements compiler.compileV128FloatDemote for arm64.
+func (c *arm64Compiler) compileV128FloatDemote(o *wazeroir.OperationV128FloatDemote) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128FConvertFromI implements compiler.compileV128FConvertFromI for arm64.
+func (c *arm64Compiler) compileV128FConvertFromI(o *wazeroir.OperationV128FConvertFromI) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Dot implements compiler.compileV128Dot for arm64.
+func (c *arm64Compiler) compileV128Dot(o *wazeroir.OperationV128Dot) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128Narrow implements compiler.compileV128Narrow for arm64.
+func (c *arm64Compiler) compileV128Narrow(o *wazeroir.OperationV128Narrow) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
+
+// compileV128ITruncSatFromF implements compiler.compileV128ITruncSatFromF for arm64.
+func (c *arm64Compiler) compileV128ITruncSatFromF(o *wazeroir.OperationV128ITruncSatFromF) error {
+	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+}
diff --git a/internal/engine/interpreter/interpreter.go b/internal/engine/interpreter/interpreter.go
index 75dfd41a33b..38b800f08da 100644
--- a/internal/engine/interpreter/interpreter.go
+++ b/internal/engine/interpreter/interpreter.go
@@ -90,7 +90,7 @@ type callEngine struct {
 	frames []*callFrame
 }
 
-func (me *moduleEngine) newCallEngine() *callEngine {
+func (e *moduleEngine) newCallEngine() *callEngine {
 	return &callEngine{}
 }
 
@@ -276,9 +276,9 @@ func (e *engine) NewModuleEngine(name string, module *wasm.Module, importedFunct
 			return me, wasm.ErrElementOffsetOutOfBounds
 		}
 
-		for i, funcindex := range init.FunctionIndexes {
-			if funcindex != nil {
-				references[init.Offset+uint32(i)] = uintptr(unsafe.Pointer(me.functions[*funcindex]))
+		for i, fnIndex := range init.FunctionIndexes {
+			if fnIndex != nil {
+				references[init.Offset+uint32(i)] = uintptr(unsafe.Pointer(me.functions[*fnIndex]))
 			}
 		}
 	}
@@ -430,12 +430,10 @@ func (e *engine) lowerIR(ir *wazeroir.CompilationResult) (*code, error) {
 			op.us[0] = uint64(o.Arg.Alignment)
 			op.us[1] = uint64(o.Arg.Offset)
 		case *wazeroir.OperationStore8:
-			op.b1 = byte(o.Type)
 			op.us = make([]uint64, 2)
 			op.us[0] = uint64(o.Arg.Alignment)
 			op.us[1] = uint64(o.Arg.Offset)
 		case *wazeroir.OperationStore16:
-			op.b1 = byte(o.Type)
 			op.us = make([]uint64, 2)
 			op.us[0] = uint64(o.Arg.Alignment)
 			op.us[1] = uint64(o.Arg.Offset)
@@ -536,7 +534,7 @@ func (e *engine) lowerIR(ir *wazeroir.CompilationResult) (*code, error) {
 			*wazeroir.OperationF32ReinterpretFromI32,
 			*wazeroir.OperationF64ReinterpretFromI64:
 			// Reinterpret ops are essentially nop for engine mode
-			// because we treat all values as uint64, and the reinterpret is only used at module
+			// because we treat all values as uint64, and Reinterpret* is only used at module
 			// validation phase where we check type soundness of all the operations.
 			// So just eliminate the ops.
 			continue
@@ -645,6 +643,71 @@ func (e *engine) lowerIR(ir *wazeroir.CompilationResult) (*code, error) {
 			op.b1 = o.Shape
 		case *wazeroir.OperationV128Cmp:
 			op.b1 = o.Type
+		case *wazeroir.OperationV128AddSat:
+			op.b1 = o.Shape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128SubSat:
+			op.b1 = o.Shape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128Mul:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Div:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Neg:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Sqrt:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Abs:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Popcnt:
+		case *wazeroir.OperationV128Min:
+			op.b1 = o.Shape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128Max:
+			op.b1 = o.Shape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128AvgrU:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Pmin:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Pmax:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Ceil:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Floor:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Trunc:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Nearest:
+			op.b1 = o.Shape
+		case *wazeroir.OperationV128Extend:
+			op.b1 = o.OriginShape
+			if o.Signed {
+				op.b2 = 1
+			}
+			op.b3 = o.UseLow
+		case *wazeroir.OperationV128ExtMul:
+			op.b1 = o.OriginShape
+			if o.Signed {
+				op.b2 = 1
+			}
+			op.b3 = o.UseLow
+		case *wazeroir.OperationV128Q15mulrSatS:
+		case *wazeroir.OperationV128ExtAddPairwise:
+			op.b1 = o.OriginShape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128FloatPromote:
+		case *wazeroir.OperationV128FloatDemote:
+		case *wazeroir.OperationV128FConvertFromI:
+			op.b1 = o.DestinationShape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128Dot:
+		case *wazeroir.OperationV128Narrow:
+			op.b1 = o.OriginShape
+			op.b3 = o.Signed
+		case *wazeroir.OperationV128ITruncSatFromF:
+			op.b1 = o.OriginShape
+			op.b3 = o.Signed
 		default:
 			panic(fmt.Errorf("BUG: unimplemented operation %s", op.kind.String()))
 		}
@@ -662,16 +725,16 @@ func (e *engine) lowerIR(ir *wazeroir.CompilationResult) (*code, error) {
 }
 
 // Name implements the same method as documented on wasm.ModuleEngine.
-func (me *moduleEngine) Name() string {
-	return me.name
+func (e *moduleEngine) Name() string {
+	return e.name
 }
 
 // CreateFuncElementInstance implements the same method as documented on wasm.ModuleEngine.
-func (me *moduleEngine) CreateFuncElementInstance(indexes []*wasm.Index) *wasm.ElementInstance {
+func (e *moduleEngine) CreateFuncElementInstance(indexes []*wasm.Index) *wasm.ElementInstance {
 	refs := make([]wasm.Reference, len(indexes))
 	for i, index := range indexes {
 		if index != nil {
-			refs[i] = uintptr(unsafe.Pointer(me.functions[*index]))
+			refs[i] = uintptr(unsafe.Pointer(e.functions[*index]))
 		}
 	}
 	return &wasm.ElementInstance{
@@ -681,27 +744,27 @@ func (me *moduleEngine) CreateFuncElementInstance(indexes []*wasm.Index) *wasm.E
 }
 
 // InitializeFuncrefGlobals implements the same method as documented on wasm.InitializeFuncrefGlobals.
-func (me *moduleEngine) InitializeFuncrefGlobals(globals []*wasm.GlobalInstance) {
+func (e *moduleEngine) InitializeFuncrefGlobals(globals []*wasm.GlobalInstance) {
 	for _, g := range globals {
 		if g.Type.ValType == wasm.ValueTypeFuncref {
 			if int64(g.Val) == wasm.GlobalInstanceNullFuncRefValue {
 				g.Val = 0 // Null funcref is expressed as zero.
 			} else {
 				// Lowers the stored function index into the interpreter specific function's opaque pointer.
-				g.Val = uint64(uintptr(unsafe.Pointer(me.functions[g.Val])))
+				g.Val = uint64(uintptr(unsafe.Pointer(e.functions[g.Val])))
 			}
 		}
 	}
 }
 
 // Call implements the same method as documented on wasm.ModuleEngine.
-func (me *moduleEngine) Call(ctx context.Context, m *wasm.CallContext, f *wasm.FunctionInstance, params ...uint64) (results []uint64, err error) {
+func (e *moduleEngine) Call(ctx context.Context, m *wasm.CallContext, f *wasm.FunctionInstance, params ...uint64) (results []uint64, err error) {
 	// Note: The input parameters are pre-validated, so a compiled function is only absent on close. Updates to
 	// code on close aren't locked, neither is this read.
-	compiled := me.functions[f.Idx]
+	compiled := e.functions[f.Idx]
 	if compiled == nil { // Lazy check the cause as it could be because the module was already closed.
 		if err = m.FailIfClosed(); err == nil {
-			panic(fmt.Errorf("BUG: %s.codes[%d] was nil before close", me.name, f.Idx))
+			panic(fmt.Errorf("BUG: %s.codes[%d] was nil before close", e.name, f.Idx))
 		}
 		return
 	}
@@ -712,7 +775,7 @@ func (me *moduleEngine) Call(ctx context.Context, m *wasm.CallContext, f *wasm.F
 		return nil, fmt.Errorf("expected %d params, but passed %d", paramSignature, paramCount)
 	}
 
-	ce := me.newCallEngine()
+	ce := e.newCallEngine()
 	defer func() {
 		// If the module closed during the call, and the call didn't err for another reason, set an ExitError.
 		if err == nil {
@@ -1363,7 +1426,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 			} else {
 				// Float64
 				const mask uint64 = 1 << 63
-				ce.pushValue(uint64(ce.popValue() &^ mask))
+				ce.pushValue(ce.popValue() &^ mask)
 			}
 			frame.pc++
 		case wazeroir.OperationKindNeg:
@@ -1384,7 +1447,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(uint64(math.Float32bits(float32(v))))
 			} else {
 				// Float64
-				v := math.Ceil(float64(math.Float64frombits(ce.popValue())))
+				v := math.Ceil(math.Float64frombits(ce.popValue()))
 				ce.pushValue(math.Float64bits(v))
 			}
 			frame.pc++
@@ -1395,7 +1458,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(uint64(math.Float32bits(float32(v))))
 			} else {
 				// Float64
-				v := math.Floor(float64(math.Float64frombits(ce.popValue())))
+				v := math.Floor(math.Float64frombits(ce.popValue()))
 				ce.pushValue(math.Float64bits(v))
 			}
 			frame.pc++
@@ -1406,7 +1469,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(uint64(math.Float32bits(float32(v))))
 			} else {
 				// Float64
-				v := math.Trunc(float64(math.Float64frombits(ce.popValue())))
+				v := math.Trunc(math.Float64frombits(ce.popValue()))
 				ce.pushValue(math.Float64bits(v))
 			}
 			frame.pc++
@@ -1428,7 +1491,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(uint64(math.Float32bits(float32(v))))
 			} else {
 				// Float64
-				v := math.Sqrt(float64(math.Float64frombits(ce.popValue())))
+				v := math.Sqrt(math.Float64frombits(ce.popValue()))
 				ce.pushValue(math.Float64bits(v))
 			}
 			frame.pc++
@@ -1883,8 +1946,8 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 			ce.pushValue(hi)
 			frame.pc++
 		case wazeroir.OperationKindV128Add:
-			xHigh, xLow := ce.popValue(), ce.popValue()
 			yHigh, yLow := ce.popValue(), ce.popValue()
+			xHigh, xLow := ce.popValue(), ce.popValue()
 			switch op.b1 {
 			case wazeroir.ShapeI8x16:
 				ce.pushValue(
@@ -1914,6 +1977,18 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 			case wazeroir.ShapeI64x2:
 				ce.pushValue(xLow + yLow)
 				ce.pushValue(xHigh + yHigh)
+			case wazeroir.ShapeF32x4:
+				ce.pushValue(
+					uint64(math.Float32bits(math.Float32frombits(uint32(xLow))+math.Float32frombits(uint32(yLow)))) |
+						(uint64(math.Float32bits(math.Float32frombits(uint32(xLow>>32))+math.Float32frombits(uint32(yLow>>32)))) << 32),
+				)
+				ce.pushValue(
+					uint64(math.Float32bits(math.Float32frombits(uint32(xHigh))+math.Float32frombits(uint32(yHigh)))) |
+						(uint64(math.Float32bits(math.Float32frombits(uint32(xHigh>>32))+math.Float32frombits(uint32(yHigh>>32)))) << 32),
+				)
+			case wazeroir.ShapeF64x2:
+				ce.pushValue(math.Float64bits(math.Float64frombits(xLow) + math.Float64frombits(yLow)))
+				ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) + math.Float64frombits(yHigh)))
 			}
 			frame.pc++
 		case wazeroir.OperationKindV128Sub:
@@ -1948,12 +2023,24 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 			case wazeroir.ShapeI64x2:
 				ce.pushValue(xLow - yLow)
 				ce.pushValue(xHigh - yHigh)
+			case wazeroir.ShapeF32x4:
+				ce.pushValue(
+					uint64(math.Float32bits(math.Float32frombits(uint32(xLow))-math.Float32frombits(uint32(yLow)))) |
+						(uint64(math.Float32bits(math.Float32frombits(uint32(xLow>>32))-math.Float32frombits(uint32(yLow>>32)))) << 32),
+				)
+				ce.pushValue(
+					uint64(math.Float32bits(math.Float32frombits(uint32(xHigh))-math.Float32frombits(uint32(yHigh)))) |
+						(uint64(math.Float32bits(math.Float32frombits(uint32(xHigh>>32))-math.Float32frombits(uint32(yHigh>>32)))) << 32),
+				)
+			case wazeroir.ShapeF64x2:
+				ce.pushValue(math.Float64bits(math.Float64frombits(xLow) - math.Float64frombits(yLow)))
+				ce.pushValue(math.Float64bits(math.Float64frombits(xHigh) - math.Float64frombits(yHigh)))
 			}
 			frame.pc++
 		case wazeroir.OperationKindV128Load:
 			offset := ce.popMemoryOffset(op)
 			switch op.b1 {
-			case wazeroir.LoadV128Type128:
+			case wazeroir.V128LoadType128:
 				lo, ok := memoryInst.ReadUint64Le(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -1964,7 +2051,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 				}
 				ce.pushValue(hi)
-			case wazeroir.LoadV128Type8x8s:
+			case wazeroir.V128LoadType8x8s:
 				data, ok := memoryInst.Read(ctx, offset, 8)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -1975,7 +2062,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(
 					uint64(uint16(int8(data[7])))<<48 | uint64(uint16(int8(data[6])))<<32 | uint64(uint16(int8(data[5])))<<16 | uint64(uint16(int8(data[4]))),
 				)
-			case wazeroir.LoadV128Type8x8u:
+			case wazeroir.V128LoadType8x8u:
 				data, ok := memoryInst.Read(ctx, offset, 8)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -1986,7 +2073,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(
 					uint64(data[7])<<48 | uint64(data[6])<<32 | uint64(data[5])<<16 | uint64(data[4]),
 				)
-			case wazeroir.LoadV128Type16x4s:
+			case wazeroir.V128LoadType16x4s:
 				data, ok := memoryInst.Read(ctx, offset, 8)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -1999,7 +2086,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 					uint64(uint32(int16(binary.LittleEndian.Uint16(data[6:]))))<<32 |
 						uint64(uint32(int16(binary.LittleEndian.Uint16(data[4:])))),
 				)
-			case wazeroir.LoadV128Type16x4u:
+			case wazeroir.V128LoadType16x4u:
 				data, ok := memoryInst.Read(ctx, offset, 8)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -2010,21 +2097,21 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				ce.pushValue(
 					uint64(binary.LittleEndian.Uint16(data[6:]))<<32 | uint64(binary.LittleEndian.Uint16(data[4:])),
 				)
-			case wazeroir.LoadV128Type32x2s:
+			case wazeroir.V128LoadType32x2s:
 				data, ok := memoryInst.Read(ctx, offset, 8)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 				}
 				ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data))))
 				ce.pushValue(uint64(int32(binary.LittleEndian.Uint32(data[4:]))))
-			case wazeroir.LoadV128Type32x2u:
+			case wazeroir.V128LoadType32x2u:
 				data, ok := memoryInst.Read(ctx, offset, 8)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 				}
 				ce.pushValue(uint64(binary.LittleEndian.Uint32(data)))
 				ce.pushValue(uint64(binary.LittleEndian.Uint32(data[4:])))
-			case wazeroir.LoadV128Type8Splat:
+			case wazeroir.V128LoadType8Splat:
 				v, ok := memoryInst.ReadByte(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -2033,7 +2120,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 					uint64(v)<<24 | uint64(v)<<16 | uint64(v)<<8 | uint64(v)
 				ce.pushValue(v8)
 				ce.pushValue(v8)
-			case wazeroir.LoadV128Type16Splat:
+			case wazeroir.V128LoadType16Splat:
 				v, ok := memoryInst.ReadUint16Le(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -2041,7 +2128,7 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				v4 := uint64(v)<<48 | uint64(v)<<32 | uint64(v)<<16 | uint64(v)
 				ce.pushValue(v4)
 				ce.pushValue(v4)
-			case wazeroir.LoadV128Type32Splat:
+			case wazeroir.V128LoadType32Splat:
 				v, ok := memoryInst.ReadUint32Le(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -2049,21 +2136,21 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 				vv := uint64(v)<<32 | uint64(v)
 				ce.pushValue(vv)
 				ce.pushValue(vv)
-			case wazeroir.LoadV128Type64Splat:
+			case wazeroir.V128LoadType64Splat:
 				lo, ok := memoryInst.ReadUint64Le(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 				}
 				ce.pushValue(lo)
 				ce.pushValue(lo)
-			case wazeroir.LoadV128Type32zero:
+			case wazeroir.V128LoadType32zero:
 				lo, ok := memoryInst.ReadUint32Le(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
 				}
 				ce.pushValue(uint64(lo))
 				ce.pushValue(0)
-			case wazeroir.LoadV128Type64zero:
+			case wazeroir.V128LoadType64zero:
 				lo, ok := memoryInst.ReadUint64Le(ctx, offset)
 				if !ok {
 					panic(wasmruntime.ErrRuntimeOutOfBoundsMemoryAccess)
@@ -2924,9 +3011,1290 @@ func (ce *callEngine) callNativeFunc(ctx context.Context, callCtx *wasm.CallCont
 			ce.pushValue(retLo)
 			ce.pushValue(retHi)
 			frame.pc++
-		}
-	}
-	ce.popFrame()
+		case wazeroir.OperationKindV128AddSat:
+			x2hi, x2Lo := ce.popValue(), ce.popValue()
+			x1hi, x1Lo := ce.popValue(), ce.popValue()
+
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				for i := 0; i < 16; i++ {
+					var v, w byte
+					if i < 8 {
+						v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8))
+					} else {
+						v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8))
+					}
+
+					var uv uint64
+					if op.b3 { // signed
+						if subbed := int64(int8(v)) + int64(int8(w)); subbed < math.MinInt8 {
+							uv = uint64(byte(0x80))
+						} else if subbed > math.MaxInt8 {
+							uv = uint64(byte(0x7f))
+						} else {
+							uv = uint64(byte(int8(subbed)))
+						}
+					} else {
+						if subbed := int64(v) + int64(w); subbed < 0 {
+							uv = uint64(byte(0))
+						} else if subbed > math.MaxUint8 {
+							uv = uint64(byte(0xff))
+						} else {
+							uv = uint64(byte(subbed))
+						}
+					}
+
+					if i < 8 {
+						retLo |= uv << (i * 8)
+					} else {
+						retHi |= uv << ((i - 8) * 8)
+					}
+				}
+			case wazeroir.ShapeI16x8:
+				for i := 0; i < 8; i++ {
+					var v, w uint16
+					if i < 4 {
+						v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16))
+					} else {
+						v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16))
+					}
+
+					var uv uint64
+					if op.b3 { // signed
+						if added := int64(int16(v)) + int64(int16(w)); added < math.MinInt16 {
+							uv = uint64(uint16(0x8000))
+						} else if added > math.MaxInt16 {
+							uv = uint64(uint16(0x7fff))
+						} else {
+							uv = uint64(uint16(int16(added)))
+						}
+					} else {
+						if added := int64(v) + int64(w); added < 0 {
+							uv = uint64(uint16(0))
+						} else if added > math.MaxUint16 {
+							uv = uint64(uint16(0xffff))
+						} else {
+							uv = uint64(uint16(added))
+						}
+					}
+
+					if i < 4 {
+						retLo |= uv << (i * 16)
+					} else {
+						retHi |= uv << ((i - 4) * 16)
+					}
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128SubSat:
+			x2hi, x2Lo := ce.popValue(), ce.popValue()
+			x1hi, x1Lo := ce.popValue(), ce.popValue()
+
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				for i := 0; i < 16; i++ {
+					var v, w byte
+					if i < 8 {
+						v, w = byte(x1Lo>>(i*8)), byte(x2Lo>>(i*8))
+					} else {
+						v, w = byte(x1hi>>((i-8)*8)), byte(x2hi>>((i-8)*8))
+					}
+
+					var uv uint64
+					if op.b3 { // signed
+						if subbed := int64(int8(v)) - int64(int8(w)); subbed < math.MinInt8 {
+							uv = uint64(byte(0x80))
+						} else if subbed > math.MaxInt8 {
+							uv = uint64(byte(0x7f))
+						} else {
+							uv = uint64(byte(int8(subbed)))
+						}
+					} else {
+						if subbed := int64(v) - int64(w); subbed < 0 {
+							uv = uint64(byte(0))
+						} else if subbed > math.MaxUint8 {
+							uv = uint64(byte(0xff))
+						} else {
+							uv = uint64(byte(subbed))
+						}
+					}
+
+					if i < 8 {
+						retLo |= uv << (i * 8)
+					} else {
+						retHi |= uv << ((i - 8) * 8)
+					}
+				}
+			case wazeroir.ShapeI16x8:
+				for i := 0; i < 8; i++ {
+					var v, w uint16
+					if i < 4 {
+						v, w = uint16(x1Lo>>(i*16)), uint16(x2Lo>>(i*16))
+					} else {
+						v, w = uint16(x1hi>>((i-4)*16)), uint16(x2hi>>((i-4)*16))
+					}
+
+					var uv uint64
+					if op.b3 { // signed
+						if subbed := int64(int16(v)) - int64(int16(w)); subbed < math.MinInt16 {
+							uv = uint64(uint16(0x8000))
+						} else if subbed > math.MaxInt16 {
+							uv = uint64(uint16(0x7fff))
+						} else {
+							uv = uint64(uint16(int16(subbed)))
+						}
+					} else {
+						if subbed := int64(v) - int64(w); subbed < 0 {
+							uv = uint64(uint16(0))
+						} else if subbed > math.MaxUint16 {
+							uv = uint64(uint16(0xffff))
+						} else {
+							uv = uint64(uint16(subbed))
+						}
+					}
+
+					if i < 4 {
+						retLo |= uv << (i * 16)
+					} else {
+						retHi |= uv << ((i - 4) * 16)
+					}
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Mul:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI16x8:
+				retHi = uint64(uint16(x1hi)*uint16(x2hi)) | (uint64(uint16(x1hi>>16)*uint16(x2hi>>16)) << 16) |
+					(uint64(uint16(x1hi>>32)*uint16(x2hi>>32)) << 32) | (uint64(uint16(x1hi>>48)*uint16(x2hi>>48)) << 48)
+				retLo = uint64(uint16(x1lo)*uint16(x2lo)) | (uint64(uint16(x1lo>>16)*uint16(x2lo>>16)) << 16) |
+					(uint64(uint16(x1lo>>32)*uint16(x2lo>>32)) << 32) | (uint64(uint16(x1lo>>48)*uint16(x2lo>>48)) << 48)
+			case wazeroir.ShapeI32x4:
+				retHi = uint64(uint32(x1hi)*uint32(x2hi)) | (uint64(uint32(x1hi>>32)*uint32(x2hi>>32)) << 32)
+				retLo = uint64(uint32(x1lo)*uint32(x2lo)) | (uint64(uint32(x1lo>>32)*uint32(x2lo>>32)) << 32)
+			case wazeroir.ShapeI64x2:
+				retHi = x1hi * x2hi
+				retLo = x1lo * x2lo
+			case wazeroir.ShapeF32x4:
+				retHi = uint64(math.Float32bits(math.Float32frombits(uint32(x1hi))*math.Float32frombits(uint32(x2hi)))) |
+					(uint64(math.Float32bits(math.Float32frombits(uint32(x1hi>>32))*math.Float32frombits(uint32(x2hi>>32)))) << 32)
+				retLo = uint64(math.Float32bits(math.Float32frombits(uint32(x1lo))*math.Float32frombits(uint32(x2lo)))) |
+					(uint64(math.Float32bits(math.Float32frombits(uint32(x1lo>>32))*math.Float32frombits(uint32(x2lo>>32)))) << 32)
+			case wazeroir.ShapeF64x2:
+				retHi = math.Float64bits(math.Float64frombits(x1hi) * math.Float64frombits(x2hi))
+				retLo = math.Float64bits(math.Float64frombits(x1lo) * math.Float64frombits(x2lo))
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Div:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			if op.b1 == wazeroir.ShapeF64x2 {
+				retHi = math.Float64bits(math.Float64frombits(x1hi) / math.Float64frombits(x2hi))
+				retLo = math.Float64bits(math.Float64frombits(x1lo) / math.Float64frombits(x2lo))
+			} else {
+				retHi = uint64(math.Float32bits(math.Float32frombits(uint32(x1hi))/math.Float32frombits(uint32(x2hi)))) |
+					(uint64(math.Float32bits(math.Float32frombits(uint32(x1hi>>32))/math.Float32frombits(uint32(x2hi>>32)))) << 32)
+				retLo = uint64(math.Float32bits(math.Float32frombits(uint32(x1lo))/math.Float32frombits(uint32(x2lo)))) |
+					(uint64(math.Float32bits(math.Float32frombits(uint32(x1lo>>32))/math.Float32frombits(uint32(x2lo>>32)))) << 32)
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Neg:
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				lo = uint64(-byte(lo)) | (uint64(-byte(lo>>8)) << 8) |
+					(uint64(-byte(lo>>16)) << 16) | (uint64(-byte(lo>>24)) << 24) |
+					(uint64(-byte(lo>>32)) << 32) | (uint64(-byte(lo>>40)) << 40) |
+					(uint64(-byte(lo>>48)) << 48) | (uint64(-byte(lo>>56)) << 56)
+				hi = uint64(-byte(hi)) | (uint64(-byte(hi>>8)) << 8) |
+					(uint64(-byte(hi>>16)) << 16) | (uint64(-byte(hi>>24)) << 24) |
+					(uint64(-byte(hi>>32)) << 32) | (uint64(-byte(hi>>40)) << 40) |
+					(uint64(-byte(hi>>48)) << 48) | (uint64(-byte(hi>>56)) << 56)
+			case wazeroir.ShapeI16x8:
+				hi = uint64(-uint16(hi)) | (uint64(-uint16(hi>>16)) << 16) |
+					(uint64(-uint16(hi>>32)) << 32) | (uint64(-uint16(hi>>48)) << 48)
+				lo = uint64(-uint16(lo)) | (uint64(-uint16(lo>>16)) << 16) |
+					(uint64(-uint16(lo>>32)) << 32) | (uint64(-uint16(lo>>48)) << 48)
+			case wazeroir.ShapeI32x4:
+				hi = uint64(-uint32(hi)) | (uint64(-uint32(hi>>32)) << 32)
+				lo = uint64(-uint32(lo)) | (uint64(-uint32(lo>>32)) << 32)
+			case wazeroir.ShapeI64x2:
+				hi = -hi
+				lo = -lo
+			case wazeroir.ShapeF32x4:
+				hi = uint64(math.Float32bits(-math.Float32frombits(uint32(hi)))) |
+					(uint64(math.Float32bits(-math.Float32frombits(uint32(hi>>32)))) << 32)
+				lo = uint64(math.Float32bits(-math.Float32frombits(uint32(lo)))) |
+					(uint64(math.Float32bits(-math.Float32frombits(uint32(lo>>32)))) << 32)
+			case wazeroir.ShapeF64x2:
+				hi = math.Float64bits(-math.Float64frombits(hi))
+				lo = math.Float64bits(-math.Float64frombits(lo))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Sqrt:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.b1 == wazeroir.ShapeF64x2 {
+				hi = math.Float64bits(math.Sqrt(math.Float64frombits(hi)))
+				lo = math.Float64bits(math.Sqrt(math.Float64frombits(lo)))
+			} else {
+				hi = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi))))))) |
+					(uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(hi>>32))))))) << 32)
+				lo = uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo))))))) |
+					(uint64(math.Float32bits(float32(math.Sqrt(float64(math.Float32frombits(uint32(lo>>32))))))) << 32)
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Abs:
+			hi, lo := ce.popValue(), ce.popValue()
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				lo = uint64(i8Abs(byte(lo))) | (uint64(i8Abs(byte(lo>>8))) << 8) |
+					(uint64(i8Abs(byte(lo>>16))) << 16) | (uint64(i8Abs(byte(lo>>24))) << 24) |
+					(uint64(i8Abs(byte(lo>>32))) << 32) | (uint64(i8Abs(byte(lo>>40))) << 40) |
+					(uint64(i8Abs(byte(lo>>48))) << 48) | (uint64(i8Abs(byte(lo>>56))) << 56)
+				hi = uint64(i8Abs(byte(hi))) | (uint64(i8Abs(byte(hi>>8))) << 8) |
+					(uint64(i8Abs(byte(hi>>16))) << 16) | (uint64(i8Abs(byte(hi>>24))) << 24) |
+					(uint64(i8Abs(byte(hi>>32))) << 32) | (uint64(i8Abs(byte(hi>>40))) << 40) |
+					(uint64(i8Abs(byte(hi>>48))) << 48) | (uint64(i8Abs(byte(hi>>56))) << 56)
+			case wazeroir.ShapeI16x8:
+				hi = uint64(i16Abs(uint16(hi))) | (uint64(i16Abs(uint16(hi>>16))) << 16) |
+					(uint64(i16Abs(uint16(hi>>32))) << 32) | (uint64(i16Abs(uint16(hi>>48))) << 48)
+				lo = uint64(i16Abs(uint16(lo))) | (uint64(i16Abs(uint16(lo>>16))) << 16) |
+					(uint64(i16Abs(uint16(lo>>32))) << 32) | (uint64(i16Abs(uint16(lo>>48))) << 48)
+			case wazeroir.ShapeI32x4:
+				hi = uint64(i32Abs(uint32(hi))) | (uint64(i32Abs(uint32(hi>>32))) << 32)
+				lo = uint64(i32Abs(uint32(lo))) | (uint64(i32Abs(uint32(lo>>32))) << 32)
+			case wazeroir.ShapeI64x2:
+				if int64(hi) < 0 {
+					hi = -hi
+				}
+				if int64(lo) < 0 {
+					lo = -lo
+				}
+			case wazeroir.ShapeF32x4:
+				hi = hi &^ (1<<31 | 1<<63)
+				lo = lo &^ (1<<31 | 1<<63)
+			case wazeroir.ShapeF64x2:
+				hi = hi &^ (1 << 63)
+				lo = lo &^ (1 << 63)
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Popcnt:
+			hi, lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			for i := 0; i < 16; i++ {
+				var v byte
+				if i < 8 {
+					v = byte(lo >> (i * 8))
+				} else {
+					v = byte(hi >> ((i - 8) * 8))
+				}
+
+				var cnt uint64
+				for i := 0; i < 8; i++ {
+					if (v>>i)&0b1 != 0 {
+						cnt++
+					}
+				}
+
+				if i < 8 {
+					retLo |= cnt << (i * 8)
+				} else {
+					retHi |= cnt << ((i - 8) * 8)
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Min:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				if op.b3 { // signed
+					retLo = uint64(i8MinS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinS(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MinS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MinS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MinS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MinS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinS(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MinS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MinS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MinS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i8MinU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MinU(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MinU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MinU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MinU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MinU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MinU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MinU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MinU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MinU(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MinU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MinU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MinU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MinU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MinU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MinU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				}
+			case wazeroir.ShapeI16x8:
+				if op.b3 { // signed
+					retLo = uint64(i16MinS(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MinS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MinS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MinS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MinS(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MinS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MinS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MinS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i16MinU(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MinU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MinU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MinU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MinU(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MinU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MinU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MinU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				}
+			case wazeroir.ShapeI32x4:
+				if op.b3 { // signed
+					retLo = uint64(i32MinS(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MinS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MinS(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MinS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				} else {
+					retLo = uint64(i32MinU(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MinU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MinU(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MinU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				}
+			case wazeroir.ShapeF32x4:
+				retHi = uint64(math.Float32bits(float32(moremath.WasmCompatMin(
+					float64(math.Float32frombits(uint32(x1hi))),
+					float64(math.Float32frombits(uint32(x2hi))),
+				)))) | uint64(math.Float32bits(float32(moremath.WasmCompatMin(
+					float64(math.Float32frombits(uint32(x1hi>>32))),
+					float64(math.Float32frombits(uint32(x2hi>>32))),
+				))))<<32
+				retLo = uint64(math.Float32bits(float32(moremath.WasmCompatMin(
+					float64(math.Float32frombits(uint32(x1lo))),
+					float64(math.Float32frombits(uint32(x2lo))),
+				)))) | uint64(math.Float32bits(float32(moremath.WasmCompatMin(
+					float64(math.Float32frombits(uint32(x1lo>>32))),
+					float64(math.Float32frombits(uint32(x2lo>>32))),
+				))))<<32
+			case wazeroir.ShapeF64x2:
+				retHi = math.Float64bits(moremath.WasmCompatMin(
+					math.Float64frombits(x1hi),
+					math.Float64frombits(x2hi),
+				))
+				retLo = math.Float64bits(moremath.WasmCompatMin(
+					math.Float64frombits(x1lo),
+					math.Float64frombits(x2lo),
+				))
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Max:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				if op.b3 { // signed
+					retLo = uint64(i8MaxS(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxS(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MaxS(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxS(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MaxS(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxS(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MaxS(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxS(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MaxS(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxS(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MaxS(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxS(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MaxS(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxS(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MaxS(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxS(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i8MaxU(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8MaxU(uint8(x1lo), uint8(x2lo))) |
+						uint64(i8MaxU(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8MaxU(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+						uint64(i8MaxU(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8MaxU(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+						uint64(i8MaxU(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8MaxU(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+					retHi = uint64(i8MaxU(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8MaxU(uint8(x1hi), uint8(x2hi))) |
+						uint64(i8MaxU(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8MaxU(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+						uint64(i8MaxU(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8MaxU(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+						uint64(i8MaxU(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8MaxU(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+				}
+			case wazeroir.ShapeI16x8:
+				if op.b3 { // signed
+					retLo = uint64(i16MaxS(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MaxS(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MaxS(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MaxS(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MaxS(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MaxS(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MaxS(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MaxS(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				} else {
+					retLo = uint64(i16MaxU(uint16(x1lo), uint16(x2lo))) |
+						uint64(i16MaxU(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+						uint64(i16MaxU(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+						uint64(i16MaxU(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+					retHi = uint64(i16MaxU(uint16(x1hi), uint16(x2hi))) |
+						uint64(i16MaxU(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+						uint64(i16MaxU(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+						uint64(i16MaxU(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+				}
+			case wazeroir.ShapeI32x4:
+				if op.b3 { // signed
+					retLo = uint64(i32MaxS(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MaxS(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MaxS(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MaxS(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				} else {
+					retLo = uint64(i32MaxU(uint32(x1lo), uint32(x2lo))) |
+						uint64(i32MaxU(uint32(x1lo>>32), uint32(x2lo>>32)))<<32
+					retHi = uint64(i32MaxU(uint32(x1hi), uint32(x2hi))) |
+						uint64(i32MaxU(uint32(x1hi>>32), uint32(x2hi>>32)))<<32
+				}
+			case wazeroir.ShapeF32x4:
+				retHi = uint64(math.Float32bits(float32(moremath.WasmCompatMax(
+					float64(math.Float32frombits(uint32(x1hi))),
+					float64(math.Float32frombits(uint32(x2hi))),
+				)))) | uint64(math.Float32bits(float32(moremath.WasmCompatMax(
+					float64(math.Float32frombits(uint32(x1hi>>32))),
+					float64(math.Float32frombits(uint32(x2hi>>32))),
+				))))<<32
+				retLo = uint64(math.Float32bits(float32(moremath.WasmCompatMax(
+					float64(math.Float32frombits(uint32(x1lo))),
+					float64(math.Float32frombits(uint32(x2lo))),
+				)))) | uint64(math.Float32bits(float32(moremath.WasmCompatMax(
+					float64(math.Float32frombits(uint32(x1lo>>32))),
+					float64(math.Float32frombits(uint32(x2lo>>32))),
+				))))<<32
+			case wazeroir.ShapeF64x2:
+				retHi = math.Float64bits(moremath.WasmCompatMax(
+					math.Float64frombits(x1hi),
+					math.Float64frombits(x2hi),
+				))
+				retLo = math.Float64bits(moremath.WasmCompatMax(
+					math.Float64frombits(x1lo),
+					math.Float64frombits(x2lo),
+				))
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128AvgrU:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				retLo = uint64(i8RoundingAverage(uint8(x1lo>>8), uint8(x2lo>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1lo), uint8(x2lo))) |
+					uint64(i8RoundingAverage(uint8(x1lo>>24), uint8(x2lo>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1lo>>16), uint8(x2lo>>16)))<<16 |
+					uint64(i8RoundingAverage(uint8(x1lo>>40), uint8(x2lo>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1lo>>32), uint8(x2lo>>32)))<<32 |
+					uint64(i8RoundingAverage(uint8(x1lo>>56), uint8(x2lo>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1lo>>48), uint8(x2lo>>48)))<<48
+				retHi = uint64(i8RoundingAverage(uint8(x1hi>>8), uint8(x2hi>>8)))<<8 | uint64(i8RoundingAverage(uint8(x1hi), uint8(x2hi))) |
+					uint64(i8RoundingAverage(uint8(x1hi>>24), uint8(x2hi>>24)))<<24 | uint64(i8RoundingAverage(uint8(x1hi>>16), uint8(x2hi>>16)))<<16 |
+					uint64(i8RoundingAverage(uint8(x1hi>>40), uint8(x2hi>>40)))<<40 | uint64(i8RoundingAverage(uint8(x1hi>>32), uint8(x2hi>>32)))<<32 |
+					uint64(i8RoundingAverage(uint8(x1hi>>56), uint8(x2hi>>56)))<<56 | uint64(i8RoundingAverage(uint8(x1hi>>48), uint8(x2hi>>48)))<<48
+			case wazeroir.ShapeI16x8:
+				retLo = uint64(i16RoundingAverage(uint16(x1lo), uint16(x2lo))) |
+					uint64(i16RoundingAverage(uint16(x1lo>>16), uint16(x2lo>>16)))<<16 |
+					uint64(i16RoundingAverage(uint16(x1lo>>32), uint16(x2lo>>32)))<<32 |
+					uint64(i16RoundingAverage(uint16(x1lo>>48), uint16(x2lo>>48)))<<48
+				retHi = uint64(i16RoundingAverage(uint16(x1hi), uint16(x2hi))) |
+					uint64(i16RoundingAverage(uint16(x1hi>>16), uint16(x2hi>>16)))<<16 |
+					uint64(i16RoundingAverage(uint16(x1hi>>32), uint16(x2hi>>32)))<<32 |
+					uint64(i16RoundingAverage(uint16(x1hi>>48), uint16(x2hi>>48)))<<48
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Pmin:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			if op.b1 == wazeroir.ShapeF32x4 {
+				if flt32(math.Float32frombits(uint32(x2lo)), math.Float32frombits(uint32(x1lo))) {
+					retLo = x2lo & 0x00000000_ffffffff
+				} else {
+					retLo = x1lo & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x2lo>>32)), math.Float32frombits(uint32(x1lo>>32))) {
+					retLo |= x2lo & 0xffffffff_00000000
+				} else {
+					retLo |= x1lo & 0xffffffff_00000000
+				}
+				if flt32(math.Float32frombits(uint32(x2hi)), math.Float32frombits(uint32(x1hi))) {
+					retHi = x2hi & 0x00000000_ffffffff
+				} else {
+					retHi = x1hi & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x2hi>>32)), math.Float32frombits(uint32(x1hi>>32))) {
+					retHi |= x2hi & 0xffffffff_00000000
+				} else {
+					retHi |= x1hi & 0xffffffff_00000000
+				}
+			} else {
+				if flt64(math.Float64frombits(x2lo), math.Float64frombits(x1lo)) {
+					retLo = x2lo
+				} else {
+					retLo = x1lo
+				}
+				if flt64(math.Float64frombits(x2hi), math.Float64frombits(x1hi)) {
+					retHi = x2hi
+				} else {
+					retHi = x1hi
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Pmax:
+			x2hi, x2lo := ce.popValue(), ce.popValue()
+			x1hi, x1lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			if op.b1 == wazeroir.ShapeF32x4 {
+				if flt32(math.Float32frombits(uint32(x1lo)), math.Float32frombits(uint32(x2lo))) {
+					retLo = x2lo & 0x00000000_ffffffff
+				} else {
+					retLo = x1lo & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x1lo>>32)), math.Float32frombits(uint32(x2lo>>32))) {
+					retLo |= x2lo & 0xffffffff_00000000
+				} else {
+					retLo |= x1lo & 0xffffffff_00000000
+				}
+				if flt32(math.Float32frombits(uint32(x1hi)), math.Float32frombits(uint32(x2hi))) {
+					retHi = x2hi & 0x00000000_ffffffff
+				} else {
+					retHi = x1hi & 0x00000000_ffffffff
+				}
+				if flt32(math.Float32frombits(uint32(x1hi>>32)), math.Float32frombits(uint32(x2hi>>32))) {
+					retHi |= x2hi & 0xffffffff_00000000
+				} else {
+					retHi |= x1hi & 0xffffffff_00000000
+				}
+			} else {
+				if flt64(math.Float64frombits(x1lo), math.Float64frombits(x2lo)) {
+					retLo = x2lo
+				} else {
+					retLo = x1lo
+				}
+				if flt64(math.Float64frombits(x1hi), math.Float64frombits(x2hi)) {
+					retHi = x2hi
+				} else {
+					retHi = x1hi
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Ceil:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.b1 == wazeroir.ShapeF32x4 {
+				lo = uint64(math.Float32bits(float32(math.Ceil(float64(math.Float32frombits(uint32(lo))))))) |
+					(uint64(math.Float32bits(float32(math.Ceil(float64(math.Float32frombits(uint32(lo>>32))))))) << 32)
+				hi = uint64(math.Float32bits(float32(math.Ceil(float64(math.Float32frombits(uint32(hi))))))) |
+					(uint64(math.Float32bits(float32(math.Ceil(float64(math.Float32frombits(uint32(hi>>32))))))) << 32)
+			} else {
+				lo = math.Float64bits(math.Ceil(math.Float64frombits(lo)))
+				hi = math.Float64bits(math.Ceil(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Floor:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.b1 == wazeroir.ShapeF32x4 {
+				lo = uint64(math.Float32bits(float32(math.Floor(float64(math.Float32frombits(uint32(lo))))))) |
+					(uint64(math.Float32bits(float32(math.Floor(float64(math.Float32frombits(uint32(lo>>32))))))) << 32)
+				hi = uint64(math.Float32bits(float32(math.Floor(float64(math.Float32frombits(uint32(hi))))))) |
+					(uint64(math.Float32bits(float32(math.Floor(float64(math.Float32frombits(uint32(hi>>32))))))) << 32)
+			} else {
+				lo = math.Float64bits(math.Floor(math.Float64frombits(lo)))
+				hi = math.Float64bits(math.Floor(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Trunc:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.b1 == wazeroir.ShapeF32x4 {
+				lo = uint64(math.Float32bits(float32(math.Trunc(float64(math.Float32frombits(uint32(lo))))))) |
+					(uint64(math.Float32bits(float32(math.Trunc(float64(math.Float32frombits(uint32(lo>>32))))))) << 32)
+				hi = uint64(math.Float32bits(float32(math.Trunc(float64(math.Float32frombits(uint32(hi))))))) |
+					(uint64(math.Float32bits(float32(math.Trunc(float64(math.Float32frombits(uint32(hi>>32))))))) << 32)
+			} else {
+				lo = math.Float64bits(math.Trunc(math.Float64frombits(lo)))
+				hi = math.Float64bits(math.Trunc(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Nearest:
+			hi, lo := ce.popValue(), ce.popValue()
+			if op.b1 == wazeroir.ShapeF32x4 {
+				lo = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(lo>>32))))) << 32)
+				hi = uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi))))) |
+					(uint64(math.Float32bits(moremath.WasmCompatNearestF32(math.Float32frombits(uint32(hi>>32))))) << 32)
+			} else {
+				lo = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(lo)))
+				hi = math.Float64bits(moremath.WasmCompatNearestF64(math.Float64frombits(hi)))
+			}
+			ce.pushValue(lo)
+			ce.pushValue(hi)
+			frame.pc++
+		case wazeroir.OperationKindV128Extend:
+			hi, lo := ce.popValue(), ce.popValue()
+			var origin uint64
+			if op.b3 { // use lower 64 bits
+				origin = lo
+			} else {
+				origin = hi
+			}
+
+			signed := op.b2 == 1
+
+			var retHi, retLo uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				for i := 0; i < 8; i++ {
+					v8 := byte(origin >> (i * 8))
+
+					var v16 uint16
+					if signed {
+						v16 = uint16(int8(v8))
+					} else {
+						v16 = uint16(v8)
+					}
+
+					if i < 4 {
+						retLo |= uint64(v16) << (i * 16)
+					} else {
+						retHi |= uint64(v16) << ((i - 4) * 16)
+					}
+				}
+			case wazeroir.ShapeI16x8:
+				for i := 0; i < 4; i++ {
+					v16 := uint16(origin >> (i * 16))
+
+					var v32 uint32
+					if signed {
+						v32 = uint32(int16(v16))
+					} else {
+						v32 = uint32(v16)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v32) << (i * 32)
+					} else {
+						retHi |= uint64(v32) << ((i - 2) * 32)
+					}
+				}
+			case wazeroir.ShapeI32x4:
+				v32Lo := uint32(origin)
+				v32Hi := uint32(origin >> 32)
+				if signed {
+					retLo = uint64(int32(v32Lo))
+					retHi = uint64(int32(v32Hi))
+				} else {
+					retLo = uint64(v32Lo)
+					retHi = uint64(v32Hi)
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128ExtMul:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			var x1, x2 uint64
+			if op.b3 { // use lower 64 bits
+				x1, x2 = x1Lo, x2Lo
+			} else {
+				x1, x2 = x1Hi, x2Hi
+			}
+
+			signed := op.b2 == 1
+
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				for i := 0; i < 8; i++ {
+					v1, v2 := byte(x1>>(i*8)), byte(x2>>(i*8))
+
+					var v16 uint16
+					if signed {
+						v16 = uint16(int16(int8(v1)) * int16(int8(v2)))
+					} else {
+						v16 = uint16(v1) * uint16(v2)
+					}
+
+					if i < 4 {
+						retLo |= uint64(v16) << (i * 16)
+					} else {
+						retHi |= uint64(v16) << ((i - 4) * 16)
+					}
+				}
+			case wazeroir.ShapeI16x8:
+				for i := 0; i < 4; i++ {
+					v1, v2 := uint16(x1>>(i*16)), uint16(x2>>(i*16))
+
+					var v32 uint32
+					if signed {
+						v32 = uint32(int32(int16(v1)) * int32(int16(v2)))
+					} else {
+						v32 = uint32(v1) * uint32(v2)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v32) << (i * 32)
+					} else {
+						retHi |= uint64(v32) << ((i - 2) * 32)
+					}
+				}
+			case wazeroir.ShapeI32x4:
+				v1Lo, v2Lo := uint32(x1), uint32(x2)
+				v1Hi, v2Hi := uint32(x1>>32), uint32(x2>>32)
+				if signed {
+					retLo = uint64(int64(int32(v1Lo)) * int64(int32(v2Lo)))
+					retHi = uint64(int64(int32(v1Hi)) * int64(int32(v2Hi)))
+				} else {
+					retLo = uint64(v1Lo) * uint64(v2Lo)
+					retHi = uint64(v1Hi) * uint64(v2Hi)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Q15mulrSatS:
+			x2hi, x2Lo := ce.popValue(), ce.popValue()
+			x1hi, x1Lo := ce.popValue(), ce.popValue()
+			var retLo, retHi uint64
+			for i := 0; i < 8; i++ {
+				var v, w int16
+				if i < 4 {
+					v, w = int16(uint16(x1Lo>>(i*16))), int16(uint16(x2Lo>>(i*16)))
+				} else {
+					v, w = int16(uint16(x1hi>>((i-4)*16))), int16(uint16(x2hi>>((i-4)*16)))
+				}
+
+				var uv uint64
+				// https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md#saturating-integer-q-format-rounding-multiplication
+				if calc := ((int32(v) * int32(w)) + 0x4000) >> 15; calc < math.MinInt16 {
+					uv = uint64(uint16(0x8000))
+				} else if calc > math.MaxInt16 {
+					uv = uint64(uint16(0x7fff))
+				} else {
+					uv = uint64(uint16(int16(calc)))
+				}
+
+				if i < 4 {
+					retLo |= uv << (i * 16)
+				} else {
+					retHi |= uv << ((i - 4) * 16)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128ExtAddPairwise:
+			hi, lo := ce.popValue(), ce.popValue()
+
+			signed := op.b3
+
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI8x16:
+				for i := 0; i < 8; i++ {
+					var v1, v2 byte
+					if i < 4 {
+						v1, v2 = byte(lo>>((i*2)*8)), byte(lo>>((i*2+1)*8))
+					} else {
+						v1, v2 = byte(hi>>(((i-4)*2)*8)), byte(hi>>(((i-4)*2+1)*8))
+					}
+
+					var v16 uint16
+					if signed {
+						v16 = uint16(int16(int8(v1)) + int16(int8(v2)))
+					} else {
+						v16 = uint16(v1) + uint16(v2)
+					}
+
+					if i < 4 {
+						retLo |= uint64(v16) << (i * 16)
+					} else {
+						retHi |= uint64(v16) << ((i - 4) * 16)
+					}
+				}
+			case wazeroir.ShapeI16x8:
+				for i := 0; i < 4; i++ {
+					var v1, v2 uint16
+					if i < 2 {
+						v1, v2 = uint16(lo>>((i*2)*16)), uint16(lo>>((i*2+1)*16))
+					} else {
+						v1, v2 = uint16(hi>>(((i-2)*2)*16)), uint16(hi>>(((i-2)*2+1)*16))
+					}
+
+					var v32 uint32
+					if signed {
+						v32 = uint32(int32(int16(v1)) + int32(int16(v2)))
+					} else {
+						v32 = uint32(v1) + uint32(v2)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v32) << (i * 32)
+					} else {
+						retHi |= uint64(v32) << ((i - 2) * 32)
+					}
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128FloatPromote:
+			hi, lo := ce.popValue(), ce.popValue()
+			ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(lo)))))
+			ce.pushValue(math.Float64bits(float64(math.Float32frombits(uint32(hi)))))
+			frame.pc++
+		case wazeroir.OperationKindV128FloatDemote:
+			hi, lo := ce.popValue(), ce.popValue()
+			ce.pushValue(
+				uint64(math.Float32bits(float32(math.Float64frombits(lo)))) |
+					(uint64(math.Float32bits(float32(math.Float64frombits(hi)))) << 32),
+			)
+			ce.pushValue(0)
+			frame.pc++
+		case wazeroir.OperationKindV128FConvertFromI:
+			hi, lo := ce.popValue(), ce.popValue()
+			v1, v2, v3, v4 := uint32(lo), uint32(lo>>32), uint32(hi), uint32(hi>>32)
+			signed := op.b3
+
+			var retLo, retHi uint64
+			switch op.b1 { // Destination shape.
+			case wazeroir.ShapeF32x4: // f32x4 from signed/unsigned i32x4
+				if signed {
+					retLo = uint64(math.Float32bits(float32(int32(v1)))) |
+						(uint64(math.Float32bits(float32(int32(v2)))) << 32)
+					retHi = uint64(math.Float32bits(float32(int32(v3)))) |
+						(uint64(math.Float32bits(float32(int32(v4)))) << 32)
+				} else {
+					retLo = uint64(math.Float32bits(float32(v1))) |
+						(uint64(math.Float32bits(float32(v2))) << 32)
+					retHi = uint64(math.Float32bits(float32(v3))) |
+						(uint64(math.Float32bits(float32(v4))) << 32)
+				}
+			case wazeroir.ShapeF64x2: // f64x2 from signed/unsigned i32x4
+				if signed {
+					retLo, retHi = math.Float64bits(float64(int32(v1))), math.Float64bits(float64(int32(v2)))
+				} else {
+					retLo, retHi = math.Float64bits(float64(v1)), math.Float64bits(float64(v2))
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Narrow:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			signed := op.b3
+
+			var retLo, retHi uint64
+			switch op.b1 {
+			case wazeroir.ShapeI16x8: // signed/unsigned i16x8 to i8x16
+				for i := 0; i < 8; i++ {
+					var v16 uint16
+					if i < 4 {
+						v16 = uint16(x1Lo >> (i * 16))
+					} else {
+						v16 = uint16(x1Hi >> ((i - 4) * 16))
+					}
+
+					var v byte
+					if signed {
+						if s := int16(v16); s > math.MaxInt8 {
+							v = math.MaxInt8
+						} else if s < math.MinInt8 {
+							s = math.MinInt8
+							v = byte(s)
+						} else {
+							v = byte(v16)
+						}
+					} else {
+						if s := int16(v16); s > math.MaxUint8 {
+							v = math.MaxUint8
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = byte(v16)
+						}
+					}
+					retLo |= uint64(v) << (i * 8)
+				}
+				for i := 0; i < 8; i++ {
+					var v16 uint16
+					if i < 4 {
+						v16 = uint16(x2Lo >> (i * 16))
+					} else {
+						v16 = uint16(x2Hi >> ((i - 4) * 16))
+					}
+
+					var v byte
+					if signed {
+						if s := int16(v16); s > math.MaxInt8 {
+							v = math.MaxInt8
+						} else if s < math.MinInt8 {
+							s = math.MinInt8
+							v = byte(s)
+						} else {
+							v = byte(v16)
+						}
+					} else {
+						if s := int16(v16); s > math.MaxUint8 {
+							v = math.MaxUint8
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = byte(v16)
+						}
+					}
+					retHi |= uint64(v) << (i * 8)
+				}
+			case wazeroir.ShapeI32x4: // signed/unsigned i32x4 to i16x8
+				for i := 0; i < 4; i++ {
+					var v32 uint32
+					if i < 2 {
+						v32 = uint32(x1Lo >> (i * 32))
+					} else {
+						v32 = uint32(x1Hi >> ((i - 2) * 32))
+					}
+
+					var v uint16
+					if signed {
+						if s := int32(v32); s > math.MaxInt16 {
+							v = math.MaxInt16
+						} else if s < math.MinInt16 {
+							s = math.MinInt16
+							v = uint16(s)
+						} else {
+							v = uint16(v32)
+						}
+					} else {
+						if s := int32(v32); s > math.MaxUint16 {
+							v = math.MaxUint16
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = uint16(v32)
+						}
+					}
+					retLo |= uint64(v) << (i * 16)
+				}
+
+				for i := 0; i < 4; i++ {
+					var v32 uint32
+					if i < 2 {
+						v32 = uint32(x2Lo >> (i * 32))
+					} else {
+						v32 = uint32(x2Hi >> ((i - 2) * 32))
+					}
+
+					var v uint16
+					if signed {
+						if s := int32(v32); s > math.MaxInt16 {
+							v = math.MaxInt16
+						} else if s < math.MinInt16 {
+							s = math.MinInt16
+							v = uint16(s)
+						} else {
+							v = uint16(v32)
+						}
+					} else {
+						if s := int32(v32); s > math.MaxUint16 {
+							v = math.MaxUint16
+						} else if s < 0 {
+							v = 0
+						} else {
+							v = uint16(v32)
+						}
+					}
+					retHi |= uint64(v) << (i * 16)
+				}
+			}
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+		case wazeroir.OperationKindV128Dot:
+			x2Hi, x2Lo := ce.popValue(), ce.popValue()
+			x1Hi, x1Lo := ce.popValue(), ce.popValue()
+			ce.pushValue(
+				uint64(uint32(int32(int16(x1Lo>>0))*int32(int16(x2Lo>>0))+int32(int16(x1Lo>>16))*int32(int16(x2Lo>>16)))) |
+					(uint64(uint32(int32(int16(x1Lo>>32))*int32(int16(x2Lo>>32))+int32(int16(x1Lo>>48))*int32(int16(x2Lo>>48)))) << 32),
+			)
+			ce.pushValue(
+				uint64(uint32(int32(int16(x1Hi>>0))*int32(int16(x2Hi>>0))+int32(int16(x1Hi>>16))*int32(int16(x2Hi>>16)))) |
+					(uint64(uint32(int32(int16(x1Hi>>32))*int32(int16(x2Hi>>32))+int32(int16(x1Hi>>48))*int32(int16(x2Hi>>48)))) << 32),
+			)
+			frame.pc++
+		case wazeroir.OperationKindV128ITruncSatFromF:
+			hi, lo := ce.popValue(), ce.popValue()
+			signed := op.b3
+			var retLo, retHi uint64
+
+			switch op.b1 {
+			case wazeroir.ShapeF32x4: // f32x4 to i32x4
+				for i, f64 := range [4]float64{
+					math.Trunc(float64(math.Float32frombits(uint32(lo)))),
+					math.Trunc(float64(math.Float32frombits(uint32(lo >> 32)))),
+					math.Trunc(float64(math.Float32frombits(uint32(hi)))),
+					math.Trunc(float64(math.Float32frombits(uint32(hi >> 32))))} {
+
+					var v uint32
+					if math.IsNaN(f64) {
+						v = 0
+					} else if signed {
+						if f64 < math.MinInt32 {
+							f64 = math.MinInt32
+						} else if f64 > math.MaxInt32 {
+							f64 = math.MaxInt32
+						}
+						v = uint32(int32(f64))
+					} else {
+						if f64 < 0 {
+							f64 = 0
+						} else if f64 > math.MaxUint32 {
+							f64 = math.MaxUint32
+						}
+						v = uint32(f64)
+					}
+
+					if i < 2 {
+						retLo |= uint64(v) << (i * 32)
+					} else {
+						retHi |= uint64(v) << ((i - 2) * 32)
+					}
+				}
+
+			case wazeroir.ShapeF64x2: // f64x2 to i32x4
+				for i, f := range [2]float64{
+					math.Trunc(math.Float64frombits(lo)),
+					math.Trunc(math.Float64frombits(hi)),
+				} {
+					var v uint32
+					if math.IsNaN(f) {
+						v = 0
+					} else if signed {
+						if f < math.MinInt32 {
+							f = math.MinInt32
+						} else if f > math.MaxInt32 {
+							f = math.MaxInt32
+						}
+						v = uint32(int32(f))
+					} else {
+						if f < 0 {
+							f = 0
+						} else if f > math.MaxUint32 {
+							f = math.MaxUint32
+						}
+						v = uint32(f)
+					}
+
+					retLo |= uint64(v) << (i * 32)
+				}
+			}
+
+			ce.pushValue(retLo)
+			ce.pushValue(retHi)
+			frame.pc++
+			frame.pc++
+		}
+	}
+	ce.popFrame()
+}
+
+// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2
+func flt32(z1, z2 float32) bool {
+	if z1 != z1 || z2 != z2 {
+		return false
+	} else if z1 == z2 {
+		return false
+	} else if math.IsInf(float64(z1), 1) {
+		return false
+	} else if math.IsInf(float64(z1), -1) {
+		return true
+	} else if math.IsInf(float64(z2), 1) {
+		return true
+	} else if math.IsInf(float64(z2), -1) {
+		return false
+	}
+	return z1 < z2
+}
+
+// https://www.w3.org/TR/2022/WD-wasm-core-2-20220419/exec/numerics.html#xref-exec-numerics-op-flt-mathrm-flt-n-z-1-z-2
+func flt64(z1, z2 float64) bool {
+	if z1 != z1 || z2 != z2 {
+		return false
+	} else if z1 == z2 {
+		return false
+	} else if math.IsInf(z1, 1) {
+		return false
+	} else if math.IsInf(z1, -1) {
+		return true
+	} else if math.IsInf(z2, 1) {
+		return true
+	} else if math.IsInf(z2, -1) {
+		return false
+	}
+	return z1 < z2
+}
+
+func i8RoundingAverage(v1, v2 byte) byte {
+	// https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md#lane-wise-integer-rounding-average
+	return byte((uint16(v1) + uint16(v2) + uint16(1)) / 2)
+}
+
+func i16RoundingAverage(v1, v2 uint16) uint16 {
+	// https://github.com/WebAssembly/spec/blob/main/proposals/simd/SIMD.md#lane-wise-integer-rounding-average
+	return uint16((uint32(v1) + uint32(v2) + 1) / 2)
+}
+
+func i8Abs(v byte) byte {
+	if i := int8(v); i < 0 {
+		return byte(-i)
+	} else {
+		return byte(i)
+	}
+}
+
+func i8MaxU(v1, v2 byte) byte {
+	if v1 < v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i8MinU(v1, v2 byte) byte {
+	if v1 > v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i8MaxS(v1, v2 byte) byte {
+	if int8(v1) < int8(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i8MinS(v1, v2 byte) byte {
+	if int8(v1) > int8(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MaxU(v1, v2 uint16) uint16 {
+	if v1 < v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MinU(v1, v2 uint16) uint16 {
+	if v1 > v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MaxS(v1, v2 uint16) uint16 {
+	if int16(v1) < int16(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16MinS(v1, v2 uint16) uint16 {
+	if int16(v1) > int16(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MaxU(v1, v2 uint32) uint32 {
+	if v1 < v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MinU(v1, v2 uint32) uint32 {
+	if v1 > v2 {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MaxS(v1, v2 uint32) uint32 {
+	if int32(v1) < int32(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i32MinS(v1, v2 uint32) uint32 {
+	if int32(v1) > int32(v2) {
+		return v2
+	} else {
+		return v1
+	}
+}
+
+func i16Abs(v uint16) uint16 {
+	if i := int16(v); i < 0 {
+		return uint16(-i)
+	} else {
+		return uint16(i)
+	}
+}
+
+func i32Abs(v uint32) uint32 {
+	if i := int32(v); i < 0 {
+		return uint32(-i)
+	} else {
+		return uint32(i)
+	}
 }
 
 func (ce *callEngine) callNativeFuncWithListener(ctx context.Context, callCtx *wasm.CallContext, f *function, fnl experimental.FunctionListener) context.Context {
diff --git a/internal/integration_test/asm/amd64_debug/golang_asm.go b/internal/integration_test/asm/amd64_debug/golang_asm.go
index f4bf0b13429..cfbbe0a8faf 100644
--- a/internal/integration_test/asm/amd64_debug/golang_asm.go
+++ b/internal/integration_test/asm/amd64_debug/golang_asm.go
@@ -560,13 +560,13 @@ var castAsGolangAsmInstruction = [...]obj.As{
 	amd64.PINSRQ:    x86.APINSRQ,
 	amd64.PADDB:     x86.APADDB,
 	amd64.PADDW:     x86.APADDW,
-	amd64.PADDL:     x86.APADDL,
+	amd64.PADDD:     x86.APADDL,
 	amd64.PADDQ:     x86.APADDQ,
 	amd64.ADDPS:     x86.AADDPS,
 	amd64.ADDPD:     x86.AADDPD,
 	amd64.PSUBB:     x86.APSUBB,
 	amd64.PSUBW:     x86.APSUBW,
-	amd64.PSUBL:     x86.APSUBL,
+	amd64.PSUBD:     x86.APSUBL,
 	amd64.PSUBQ:     x86.APSUBQ,
 	amd64.SUBPS:     x86.ASUBPS,
 	amd64.SUBPD:     x86.ASUBPD,
diff --git a/internal/integration_test/asm/amd64_debug/impl_test.go b/internal/integration_test/asm/amd64_debug/impl_test.go
index c6695b1e09e..e932375f256 100644
--- a/internal/integration_test/asm/amd64_debug/impl_test.go
+++ b/internal/integration_test/asm/amd64_debug/impl_test.go
@@ -812,13 +812,13 @@ func TestAssemblerImpl_EncodeRegisterToRegister(t *testing.T) {
 	}{
 		{instruction: amd64.PADDB, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.PADDW, srcRegs: floatRegisters, DstRegs: floatRegisters},
-		{instruction: amd64.PADDL, srcRegs: floatRegisters, DstRegs: floatRegisters},
+		{instruction: amd64.PADDD, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.PADDQ, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.ADDPS, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.ADDPD, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.PSUBB, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.PSUBW, srcRegs: floatRegisters, DstRegs: floatRegisters},
-		{instruction: amd64.PSUBL, srcRegs: floatRegisters, DstRegs: floatRegisters},
+		{instruction: amd64.PSUBD, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.PSUBQ, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.SUBPS, srcRegs: floatRegisters, DstRegs: floatRegisters},
 		{instruction: amd64.SUBPD, srcRegs: floatRegisters, DstRegs: floatRegisters},
diff --git a/internal/integration_test/spectest/spectest.go b/internal/integration_test/spectest/spectest.go
index 1281bca2a5e..4293c9c8315 100644
--- a/internal/integration_test/spectest/spectest.go
+++ b/internal/integration_test/spectest/spectest.go
@@ -66,25 +66,51 @@ type (
 	}
 
 	commandActionVal struct {
-		ValType  string      `json:"type"`
-		LaneType string      `json:"lane_type"`
+		ValType string `json:"type"`
+		// LaneType is not empty if ValueType == "v128"
+		LaneType laneType    `json:"lane_type"`
 		Value    interface{} `json:"value"`
 	}
 )
 
+// laneType is a type of each lane of vector value.
+//
+// See https://github.com/WebAssembly/wabt/blob/main/docs/wast2json.md#const
+type laneType = string
+
+const (
+	laneTypeI8  laneType = "i8"
+	laneTypeI16 laneType = "i16"
+	laneTypeI32 laneType = "i32"
+	laneTypeI64 laneType = "i64"
+	laneTypeF32 laneType = "f32"
+	laneTypeF64 laneType = "f64"
+)
+
 func (c commandActionVal) String() string {
 	var v string
+	valTypeStr := c.ValType
 	switch c.ValType {
 	case "i32":
 		v = c.Value.(string)
 	case "f32":
-		ret, _ := strconv.ParseUint(c.Value.(string), 10, 32)
-		v = fmt.Sprintf("%f", math.Float32frombits(uint32(ret)))
+		str := c.Value.(string)
+		if strings.Contains(str, "nan") {
+			v = "nan"
+		} else {
+			ret, _ := strconv.ParseUint(str, 10, 32)
+			v = fmt.Sprintf("%f", math.Float32frombits(uint32(ret)))
+		}
 	case "i64":
 		v = c.Value.(string)
 	case "f64":
-		ret, _ := strconv.ParseUint(c.Value.(string), 10, 64)
-		v = fmt.Sprintf("%f", math.Float64frombits(ret))
+		str := c.Value.(string)
+		if strings.Contains(str, "nan") {
+			v = "nan"
+		} else {
+			ret, _ := strconv.ParseUint(str, 10, 64)
+			v = fmt.Sprintf("%f", math.Float64frombits(ret))
+		}
 	case "externref":
 		if c.Value == "null" {
 			v = "null"
@@ -107,8 +133,9 @@ func (c commandActionVal) String() string {
 			strs = append(strs, v.(string))
 		}
 		v = strings.Join(strs, ",")
+		valTypeStr = fmt.Sprintf("v128[lane=%s]", c.LaneType)
 	}
-	return fmt.Sprintf("{type: %s, value: %v}", c.ValType, v)
+	return fmt.Sprintf("{type: %s, value: %v}", valTypeStr, v)
 }
 
 func (c command) String() string {
@@ -153,15 +180,14 @@ func (c command) getAssertReturnArgs() []uint64 {
 	return args
 }
 
-func (c command) getAssertReturnArgsExps() ([]uint64, []uint64) {
-	var args, exps []uint64
+func (c command) getAssertReturnArgsExps() (args []uint64, exps []uint64) {
 	for _, arg := range c.Action.Args {
 		args = append(args, arg.toUint64s()...)
 	}
 	for _, exp := range c.Exps {
 		exps = append(exps, exp.toUint64s()...)
 	}
-	return args, exps
+	return
 }
 
 func (c commandActionVal) toUint64s() (ret []uint64) {
@@ -170,7 +196,6 @@ func (c commandActionVal) toUint64s() (ret []uint64) {
 		if !ok {
 			panic("BUG")
 		}
-		var low, high uint64
 		var width, valNum int
 		switch c.LaneType {
 		case "i8":
@@ -188,24 +213,39 @@ func (c commandActionVal) toUint64s() (ret []uint64) {
 		default:
 			panic("BUG")
 		}
-		for i := 0; i < valNum/2; i++ {
-			v, err := strconv.ParseUint(strValues[i].(string), 10, width)
-			if err != nil {
-				panic(err)
+		lo, hi := buildLaneUint64(strValues, width, valNum)
+		return []uint64{lo, hi}
+	} else {
+		return []uint64{c.toUint64()}
+	}
+}
+
+func buildLaneUint64(raw []interface{}, width, valNum int) (lo, hi uint64) {
+	for i := 0; i < valNum; i++ {
+		str := raw[i].(string)
+
+		var v uint64
+		var err error
+		if strings.Contains(str, "nan") {
+			if width == 64 {
+				v = math.Float64bits(math.NaN())
+			} else {
+				v = uint64(math.Float32bits(float32(math.NaN())))
 			}
-			low |= (v << (i * width))
-		}
-		for i := valNum / 2; i < valNum; i++ {
-			v, err := strconv.ParseUint(strValues[i].(string), 10, width)
+		} else {
+			v, err = strconv.ParseUint(str, 10, width)
 			if err != nil {
 				panic(err)
 			}
-			high |= (v << ((i - valNum/2) * width))
 		}
-		return []uint64{low, high}
-	} else {
-		return []uint64{c.toUint64()}
+
+		if half := valNum / 2; i < half {
+			lo |= v << (i * width)
+		} else {
+			hi |= v << ((i - half) * width)
+		}
 	}
+	return
 }
 
 func (c commandActionVal) toUint64() (ret uint64) {
@@ -441,7 +481,14 @@ func Run(t *testing.T, testDataFS embed.FS, newEngine func(wasm.Features) wasm.E
 							vals, types, err := callFunction(ns, moduleName, c.Action.Field, args...)
 							require.NoError(t, err, msg)
 							require.Equal(t, len(exps), len(vals), msg)
-							requireValuesEq(t, vals, exps, types, msg)
+							laneTypes := map[int]string{}
+							for i, expV := range c.Exps {
+								if expV.ValType == "v128" {
+									laneTypes[i] = expV.LaneType
+								}
+							}
+							matched, valuesMsg := valuesEq(vals, exps, types, laneTypes)
+							require.True(t, matched, msg+"\n"+valuesMsg)
 						case "get":
 							_, exps := c.getAssertReturnArgsExps()
 							require.Equal(t, 1, len(exps))
@@ -597,52 +644,157 @@ func testdataPath(filename string) string {
 	return fmt.Sprintf("testdata/%s", filename)
 }
 
-func requireValuesEq(t *testing.T, actual, exps []uint64, valTypes []wasm.ValueType, msg string) {
-	var expectedTypesVectorFlattend []wasm.ValueType
-	for _, tp := range valTypes {
-		if tp != wasm.ValueTypeV128 {
-			expectedTypesVectorFlattend = append(expectedTypesVectorFlattend, tp)
-		} else {
-			expectedTypesVectorFlattend = append(expectedTypesVectorFlattend, wasm.ValueTypeI64)
-			expectedTypesVectorFlattend = append(expectedTypesVectorFlattend, wasm.ValueTypeI64)
+// valuesEq returns true if all the actual result matches exps which are all expressed as uint64.
+// 	* actual,exps: comparison target values which are all represented as uint64, meaning that if valTypes = [V128,I32], then
+//		we have actual/exp = [(lower-64bit of the first V128), (higher-64bit of the first V128), I32].
+// 	* valTypes holds the wasm.ValueType(s) of the original values in Wasm.
+// 	* laneTypes maps the index of valueTypes to laneType if valueTypes[i] == wasm.ValueTypeV128.
+//
+// Also, if matched == false this returns non-empty valuesMsg which can be used to augment the test failure message.
+func valuesEq(actual, exps []uint64, valTypes []wasm.ValueType, laneTypes map[int]laneType) (matched bool, valuesMsg string) {
+	matched = true
+
+	var msgExpValuesStrs, msgActualValuesStrs []string
+	var uint64RepPos int // the index to actual and exps slice.
+	for i, tp := range valTypes {
+		switch tp {
+		case wasm.ValueTypeI32:
+			msgExpValuesStrs = append(msgExpValuesStrs, fmt.Sprintf("%d", uint32(exps[uint64RepPos])))
+			msgActualValuesStrs = append(msgActualValuesStrs, fmt.Sprintf("%d", uint32(actual[uint64RepPos])))
+			matched = matched && (uint32(exps[uint64RepPos]) == uint32(actual[uint64RepPos]))
+			uint64RepPos++
+		case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
+			msgExpValuesStrs = append(msgExpValuesStrs, fmt.Sprintf("%d", exps[uint64RepPos]))
+			msgActualValuesStrs = append(msgActualValuesStrs, fmt.Sprintf("%d", actual[uint64RepPos]))
+			matched = matched && (exps[uint64RepPos] == actual[uint64RepPos])
+			uint64RepPos++
+		case wasm.ValueTypeF32:
+			a := math.Float32frombits(uint32(actual[uint64RepPos]))
+			e := math.Float32frombits(uint32(exps[uint64RepPos]))
+			msgExpValuesStrs = append(msgExpValuesStrs, fmt.Sprintf("%f", e))
+			msgActualValuesStrs = append(msgActualValuesStrs, fmt.Sprintf("%f", a))
+			matched = matched && f32Equal(e, a)
+			uint64RepPos++
+		case wasm.ValueTypeF64:
+			e := math.Float64frombits(exps[uint64RepPos])
+			a := math.Float64frombits(actual[uint64RepPos])
+			msgExpValuesStrs = append(msgExpValuesStrs, fmt.Sprintf("%f", e))
+			msgActualValuesStrs = append(msgActualValuesStrs, fmt.Sprintf("%f", a))
+			matched = matched && f64Equal(e, a)
+			uint64RepPos++
+		case wasm.ValueTypeV128:
+			actualLo, actualHi := actual[uint64RepPos], actual[uint64RepPos+1]
+			expLo, expHi := exps[uint64RepPos], exps[uint64RepPos+1]
+			switch laneTypes[i] {
+			case laneTypeI8:
+				msgExpValuesStrs = append(msgExpValuesStrs,
+					fmt.Sprintf("i8x16(%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x)",
+						byte(expLo), byte(expLo>>8), byte(expLo>>16), byte(expLo>>24),
+						byte(expLo>>32), byte(expLo>>40), byte(expLo>>48), byte(expLo>>56),
+						byte(expHi), byte(expHi>>8), byte(expHi>>16), byte(expHi>>24),
+						byte(expHi>>32), byte(expHi>>40), byte(expHi>>48), byte(expHi>>56),
+					),
+				)
+				msgActualValuesStrs = append(msgActualValuesStrs,
+					fmt.Sprintf("i8x16(%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x)",
+						byte(actualLo), byte(actualLo>>8), byte(actualLo>>16), byte(actualLo>>24),
+						byte(actualLo>>32), byte(actualLo>>40), byte(actualLo>>48), byte(actualLo>>56),
+						byte(actualHi), byte(actualHi>>8), byte(actualHi>>16), byte(actualHi>>24),
+						byte(actualHi>>32), byte(actualHi>>40), byte(actualHi>>48), byte(actualHi>>56),
+					),
+				)
+				matched = matched && (expLo == actualLo) && (expHi == actualHi)
+			case laneTypeI16:
+				msgExpValuesStrs = append(msgExpValuesStrs,
+					fmt.Sprintf("i16x8(%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x)",
+						uint16(expLo), uint16(expLo>>16), uint16(expLo>>32), uint16(expLo>>48),
+						uint16(expHi), uint16(expHi>>16), uint16(expHi>>32), uint16(expHi>>48),
+					),
+				)
+				msgActualValuesStrs = append(msgActualValuesStrs,
+					fmt.Sprintf("i16x8(%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x)",
+						uint16(actualLo), uint16(actualLo>>16), uint16(actualLo>>32), uint16(actualLo>>48),
+						uint16(actualHi), uint16(actualHi>>16), uint16(actualHi>>32), uint16(actualHi>>48),
+					),
+				)
+				matched = matched && (expLo == actualLo) && (expHi == actualHi)
+			case laneTypeI32:
+				msgExpValuesStrs = append(msgExpValuesStrs,
+					fmt.Sprintf("i32x4(%#x, %#x, %#x, %#x)", uint32(expLo), uint32(expLo>>32), uint32(expHi), uint32(expHi>>32)),
+				)
+				msgActualValuesStrs = append(msgActualValuesStrs,
+					fmt.Sprintf("i32x4(%#x, %#x, %#x, %#x)", uint32(actualLo), uint32(actualLo>>32), uint32(actualHi), uint32(actualHi>>32)),
+				)
+				matched = matched && (expLo == actualLo) && (expHi == actualHi)
+			case laneTypeI64:
+				msgExpValuesStrs = append(msgExpValuesStrs,
+					fmt.Sprintf("i64x2(%#x, %#x)", expLo, expHi),
+				)
+				msgActualValuesStrs = append(msgActualValuesStrs,
+					fmt.Sprintf("i64x2(%#x, %#x)", actualLo, actualHi),
+				)
+				matched = matched && (expLo == actualLo) && (expHi == actualHi)
+			case laneTypeF32:
+				msgExpValuesStrs = append(msgExpValuesStrs,
+					fmt.Sprintf("f32x4(%f, %f, %f, %f)",
+						math.Float32frombits(uint32(expLo)), math.Float32frombits(uint32(expLo>>32)),
+						math.Float32frombits(uint32(expHi)), math.Float32frombits(uint32(expHi>>32)),
+					),
+				)
+				msgActualValuesStrs = append(msgActualValuesStrs,
+					fmt.Sprintf("f32x4(%f, %f, %f, %f)",
+						math.Float32frombits(uint32(actualLo)), math.Float32frombits(uint32(actualLo>>32)),
+						math.Float32frombits(uint32(actualHi)), math.Float32frombits(uint32(actualHi>>32)),
+					),
+				)
+				matched = matched &&
+					f32Equal(math.Float32frombits(uint32(expLo)), math.Float32frombits(uint32(actualLo))) &&
+					f32Equal(math.Float32frombits(uint32(expLo>>32)), math.Float32frombits(uint32(actualLo>>32))) &&
+					f32Equal(math.Float32frombits(uint32(expHi)), math.Float32frombits(uint32(actualHi))) &&
+					f32Equal(math.Float32frombits(uint32(expHi>>32)), math.Float32frombits(uint32(actualHi>>32)))
+			case laneTypeF64:
+				msgExpValuesStrs = append(msgExpValuesStrs,
+					fmt.Sprintf("f64x2(%f, %f)", math.Float64frombits(expLo), math.Float64frombits(expHi)),
+				)
+				msgActualValuesStrs = append(msgActualValuesStrs,
+					fmt.Sprintf("f64x2(%f, %f)", math.Float64frombits(actualLo), math.Float64frombits(actualHi)),
+				)
+				matched = matched &&
+					f64Equal(math.Float64frombits(expLo), math.Float64frombits(actualLo)) &&
+					f64Equal(math.Float64frombits(expHi), math.Float64frombits(actualHi))
+			default:
+				panic("BUG")
+			}
+			uint64RepPos += 2
+		default:
+			panic("BUG")
 		}
 	}
 
-	result := fmt.Sprintf("\thave (%v)\n\twant (%v)", actual, exps)
-	for i := range exps {
-		requireValueEq(t, actual[i], exps[i], expectedTypesVectorFlattend[i], msg+"\n"+result)
+	if !matched {
+		valuesMsg = fmt.Sprintf("\thave [%s]\n\twant [%s]",
+			strings.Join(msgActualValuesStrs, ", "),
+			strings.Join(msgExpValuesStrs, ", "))
 	}
+	return
 }
 
-func requireValueEq(t *testing.T, actual, expected uint64, valType wasm.ValueType, msg string) {
-	switch valType {
-	case wasm.ValueTypeI32:
-		require.Equal(t, uint32(expected), uint32(actual), msg)
-	case wasm.ValueTypeI64:
-		require.Equal(t, expected, actual, msg)
-	case wasm.ValueTypeF32:
-		expF := math.Float32frombits(uint32(expected))
-		actualF := math.Float32frombits(uint32(actual))
-		if math.IsNaN(float64(expF)) { // NaN cannot be compared with themselves, so we have to use IsNaN
-			require.True(t, math.IsNaN(float64(actualF)), msg)
-		} else {
-			require.Equal(t, expF, actualF, msg)
-		}
-	case wasm.ValueTypeF64:
-		expF := math.Float64frombits(expected)
-		actualF := math.Float64frombits(actual)
-		if math.IsNaN(expF) { // NaN cannot be compared with themselves, so we have to use IsNaN
-			require.True(t, math.IsNaN(actualF), msg)
-		} else {
-			require.Equal(t, expF, actualF, msg)
-		}
-	case wasm.ValueTypeExternref:
-		require.Equal(t, expected, actual, msg)
-	case wasm.ValueTypeFuncref:
-		require.Equal(t, expected, actual, msg)
-	default:
-		t.Fatal(msg)
+func f32Equal(expected, actual float32) (matched bool) {
+	if math.IsNaN(float64(expected)) { // NaN cannot be compared with themselves, so we have to use IsNaN
+		matched = math.IsNaN(float64(actual))
+	} else {
+		matched = expected == actual
+	}
+	return
+}
+
+func f64Equal(actual, expected float64) (matched bool) {
+	if math.IsNaN(expected) { // NaN cannot be compared with themselves, so we have to use IsNaN
+		matched = math.IsNaN(actual)
+	} else {
+		matched = expected == actual
 	}
+	return
 }
 
 // callFunction is inlined here as the spectest needs to validate the signature was correct
diff --git a/internal/integration_test/spectest/spectest_test.go b/internal/integration_test/spectest/spectest_test.go
new file mode 100644
index 00000000000..b11ef1dc639
--- /dev/null
+++ b/internal/integration_test/spectest/spectest_test.go
@@ -0,0 +1,581 @@
+package spectest
+
+import (
+	"encoding/json"
+	"math"
+	"testing"
+
+	"github.com/tetratelabs/wazero/internal/testing/require"
+	"github.com/tetratelabs/wazero/internal/wasm"
+)
+
+func Test_f32Equal(t *testing.T) {
+	tests := []struct {
+		name   string
+		f1, f2 float32
+		exp    bool
+	}{
+		{name: "1", f1: 1.1, f2: 1.1, exp: true},
+		{name: "2", f1: float32(math.NaN()), f2: float32(math.NaN()), exp: true},
+		{name: "3", f1: float32(math.Inf(1)), f2: float32(math.Inf(1)), exp: true},
+		{name: "4", f1: float32(math.Inf(-1)), f2: float32(math.Inf(-1)), exp: true},
+		{name: "5", f1: 1.1, f2: -1.1, exp: false},
+		{name: "6", f1: float32(math.NaN()), f2: -1.1, exp: false},
+		{name: "7", f1: -1.1, f2: float32(math.NaN()), exp: false},
+		{name: "8", f1: float32(math.NaN()), f2: float32(math.Inf(1)), exp: false},
+		{name: "9", f1: float32(math.Inf(1)), f2: float32(math.NaN()), exp: false},
+		{name: "10", f1: float32(math.NaN()), f2: float32(math.Inf(-1)), exp: false},
+		{name: "11", f1: float32(math.Inf(-1)), f2: float32(math.NaN()), exp: false},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			require.Equal(t, tc.exp, f32Equal(tc.f1, tc.f2))
+		})
+	}
+}
+
+func Test_f64Equal(t *testing.T) {
+	tests := []struct {
+		name   string
+		f1, f2 float64
+		exp    bool
+	}{
+		{name: "1", f1: 1.1, f2: 1.1, exp: true},
+		{name: "2", f1: math.NaN(), f2: math.NaN(), exp: true},
+		{name: "3", f1: math.Inf(1), f2: math.Inf(1), exp: true},
+		{name: "4", f1: math.Inf(-1), f2: math.Inf(-1), exp: true},
+		{name: "5", f1: 1.1, f2: -1.1, exp: false},
+		{name: "6", f1: math.NaN(), f2: -1.1, exp: false},
+		{name: "7", f1: -1.1, f2: math.NaN(), exp: false},
+		{name: "8", f1: math.NaN(), f2: math.Inf(1), exp: false},
+		{name: "9", f1: math.Inf(1), f2: math.NaN(), exp: false},
+		{name: "10", f1: math.NaN(), f2: math.Inf(-1), exp: false},
+		{name: "11", f1: math.Inf(-1), f2: math.NaN(), exp: false},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			require.Equal(t, tc.exp, f64Equal(tc.f1, tc.f2))
+		})
+	}
+}
+
+func Test_valuesEq(t *testing.T) {
+	i32, i64, f32, f64, v128 := wasm.ValueTypeI32, wasm.ValueTypeI64, wasm.ValueTypeF32, wasm.ValueTypeF64, wasm.ValueTypeV128
+	tests := []struct {
+		name         string
+		exps, actual []uint64
+		valueTypes   []wasm.ValueType
+		laneTypes    map[int]laneType
+		expMatched   bool
+		expValuesMsg string
+	}{
+		{
+			name:       "matched/i32",
+			exps:       []uint64{0},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{i32},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/i32",
+			exps:       []uint64{1},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{i32},
+			expMatched: false,
+			expValuesMsg: `	have [0]
+	want [1]`,
+		},
+		{
+			name:       "unmatched/i32",
+			exps:       []uint64{math.MaxUint32},
+			actual:     []uint64{1123},
+			valueTypes: []wasm.ValueType{i32},
+			expMatched: false,
+			expValuesMsg: `	have [1123]
+	want [4294967295]`,
+		},
+		{
+			name:       "matched/i64",
+			exps:       []uint64{0},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{i64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/i64",
+			exps:       []uint64{1},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{i64},
+			expMatched: false,
+			expValuesMsg: `	have [0]
+	want [1]`,
+		},
+		{
+			name:       "unmatched/i64",
+			exps:       []uint64{math.MaxUint64},
+			actual:     []uint64{1123},
+			valueTypes: []wasm.ValueType{i64},
+			expMatched: false,
+			expValuesMsg: `	have [1123]
+	want [18446744073709551615]`,
+		},
+		{
+			name:       "matched/f32",
+			exps:       []uint64{0},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{f32},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/f32",
+			exps:       []uint64{uint64(math.Float32bits(-13123.1))},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{f32},
+			expMatched: false,
+			expValuesMsg: `	have [0.000000]
+	want [-13123.099609]`,
+		},
+		{
+			name:       "matched/f64",
+			exps:       []uint64{0},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{f64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/f64",
+			exps:       []uint64{math.Float64bits(1.0)},
+			actual:     []uint64{0},
+			valueTypes: []wasm.ValueType{f64},
+			expMatched: false,
+			expValuesMsg: `	have [0.000000]
+	want [1.000000]`,
+		},
+		{
+			name:       "unmatched/f64",
+			actual:     []uint64{math.Float64bits(-1231231.0)},
+			exps:       []uint64{0},
+			valueTypes: []wasm.ValueType{f64},
+			expMatched: false,
+			expValuesMsg: `	have [-1231231.000000]
+	want [0.000000]`,
+		},
+		{
+			name:       "matched/i8x16",
+			exps:       []uint64{math.MaxUint64, 123},
+			actual:     []uint64{math.MaxUint64, 123},
+			laneTypes:  map[int]laneType{0: laneTypeI8},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/i8x16",
+			exps:       []uint64{0, 0xff<<56 | 0xaa},
+			actual:     []uint64{math.MaxUint64, 0xff<<48 | 0xcc},
+			laneTypes:  map[int]laneType{0: laneTypeI8},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: false,
+			expValuesMsg: `	have [i8x16(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xcc, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0)]
+	want [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xaa, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff)]`,
+		},
+		{
+			name:       "matched/i16x8",
+			exps:       []uint64{math.MaxUint64, 123},
+			actual:     []uint64{math.MaxUint64, 123},
+			laneTypes:  map[int]laneType{0: laneTypeI16},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/i16x8",
+			exps:       []uint64{0xffff << 32, 0},
+			actual:     []uint64{0xaabb << 16, ^uint64(0)},
+			laneTypes:  map[int]laneType{0: laneTypeI16},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: false,
+			expValuesMsg: `	have [i16x8(0x0, 0xaabb, 0x0, 0x0, 0xffff, 0xffff, 0xffff, 0xffff)]
+	want [i16x8(0x0, 0x0, 0xffff, 0x0, 0x0, 0x0, 0x0, 0x0)]`,
+		},
+		{
+			name:       "matched/i32x4",
+			exps:       []uint64{math.MaxUint64, 123},
+			actual:     []uint64{math.MaxUint64, 123},
+			laneTypes:  map[int]laneType{0: laneTypeI32},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: true,
+		},
+		{
+			name:       "matched/i32x4",
+			exps:       []uint64{0xffff_ffff<<32 | 0xa, 123},
+			actual:     []uint64{0x1a1a_1a1a<<32 | 0xa, 123},
+			laneTypes:  map[int]laneType{0: laneTypeI32},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: false,
+			expValuesMsg: `	have [i32x4(0xa, 0x1a1a1a1a, 0x7b, 0x0)]
+	want [i32x4(0xa, 0xffffffff, 0x7b, 0x0)]`,
+		},
+		{
+			name:       "matched/i64x2",
+			exps:       []uint64{math.MaxUint64, 123},
+			actual:     []uint64{math.MaxUint64, 123},
+			laneTypes:  map[int]laneType{0: laneTypeI64},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/i64x2",
+			exps:       []uint64{math.MaxUint64, 123},
+			actual:     []uint64{math.MaxUint64, 0},
+			laneTypes:  map[int]laneType{0: laneTypeI64},
+			valueTypes: []wasm.ValueType{v128},
+			expMatched: false,
+			expValuesMsg: `	have [i64x2(0xffffffffffffffff, 0x0)]
+	want [i64x2(0xffffffffffffffff, 0x7b)]`,
+		},
+		{
+			name: "matched/f32x4",
+			exps: []uint64{
+				(uint64(math.Float32bits(float32(math.NaN()))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+				(uint64(math.Float32bits(float32(math.NaN()))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+			},
+			actual: []uint64{
+				(uint64(math.Float32bits(float32(math.NaN()))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+				(uint64(math.Float32bits(float32(math.NaN()))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+			},
+			valueTypes: []wasm.ValueType{v128},
+			laneTypes:  map[int]laneType{0: laneTypeF32},
+			expMatched: true,
+		},
+		{
+			name: "unmatched/f32x4",
+			exps: []uint64{
+				(uint64(math.Float32bits(float32(1.213))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+				(uint64(math.Float32bits(float32(math.NaN()))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+			},
+			actual: []uint64{
+				(uint64(math.Float32bits(float32(math.NaN()))) << 32) | uint64(math.Float32bits(float32(math.Inf(1)))),
+				(uint64(math.Float32bits(float32(math.Inf(-1)))) << 32) | uint64(math.Float32bits(float32(math.NaN()))),
+			},
+			valueTypes: []wasm.ValueType{v128},
+			laneTypes:  map[int]laneType{0: laneTypeF32},
+			expMatched: false,
+			expValuesMsg: `	have [f32x4(+Inf, NaN, NaN, -Inf)]
+	want [f32x4(NaN, 1.213000, NaN, NaN)]`,
+		},
+		{
+			name:       "matched/f64x2",
+			exps:       []uint64{math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128},
+			laneTypes:  map[int]laneType{0: laneTypeF64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/f64x2",
+			exps:       []uint64{math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{math.Float64bits(-1.0), math.Float64bits(math.Inf(1))},
+			valueTypes: []wasm.ValueType{v128},
+			laneTypes:  map[int]laneType{0: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [f64x2(-1.000000, +Inf)]
+	want [f64x2(1.000000, NaN)]`,
+		},
+		{
+			name:       "unmatched/f64x2",
+			exps:       []uint64{math.Float64bits(math.Inf(1)), math.Float64bits(math.NaN())},
+			actual:     []uint64{math.Float64bits(math.Inf(-1)), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128},
+			laneTypes:  map[int]laneType{0: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [f64x2(-Inf, NaN)]
+	want [f64x2(+Inf, NaN)]`,
+		},
+		{
+			name:       "matched/[i32,f64x2]",
+			exps:       []uint64{1, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{1, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{i32, v128},
+			laneTypes:  map[int]laneType{1: laneTypeF64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/[i32,f64x2]",
+			exps:       []uint64{123, math.Float64bits(math.Inf(1)), math.Float64bits(math.NaN())},
+			actual:     []uint64{123, math.Float64bits(math.Inf(-1)), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{i32, v128},
+			laneTypes:  map[int]laneType{1: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [123, f64x2(-Inf, NaN)]
+	want [123, f64x2(+Inf, NaN)]`,
+		},
+		{
+			name:       "matched/[i32,f64x2]",
+			exps:       []uint64{math.Float64bits(1.0), math.Float64bits(math.NaN()), 1},
+			actual:     []uint64{math.Float64bits(1.0), math.Float64bits(math.NaN()), 1},
+			valueTypes: []wasm.ValueType{v128, i32},
+			laneTypes:  map[int]laneType{0: laneTypeF64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/[f64x2,i32]",
+			exps:       []uint64{math.Float64bits(math.Inf(1)), math.Float64bits(math.NaN()), 123},
+			actual:     []uint64{math.Float64bits(math.Inf(-1)), math.Float64bits(math.NaN()), 123},
+			valueTypes: []wasm.ValueType{v128, i32},
+			laneTypes:  map[int]laneType{0: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [f64x2(-Inf, NaN), 123]
+	want [f64x2(+Inf, NaN), 123]`,
+		},
+		{
+			name:       "matched/[f32,i32,f64x2]",
+			exps:       []uint64{uint64(math.Float32bits(float32(math.NaN()))), math.Float64bits(1.0), math.Float64bits(math.NaN()), 1},
+			actual:     []uint64{uint64(math.Float32bits(float32(math.NaN()))), math.Float64bits(1.0), math.Float64bits(math.NaN()), 1},
+			valueTypes: []wasm.ValueType{f32, v128, i32},
+			laneTypes:  map[int]laneType{1: laneTypeF64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/[f32,f64x2,i32]",
+			exps:       []uint64{uint64(math.Float32bits(1.0)), math.Float64bits(math.Inf(1)), math.Float64bits(math.NaN()), 123},
+			actual:     []uint64{uint64(math.Float32bits(1.0)), math.Float64bits(math.Inf(-1)), math.Float64bits(math.NaN()), 123},
+			valueTypes: []wasm.ValueType{f32, v128, i32},
+			laneTypes:  map[int]laneType{1: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [1.000000, f64x2(-Inf, NaN), 123]
+	want [1.000000, f64x2(+Inf, NaN), 123]`,
+		},
+		{
+			name:       "matched/[i8x16,f64x2]",
+			exps:       []uint64{0, 0, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{0, 0, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128, v128},
+			laneTypes:  map[int]laneType{0: laneTypeI8, 1: laneTypeF64},
+			expMatched: true,
+		},
+		{
+			name:       "unmatched/[i8x16,f64x2]",
+			exps:       []uint64{0, 0xff << 56, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{0, 0xaa << 56, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128, v128},
+			laneTypes:  map[int]laneType{0: laneTypeI8, 1: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xaa), f64x2(1.000000, NaN)]
+	want [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff), f64x2(1.000000, NaN)]`,
+		},
+		{
+			name:       "unmatched/[i8x16,f64x2]",
+			exps:       []uint64{0, 0xff << 56, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{0, 0xff << 56, math.Float64bits(1.0), math.Float64bits(math.Inf(1))},
+			valueTypes: []wasm.ValueType{v128, v128},
+			laneTypes:  map[int]laneType{0: laneTypeI8, 1: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff), f64x2(1.000000, +Inf)]
+	want [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff), f64x2(1.000000, NaN)]`,
+		},
+		{
+			name:       "matched/[i8x16,i32,f64x2]",
+			exps:       []uint64{0, 0, math.MaxUint32, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{0, 0, math.MaxUint32, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128, i32, v128},
+			laneTypes:  map[int]laneType{0: laneTypeI8, 2: laneTypeF64},
+			expMatched: true,
+		},
+		{
+			name:       "matched/[i8x16,i32,f64x2]",
+			exps:       []uint64{0, 0, math.MaxUint32, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{0, 0, math.MaxUint32 - 1, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128, i32, v128},
+			laneTypes:  map[int]laneType{0: laneTypeI8, 2: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0), 4294967294, f64x2(1.000000, NaN)]
+	want [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0), 4294967295, f64x2(1.000000, NaN)]`,
+		},
+		{
+			name:       "matched/[i8x16,i32,f64x2]",
+			exps:       []uint64{0, 0, math.MaxUint32, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			actual:     []uint64{0, 0xff << 16, math.MaxUint32, math.Float64bits(1.0), math.Float64bits(math.NaN())},
+			valueTypes: []wasm.ValueType{v128, i32, v128},
+			laneTypes:  map[int]laneType{0: laneTypeI8, 2: laneTypeF64},
+			expMatched: false,
+			expValuesMsg: `	have [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0), 4294967295, f64x2(1.000000, NaN)]
+	want [i8x16(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0), 4294967295, f64x2(1.000000, NaN)]`,
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			actualMatched, actualValuesMsg := valuesEq(tc.actual, tc.exps, tc.valueTypes, tc.laneTypes)
+			require.Equal(t, tc.expMatched, actualMatched)
+			require.Equal(t, tc.expValuesMsg, actualValuesMsg)
+		})
+	}
+}
+
+func TestCommandActionVal_toUint64s(t *testing.T) {
+	tests := []struct {
+		name                string
+		rawCommandActionVal string
+		exp                 []uint64
+	}{
+		{
+			name:                "i32",
+			rawCommandActionVal: `{"type": "i32", "value": "0"}`,
+			exp:                 []uint64{0},
+		},
+		{
+			name:                "i32",
+			rawCommandActionVal: `{"type": "i32", "value": "4294967295"}`,
+			exp:                 []uint64{4294967295},
+		},
+		{
+			name:                "i64",
+			rawCommandActionVal: `{"type": "i64", "value": "0"}`,
+			exp:                 []uint64{0},
+		},
+		{
+			name:                "i64",
+			rawCommandActionVal: `{"type": "i64", "value": "7034535277573963776"}`,
+			exp:                 []uint64{7034535277573963776},
+		},
+		{
+			name:                "f32",
+			rawCommandActionVal: `{"type": "f32", "value": "0"}`,
+			exp:                 []uint64{0},
+		},
+		{
+			name:                "f32",
+			rawCommandActionVal: `{"type": "f32", "value": "2147483648"}`,
+			exp:                 []uint64{2147483648},
+		},
+		{
+			name:                "f64",
+			rawCommandActionVal: `{"type": "f64", "value": "0"}`,
+			exp:                 []uint64{0},
+		},
+		{
+			name:                "f64",
+			rawCommandActionVal: `{"type": "f64", "value": "4616189618054758400"}`,
+			exp:                 []uint64{4616189618054758400},
+		},
+		{
+			name:                "f32x4",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "f32", "value": ["645922816", "645922816", "645922816", "645922816"]}`,
+			exp:                 []uint64{645922816<<32 | 645922816, 645922816<<32 | 645922816},
+		},
+		{
+			name:                "f32x4",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "f32", "value": ["nan:canonical", "nan:arithmetic", "nan:canonical", "nan:arithmetic"]}`,
+			exp: []uint64{
+				uint64(math.Float32bits(float32(math.NaN()))) | (uint64(math.Float32bits(float32(math.NaN()))) << 32),
+				uint64(math.Float32bits(float32(math.NaN()))) | (uint64(math.Float32bits(float32(math.NaN()))) << 32),
+			},
+		},
+		{
+			name:                "f64x2",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "f64", "value": ["9223372036854775808", "9223372036854775808"]}`,
+			exp:                 []uint64{9223372036854775808, 9223372036854775808},
+		},
+		{
+			name:                "f64x2",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "f64", "value": ["nan:canonical", "nan:arithmetic"]}`,
+			exp:                 []uint64{math.Float64bits(math.NaN()), math.Float64bits(math.NaN())},
+		},
+		{
+			name:                "i8x16",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "i8", "value": ["128", "129", "130", "131", "253", "254", "255", "0", "0", "1", "2", "127", "128", "253", "254", "255"]}`,
+			exp: []uint64{
+				128 | (129 << 8) | (130 << 16) | (131 << 24) | (253 << 32) | (254 << 40) | (255 << 48),
+				1<<8 | 2<<16 | 127<<24 | 128<<32 | 253<<40 | 254<<48 | 255<<56,
+			},
+		},
+		{
+			name:                "i16x8",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "i16", "value": ["256", "770", "1284", "1798", "2312", "2826", "3340", "3854"]}`,
+			exp: []uint64{
+				256 | 770<<16 | 1284<<32 | 1798<<48,
+				2312 | 2826<<16 | 3340<<32 | 3854<<48,
+			},
+		},
+		{
+			name:                "i32x4",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "i32", "value": ["123", "32766", "32766", "40000"]}`,
+			exp: []uint64{
+				123 | 32766<<32,
+				32766 | 40000<<32,
+			},
+		},
+		{
+			name:                "i64x2",
+			rawCommandActionVal: `{"type": "v128", "lane_type": "i64", "value": ["18446744073709551615", "123124"]}`,
+			exp: []uint64{
+				18446744073709551615,
+				123124,
+			},
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			var c commandActionVal
+			err := json.Unmarshal([]byte(tc.rawCommandActionVal), &c)
+			require.NoError(t, err)
+			actual := c.toUint64s()
+			require.Equal(t, tc.exp, actual)
+		})
+	}
+}
+
+func TestCommand_getAssertReturnArgsExps(t *testing.T) {
+	tests := []struct {
+		name       string
+		rawCommand string
+		args, exps []uint64
+	}{
+		{
+			name: "1",
+			rawCommand: `
+{
+  "type": "assert_return",
+  "line": 148,
+  "action": {
+    "type": "invoke", "field": "f32x4.min",
+    "args": [
+      {"type": "v128", "lane_type": "f32", "value": ["2147483648", "123", "2147483648", "1"]},
+      {"type": "v128", "lane_type": "i8", "value": ["128", "129", "130", "131", "253", "254", "255", "0", "0", "1", "2", "127", "128", "253", "254", "255"]}
+    ]
+  },
+  "expected": [
+    {"type": "v128", "lane_type": "f32", "value": ["2147483648", "0", "0", "2147483648"]}
+  ]
+}`,
+			args: []uint64{
+				123<<32 | 2147483648,
+				1<<32 | 2147483648,
+				128 | (129 << 8) | (130 << 16) | (131 << 24) | (253 << 32) | (254 << 40) | (255 << 48),
+				1<<8 | 2<<16 | 127<<24 | 128<<32 | 253<<40 | 254<<48 | 255<<56,
+			},
+			exps: []uint64{
+				2147483648,
+				2147483648 << 32,
+			},
+		},
+	}
+
+	for _, tc := range tests {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			var c command
+			err := json.Unmarshal([]byte(tc.rawCommand), &c)
+			require.NoError(t, err)
+			actualArgs, actualExps := c.getAssertReturnArgsExps()
+			require.Equal(t, tc.args, actualArgs)
+			require.Equal(t, tc.exps, actualExps)
+		})
+	}
+}
diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go
index 7ebf37d9153..76838f372c6 100644
--- a/internal/integration_test/spectest/v2/spec_test.go
+++ b/internal/integration_test/spectest/v2/spec_test.go
@@ -4,7 +4,6 @@ import (
 	"embed"
 	"path"
 	"runtime"
-	"strings"
 	"testing"
 
 	"github.com/tetratelabs/wazero/internal/engine/compiler"
@@ -26,43 +25,28 @@ func TestCompiler(t *testing.T) {
 	}
 
 	spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool {
-		// TODO: remove after SIMD proposal
-		if strings.Contains(jsonname, "simd") {
-			switch path.Base(jsonname) {
-			case "simd_address.json", "simd_const.json", "simd_align.json", "simd_load16_lane.json", "simd_load32_lane.json",
-				"simd_load64_lane.json", "simd_load8_lane.json", "simd_lane.json", "simd_load_extend.json",
-				"simd_load_splat.json", "simd_load_zero.json", "simd_store.json", "simd_store16_lane.json",
-				"simd_store32_lane.json", "simd_store64_lane.json", "simd_store8_lane.json":
-				return true
-			case "simd_bitwise.json", "simd_boolean.json", "simd_bit_shift.json",
-				"simd_i8x16_cmp.json", "simd_i16x8_cmp.json", "simd_i32x4_cmp.json", "simd_i64x2_cmp.json",
-				"simd_f32x4_cmp.json", "simd_f64x2_cmp.json":
-				// TODO: implement on arm64.
-				return runtime.GOARCH == "amd64"
-			default:
-				return false // others not supported, yet!
-			}
+		switch path.Base(jsonname) {
+		case "simd_bitwise.json", "simd_boolean.json", "simd_bit_shift.json",
+			"simd_i8x16_cmp.json", "simd_i16x8_cmp.json", "simd_i32x4_cmp.json", "simd_i64x2_cmp.json",
+			"simd_f32x4_cmp.json", "simd_f64x2_cmp.json", "simd_f32x4_arith.json", "simd_f64x2_arith.json",
+			"simd_i16x8_arith.json", "simd_i64x2_arith.json", "simd_i32x4_arith.json", "simd_i8x16_arith.json",
+			"simd_i16x8_sat_arith.json", "simd_i8x16_sat_arith.json",
+			"simd_i16x8_arith2.json", "simd_i8x16_arith2.json", "simd_i32x4_arith2.json", "simd_i64x2_arith2.json",
+			"simd_f64x2.json", "simd_f32x4.json", "simd_f32x4_rounding.json", "simd_f64x2_rounding.json",
+			"simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json", "simd_int_to_int_extend.json",
+			"simd_i64x2_extmul_i32x4.json", "simd_i32x4_extmul_i16x8.json", "simd_i16x8_extmul_i8x16.json",
+			"simd_i16x8_q15mulr_sat_s.json", "simd_i16x8_extadd_pairwise_i8x16.json", "simd_i32x4_extadd_pairwise_i16x8.json",
+			"simd_i32x4_dot_i16x8.json", "simd_i32x4_trunc_sat_f32x4.json",
+			"simd_splat.json", "simd_load.json", "simd_i32x4_trunc_sat_f64x2.json",
+			"simd_conversions.json":
+			// TODO: implement on arm64.
+			return runtime.GOARCH == "amd64"
+		default:
+			return true
 		}
-		return true
 	})
 }
 
 func TestInterpreter(t *testing.T) {
-	spectest.Run(t, testcases, interpreter.NewEngine, enabledFeatures, func(jsonname string) bool {
-		// TODO: remove after SIMD proposal
-		if strings.Contains(jsonname, "simd") {
-			switch path.Base(jsonname) {
-			case "simd_address.json", "simd_const.json", "simd_align.json", "simd_load16_lane.json",
-				"simd_load32_lane.json", "simd_load64_lane.json", "simd_load8_lane.json", "simd_lane.json",
-				"simd_load_extend.json", "simd_load_splat.json", "simd_load_zero.json", "simd_store.json",
-				"simd_store16_lane.json", "simd_store32_lane.json", "simd_store64_lane.json", "simd_store8_lane.json",
-				"simd_bitwise.json", "simd_boolean.json", "simd_bit_shift.json", "simd_i8x16_cmp.json", "simd_i16x8_cmp.json",
-				"simd_i32x4_cmp.json", "simd_i64x2_cmp.json", "simd_f32x4_cmp.json", "simd_f64x2_cmp.json":
-				return true
-			default:
-				return false // others not supported, yet!
-			}
-		}
-		return true
-	})
+	spectest.Run(t, testcases, interpreter.NewEngine, enabledFeatures, func(string) bool { return true })
 }
diff --git a/internal/wasm/func_validation.go b/internal/wasm/func_validation.go
index 899edda04db..a8cd9f9198a 100644
--- a/internal/wasm/func_validation.go
+++ b/internal/wasm/func_validation.go
@@ -714,7 +714,7 @@ func (m *Module) validateFunctionWithMaxStackValues(
 					return fmt.Errorf("cannot pop the f64 operand for %s: %v", InstructionName(op), err)
 				}
 				valueTypeStack.push(ValueTypeI64)
-			case OpcodeF32ConvertI32s, OpcodeF32ConvertI32U:
+			case OpcodeF32ConvertI32S, OpcodeF32ConvertI32U:
 				if err := valueTypeStack.popAndVerifyType(ValueTypeI32); err != nil {
 					return fmt.Errorf("cannot pop the i32 operand for %s: %v", InstructionName(op), err)
 				}
@@ -1071,14 +1071,6 @@ func (m *Module) validateFunctionWithMaxStackValues(
 				}
 				pc += 16
 				valueTypeStack.push(ValueTypeV128)
-			case OpcodeVecI8x16Add, OpcodeVecI16x8Add, OpcodeVecI32x4Add, OpcodeVecI64x2Add,
-				OpcodeVecI8x16Sub, OpcodeVecI16x8Sub, OpcodeVecI32x4Sub, OpcodeVecI64x2Sub:
-				for i := 0; i < 2; i++ {
-					if err := valueTypeStack.popAndVerifyType(ValueTypeV128); err != nil {
-						return fmt.Errorf("cannot pop the operand for %s: %v", vectorInstructionName[vecOpcode], err)
-					}
-				}
-				valueTypeStack.push(ValueTypeV128)
 			case OpcodeVecV128AnyTrue, OpcodeVecI8x16AllTrue, OpcodeVecI16x8AllTrue, OpcodeVecI32x4AllTrue, OpcodeVecI64x2AllTrue,
 				OpcodeVecI8x16BitMask, OpcodeVecI16x8BitMask, OpcodeVecI32x4BitMask, OpcodeVecI64x2BitMask:
 				if err := valueTypeStack.popAndVerifyType(ValueTypeV128); err != nil {
@@ -1309,7 +1301,53 @@ func (m *Module) validateFunctionWithMaxStackValues(
 				OpcodeVecI64x2Eq, OpcodeVecI64x2Ne, OpcodeVecI64x2LtS, OpcodeVecI64x2GtS, OpcodeVecI64x2LeS,
 				OpcodeVecI64x2GeS, OpcodeVecF32x4Eq, OpcodeVecF32x4Ne, OpcodeVecF32x4Lt, OpcodeVecF32x4Gt,
 				OpcodeVecF32x4Le, OpcodeVecF32x4Ge, OpcodeVecF64x2Eq, OpcodeVecF64x2Ne, OpcodeVecF64x2Lt,
-				OpcodeVecF64x2Gt, OpcodeVecF64x2Le, OpcodeVecF64x2Ge:
+				OpcodeVecF64x2Gt, OpcodeVecF64x2Le, OpcodeVecF64x2Ge,
+				OpcodeVecI32x4DotI16x8S,
+				OpcodeVecI8x16NarrowI16x8S, OpcodeVecI8x16NarrowI16x8U, OpcodeVecI16x8NarrowI32x4S, OpcodeVecI16x8NarrowI32x4U:
+				if err := valueTypeStack.popAndVerifyType(ValueTypeV128); err != nil {
+					return fmt.Errorf("cannot pop the operand for %s: %v", vectorInstructionName[vecOpcode], err)
+				}
+				if err := valueTypeStack.popAndVerifyType(ValueTypeV128); err != nil {
+					return fmt.Errorf("cannot pop the operand for %s: %v", vectorInstructionName[vecOpcode], err)
+				}
+				valueTypeStack.push(ValueTypeV128)
+			case OpcodeVecI8x16Neg, OpcodeVecI16x8Neg, OpcodeVecI32x4Neg, OpcodeVecI64x2Neg, OpcodeVecF32x4Neg, OpcodeVecF64x2Neg,
+				OpcodeVecF32x4Sqrt, OpcodeVecF64x2Sqrt,
+				OpcodeVecI8x16Abs, OpcodeVecI8x16Popcnt, OpcodeVecI16x8Abs, OpcodeVecI32x4Abs, OpcodeVecI64x2Abs,
+				OpcodeVecF32x4Abs, OpcodeVecF64x2Abs,
+				OpcodeVecF32x4Ceil, OpcodeVecF32x4Floor, OpcodeVecF32x4Trunc, OpcodeVecF32x4Nearest,
+				OpcodeVecF64x2Ceil, OpcodeVecF64x2Floor, OpcodeVecF64x2Trunc, OpcodeVecF64x2Nearest,
+				OpcodeVecI16x8ExtendLowI8x16S, OpcodeVecI16x8ExtendHighI8x16S, OpcodeVecI16x8ExtendLowI8x16U, OpcodeVecI16x8ExtendHighI8x16U,
+				OpcodeVecI32x4ExtendLowI16x8S, OpcodeVecI32x4ExtendHighI16x8S, OpcodeVecI32x4ExtendLowI16x8U, OpcodeVecI32x4ExtendHighI16x8U,
+				OpcodeVecI64x2ExtendLowI32x4S, OpcodeVecI64x2ExtendHighI32x4S, OpcodeVecI64x2ExtendLowI32x4U, OpcodeVecI64x2ExtendHighI32x4U,
+				OpcodeVecI16x8ExtaddPairwiseI8x16S, OpcodeVecI16x8ExtaddPairwiseI8x16U,
+				OpcodeVecI32x4ExtaddPairwiseI16x8S, OpcodeVecI32x4ExtaddPairwiseI16x8U,
+				OpcodeVecF64x2PromoteLowF32x4Zero, OpcodeVecF32x4DemoteF64x2Zero,
+				OpcodeVecF32x4ConvertI32x4S, OpcodeVecF32x4ConvertI32x4U,
+				OpcodeVecF64x2ConvertLowI32x4S, OpcodeVecF64x2ConvertLowI32x4U,
+				OpcodeVecI32x4TruncSatF32x4S, OpcodeVecI32x4TruncSatF32x4U, OpcodeVecI32x4TruncSatF64x2SZero, OpcodeVecI32x4TruncSatF64x2UZero:
+				if err := valueTypeStack.popAndVerifyType(ValueTypeV128); err != nil {
+					return fmt.Errorf("cannot pop the operand for %s: %v", vectorInstructionName[vecOpcode], err)
+				}
+				valueTypeStack.push(ValueTypeV128)
+
+			case OpcodeVecI8x16Add, OpcodeVecI8x16AddSatS, OpcodeVecI8x16AddSatU, OpcodeVecI8x16Sub, OpcodeVecI8x16SubSatS, OpcodeVecI8x16SubSatU,
+				OpcodeVecI16x8Add, OpcodeVecI16x8AddSatS, OpcodeVecI16x8AddSatU, OpcodeVecI16x8Sub, OpcodeVecI16x8SubSatS, OpcodeVecI16x8SubSatU, OpcodeVecI16x8Mul,
+				OpcodeVecI32x4Add, OpcodeVecI32x4Sub, OpcodeVecI32x4Mul,
+				OpcodeVecI64x2Add, OpcodeVecI64x2Sub, OpcodeVecI64x2Mul,
+				OpcodeVecF32x4Add, OpcodeVecF32x4Sub, OpcodeVecF32x4Mul, OpcodeVecF32x4Div,
+				OpcodeVecF64x2Add, OpcodeVecF64x2Sub, OpcodeVecF64x2Mul, OpcodeVecF64x2Div,
+				OpcodeVecI8x16MinS, OpcodeVecI8x16MinU, OpcodeVecI8x16MaxS, OpcodeVecI8x16MaxU,
+				OpcodeVecI8x16AvgrU,
+				OpcodeVecI16x8MinS, OpcodeVecI16x8MinU, OpcodeVecI16x8MaxS, OpcodeVecI16x8MaxU,
+				OpcodeVecI16x8AvgrU,
+				OpcodeVecI32x4MinS, OpcodeVecI32x4MinU, OpcodeVecI32x4MaxS, OpcodeVecI32x4MaxU,
+				OpcodeVecF32x4Min, OpcodeVecF32x4Max, OpcodeVecF64x2Min, OpcodeVecF64x2Max,
+				OpcodeVecF32x4Pmin, OpcodeVecF32x4Pmax, OpcodeVecF64x2Pmin, OpcodeVecF64x2Pmax,
+				OpcodeVecI16x8Q15mulrSatS,
+				OpcodeVecI16x8ExtMulLowI8x16S, OpcodeVecI16x8ExtMulHighI8x16S, OpcodeVecI16x8ExtMulLowI8x16U, OpcodeVecI16x8ExtMulHighI8x16U,
+				OpcodeVecI32x4ExtMulLowI16x8S, OpcodeVecI32x4ExtMulHighI16x8S, OpcodeVecI32x4ExtMulLowI16x8U, OpcodeVecI32x4ExtMulHighI16x8U,
+				OpcodeVecI64x2ExtMulLowI32x4S, OpcodeVecI64x2ExtMulHighI32x4S, OpcodeVecI64x2ExtMulLowI32x4U, OpcodeVecI64x2ExtMulHighI32x4U:
 				if err := valueTypeStack.popAndVerifyType(ValueTypeV128); err != nil {
 					return fmt.Errorf("cannot pop the operand for %s: %v", vectorInstructionName[vecOpcode], err)
 				}
diff --git a/internal/wasm/func_validation_test.go b/internal/wasm/func_validation_test.go
index d926bb77ea5..11e37a90599 100644
--- a/internal/wasm/func_validation_test.go
+++ b/internal/wasm/func_validation_test.go
@@ -3020,6 +3020,122 @@ func TestModule_funcValidation_SIMD(t *testing.T) {
 		{name: OpcodeVecF64x2GtName, body: vv2v(OpcodeVecF64x2Gt)},
 		{name: OpcodeVecF64x2LeName, body: vv2v(OpcodeVecF64x2Le)},
 		{name: OpcodeVecF64x2GeName, body: vv2v(OpcodeVecF64x2Ge)},
+		{name: OpcodeVecI8x16AddName, body: vv2v(OpcodeVecI8x16Add)},
+		{name: OpcodeVecI8x16AddSatSName, body: vv2v(OpcodeVecI8x16AddSatS)},
+		{name: OpcodeVecI8x16AddSatUName, body: vv2v(OpcodeVecI8x16AddSatU)},
+		{name: OpcodeVecI8x16SubName, body: vv2v(OpcodeVecI8x16Sub)},
+		{name: OpcodeVecI8x16SubSatSName, body: vv2v(OpcodeVecI8x16SubSatS)},
+		{name: OpcodeVecI8x16SubSatUName, body: vv2v(OpcodeVecI8x16SubSatU)},
+		{name: OpcodeVecI16x8AddName, body: vv2v(OpcodeVecI16x8Add)},
+		{name: OpcodeVecI16x8AddSatSName, body: vv2v(OpcodeVecI16x8AddSatS)},
+		{name: OpcodeVecI16x8AddSatUName, body: vv2v(OpcodeVecI16x8AddSatU)},
+		{name: OpcodeVecI16x8SubName, body: vv2v(OpcodeVecI16x8Sub)},
+		{name: OpcodeVecI16x8SubSatSName, body: vv2v(OpcodeVecI16x8SubSatS)},
+		{name: OpcodeVecI16x8SubSatUName, body: vv2v(OpcodeVecI16x8SubSatU)},
+		{name: OpcodeVecI16x8MulName, body: vv2v(OpcodeVecI16x8Mul)},
+		{name: OpcodeVecI32x4AddName, body: vv2v(OpcodeVecI32x4Add)},
+		{name: OpcodeVecI32x4SubName, body: vv2v(OpcodeVecI32x4Sub)},
+		{name: OpcodeVecI32x4MulName, body: vv2v(OpcodeVecI32x4Mul)},
+		{name: OpcodeVecI64x2AddName, body: vv2v(OpcodeVecI64x2Add)},
+		{name: OpcodeVecI64x2SubName, body: vv2v(OpcodeVecI64x2Sub)},
+		{name: OpcodeVecI64x2MulName, body: vv2v(OpcodeVecI64x2Mul)},
+		{name: OpcodeVecF32x4AddName, body: vv2v(OpcodeVecF32x4Add)},
+		{name: OpcodeVecF32x4SubName, body: vv2v(OpcodeVecF32x4Sub)},
+		{name: OpcodeVecF32x4MulName, body: vv2v(OpcodeVecF32x4Mul)},
+		{name: OpcodeVecF32x4DivName, body: vv2v(OpcodeVecF32x4Div)},
+		{name: OpcodeVecF64x2AddName, body: vv2v(OpcodeVecF64x2Add)},
+		{name: OpcodeVecF64x2SubName, body: vv2v(OpcodeVecF64x2Sub)},
+		{name: OpcodeVecF64x2MulName, body: vv2v(OpcodeVecF64x2Mul)},
+		{name: OpcodeVecF64x2DivName, body: vv2v(OpcodeVecF64x2Div)},
+		{name: OpcodeVecI8x16NegName, body: v2v(OpcodeVecI8x16Neg)},
+		{name: OpcodeVecI16x8NegName, body: v2v(OpcodeVecI16x8Neg)},
+		{name: OpcodeVecI32x4NegName, body: v2v(OpcodeVecI32x4Neg)},
+		{name: OpcodeVecI64x2NegName, body: v2v(OpcodeVecI64x2Neg)},
+		{name: OpcodeVecF32x4NegName, body: v2v(OpcodeVecF32x4Neg)},
+		{name: OpcodeVecF64x2NegName, body: v2v(OpcodeVecF64x2Neg)},
+		{name: OpcodeVecF32x4SqrtName, body: v2v(OpcodeVecF32x4Sqrt)},
+		{name: OpcodeVecF64x2SqrtName, body: v2v(OpcodeVecF64x2Sqrt)},
+		{name: OpcodeVecI8x16MinSName, body: vv2v(OpcodeVecI8x16MinS)},
+		{name: OpcodeVecI8x16MinUName, body: vv2v(OpcodeVecI8x16MinU)},
+		{name: OpcodeVecI8x16MaxSName, body: vv2v(OpcodeVecI8x16MaxS)},
+		{name: OpcodeVecI8x16MaxUName, body: vv2v(OpcodeVecI8x16MaxU)},
+		{name: OpcodeVecI8x16AvgrUName, body: vv2v(OpcodeVecI8x16AvgrU)},
+		{name: OpcodeVecI8x16AbsName, body: v2v(OpcodeVecI8x16Abs)},
+		{name: OpcodeVecI8x16PopcntName, body: v2v(OpcodeVecI8x16Popcnt)},
+		{name: OpcodeVecI16x8MinSName, body: vv2v(OpcodeVecI16x8MinS)},
+		{name: OpcodeVecI16x8MinUName, body: vv2v(OpcodeVecI16x8MinU)},
+		{name: OpcodeVecI16x8MaxSName, body: vv2v(OpcodeVecI16x8MaxS)},
+		{name: OpcodeVecI16x8MaxUName, body: vv2v(OpcodeVecI16x8MaxU)},
+		{name: OpcodeVecI16x8AvgrUName, body: vv2v(OpcodeVecI16x8AvgrU)},
+		{name: OpcodeVecI16x8AbsName, body: v2v(OpcodeVecI16x8Abs)},
+		{name: OpcodeVecI32x4MinSName, body: vv2v(OpcodeVecI32x4MinS)},
+		{name: OpcodeVecI32x4MinUName, body: vv2v(OpcodeVecI32x4MinU)},
+		{name: OpcodeVecI32x4MaxSName, body: vv2v(OpcodeVecI32x4MaxS)},
+		{name: OpcodeVecI32x4MaxUName, body: vv2v(OpcodeVecI32x4MaxU)},
+		{name: OpcodeVecI32x4AbsName, body: v2v(OpcodeVecI32x4Abs)},
+		{name: OpcodeVecI64x2AbsName, body: v2v(OpcodeVecI64x2Abs)},
+		{name: OpcodeVecF32x4AbsName, body: v2v(OpcodeVecF32x4Abs)},
+		{name: OpcodeVecF64x2AbsName, body: v2v(OpcodeVecF64x2Abs)},
+		{name: OpcodeVecF32x4MinName, body: vv2v(OpcodeVecF32x4Min)},
+		{name: OpcodeVecF32x4MaxName, body: vv2v(OpcodeVecF32x4Max)},
+		{name: OpcodeVecF64x2MinName, body: vv2v(OpcodeVecF64x2Min)},
+		{name: OpcodeVecF64x2MaxName, body: vv2v(OpcodeVecF64x2Max)},
+		{name: OpcodeVecF32x4CeilName, body: v2v(OpcodeVecF32x4Ceil)},
+		{name: OpcodeVecF32x4FloorName, body: v2v(OpcodeVecF32x4Floor)},
+		{name: OpcodeVecF32x4TruncName, body: v2v(OpcodeVecF32x4Trunc)},
+		{name: OpcodeVecF32x4NearestName, body: v2v(OpcodeVecF32x4Nearest)},
+		{name: OpcodeVecF64x2CeilName, body: v2v(OpcodeVecF64x2Ceil)},
+		{name: OpcodeVecF64x2FloorName, body: v2v(OpcodeVecF64x2Floor)},
+		{name: OpcodeVecF64x2TruncName, body: v2v(OpcodeVecF64x2Trunc)},
+		{name: OpcodeVecF64x2NearestName, body: v2v(OpcodeVecF64x2Nearest)},
+		{name: OpcodeVecF32x4MinName, body: vv2v(OpcodeVecF32x4Pmin)},
+		{name: OpcodeVecF32x4MaxName, body: vv2v(OpcodeVecF32x4Pmax)},
+		{name: OpcodeVecF64x2MinName, body: vv2v(OpcodeVecF64x2Pmin)},
+		{name: OpcodeVecF64x2MaxName, body: vv2v(OpcodeVecF64x2Pmax)},
+		{name: OpcodeVecI16x8ExtendLowI8x16SName, body: v2v(OpcodeVecI16x8ExtendLowI8x16S)},
+		{name: OpcodeVecI16x8ExtendHighI8x16SName, body: v2v(OpcodeVecI16x8ExtendHighI8x16S)},
+		{name: OpcodeVecI16x8ExtendLowI8x16UName, body: v2v(OpcodeVecI16x8ExtendLowI8x16U)},
+		{name: OpcodeVecI16x8ExtendHighI8x16UName, body: v2v(OpcodeVecI16x8ExtendHighI8x16U)},
+		{name: OpcodeVecI32x4ExtendLowI16x8SName, body: v2v(OpcodeVecI32x4ExtendLowI16x8S)},
+		{name: OpcodeVecI32x4ExtendHighI16x8SName, body: v2v(OpcodeVecI32x4ExtendHighI16x8S)},
+		{name: OpcodeVecI32x4ExtendLowI16x8UName, body: v2v(OpcodeVecI32x4ExtendLowI16x8U)},
+		{name: OpcodeVecI32x4ExtendHighI16x8UName, body: v2v(OpcodeVecI32x4ExtendHighI16x8U)},
+		{name: OpcodeVecI64x2ExtendLowI32x4SName, body: v2v(OpcodeVecI64x2ExtendLowI32x4S)},
+		{name: OpcodeVecI64x2ExtendHighI32x4SName, body: v2v(OpcodeVecI64x2ExtendHighI32x4S)},
+		{name: OpcodeVecI64x2ExtendLowI32x4UName, body: v2v(OpcodeVecI64x2ExtendLowI32x4U)},
+		{name: OpcodeVecI64x2ExtendHighI32x4UName, body: v2v(OpcodeVecI64x2ExtendHighI32x4U)},
+		{name: OpcodeVecI16x8Q15mulrSatSName, body: vv2v(OpcodeVecI16x8Q15mulrSatS)},
+		{name: OpcodeVecI16x8ExtMulLowI8x16SName, body: vv2v(OpcodeVecI16x8ExtMulLowI8x16S)},
+		{name: OpcodeVecI16x8ExtMulHighI8x16SName, body: vv2v(OpcodeVecI16x8ExtMulHighI8x16S)},
+		{name: OpcodeVecI16x8ExtMulLowI8x16UName, body: vv2v(OpcodeVecI16x8ExtMulLowI8x16U)},
+		{name: OpcodeVecI16x8ExtMulHighI8x16UName, body: vv2v(OpcodeVecI16x8ExtMulHighI8x16U)},
+		{name: OpcodeVecI32x4ExtMulLowI16x8SName, body: vv2v(OpcodeVecI32x4ExtMulLowI16x8S)},
+		{name: OpcodeVecI32x4ExtMulHighI16x8SName, body: vv2v(OpcodeVecI32x4ExtMulHighI16x8S)},
+		{name: OpcodeVecI32x4ExtMulLowI16x8UName, body: vv2v(OpcodeVecI32x4ExtMulLowI16x8U)},
+		{name: OpcodeVecI32x4ExtMulHighI16x8UName, body: vv2v(OpcodeVecI32x4ExtMulHighI16x8U)},
+		{name: OpcodeVecI64x2ExtMulLowI32x4SName, body: vv2v(OpcodeVecI64x2ExtMulLowI32x4S)},
+		{name: OpcodeVecI64x2ExtMulHighI32x4SName, body: vv2v(OpcodeVecI64x2ExtMulHighI32x4S)},
+		{name: OpcodeVecI64x2ExtMulLowI32x4UName, body: vv2v(OpcodeVecI64x2ExtMulLowI32x4U)},
+		{name: OpcodeVecI64x2ExtMulHighI32x4UName, body: vv2v(OpcodeVecI64x2ExtMulHighI32x4U)},
+		{name: OpcodeVecI16x8ExtaddPairwiseI8x16SName, body: v2v(OpcodeVecI16x8ExtaddPairwiseI8x16S)},
+		{name: OpcodeVecI16x8ExtaddPairwiseI8x16UName, body: v2v(OpcodeVecI16x8ExtaddPairwiseI8x16U)},
+		{name: OpcodeVecI32x4ExtaddPairwiseI16x8SName, body: v2v(OpcodeVecI32x4ExtaddPairwiseI16x8S)},
+		{name: OpcodeVecI32x4ExtaddPairwiseI16x8UName, body: v2v(OpcodeVecI32x4ExtaddPairwiseI16x8U)},
+		{name: OpcodeVecF64x2PromoteLowF32x4ZeroName, body: v2v(OpcodeVecF64x2PromoteLowF32x4Zero)},
+		{name: OpcodeVecF32x4DemoteF64x2ZeroName, body: v2v(OpcodeVecF32x4DemoteF64x2Zero)},
+		{name: OpcodeVecF32x4ConvertI32x4SName, body: v2v(OpcodeVecF32x4ConvertI32x4S)},
+		{name: OpcodeVecF32x4ConvertI32x4UName, body: v2v(OpcodeVecF32x4ConvertI32x4U)},
+		{name: OpcodeVecF64x2ConvertLowI32x4SName, body: v2v(OpcodeVecF64x2ConvertLowI32x4S)},
+		{name: OpcodeVecF64x2ConvertLowI32x4UName, body: v2v(OpcodeVecF64x2ConvertLowI32x4U)},
+		{name: OpcodeVecI32x4DotI16x8SName, body: vv2v(OpcodeVecI32x4DotI16x8S)},
+		{name: OpcodeVecI8x16NarrowI16x8SName, body: vv2v(OpcodeVecI8x16NarrowI16x8S)},
+		{name: OpcodeVecI8x16NarrowI16x8UName, body: vv2v(OpcodeVecI8x16NarrowI16x8U)},
+		{name: OpcodeVecI16x8NarrowI32x4SName, body: vv2v(OpcodeVecI16x8NarrowI32x4S)},
+		{name: OpcodeVecI16x8NarrowI32x4UName, body: vv2v(OpcodeVecI16x8NarrowI32x4U)},
+		{name: OpcodeVecI32x4TruncSatF32x4SName, body: v2v(OpcodeVecI32x4TruncSatF32x4S)},
+		{name: OpcodeVecI32x4TruncSatF32x4UName, body: v2v(OpcodeVecI32x4TruncSatF32x4U)},
+		{name: OpcodeVecI32x4TruncSatF64x2SZeroName, body: v2v(OpcodeVecI32x4TruncSatF64x2SZero)},
+		{name: OpcodeVecI32x4TruncSatF64x2UZeroName, body: v2v(OpcodeVecI32x4TruncSatF64x2UZero)},
 	}
 
 	for _, tt := range tests {
@@ -3115,16 +3231,6 @@ func TestModule_funcValidation_SIMD_error(t *testing.T) {
 			},
 			expectedErr: "invalid lane index[0] 255 >= 32 for v128.shuffle",
 		},
-		{
-			// TODO delete this case after SIMD impl completion.
-			name: "unimplemented",
-			body: []byte{
-				OpcodeVecPrefix,
-				OpcodeVecF32x4DemoteF64x2Zero,
-			},
-			flag:        FeatureSIMD,
-			expectedErr: "TODO: SIMD instruction f32x4.demote_f64x2_zero will be implemented in #506",
-		},
 	}
 
 	addExtractOrReplaceLaneOutOfIndexCase := func(op OpcodeVec, lane, laneCeil byte) {
diff --git a/internal/wasm/instruction.go b/internal/wasm/instruction.go
index 8cab0bd2d68..857633ffcb4 100644
--- a/internal/wasm/instruction.go
+++ b/internal/wasm/instruction.go
@@ -218,7 +218,7 @@ const (
 	OpcodeI64TruncF64S  Opcode = 0xb0
 	OpcodeI64TruncF64U  Opcode = 0xb1
 
-	OpcodeF32ConvertI32s Opcode = 0xb2
+	OpcodeF32ConvertI32S Opcode = 0xb2
 	OpcodeF32ConvertI32U Opcode = 0xb3
 	OpcodeF32ConvertI64S Opcode = 0xb4
 	OpcodeF32ConvertI64U Opcode = 0xb5
@@ -487,7 +487,7 @@ const (
 	OpcodeVecI8x16MinU    OpcodeVec = 0x77
 	OpcodeVecI8x16MaxS    OpcodeVec = 0x78
 	OpcodeVecI8x16MaxU    OpcodeVec = 0x79
-	OpcodeVecI8x16ArgrU   OpcodeVec = 0x7b
+	OpcodeVecI8x16AvgrU   OpcodeVec = 0x7b
 
 	// i16 misc.
 
@@ -495,7 +495,7 @@ const (
 	OpcodeVecI16x8ExtaddPairwiseI8x16U OpcodeVec = 0x7d
 	OpcodeVecI16x8Abs                  OpcodeVec = 0x80
 	OpcodeVecI16x8Neg                  OpcodeVec = 0x81
-	OpcodeVecI16x8Q16mulrSatS          OpcodeVec = 0x82
+	OpcodeVecI16x8Q15mulrSatS          OpcodeVec = 0x82
 	OpcodeVecI16x8AllTrue              OpcodeVec = 0x83
 	OpcodeVecI16x8BitMask              OpcodeVec = 0x84
 	OpcodeVecI16x8NarrowI32x4S         OpcodeVec = 0x85
@@ -518,7 +518,7 @@ const (
 	OpcodeVecI16x8MinU                 OpcodeVec = 0x97
 	OpcodeVecI16x8MaxS                 OpcodeVec = 0x98
 	OpcodeVecI16x8MaxU                 OpcodeVec = 0x99
-	OpcodeVecI16x8ArgrU                OpcodeVec = 0x9b
+	OpcodeVecI16x8AvgrU                OpcodeVec = 0x9b
 	OpcodeVecI16x8ExtMulLowI8x16S      OpcodeVec = 0x9c
 	OpcodeVecI16x8ExtMulHighI8x16S     OpcodeVec = 0x9d
 	OpcodeVecI16x8ExtMulLowI8x16U      OpcodeVec = 0x9e
@@ -617,8 +617,8 @@ const (
 	OpcodeVecF32x4ConvertI32x4U       OpcodeVec = 0xfb
 	OpcodeVecI32x4TruncSatF64x2SZero  OpcodeVec = 0xfc
 	OpcodeVecI32x4TruncSatF64x2UZero  OpcodeVec = 0xfd
-	OpcodeVecF64x2ConvertI32x4S       OpcodeVec = 0xfe
-	OpcodeVecF64x2ConvertI32x4U       OpcodeVec = 0xff
+	OpcodeVecF64x2ConvertLowI32x4S    OpcodeVec = 0xfe
+	OpcodeVecF64x2ConvertLowI32x4U    OpcodeVec = 0xff
 	OpcodeVecF32x4DemoteF64x2Zero     OpcodeVec = 0x5e
 	OpcodeVecF64x2PromoteLowF32x4Zero OpcodeVec = 0x5f
 )
@@ -783,7 +783,7 @@ const (
 	OpcodeI64TruncF32UName      = "i64.trunc_f32_u"
 	OpcodeI64TruncF64SName      = "i64.trunc_f64_s"
 	OpcodeI64TruncF64UName      = "i64.trunc_f64_u"
-	OpcodeF32ConvertI32sName    = "f32.convert_i32_s"
+	OpcodeF32ConvertI32SName    = "f32.convert_i32_s"
 	OpcodeF32ConvertI32UName    = "f32.convert_i32_u"
 	OpcodeF32ConvertI64SName    = "f32.convert_i64_s"
 	OpcodeF32ConvertI64UName    = "f32.convert_i64u"
@@ -977,7 +977,7 @@ var instructionNames = [256]string{
 	OpcodeI64TruncF32U:      OpcodeI64TruncF32UName,
 	OpcodeI64TruncF64S:      OpcodeI64TruncF64SName,
 	OpcodeI64TruncF64U:      OpcodeI64TruncF64UName,
-	OpcodeF32ConvertI32s:    OpcodeF32ConvertI32sName,
+	OpcodeF32ConvertI32S:    OpcodeF32ConvertI32SName,
 	OpcodeF32ConvertI32U:    OpcodeF32ConvertI32UName,
 	OpcodeF32ConvertI64S:    OpcodeF32ConvertI64SName,
 	OpcodeF32ConvertI64U:    OpcodeF32ConvertI64UName,
@@ -1187,12 +1187,12 @@ const (
 	OpcodeVecI8x16MinUName                 = "i8x16.min_u"
 	OpcodeVecI8x16MaxSName                 = "i8x16.max_s"
 	OpcodeVecI8x16MaxUName                 = "i8x16.max_u"
-	OpcodeVecI8x16ArgrUName                = "i8x16.argr_u"
+	OpcodeVecI8x16AvgrUName                = "i8x16.avgr_u"
 	OpcodeVecI16x8ExtaddPairwiseI8x16SName = "i16x8.extadd_pairwise_i8x16_s"
 	OpcodeVecI16x8ExtaddPairwiseI8x16UName = "i16x8.extadd_pairwise_i8x16_u"
 	OpcodeVecI16x8AbsName                  = "i16x8.abs"
 	OpcodeVecI16x8NegName                  = "i16x8.neg"
-	OpcodeVecI16x8Q16mulrSatSName          = "i16x8.q15mulr_sat_s"
+	OpcodeVecI16x8Q15mulrSatSName          = "i16x8.q15mulr_sat_s"
 	OpcodeVecI16x8AllTrueName              = "i16x8.all_true"
 	OpcodeVecI16x8BitMaskName              = "i16x8.bitmask"
 	OpcodeVecI16x8NarrowI32x4SName         = "i16x8.narrow_i32x4_s"
@@ -1215,7 +1215,7 @@ const (
 	OpcodeVecI16x8MinUName                 = "i16x8.min_u"
 	OpcodeVecI16x8MaxSName                 = "i16x8.max_s"
 	OpcodeVecI16x8MaxUName                 = "i16x8.max_u"
-	OpcodeVecI16x8ArgrUName                = "i16x8.argr_u"
+	OpcodeVecI16x8AvgrUName                = "i16x8.avgr_u"
 	OpcodeVecI16x8ExtMulLowI8x16SName      = "i16x8.extmul_low_i8x16_s"
 	OpcodeVecI16x8ExtMulHighI8x16SName     = "i16x8.extmul_high_i8x16_s"
 	OpcodeVecI16x8ExtMulLowI8x16UName      = "i16x8.extmul_low_i8x16_u"
@@ -1299,8 +1299,8 @@ const (
 	OpcodeVecF32x4ConvertI32x4UName        = "f32x4.convert_i32x4_u"
 	OpcodeVecI32x4TruncSatF64x2SZeroName   = "i32x4.trunc_sat_f64x2_s_zero"
 	OpcodeVecI32x4TruncSatF64x2UZeroName   = "i32x4.trunc_sat_f64x2_u_zero"
-	OpcodeVecF64x2ConvertI32x4SName        = "f64x2.convert_low_i32x4_s"
-	OpcodeVecF64x2ConvertI32x4UName        = "f64x2.convert_low_i32x4_u"
+	OpcodeVecF64x2ConvertLowI32x4SName     = "f64x2.convert_low_i32x4_s"
+	OpcodeVecF64x2ConvertLowI32x4UName     = "f64x2.convert_low_i32x4_u"
 	OpcodeVecF32x4DemoteF64x2ZeroName      = "f32x4.demote_f64x2_zero"
 	OpcodeVecF64x2PromoteLowF32x4ZeroName  = "f64x2.promote_low_f32x4"
 )
@@ -1426,12 +1426,12 @@ var vectorInstructionName = map[OpcodeVec]string{
 	OpcodeVecI8x16MinU:                 OpcodeVecI8x16MinUName,
 	OpcodeVecI8x16MaxS:                 OpcodeVecI8x16MaxSName,
 	OpcodeVecI8x16MaxU:                 OpcodeVecI8x16MaxUName,
-	OpcodeVecI8x16ArgrU:                OpcodeVecI8x16ArgrUName,
+	OpcodeVecI8x16AvgrU:                OpcodeVecI8x16AvgrUName,
 	OpcodeVecI16x8ExtaddPairwiseI8x16S: OpcodeVecI16x8ExtaddPairwiseI8x16SName,
 	OpcodeVecI16x8ExtaddPairwiseI8x16U: OpcodeVecI16x8ExtaddPairwiseI8x16UName,
 	OpcodeVecI16x8Abs:                  OpcodeVecI16x8AbsName,
 	OpcodeVecI16x8Neg:                  OpcodeVecI16x8NegName,
-	OpcodeVecI16x8Q16mulrSatS:          OpcodeVecI16x8Q16mulrSatSName,
+	OpcodeVecI16x8Q15mulrSatS:          OpcodeVecI16x8Q15mulrSatSName,
 	OpcodeVecI16x8AllTrue:              OpcodeVecI16x8AllTrueName,
 	OpcodeVecI16x8BitMask:              OpcodeVecI16x8BitMaskName,
 	OpcodeVecI16x8NarrowI32x4S:         OpcodeVecI16x8NarrowI32x4SName,
@@ -1454,7 +1454,7 @@ var vectorInstructionName = map[OpcodeVec]string{
 	OpcodeVecI16x8MinU:                 OpcodeVecI16x8MinUName,
 	OpcodeVecI16x8MaxS:                 OpcodeVecI16x8MaxSName,
 	OpcodeVecI16x8MaxU:                 OpcodeVecI16x8MaxUName,
-	OpcodeVecI16x8ArgrU:                OpcodeVecI16x8ArgrUName,
+	OpcodeVecI16x8AvgrU:                OpcodeVecI16x8AvgrUName,
 	OpcodeVecI16x8ExtMulLowI8x16S:      OpcodeVecI16x8ExtMulLowI8x16SName,
 	OpcodeVecI16x8ExtMulHighI8x16S:     OpcodeVecI16x8ExtMulHighI8x16SName,
 	OpcodeVecI16x8ExtMulLowI8x16U:      OpcodeVecI16x8ExtMulLowI8x16UName,
@@ -1538,8 +1538,8 @@ var vectorInstructionName = map[OpcodeVec]string{
 	OpcodeVecF32x4ConvertI32x4U:        OpcodeVecF32x4ConvertI32x4UName,
 	OpcodeVecI32x4TruncSatF64x2SZero:   OpcodeVecI32x4TruncSatF64x2SZeroName,
 	OpcodeVecI32x4TruncSatF64x2UZero:   OpcodeVecI32x4TruncSatF64x2UZeroName,
-	OpcodeVecF64x2ConvertI32x4S:        OpcodeVecF64x2ConvertI32x4SName,
-	OpcodeVecF64x2ConvertI32x4U:        OpcodeVecF64x2ConvertI32x4UName,
+	OpcodeVecF64x2ConvertLowI32x4S:     OpcodeVecF64x2ConvertLowI32x4SName,
+	OpcodeVecF64x2ConvertLowI32x4U:     OpcodeVecF64x2ConvertLowI32x4UName,
 	OpcodeVecF32x4DemoteF64x2Zero:      OpcodeVecF32x4DemoteF64x2ZeroName,
 	OpcodeVecF64x2PromoteLowF32x4Zero:  OpcodeVecF64x2PromoteLowF32x4ZeroName,
 }
diff --git a/internal/wasm/module.go b/internal/wasm/module.go
index cac6207f7cf..c5b4be9ffd4 100644
--- a/internal/wasm/module.go
+++ b/internal/wasm/module.go
@@ -1006,7 +1006,7 @@ const (
 	ValueTypeF32 = api.ValueTypeF32
 	ValueTypeF64 = api.ValueTypeF64
 	// TODO: ValueTypeV128 is not exposed in the api pkg yet.
-	ValueTypeV128 = 0x7b
+	ValueTypeV128 ValueType = 0x7b
 	// TODO: ValueTypeFuncref is not exposed in the api pkg yet.
 	ValueTypeFuncref   ValueType = 0x70
 	ValueTypeExternref           = api.ValueTypeExternref
diff --git a/internal/wazeroir/compiler.go b/internal/wazeroir/compiler.go
index 0aa1a81778e..59b08350517 100644
--- a/internal/wazeroir/compiler.go
+++ b/internal/wazeroir/compiler.go
@@ -947,7 +947,7 @@ operatorSwitch:
 			return err
 		}
 		c.emit(
-			&OperationStore8{Type: UnsignedInt32, Arg: imm},
+			&OperationStore8{Arg: imm},
 		)
 	case wasm.OpcodeI32Store16:
 		imm, err := c.readMemoryArg(wasm.OpcodeI32Store16Name)
@@ -955,7 +955,7 @@ operatorSwitch:
 			return err
 		}
 		c.emit(
-			&OperationStore16{Type: UnsignedInt32, Arg: imm},
+			&OperationStore16{Arg: imm},
 		)
 	case wasm.OpcodeI64Store8:
 		imm, err := c.readMemoryArg(wasm.OpcodeI64Store8Name)
@@ -963,7 +963,7 @@ operatorSwitch:
 			return err
 		}
 		c.emit(
-			&OperationStore8{Type: UnsignedInt64, Arg: imm},
+			&OperationStore8{Arg: imm},
 		)
 	case wasm.OpcodeI64Store16:
 		imm, err := c.readMemoryArg(wasm.OpcodeI64Store16Name)
@@ -971,7 +971,7 @@ operatorSwitch:
 			return err
 		}
 		c.emit(
-			&OperationStore16{Type: UnsignedInt64, Arg: imm},
+			&OperationStore16{Arg: imm},
 		)
 	case wasm.OpcodeI64Store32:
 		imm, err := c.readMemoryArg(wasm.OpcodeI64Store32Name)
@@ -1457,7 +1457,7 @@ operatorSwitch:
 		c.emit(
 			&OperationITruncFromF{InputType: Float64, OutputType: SignedUint64},
 		)
-	case wasm.OpcodeF32ConvertI32s:
+	case wasm.OpcodeF32ConvertI32S:
 		c.emit(
 			&OperationFConvertFromI{InputType: SignedInt32, OutputType: Float32},
 		)
@@ -1730,45 +1730,13 @@ operatorSwitch:
 				&OperationV128Const{Lo: lo, Hi: hi},
 			)
 			c.pc += 7
-		case wasm.OpcodeVecI8x16Add:
-			c.emit(
-				&OperationV128Add{Shape: ShapeI8x16},
-			)
-		case wasm.OpcodeVecI16x8Add:
-			c.emit(
-				&OperationV128Add{Shape: ShapeI16x8},
-			)
-		case wasm.OpcodeVecI32x4Add:
-			c.emit(
-				&OperationV128Add{Shape: ShapeI32x4},
-			)
-		case wasm.OpcodeVecI64x2Add:
-			c.emit(
-				&OperationV128Add{Shape: ShapeI64x2},
-			)
-		case wasm.OpcodeVecI8x16Sub:
-			c.emit(
-				&OperationV128Sub{Shape: ShapeI8x16},
-			)
-		case wasm.OpcodeVecI16x8Sub:
-			c.emit(
-				&OperationV128Sub{Shape: ShapeI16x8},
-			)
-		case wasm.OpcodeVecI32x4Sub:
-			c.emit(
-				&OperationV128Sub{Shape: ShapeI32x4},
-			)
-		case wasm.OpcodeVecI64x2Sub:
-			c.emit(
-				&OperationV128Sub{Shape: ShapeI64x2},
-			)
 		case wasm.OpcodeVecV128Load:
 			arg, err := c.readMemoryArg(wasm.OpcodeI32LoadName)
 			if err != nil {
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type128, Arg: arg},
+				&OperationV128Load{Type: V128LoadType128, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load8x8s:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8x8SName)
@@ -1776,7 +1744,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type8x8s, Arg: arg},
+				&OperationV128Load{Type: V128LoadType8x8s, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load8x8u:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8x8UName)
@@ -1784,7 +1752,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type8x8u, Arg: arg},
+				&OperationV128Load{Type: V128LoadType8x8u, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load16x4s:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16x4SName)
@@ -1792,7 +1760,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type16x4s, Arg: arg},
+				&OperationV128Load{Type: V128LoadType16x4s, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load16x4u:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16x4UName)
@@ -1800,7 +1768,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type16x4u, Arg: arg},
+				&OperationV128Load{Type: V128LoadType16x4u, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load32x2s:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32x2SName)
@@ -1808,7 +1776,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type32x2s, Arg: arg},
+				&OperationV128Load{Type: V128LoadType32x2s, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load32x2u:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32x2UName)
@@ -1816,7 +1784,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type32x2u, Arg: arg},
+				&OperationV128Load{Type: V128LoadType32x2u, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load8Splat:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8SplatName)
@@ -1824,7 +1792,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type8Splat, Arg: arg},
+				&OperationV128Load{Type: V128LoadType8Splat, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load16Splat:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load16SplatName)
@@ -1832,7 +1800,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type16Splat, Arg: arg},
+				&OperationV128Load{Type: V128LoadType16Splat, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load32Splat:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32SplatName)
@@ -1840,7 +1808,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type32Splat, Arg: arg},
+				&OperationV128Load{Type: V128LoadType32Splat, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load64Splat:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64SplatName)
@@ -1848,7 +1816,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type64Splat, Arg: arg},
+				&OperationV128Load{Type: V128LoadType64Splat, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load32zero:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load32zeroName)
@@ -1856,7 +1824,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type32zero, Arg: arg},
+				&OperationV128Load{Type: V128LoadType32zero, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load64zero:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load64zeroName)
@@ -1864,7 +1832,7 @@ operatorSwitch:
 				return err
 			}
 			c.emit(
-				&OperationV128Load{Type: LoadV128Type64zero, Arg: arg},
+				&OperationV128Load{Type: V128LoadType64zero, Arg: arg},
 			)
 		case wasm.OpcodeVecV128Load8Lane:
 			arg, err := c.readMemoryArg(wasm.OpcodeVecV128Load8LaneName)
@@ -2372,6 +2340,470 @@ operatorSwitch:
 			c.emit(
 				&OperationV128Cmp{Type: V128CmpTypeF64x2Ge},
 			)
+		case wasm.OpcodeVecI8x16Neg:
+			c.emit(
+				&OperationV128Neg{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI16x8Neg:
+			c.emit(
+				&OperationV128Neg{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI32x4Neg:
+			c.emit(
+				&OperationV128Neg{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI64x2Neg:
+			c.emit(
+				&OperationV128Neg{Shape: ShapeI64x2},
+			)
+		case wasm.OpcodeVecF32x4Neg:
+			c.emit(
+				&OperationV128Neg{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Neg:
+			c.emit(
+				&OperationV128Neg{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecI8x16Add:
+			c.emit(
+				&OperationV128Add{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI16x8Add:
+			c.emit(
+				&OperationV128Add{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI32x4Add:
+			c.emit(
+				&OperationV128Add{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI64x2Add:
+			c.emit(
+				&OperationV128Add{Shape: ShapeI64x2},
+			)
+		case wasm.OpcodeVecF32x4Add:
+			c.emit(
+				&OperationV128Add{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Add:
+			c.emit(
+				&OperationV128Add{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecI8x16Sub:
+			c.emit(
+				&OperationV128Sub{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI16x8Sub:
+			c.emit(
+				&OperationV128Sub{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI32x4Sub:
+			c.emit(
+				&OperationV128Sub{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI64x2Sub:
+			c.emit(
+				&OperationV128Sub{Shape: ShapeI64x2},
+			)
+		case wasm.OpcodeVecF32x4Sub:
+			c.emit(
+				&OperationV128Sub{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Sub:
+			c.emit(
+				&OperationV128Sub{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecI8x16AddSatS:
+			c.emit(
+				&OperationV128AddSat{Shape: ShapeI8x16, Signed: true},
+			)
+		case wasm.OpcodeVecI8x16AddSatU:
+			c.emit(
+				&OperationV128AddSat{Shape: ShapeI8x16, Signed: false},
+			)
+		case wasm.OpcodeVecI16x8AddSatS:
+			c.emit(
+				&OperationV128AddSat{Shape: ShapeI16x8, Signed: true},
+			)
+		case wasm.OpcodeVecI16x8AddSatU:
+			c.emit(
+				&OperationV128AddSat{Shape: ShapeI16x8, Signed: false},
+			)
+		case wasm.OpcodeVecI8x16SubSatS:
+			c.emit(
+				&OperationV128SubSat{Shape: ShapeI8x16, Signed: true},
+			)
+		case wasm.OpcodeVecI8x16SubSatU:
+			c.emit(
+				&OperationV128SubSat{Shape: ShapeI8x16, Signed: false},
+			)
+		case wasm.OpcodeVecI16x8SubSatS:
+			c.emit(
+				&OperationV128SubSat{Shape: ShapeI16x8, Signed: true},
+			)
+		case wasm.OpcodeVecI16x8SubSatU:
+			c.emit(
+				&OperationV128SubSat{Shape: ShapeI16x8, Signed: false},
+			)
+		case wasm.OpcodeVecI16x8Mul:
+			c.emit(
+				&OperationV128Mul{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI32x4Mul:
+			c.emit(
+				&OperationV128Mul{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI64x2Mul:
+			c.emit(
+				&OperationV128Mul{Shape: ShapeI64x2},
+			)
+		case wasm.OpcodeVecF32x4Mul:
+			c.emit(
+				&OperationV128Mul{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Mul:
+			c.emit(
+				&OperationV128Mul{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF32x4Sqrt:
+			c.emit(
+				&OperationV128Sqrt{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Sqrt:
+			c.emit(
+				&OperationV128Sqrt{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF32x4Div:
+			c.emit(
+				&OperationV128Div{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Div:
+			c.emit(
+				&OperationV128Div{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecI8x16Abs:
+			c.emit(
+				&OperationV128Abs{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI8x16Popcnt:
+			c.emit(
+				&OperationV128Popcnt{},
+			)
+		case wasm.OpcodeVecI16x8Abs:
+			c.emit(
+				&OperationV128Abs{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI32x4Abs:
+			c.emit(
+				&OperationV128Abs{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI64x2Abs:
+			c.emit(
+				&OperationV128Abs{Shape: ShapeI64x2},
+			)
+		case wasm.OpcodeVecF32x4Abs:
+			c.emit(
+				&OperationV128Abs{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Abs:
+			c.emit(
+				&OperationV128Abs{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecI8x16MinS:
+			c.emit(
+				&OperationV128Min{Signed: true, Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI8x16MinU:
+			c.emit(
+				&OperationV128Min{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI8x16MaxS:
+			c.emit(
+				&OperationV128Max{Shape: ShapeI8x16, Signed: true},
+			)
+		case wasm.OpcodeVecI8x16MaxU:
+			c.emit(
+				&OperationV128Max{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI8x16AvgrU:
+			c.emit(
+				&OperationV128AvgrU{Shape: ShapeI8x16},
+			)
+		case wasm.OpcodeVecI16x8MinS:
+			c.emit(
+				&OperationV128Min{Signed: true, Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI16x8MinU:
+			c.emit(
+				&OperationV128Min{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI16x8MaxS:
+			c.emit(
+				&OperationV128Max{Shape: ShapeI16x8, Signed: true},
+			)
+		case wasm.OpcodeVecI16x8MaxU:
+			c.emit(
+				&OperationV128Max{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI16x8AvgrU:
+			c.emit(
+				&OperationV128AvgrU{Shape: ShapeI16x8},
+			)
+		case wasm.OpcodeVecI32x4MinS:
+			c.emit(
+				&OperationV128Min{Signed: true, Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI32x4MinU:
+			c.emit(
+				&OperationV128Min{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecI32x4MaxS:
+			c.emit(
+				&OperationV128Max{Shape: ShapeI32x4, Signed: true},
+			)
+		case wasm.OpcodeVecI32x4MaxU:
+			c.emit(
+				&OperationV128Max{Shape: ShapeI32x4},
+			)
+		case wasm.OpcodeVecF32x4Min:
+			c.emit(
+				&OperationV128Min{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF32x4Max:
+			c.emit(
+				&OperationV128Max{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Min:
+			c.emit(
+				&OperationV128Min{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF64x2Max:
+			c.emit(
+				&OperationV128Max{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF32x4Pmin:
+			c.emit(
+				&OperationV128Pmin{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF32x4Pmax:
+			c.emit(
+				&OperationV128Pmax{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Pmin:
+			c.emit(
+				&OperationV128Pmin{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF64x2Pmax:
+			c.emit(
+				&OperationV128Pmax{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF32x4Ceil:
+			c.emit(
+				&OperationV128Ceil{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF32x4Floor:
+			c.emit(
+				&OperationV128Floor{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF32x4Trunc:
+			c.emit(
+				&OperationV128Trunc{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF32x4Nearest:
+			c.emit(
+				&OperationV128Nearest{Shape: ShapeF32x4},
+			)
+		case wasm.OpcodeVecF64x2Ceil:
+			c.emit(
+				&OperationV128Ceil{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF64x2Floor:
+			c.emit(
+				&OperationV128Floor{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF64x2Trunc:
+			c.emit(
+				&OperationV128Trunc{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecF64x2Nearest:
+			c.emit(
+				&OperationV128Nearest{Shape: ShapeF64x2},
+			)
+		case wasm.OpcodeVecI16x8ExtendLowI8x16S:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI8x16, Signed: true, UseLow: true},
+			)
+		case wasm.OpcodeVecI16x8ExtendHighI8x16S:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI8x16, Signed: true, UseLow: false},
+			)
+		case wasm.OpcodeVecI16x8ExtendLowI8x16U:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI8x16, Signed: false, UseLow: true},
+			)
+		case wasm.OpcodeVecI16x8ExtendHighI8x16U:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI8x16, Signed: false, UseLow: false},
+			)
+		case wasm.OpcodeVecI32x4ExtendLowI16x8S:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI16x8, Signed: true, UseLow: true},
+			)
+		case wasm.OpcodeVecI32x4ExtendHighI16x8S:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI16x8, Signed: true, UseLow: false},
+			)
+		case wasm.OpcodeVecI32x4ExtendLowI16x8U:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI16x8, Signed: false, UseLow: true},
+			)
+		case wasm.OpcodeVecI32x4ExtendHighI16x8U:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI16x8, Signed: false, UseLow: false},
+			)
+		case wasm.OpcodeVecI64x2ExtendLowI32x4S:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI32x4, Signed: true, UseLow: true},
+			)
+		case wasm.OpcodeVecI64x2ExtendHighI32x4S:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI32x4, Signed: true, UseLow: false},
+			)
+		case wasm.OpcodeVecI64x2ExtendLowI32x4U:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI32x4, Signed: false, UseLow: true},
+			)
+		case wasm.OpcodeVecI64x2ExtendHighI32x4U:
+			c.emit(
+				&OperationV128Extend{OriginShape: ShapeI32x4, Signed: false, UseLow: false},
+			)
+		case wasm.OpcodeVecI16x8Q15mulrSatS:
+			c.emit(
+				&OperationV128Q15mulrSatS{},
+			)
+		case wasm.OpcodeVecI16x8ExtMulLowI8x16S:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI8x16, Signed: true, UseLow: true},
+			)
+		case wasm.OpcodeVecI16x8ExtMulHighI8x16S:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI8x16, Signed: true, UseLow: false},
+			)
+		case wasm.OpcodeVecI16x8ExtMulLowI8x16U:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI8x16, Signed: false, UseLow: true},
+			)
+		case wasm.OpcodeVecI16x8ExtMulHighI8x16U:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI8x16, Signed: false, UseLow: false},
+			)
+		case wasm.OpcodeVecI32x4ExtMulLowI16x8S:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI16x8, Signed: true, UseLow: true},
+			)
+		case wasm.OpcodeVecI32x4ExtMulHighI16x8S:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI16x8, Signed: true, UseLow: false},
+			)
+		case wasm.OpcodeVecI32x4ExtMulLowI16x8U:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI16x8, Signed: false, UseLow: true},
+			)
+		case wasm.OpcodeVecI32x4ExtMulHighI16x8U:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI16x8, Signed: false, UseLow: false},
+			)
+		case wasm.OpcodeVecI64x2ExtMulLowI32x4S:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI32x4, Signed: true, UseLow: true},
+			)
+		case wasm.OpcodeVecI64x2ExtMulHighI32x4S:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI32x4, Signed: true, UseLow: false},
+			)
+		case wasm.OpcodeVecI64x2ExtMulLowI32x4U:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI32x4, Signed: false, UseLow: true},
+			)
+		case wasm.OpcodeVecI64x2ExtMulHighI32x4U:
+			c.emit(
+				&OperationV128ExtMul{OriginShape: ShapeI32x4, Signed: false, UseLow: false},
+			)
+		case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S:
+			c.emit(
+				&OperationV128ExtAddPairwise{OriginShape: ShapeI8x16, Signed: true},
+			)
+		case wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U:
+			c.emit(
+				&OperationV128ExtAddPairwise{OriginShape: ShapeI8x16, Signed: false},
+			)
+		case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S:
+			c.emit(
+				&OperationV128ExtAddPairwise{OriginShape: ShapeI16x8, Signed: true},
+			)
+		case wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U:
+			c.emit(
+				&OperationV128ExtAddPairwise{OriginShape: ShapeI16x8, Signed: false},
+			)
+		case wasm.OpcodeVecF64x2PromoteLowF32x4Zero:
+			c.emit(
+				&OperationV128FloatPromote{},
+			)
+		case wasm.OpcodeVecF32x4DemoteF64x2Zero:
+			c.emit(
+				&OperationV128FloatDemote{},
+			)
+		case wasm.OpcodeVecF32x4ConvertI32x4S:
+			c.emit(
+				&OperationV128FConvertFromI{DestinationShape: ShapeF32x4, Signed: true},
+			)
+		case wasm.OpcodeVecF32x4ConvertI32x4U:
+			c.emit(
+				&OperationV128FConvertFromI{DestinationShape: ShapeF32x4, Signed: false},
+			)
+		case wasm.OpcodeVecF64x2ConvertLowI32x4S:
+			c.emit(
+				&OperationV128FConvertFromI{DestinationShape: ShapeF64x2, Signed: true},
+			)
+		case wasm.OpcodeVecF64x2ConvertLowI32x4U:
+			c.emit(
+				&OperationV128FConvertFromI{DestinationShape: ShapeF64x2, Signed: false},
+			)
+		case wasm.OpcodeVecI32x4DotI16x8S:
+			c.emit(
+				&OperationV128Dot{},
+			)
+		case wasm.OpcodeVecI8x16NarrowI16x8S:
+			c.emit(
+				&OperationV128Narrow{OriginShape: ShapeI16x8, Signed: true},
+			)
+		case wasm.OpcodeVecI8x16NarrowI16x8U:
+			c.emit(
+				&OperationV128Narrow{OriginShape: ShapeI16x8, Signed: false},
+			)
+		case wasm.OpcodeVecI16x8NarrowI32x4S:
+			c.emit(
+				&OperationV128Narrow{OriginShape: ShapeI32x4, Signed: true},
+			)
+		case wasm.OpcodeVecI16x8NarrowI32x4U:
+			c.emit(
+				&OperationV128Narrow{OriginShape: ShapeI32x4, Signed: false},
+			)
+		case wasm.OpcodeVecI32x4TruncSatF32x4S:
+			c.emit(
+				&OperationV128ITruncSatFromF{OriginShape: ShapeF32x4, Signed: true},
+			)
+		case wasm.OpcodeVecI32x4TruncSatF32x4U:
+			c.emit(
+				&OperationV128ITruncSatFromF{OriginShape: ShapeF32x4, Signed: false},
+			)
+		case wasm.OpcodeVecI32x4TruncSatF64x2SZero:
+			c.emit(
+				&OperationV128ITruncSatFromF{OriginShape: ShapeF64x2, Signed: true},
+			)
+		case wasm.OpcodeVecI32x4TruncSatF64x2UZero:
+			c.emit(
+				&OperationV128ITruncSatFromF{OriginShape: ShapeF64x2, Signed: false},
+			)
 		default:
 			return fmt.Errorf("unsupported vector instruction in wazeroir: %s", wasm.VectorInstructionName(vecOp))
 		}
diff --git a/internal/wazeroir/compiler_test.go b/internal/wazeroir/compiler_test.go
index 640835f9530..e81f559831c 100644
--- a/internal/wazeroir/compiler_test.go
+++ b/internal/wazeroir/compiler_test.go
@@ -1256,102 +1256,102 @@ func TestCompile_Vec(t *testing.T) {
 		{
 			name: wasm.OpcodeVecV128LoadName, body: load(wasm.OpcodeVecV128Load, 0, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type128, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType128, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128LoadName + "/align=4", body: load(wasm.OpcodeVecV128Load, 0, 4),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type128, Arg: &MemoryArg{Alignment: 4, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType128, Arg: &MemoryArg{Alignment: 4, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load8x8SName, body: load(wasm.OpcodeVecV128Load8x8s, 1, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type8x8s, Arg: &MemoryArg{Alignment: 0, Offset: 1}},
+			expected:             &OperationV128Load{Type: V128LoadType8x8s, Arg: &MemoryArg{Alignment: 0, Offset: 1}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load8x8SName + "/align=1", body: load(wasm.OpcodeVecV128Load8x8s, 0, 1),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type8x8s, Arg: &MemoryArg{Alignment: 1, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType8x8s, Arg: &MemoryArg{Alignment: 1, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load8x8UName, body: load(wasm.OpcodeVecV128Load8x8u, 0, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type8x8u, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType8x8u, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load8x8UName + "/align=1", body: load(wasm.OpcodeVecV128Load8x8u, 0, 1),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type8x8u, Arg: &MemoryArg{Alignment: 1, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType8x8u, Arg: &MemoryArg{Alignment: 1, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load16x4SName, body: load(wasm.OpcodeVecV128Load16x4s, 1, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type16x4s, Arg: &MemoryArg{Alignment: 0, Offset: 1}},
+			expected:             &OperationV128Load{Type: V128LoadType16x4s, Arg: &MemoryArg{Alignment: 0, Offset: 1}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load16x4SName + "/align=2", body: load(wasm.OpcodeVecV128Load16x4s, 0, 2),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type16x4s, Arg: &MemoryArg{Alignment: 2, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType16x4s, Arg: &MemoryArg{Alignment: 2, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load16x4UName, body: load(wasm.OpcodeVecV128Load16x4u, 0, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type16x4u, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType16x4u, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load16x4UName + "/align=2", body: load(wasm.OpcodeVecV128Load16x4u, 0, 2),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type16x4u, Arg: &MemoryArg{Alignment: 2, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType16x4u, Arg: &MemoryArg{Alignment: 2, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load32x2SName, body: load(wasm.OpcodeVecV128Load32x2s, 1, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type32x2s, Arg: &MemoryArg{Alignment: 0, Offset: 1}},
+			expected:             &OperationV128Load{Type: V128LoadType32x2s, Arg: &MemoryArg{Alignment: 0, Offset: 1}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load32x2SName + "/align=3", body: load(wasm.OpcodeVecV128Load32x2s, 0, 3),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type32x2s, Arg: &MemoryArg{Alignment: 3, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType32x2s, Arg: &MemoryArg{Alignment: 3, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load32x2UName, body: load(wasm.OpcodeVecV128Load32x2u, 0, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type32x2u, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType32x2u, Arg: &MemoryArg{Alignment: 0, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load32x2UName + "/align=3", body: load(wasm.OpcodeVecV128Load32x2u, 0, 3),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type32x2u, Arg: &MemoryArg{Alignment: 3, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType32x2u, Arg: &MemoryArg{Alignment: 3, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load8SplatName, body: load(wasm.OpcodeVecV128Load8Splat, 2, 0),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type8Splat, Arg: &MemoryArg{Alignment: 0, Offset: 2}},
+			expected:             &OperationV128Load{Type: V128LoadType8Splat, Arg: &MemoryArg{Alignment: 0, Offset: 2}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load16SplatName, body: load(wasm.OpcodeVecV128Load16Splat, 0, 1),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type16Splat, Arg: &MemoryArg{Alignment: 1, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType16Splat, Arg: &MemoryArg{Alignment: 1, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load32SplatName, body: load(wasm.OpcodeVecV128Load32Splat, 3, 2),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type32Splat, Arg: &MemoryArg{Alignment: 2, Offset: 3}},
+			expected:             &OperationV128Load{Type: V128LoadType32Splat, Arg: &MemoryArg{Alignment: 2, Offset: 3}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load64SplatName, body: load(wasm.OpcodeVecV128Load64Splat, 0, 3),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type64Splat, Arg: &MemoryArg{Alignment: 3, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType64Splat, Arg: &MemoryArg{Alignment: 3, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load32zeroName, body: load(wasm.OpcodeVecV128Load32zero, 0, 2),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type32zero, Arg: &MemoryArg{Alignment: 2, Offset: 0}},
+			expected:             &OperationV128Load{Type: V128LoadType32zero, Arg: &MemoryArg{Alignment: 2, Offset: 0}},
 		},
 		{
 			name: wasm.OpcodeVecV128Load64zeroName, body: load(wasm.OpcodeVecV128Load64zero, 5, 3),
 			needDropBeforeReturn: true,
-			expected:             &OperationV128Load{Type: LoadV128Type64zero, Arg: &MemoryArg{Alignment: 3, Offset: 5}},
+			expected:             &OperationV128Load{Type: V128LoadType64zero, Arg: &MemoryArg{Alignment: 3, Offset: 5}},
 		},
 		{name: wasm.OpcodeVecV128Load8LaneName, needDropBeforeReturn: true,
 			body:     loadLane(wasm.OpcodeVecV128Load8Lane, 5, 0, 10),
@@ -1899,6 +1899,535 @@ func TestCompile_Vec(t *testing.T) {
 			needDropBeforeReturn: true,
 			expected:             &OperationV128AnyTrue{},
 		},
+		{name: wasm.OpcodeVecI8x16AddName, body: vv2v(wasm.OpcodeVecI8x16Add),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Add{Shape: ShapeI8x16},
+		},
+		{name: wasm.OpcodeVecI8x16AddSatSName, body: vv2v(wasm.OpcodeVecI8x16AddSatS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128AddSat{Shape: ShapeI8x16, Signed: true},
+		},
+		{name: wasm.OpcodeVecI8x16AddSatUName, body: vv2v(wasm.OpcodeVecI8x16AddSatU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128AddSat{Shape: ShapeI8x16, Signed: false},
+		},
+		{name: wasm.OpcodeVecI8x16SubName, body: vv2v(wasm.OpcodeVecI8x16Sub),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Sub{Shape: ShapeI8x16},
+		},
+		{name: wasm.OpcodeVecI8x16SubSatSName, body: vv2v(wasm.OpcodeVecI8x16SubSatS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128SubSat{Shape: ShapeI8x16, Signed: true},
+		},
+		{name: wasm.OpcodeVecI8x16SubSatUName, body: vv2v(wasm.OpcodeVecI8x16SubSatU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128SubSat{Shape: ShapeI8x16, Signed: false},
+		},
+		{name: wasm.OpcodeVecI16x8AddName, body: vv2v(wasm.OpcodeVecI16x8Add),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Add{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI16x8AddSatSName, body: vv2v(wasm.OpcodeVecI16x8AddSatS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128AddSat{Shape: ShapeI16x8, Signed: true},
+		},
+		{name: wasm.OpcodeVecI16x8AddSatUName, body: vv2v(wasm.OpcodeVecI16x8AddSatU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128AddSat{Shape: ShapeI16x8, Signed: false},
+		},
+		{name: wasm.OpcodeVecI16x8SubName, body: vv2v(wasm.OpcodeVecI16x8Sub),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Sub{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI16x8SubSatSName, body: vv2v(wasm.OpcodeVecI16x8SubSatS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128SubSat{Shape: ShapeI16x8, Signed: true},
+		},
+		{name: wasm.OpcodeVecI16x8SubSatUName, body: vv2v(wasm.OpcodeVecI16x8SubSatU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128SubSat{Shape: ShapeI16x8, Signed: false},
+		},
+		{name: wasm.OpcodeVecI16x8MulName, body: vv2v(wasm.OpcodeVecI16x8Mul),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Mul{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI32x4AddName, body: vv2v(wasm.OpcodeVecI32x4Add),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Add{Shape: ShapeI32x4},
+		},
+		{name: wasm.OpcodeVecI32x4SubName, body: vv2v(wasm.OpcodeVecI32x4Sub),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Sub{Shape: ShapeI32x4},
+		},
+		{name: wasm.OpcodeVecI32x4MulName, body: vv2v(wasm.OpcodeVecI32x4Mul),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Mul{Shape: ShapeI32x4},
+		},
+		{name: wasm.OpcodeVecI64x2AddName, body: vv2v(wasm.OpcodeVecI64x2Add),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Add{Shape: ShapeI64x2},
+		},
+		{name: wasm.OpcodeVecI64x2SubName, body: vv2v(wasm.OpcodeVecI64x2Sub),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Sub{Shape: ShapeI64x2},
+		},
+		{name: wasm.OpcodeVecI64x2MulName, body: vv2v(wasm.OpcodeVecI64x2Mul),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Mul{Shape: ShapeI64x2},
+		},
+		{name: wasm.OpcodeVecF32x4AddName, body: vv2v(wasm.OpcodeVecF32x4Add),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Add{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4SubName, body: vv2v(wasm.OpcodeVecF32x4Sub),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Sub{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4MulName, body: vv2v(wasm.OpcodeVecF32x4Mul),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Mul{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4DivName, body: vv2v(wasm.OpcodeVecF32x4Div),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Div{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF64x2AddName, body: vv2v(wasm.OpcodeVecF64x2Add),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Add{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2SubName, body: vv2v(wasm.OpcodeVecF64x2Sub),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Sub{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2MulName, body: vv2v(wasm.OpcodeVecF64x2Mul),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Mul{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2DivName, body: vv2v(wasm.OpcodeVecF64x2Div),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Div{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecI8x16MinSName, body: vv2v(wasm.OpcodeVecI8x16MinS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeI8x16, Signed: true},
+		},
+		{name: wasm.OpcodeVecI8x16MinUName, body: vv2v(wasm.OpcodeVecI8x16MinU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeI8x16},
+		},
+		{name: wasm.OpcodeVecI8x16MaxSName, body: vv2v(wasm.OpcodeVecI8x16MaxS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeI8x16, Signed: true},
+		},
+		{name: wasm.OpcodeVecI8x16MaxUName, body: vv2v(wasm.OpcodeVecI8x16MaxU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeI8x16},
+		},
+		{name: wasm.OpcodeVecI8x16AvgrUName, body: vv2v(wasm.OpcodeVecI8x16AvgrU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128AvgrU{Shape: ShapeI8x16},
+		},
+		{name: wasm.OpcodeVecI16x8MinSName, body: vv2v(wasm.OpcodeVecI16x8MinS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeI16x8, Signed: true},
+		},
+		{name: wasm.OpcodeVecI16x8MinUName, body: vv2v(wasm.OpcodeVecI16x8MinU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI16x8MaxSName, body: vv2v(wasm.OpcodeVecI16x8MaxS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeI16x8, Signed: true},
+		},
+		{name: wasm.OpcodeVecI16x8MaxUName, body: vv2v(wasm.OpcodeVecI16x8MaxU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI16x8AvgrUName, body: vv2v(wasm.OpcodeVecI16x8AvgrU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128AvgrU{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI32x4MinSName, body: vv2v(wasm.OpcodeVecI32x4MinS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeI32x4, Signed: true},
+		},
+		{name: wasm.OpcodeVecI32x4MinUName, body: vv2v(wasm.OpcodeVecI32x4MinU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeI32x4},
+		},
+		{name: wasm.OpcodeVecI32x4MaxSName, body: vv2v(wasm.OpcodeVecI32x4MaxS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeI32x4, Signed: true},
+		},
+		{name: wasm.OpcodeVecI32x4MaxUName, body: vv2v(wasm.OpcodeVecI32x4MaxU),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeI32x4},
+		},
+		{name: wasm.OpcodeVecF32x4MinName, body: vv2v(wasm.OpcodeVecF32x4Min),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4MaxName, body: vv2v(wasm.OpcodeVecF32x4Max),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF64x2MinName, body: vv2v(wasm.OpcodeVecF64x2Min),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Min{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2MaxName, body: vv2v(wasm.OpcodeVecF64x2Max),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Max{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecI8x16AbsName, body: v2v(wasm.OpcodeVecI8x16Abs),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Abs{Shape: ShapeI8x16},
+		},
+		{name: wasm.OpcodeVecI8x16PopcntName, body: v2v(wasm.OpcodeVecI8x16Popcnt),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Popcnt{},
+		},
+		{name: wasm.OpcodeVecI16x8AbsName, body: v2v(wasm.OpcodeVecI16x8Abs),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Abs{Shape: ShapeI16x8},
+		},
+		{name: wasm.OpcodeVecI32x4AbsName, body: v2v(wasm.OpcodeVecI32x4Abs),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Abs{Shape: ShapeI32x4},
+		},
+		{name: wasm.OpcodeVecI64x2AbsName, body: v2v(wasm.OpcodeVecI64x2Abs),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Abs{Shape: ShapeI64x2},
+		},
+		{name: wasm.OpcodeVecF32x4AbsName, body: v2v(wasm.OpcodeVecF32x4Abs),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Abs{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF64x2AbsName, body: v2v(wasm.OpcodeVecF64x2Abs),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Abs{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF32x4CeilName, body: v2v(wasm.OpcodeVecF32x4Ceil),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Ceil{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4FloorName, body: v2v(wasm.OpcodeVecF32x4Floor),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Floor{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4TruncName, body: v2v(wasm.OpcodeVecF32x4Trunc),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Trunc{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4NearestName, body: v2v(wasm.OpcodeVecF32x4Nearest),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Nearest{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF64x2CeilName, body: v2v(wasm.OpcodeVecF64x2Ceil),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Ceil{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2FloorName, body: v2v(wasm.OpcodeVecF64x2Floor),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Floor{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2TruncName, body: v2v(wasm.OpcodeVecF64x2Trunc),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Trunc{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2NearestName, body: v2v(wasm.OpcodeVecF64x2Nearest),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Nearest{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF32x4PminName, body: vv2v(wasm.OpcodeVecF32x4Pmin),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Pmin{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF32x4PmaxName, body: vv2v(wasm.OpcodeVecF32x4Pmax),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Pmax{Shape: ShapeF32x4},
+		},
+		{name: wasm.OpcodeVecF64x2PminName, body: vv2v(wasm.OpcodeVecF64x2Pmin),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Pmin{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecF64x2PmaxName, body: vv2v(wasm.OpcodeVecF64x2Pmax),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Pmax{Shape: ShapeF64x2},
+		},
+		{name: wasm.OpcodeVecI16x8Q15mulrSatSName, body: vv2v(wasm.OpcodeVecI16x8Q15mulrSatS),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Q15mulrSatS{},
+		},
+		{name: wasm.OpcodeVecI16x8ExtMulLowI8x16SName, body: vv2v(wasm.OpcodeVecI16x8ExtMulLowI8x16S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI8x16,
+				Signed:      true,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtMulHighI8x16SName, body: vv2v(wasm.OpcodeVecI16x8ExtMulHighI8x16S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI8x16,
+				Signed:      true,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtMulLowI8x16UName, body: vv2v(wasm.OpcodeVecI16x8ExtMulLowI8x16U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI8x16,
+				Signed:      false,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtMulHighI8x16UName, body: vv2v(wasm.OpcodeVecI16x8ExtMulHighI8x16U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI8x16,
+				Signed:      false,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtMulLowI16x8SName, body: vv2v(wasm.OpcodeVecI32x4ExtMulLowI16x8S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI16x8,
+				Signed:      true,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtMulHighI16x8SName, body: vv2v(wasm.OpcodeVecI32x4ExtMulHighI16x8S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI16x8,
+				Signed:      true,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtMulLowI16x8UName, body: vv2v(wasm.OpcodeVecI32x4ExtMulLowI16x8U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI16x8,
+				Signed:      false,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtMulHighI16x8UName, body: vv2v(wasm.OpcodeVecI32x4ExtMulHighI16x8U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI16x8,
+				Signed:      false,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtMulLowI32x4SName, body: vv2v(wasm.OpcodeVecI64x2ExtMulLowI32x4S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI32x4,
+				Signed:      true,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtMulHighI32x4SName, body: vv2v(wasm.OpcodeVecI64x2ExtMulHighI32x4S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI32x4,
+				Signed:      true,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtMulLowI32x4UName, body: vv2v(wasm.OpcodeVecI64x2ExtMulLowI32x4U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI32x4,
+				Signed:      false,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtMulHighI32x4UName, body: vv2v(wasm.OpcodeVecI64x2ExtMulHighI32x4U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128ExtMul{
+				OriginShape: ShapeI32x4,
+				Signed:      false,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtendLowI8x16SName, body: v2v(wasm.OpcodeVecI16x8ExtendLowI8x16S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI8x16,
+				Signed:      true,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtendHighI8x16SName, body: v2v(wasm.OpcodeVecI16x8ExtendHighI8x16S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI8x16,
+				Signed:      true,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtendLowI8x16UName, body: v2v(wasm.OpcodeVecI16x8ExtendLowI8x16U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI8x16,
+				Signed:      false,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI16x8ExtendHighI8x16UName, body: v2v(wasm.OpcodeVecI16x8ExtendHighI8x16U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI8x16,
+				Signed:      false,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtendLowI16x8SName, body: v2v(wasm.OpcodeVecI32x4ExtendLowI16x8S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI16x8,
+				Signed:      true,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtendHighI16x8SName, body: v2v(wasm.OpcodeVecI32x4ExtendHighI16x8S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI16x8,
+				Signed:      true,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtendLowI16x8UName, body: v2v(wasm.OpcodeVecI32x4ExtendLowI16x8U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI16x8,
+				Signed:      false,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI32x4ExtendHighI16x8UName, body: v2v(wasm.OpcodeVecI32x4ExtendHighI16x8U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI16x8,
+				Signed:      false,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtendLowI32x4SName, body: v2v(wasm.OpcodeVecI64x2ExtendLowI32x4S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI32x4,
+				Signed:      true,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtendHighI32x4SName, body: v2v(wasm.OpcodeVecI64x2ExtendHighI32x4S),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI32x4,
+				Signed:      true,
+				UseLow:      false,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtendLowI32x4UName, body: v2v(wasm.OpcodeVecI64x2ExtendLowI32x4U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI32x4,
+				Signed:      false,
+				UseLow:      true,
+			},
+		},
+		{name: wasm.OpcodeVecI64x2ExtendHighI32x4UName, body: v2v(wasm.OpcodeVecI64x2ExtendHighI32x4U),
+			needDropBeforeReturn: true,
+			expected: &OperationV128Extend{
+				OriginShape: ShapeI32x4,
+				Signed:      false,
+				UseLow:      false,
+			},
+		},
+
+		{name: wasm.OpcodeVecI16x8ExtaddPairwiseI8x16SName, body: v2v(wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ExtAddPairwise{OriginShape: ShapeI8x16, Signed: true},
+		},
+		{name: wasm.OpcodeVecI16x8ExtaddPairwiseI8x16UName, body: v2v(wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ExtAddPairwise{OriginShape: ShapeI8x16, Signed: false},
+		},
+		{name: wasm.OpcodeVecI32x4ExtaddPairwiseI16x8SName, body: v2v(wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ExtAddPairwise{OriginShape: ShapeI16x8, Signed: true},
+		},
+		{name: wasm.OpcodeVecI32x4ExtaddPairwiseI16x8UName, body: v2v(wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ExtAddPairwise{OriginShape: ShapeI16x8, Signed: false},
+		},
+		{name: wasm.OpcodeVecF64x2PromoteLowF32x4ZeroName, body: v2v(wasm.OpcodeVecF64x2PromoteLowF32x4Zero),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128FloatPromote{},
+		},
+		{name: wasm.OpcodeVecF32x4DemoteF64x2ZeroName, body: v2v(wasm.OpcodeVecF32x4DemoteF64x2Zero),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128FloatDemote{},
+		},
+		{name: wasm.OpcodeVecF32x4ConvertI32x4SName, body: v2v(wasm.OpcodeVecF32x4ConvertI32x4S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128FConvertFromI{DestinationShape: ShapeF32x4, Signed: true},
+		},
+		{name: wasm.OpcodeVecF32x4ConvertI32x4UName, body: v2v(wasm.OpcodeVecF32x4ConvertI32x4U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128FConvertFromI{DestinationShape: ShapeF32x4, Signed: false},
+		},
+		{name: wasm.OpcodeVecF64x2ConvertLowI32x4SName, body: v2v(wasm.OpcodeVecF64x2ConvertLowI32x4S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128FConvertFromI{DestinationShape: ShapeF64x2, Signed: true},
+		},
+		{name: wasm.OpcodeVecF64x2ConvertLowI32x4UName, body: v2v(wasm.OpcodeVecF64x2ConvertLowI32x4U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128FConvertFromI{DestinationShape: ShapeF64x2, Signed: false},
+		},
+		{name: wasm.OpcodeVecI32x4DotI16x8SName, body: vv2v(wasm.OpcodeVecI32x4DotI16x8S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Dot{},
+		},
+		{name: wasm.OpcodeVecI8x16NarrowI16x8SName, body: vv2v(wasm.OpcodeVecI8x16NarrowI16x8S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Narrow{OriginShape: ShapeI16x8, Signed: true},
+		},
+		{name: wasm.OpcodeVecI8x16NarrowI16x8UName, body: vv2v(wasm.OpcodeVecI8x16NarrowI16x8U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Narrow{OriginShape: ShapeI16x8, Signed: false},
+		},
+		{name: wasm.OpcodeVecI16x8NarrowI32x4SName, body: vv2v(wasm.OpcodeVecI16x8NarrowI32x4S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Narrow{OriginShape: ShapeI32x4, Signed: true},
+		},
+		{name: wasm.OpcodeVecI16x8NarrowI32x4UName, body: vv2v(wasm.OpcodeVecI16x8NarrowI32x4U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128Narrow{OriginShape: ShapeI32x4, Signed: false},
+		},
+		{name: wasm.OpcodeVecI32x4TruncSatF32x4SName, body: v2v(wasm.OpcodeVecI32x4TruncSatF32x4S),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ITruncSatFromF{OriginShape: ShapeF32x4, Signed: true},
+		},
+		{name: wasm.OpcodeVecI32x4TruncSatF32x4UName, body: v2v(wasm.OpcodeVecI32x4TruncSatF32x4U),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ITruncSatFromF{OriginShape: ShapeF32x4, Signed: false},
+		},
+		{name: wasm.OpcodeVecI32x4TruncSatF64x2SZeroName, body: v2v(wasm.OpcodeVecI32x4TruncSatF64x2SZero),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ITruncSatFromF{OriginShape: ShapeF64x2, Signed: true},
+		},
+		{name: wasm.OpcodeVecI32x4TruncSatF64x2UZeroName, body: v2v(wasm.OpcodeVecI32x4TruncSatF64x2UZero),
+			needDropBeforeReturn: true,
+			expected:             &OperationV128ITruncSatFromF{OriginShape: ShapeF64x2, Signed: false},
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/internal/wazeroir/format.go b/internal/wazeroir/format.go
index c665d032c5b..39772b31a49 100644
--- a/internal/wazeroir/format.go
+++ b/internal/wazeroir/format.go
@@ -72,9 +72,9 @@ func formatOperation(w io.StringWriter, b Operation) {
 	case *OperationStore:
 		str = fmt.Sprintf("%s.store (align=%d, offset=%d)", o.Type, o.Arg.Alignment, o.Arg.Offset)
 	case *OperationStore8:
-		str = fmt.Sprintf("%s.store8 (align=%d, offset=%d)", o.Type, o.Arg.Alignment, o.Arg.Offset)
+		str = fmt.Sprintf("store8 (align=%d, offset=%d)", o.Arg.Alignment, o.Arg.Offset)
 	case *OperationStore16:
-		str = fmt.Sprintf("%s.store16 (align=%d, offset=%d)", o.Type, o.Arg.Alignment, o.Arg.Offset)
+		str = fmt.Sprintf("store16 (align=%d, offset=%d)", o.Arg.Alignment, o.Arg.Offset)
 	case *OperationStore32:
 		str = fmt.Sprintf("i64.store32 (align=%d, offset=%d)", o.Arg.Alignment, o.Arg.Offset)
 	case *OperationMemorySize:
diff --git a/internal/wazeroir/operations.go b/internal/wazeroir/operations.go
index db25340bae6..c7603c224a4 100644
--- a/internal/wazeroir/operations.go
+++ b/internal/wazeroir/operations.go
@@ -2,6 +2,7 @@ package wazeroir
 
 import "fmt"
 
+// UnsignedInt represents unsigned 32-bit or 64-bit integers.
 type UnsignedInt byte
 
 const (
@@ -9,6 +10,7 @@ const (
 	UnsignedInt64
 )
 
+// String implements fmt.Stringer.
 func (s UnsignedInt) String() (ret string) {
 	switch s {
 	case UnsignedInt32:
@@ -19,6 +21,7 @@ func (s UnsignedInt) String() (ret string) {
 	return
 }
 
+// SignedInt represents signed or unsigned integers.
 type SignedInt byte
 
 const (
@@ -28,6 +31,7 @@ const (
 	SignedUint64
 )
 
+// String implements fmt.Stringer.
 func (s SignedInt) String() (ret string) {
 	switch s {
 	case SignedUint32:
@@ -42,6 +46,7 @@ func (s SignedInt) String() (ret string) {
 	return
 }
 
+// Float represents the scalar double or single precision floating points.
 type Float byte
 
 const (
@@ -49,6 +54,7 @@ const (
 	Float64
 )
 
+// String implements fmt.Stringer.
 func (s Float) String() (ret string) {
 	switch s {
 	case Float32:
@@ -59,6 +65,7 @@ func (s Float) String() (ret string) {
 	return
 }
 
+// UnsignedType is the union of UnsignedInt, Float and V128 vector type.
 type UnsignedType byte
 
 const (
@@ -70,6 +77,7 @@ const (
 	UnsignedTypeUnknown
 )
 
+// String implements fmt.Stringer.
 func (s UnsignedType) String() (ret string) {
 	switch s {
 	case UnsignedTypeI32:
@@ -88,6 +96,7 @@ func (s UnsignedType) String() (ret string) {
 	return
 }
 
+// SignedType is the union of SignedInt and Float types.
 type SignedType byte
 
 const (
@@ -99,6 +108,7 @@ const (
 	SignedTypeFloat64
 )
 
+// String implements fmt.Stringer.
 func (s SignedType) String() (ret string) {
 	switch s {
 	case SignedTypeInt32:
@@ -117,12 +127,17 @@ func (s SignedType) String() (ret string) {
 	return
 }
 
+// Operation is the interface implemented by each individual operation.
 type Operation interface {
+	// Kind returns the kind of the implementation.
 	Kind() OperationKind
+	// TODO String()
 }
 
+// OperationKind is the kind of each implementation of Operation interface.
 type OperationKind uint16
 
+// String implements fmt.Stringer.
 func (o OperationKind) String() (ret string) {
 	switch o {
 	case OperationKindUnreachable:
@@ -205,7 +220,6 @@ func (o OperationKind) String() (ret string) {
 		ret = "Ctz"
 	case OperationKindPopcnt:
 		ret = "Popcnt"
-
 	case OperationKindDiv:
 		ret = "Div"
 	case OperationKindRem:
@@ -348,6 +362,60 @@ func (o OperationKind) String() (ret string) {
 		ret = "SignExtend64From16"
 	case OperationKindSignExtend64From32:
 		ret = "SignExtend64From32"
+	case OperationKindV128AddSat:
+		ret = "V128AddSat"
+	case OperationKindV128SubSat:
+		ret = "V128SubSat"
+	case OperationKindV128Mul:
+		ret = "V128Mul"
+	case OperationKindV128Div:
+		ret = "V128Div"
+	case OperationKindV128Neg:
+		ret = "V128Neg"
+	case OperationKindV128Sqrt:
+		ret = "V128Sqrt"
+	case OperationKindV128Abs:
+		ret = "V128Abs"
+	case OperationKindV128Popcnt:
+		ret = "V128Popcnt"
+	case OperationKindV128Min:
+		ret = "V128Min"
+	case OperationKindV128Max:
+		ret = "V128Max"
+	case OperationKindV128AvgrU:
+		ret = "V128AvgrU"
+	case OperationKindV128Ceil:
+		ret = "V128Ceil"
+	case OperationKindV128Floor:
+		ret = "V128Floor"
+	case OperationKindV128Trunc:
+		ret = "V128Trunc"
+	case OperationKindV128Nearest:
+		ret = "V128Nearest"
+	case OperationKindV128Pmin:
+		ret = "V128Pmin"
+	case OperationKindV128Pmax:
+		ret = "V128Pmax"
+	case OperationKindV128Extend:
+		ret = "V128Extend"
+	case OperationKindV128ExtMul:
+		ret = "V128ExtMul"
+	case OperationKindV128Q15mulrSatS:
+		ret = "V128Q15mulrSatS"
+	case OperationKindV128ExtAddPairwise:
+		ret = "V128ExtAddPairwise"
+	case OperationKindV128FloatPromote:
+		ret = "V128FloatPromote"
+	case OperationKindV128FloatDemote:
+		ret = "V128FloatDemote"
+	case OperationKindV128FConvertFromI:
+		ret = "V128FConvertFromI"
+	case OperationKindV128Dot:
+		ret = "V128Dot"
+	case OperationKindV128Narrow:
+		ret = "V128Narrow"
+	case OperationKindV128ITruncSatFromF:
+		ret = "V128ITruncSatFromF"
 	default:
 		panic(fmt.Errorf("unknown operation %d", o))
 	}
@@ -355,130 +423,298 @@ func (o OperationKind) String() (ret string) {
 }
 
 const (
+	// OperationKindUnreachable is the kind for OperationUnreachable.
 	OperationKindUnreachable OperationKind = iota
+	// OperationKindLabel is the kind for OperationLabel.
 	OperationKindLabel
+	// OperationKindBr is the kind for OperationBr.
 	OperationKindBr
+	// OperationKindBrIf is the kind for OperationBrIf.
 	OperationKindBrIf
+	// OperationKindBrTable is the kind for OperationBrTable.
 	OperationKindBrTable
+	// OperationKindCall is the kind for OperationCall.
 	OperationKindCall
+	// OperationKindCallIndirect is the kind for OperationCallIndirect.
 	OperationKindCallIndirect
+	// OperationKindDrop is the kind for OperationDrop.
 	OperationKindDrop
+	// OperationKindSelect is the kind for OperationSelect.
 	OperationKindSelect
+	// OperationKindPick is the kind for OperationPick.
 	OperationKindPick
+	// OperationKindSwap is the kind for OperationSwap.
 	OperationKindSwap
+	// OperationKindGlobalGet is the kind for OperationGlobalGet.
 	OperationKindGlobalGet
+	// OperationKindGlobalSet is the kind for OperationGlobalSet.
 	OperationKindGlobalSet
+	// OperationKindLoad is the kind for OperationLoad.
 	OperationKindLoad
+	// OperationKindLoad8 is the kind for OperationLoad8.
 	OperationKindLoad8
+	// OperationKindLoad16 is the kind for OperationLoad16.
 	OperationKindLoad16
+	// OperationKindLoad32 is the kind for OperationLoad32.
 	OperationKindLoad32
+	// OperationKindStore is the kind for OperationStore.
 	OperationKindStore
+	// OperationKindStore8 is the kind for OperationStore8.
 	OperationKindStore8
+	// OperationKindStore16 is the kind for OperationStore16.
 	OperationKindStore16
+	// OperationKindStore32 is the kind for OperationStore32.
 	OperationKindStore32
+	// OperationKindMemorySize is the kind for OperationMemorySize.
 	OperationKindMemorySize
+	// OperationKindMemoryGrow is the kind for OperationMemoryGrow.
 	OperationKindMemoryGrow
+	// OperationKindConstI32 is the kind for OperationConstI32.
 	OperationKindConstI32
+	// OperationKindConstI64 is the kind for OperationConstI64.
 	OperationKindConstI64
+	// OperationKindConstF32 is the kind for OperationConstF32.
 	OperationKindConstF32
+	// OperationKindConstF64 is the kind for OperationConstF64.
 	OperationKindConstF64
+	// OperationKindEq is the kind for OperationEq.
 	OperationKindEq
+	// OperationKindNe is the kind for OperationNe.
 	OperationKindNe
+	// OperationKindEqz is the kind for OperationEqz.
 	OperationKindEqz
+	// OperationKindLt is the kind for OperationLt.
 	OperationKindLt
+	// OperationKindGt is the kind for OperationGt.
 	OperationKindGt
+	// OperationKindLe is the kind for OperationLe.
 	OperationKindLe
+	// OperationKindGe is the kind for OperationGe.
 	OperationKindGe
+	// OperationKindAdd is the kind for OperationAdd.
 	OperationKindAdd
+	// OperationKindSub is the kind for OperationSub.
 	OperationKindSub
+	// OperationKindMul is the kind for OperationMul.
 	OperationKindMul
+	// OperationKindClz is the kind for OperationClz.
 	OperationKindClz
+	// OperationKindCtz is the kind for OperationCtz.
 	OperationKindCtz
+	// OperationKindPopcnt is the kind for OperationPopcnt.
 	OperationKindPopcnt
+	// OperationKindDiv is the kind for OperationDiv.
 	OperationKindDiv
+	// OperationKindRem is the kind for OperationRem.
 	OperationKindRem
+	// OperationKindAnd is the kind for OperationAnd.
 	OperationKindAnd
+	// OperationKindOr is the kind for OperationOr.
 	OperationKindOr
+	// OperationKindXor is the kind for OperationXor.
 	OperationKindXor
+	// OperationKindShl is the kind for OperationShl.
 	OperationKindShl
+	// OperationKindShr is the kind for OperationShr.
 	OperationKindShr
+	// OperationKindRotl is the kind for OperationRotl.
 	OperationKindRotl
+	// OperationKindRotr is the kind for OperationRotr.
 	OperationKindRotr
+	// OperationKindAbs is the kind for OperationAbs.
 	OperationKindAbs
+	// OperationKindNeg is the kind for OperationNeg.
 	OperationKindNeg
+	// OperationKindCeil is the kind for OperationCeil.
 	OperationKindCeil
+	// OperationKindFloor is the kind for OperationFloor.
 	OperationKindFloor
+	// OperationKindTrunc is the kind for OperationTrunc.
 	OperationKindTrunc
+	// OperationKindNearest is the kind for OperationNearest.
 	OperationKindNearest
+	// OperationKindSqrt is the kind for OperationSqrt.
 	OperationKindSqrt
+	// OperationKindMin is the kind for OperationMin.
 	OperationKindMin
+	// OperationKindMax is the kind for OperationMax.
 	OperationKindMax
+	// OperationKindCopysign is the kind for OperationCopysign.
 	OperationKindCopysign
+	// OperationKindI32WrapFromI64 is the kind for OperationI32WrapFromI64.
 	OperationKindI32WrapFromI64
+	// OperationKindITruncFromF is the kind for OperationITruncFromF.
 	OperationKindITruncFromF
+	// OperationKindFConvertFromI is the kind for OperationFConvertFromI.
 	OperationKindFConvertFromI
+	// OperationKindF32DemoteFromF64 is the kind for OperationF32DemoteFromF64.
 	OperationKindF32DemoteFromF64
+	// OperationKindF64PromoteFromF32 is the kind for OperationF64PromoteFromF32.
 	OperationKindF64PromoteFromF32
+	// OperationKindI32ReinterpretFromF32 is the kind for OperationI32ReinterpretFromF32.
 	OperationKindI32ReinterpretFromF32
+	// OperationKindI64ReinterpretFromF64 is the kind for OperationI64ReinterpretFromF64.
 	OperationKindI64ReinterpretFromF64
+	// OperationKindF32ReinterpretFromI32 is the kind for OperationF32ReinterpretFromI32.
 	OperationKindF32ReinterpretFromI32
+	// OperationKindF64ReinterpretFromI64 is the kind for OperationF64ReinterpretFromI64.
 	OperationKindF64ReinterpretFromI64
+	// OperationKindExtend is the kind for OperationExtend.
 	OperationKindExtend
+	// OperationKindSignExtend32From8 is the kind for OperationSignExtend32From8.
 	OperationKindSignExtend32From8
+	// OperationKindSignExtend32From16 is the kind for OperationSignExtend32From16.
 	OperationKindSignExtend32From16
+	// OperationKindSignExtend64From8 is the kind for OperationSignExtend64From8.
 	OperationKindSignExtend64From8
+	// OperationKindSignExtend64From16 is the kind for OperationSignExtend64From16.
 	OperationKindSignExtend64From16
+	// OperationKindSignExtend64From32 is the kind for OperationSignExtend64From32.
 	OperationKindSignExtend64From32
+	// OperationKindMemoryInit is the kind for OperationMemoryInit.
 	OperationKindMemoryInit
+	// OperationKindDataDrop is the kind for OperationDataDrop.
 	OperationKindDataDrop
+	// OperationKindMemoryCopy is the kind for OperationMemoryCopy.
 	OperationKindMemoryCopy
+	// OperationKindMemoryFill is the kind for OperationMemoryFill.
 	OperationKindMemoryFill
+	// OperationKindTableInit is the kind for OperationTableInit.
 	OperationKindTableInit
+	// OperationKindElemDrop is the kind for OperationElemDrop.
 	OperationKindElemDrop
+	// OperationKindTableCopy is the kind for OperationTableCopy.
 	OperationKindTableCopy
+	// OperationKindRefFunc is the kind for OperationRefFunc.
 	OperationKindRefFunc
+	// OperationKindTableGet is the kind for OperationTableGet.
 	OperationKindTableGet
+	// OperationKindTableSet is the kind for OperationTableSet.
 	OperationKindTableSet
+	// OperationKindTableSize is the kind for OperationTableSize.
 	OperationKindTableSize
+	// OperationKindTableGrow is the kind for OperationTableGrow.
 	OperationKindTableGrow
+	// OperationKindTableFill is the kind for OperationTableFill.
 	OperationKindTableFill
 
 	// Vector value related instructions are prefixed by V128.
 
+	// OperationKindV128Const is the kind for OperationV128Const.
 	OperationKindV128Const
+	// OperationKindV128Add is the kind for OperationV128Add.
 	OperationKindV128Add
+	// OperationKindV128Sub is the kind for OperationV128Sub.
 	OperationKindV128Sub
+	// OperationKindV128Load is the kind for OperationV128Load.
 	OperationKindV128Load
+	// OperationKindV128LoadLane is the kind for OperationV128LoadLane.
 	OperationKindV128LoadLane
+	// OperationKindV128Store is the kind for OperationV128Store.
 	OperationKindV128Store
+	// OperationKindV128StoreLane is the kind for OperationV128StoreLane.
 	OperationKindV128StoreLane
+	// OperationKindV128ExtractLane is the kind for OperationV128ExtractLane.
 	OperationKindV128ExtractLane
+	// OperationKindV128ReplaceLane is the kind for OperationV128ReplaceLane.
 	OperationKindV128ReplaceLane
+	// OperationKindV128Splat is the kind for OperationV128Splat.
 	OperationKindV128Splat
+	// OperationKindV128Shuffle is the kind for OperationV128Shuffle.
 	OperationKindV128Shuffle
+	// OperationKindV128Swizzle is the kind for OperationV128Swizzle.
 	OperationKindV128Swizzle
+	// OperationKindV128AnyTrue is the kind for OperationV128AnyTrue.
 	OperationKindV128AnyTrue
+	// OperationKindV128AllTrue is the kind for OperationV128AllTrue.
 	OperationKindV128AllTrue
+	// OperationKindV128BitMask is the kind for OperationV128BitMask.
 	OperationKindV128BitMask
+	// OperationKindV128And is the kind for OperationV128And.
 	OperationKindV128And
+	// OperationKindV128Not is the kind for OperationV128Not.
 	OperationKindV128Not
+	// OperationKindV128Or is the kind for OperationV128Or.
 	OperationKindV128Or
+	// OperationKindV128Xor is the kind for OperationV128Xor.
 	OperationKindV128Xor
+	// OperationKindV128Bitselect is the kind for OperationV128Bitselect.
 	OperationKindV128Bitselect
+	// OperationKindV128AndNot is the kind for OperationV128AndNot.
 	OperationKindV128AndNot
+	// OperationKindV128Shl is the kind for OperationV128Shl.
 	OperationKindV128Shl
+	// OperationKindV128Shr is the kind for OperationV128Shr.
 	OperationKindV128Shr
+	// OperationKindV128Cmp is the kind for OperationV128Cmp.
 	OperationKindV128Cmp
+	// OperationKindV128AddSat is the kind for OperationV128AddSat.
+	OperationKindV128AddSat
+	// OperationKindV128SubSat is the kind for OperationV128SubSat.
+	OperationKindV128SubSat
+	// OperationKindV128Mul is the kind for OperationV128Mul.
+	OperationKindV128Mul
+	// OperationKindV128Div is the kind for OperationV128Div.
+	OperationKindV128Div
+	// OperationKindV128Neg is the kind for OperationV128Neg.
+	OperationKindV128Neg
+	// OperationKindV128Sqrt is the kind for OperationV128Sqrt.
+	OperationKindV128Sqrt
+	// OperationKindV128Abs is the kind for OperationV128Abs.
+	OperationKindV128Abs
+	// OperationKindV128Popcnt is the kind for OperationV128Popcnt.
+	OperationKindV128Popcnt
+	// OperationKindV128Min is the kind for OperationV128Min.
+	OperationKindV128Min
+	// OperationKindV128Max is the kind for OperationV128Max.
+	OperationKindV128Max
+	// OperationKindV128AvgrU is the kind for OperationV128AvgrU.
+	OperationKindV128AvgrU
+	// OperationKindV128Pmin is the kind for OperationV128Pmin.
+	OperationKindV128Pmin
+	// OperationKindV128Pmax is the kind for OperationV128Pmax.
+	OperationKindV128Pmax
+	// OperationKindV128Ceil is the kind for OperationV128Ceil.
+	OperationKindV128Ceil
+	// OperationKindV128Floor is the kind for OperationV128Floor.
+	OperationKindV128Floor
+	// OperationKindV128Trunc is the kind for OperationV128Trunc.
+	OperationKindV128Trunc
+	// OperationKindV128Nearest is the kind for OperationV128Nearest.
+	OperationKindV128Nearest
+	// OperationKindV128Extend is the kind for OperationV128Extend.
+	OperationKindV128Extend
+	// OperationKindV128ExtMul is the kind for OperationV128ExtMul.
+	OperationKindV128ExtMul
+	// OperationKindV128Q15mulrSatS is the kind for OperationV128Q15mulrSatS.
+	OperationKindV128Q15mulrSatS
+	// OperationKindV128ExtAddPairwise is the kind for OperationV128ExtAddPairwise.
+	OperationKindV128ExtAddPairwise
+	// OperationKindV128FloatPromote is the kind for OperationV128FloatPromote.
+	OperationKindV128FloatPromote
+	// OperationKindV128FloatDemote is the kind for OperationV128FloatDemote.
+	OperationKindV128FloatDemote
+	// OperationKindV128FConvertFromI is the kind for OperationV128FConvertFromI.
+	OperationKindV128FConvertFromI
+	// OperationKindV128Dot is the kind for OperationV128Dot.
+	OperationKindV128Dot
+	// OperationKindV128Narrow is the kind for OperationV128Narrow.
+	OperationKindV128Narrow
+	// OperationKindV128ITruncSatFromF is the kind for OperationV128ITruncSatFromF.
+	OperationKindV128ITruncSatFromF
 
 	// operationKindEnd is always placed at the bottom of this iota definition to be used in the test.
 	operationKindEnd
 )
 
+// Label is the label of each block in wazeroir where "block" consists of multiple operations,
+// and must end with branching operations (e.g. OperationBr or OperationBrIf).
 type Label struct {
 	FrameID uint32
 	Kind    LabelKind
 }
 
+// String implements fmt.Stringer.
 func (l *Label) String() (ret string) {
 	if l == nil {
 		// Sometimes String() is called on the nil label which is interpreted
@@ -496,11 +732,23 @@ func (l *Label) String() (ret string) {
 	return
 }
 
+// LabelKind is the kind of the label.
 type LabelKind = byte
 
 const (
+	// LabelKindHeader is the header for various blocks. For example, the "then" block of
+	// wasm.OpcodeIfName in Wasm has the label of this kind.
 	LabelKindHeader LabelKind = iota
+	// LabelKindElse is the kind of label for "else" block of wasm.OpcodeIfName in Wasm.
 	LabelKindElse
+	// LabelKindContinuation is the kind of label which is the continuation of blocks.
+	// For example, for wasm text like
+	// (func
+	//   ....
+	//   (if (local.get 0) (then (nop)) (else (nop)))
+	//   return
+	// )
+	// we have the continuation block (of if-block) corresponding to "return" opcode.
 	LabelKindContinuation
 )
 
@@ -512,14 +760,19 @@ func (l *Label) asBranchTargetDrop() *BranchTargetDrop {
 	return &BranchTargetDrop{Target: l.asBranchTarget()}
 }
 
+// BranchTarget represents the branch operation's target such as OperationBr of OperationBrIf.
 type BranchTarget struct {
+	// Label holds the target label. Note that this is nullable and in that case
+	// the branch target is the "return" of the function.
 	Label *Label
 }
 
+// IsReturnTarget returns true if the branch target is the function return, false otherwise.
 func (b *BranchTarget) IsReturnTarget() bool {
 	return b.Label == nil
 }
 
+// String implements fmt.Stringer.
 func (b *BranchTarget) String() (ret string) {
 	if b.IsReturnTarget() {
 		ret = ".return"
@@ -529,11 +782,14 @@ func (b *BranchTarget) String() (ret string) {
 	return
 }
 
+// BranchTargetDrop represents the branch target and the drop range which must be dropped
+// before give the control over to the target label.
 type BranchTargetDrop struct {
 	Target *BranchTarget
 	ToDrop *InclusiveRange
 }
 
+// String implements fmt.Stringer.
 func (b *BranchTargetDrop) String() (ret string) {
 	if b.ToDrop != nil {
 		ret = fmt.Sprintf("%s(drop %d..%d)", b.Target, b.ToDrop.Start, b.ToDrop.End)
@@ -543,80 +799,149 @@ func (b *BranchTargetDrop) String() (ret string) {
 	return
 }
 
+// OperationUnreachable implements Operation.
+//
+// This corresponds to wasm.OpcodeUnreachable.
+//
+// The engines are expected to exit the execution with wasmruntime.ErrRuntimeUnreachable error.
 type OperationUnreachable struct{}
 
-func (o *OperationUnreachable) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationUnreachable) Kind() OperationKind {
 	return OperationKindUnreachable
 }
 
+// OperationLabel implements Operation.
+//
+// This is used to inform the engines of the beginning of a label.
 type OperationLabel struct {
 	Label *Label
 }
 
-func (o *OperationLabel) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationLabel) Kind() OperationKind {
 	return OperationKindLabel
 }
 
+// OperationBr implements Operation.
+//
+// The engines are expected to branch into OperationBr.Target label.
 type OperationBr struct {
 	Target *BranchTarget
 }
 
-func (o *OperationBr) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationBr) Kind() OperationKind {
 	return OperationKindBr
 }
 
+// OperationBrIf implements Operation.
+//
+// The engines are expected to pop a value and branch into OperationBrIf.Then label if the value equals 1.
+// Otherwise, the code branches into OperationBrIf.Else label.
 type OperationBrIf struct {
 	Then, Else *BranchTargetDrop
 }
 
-func (o *OperationBrIf) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationBrIf) Kind() OperationKind {
 	return OperationKindBrIf
 }
 
-type InclusiveRange struct {
-	Start, End int
-}
-
+// OperationBrTable implements Operation.
+//
+// This corresponds to wasm.OpcodeBrTableName except that the label
+// here means the wazeroir level, not the ones of Wasm.
+//
+// The engines are expected to do the br_table operation base on the
+// OperationBrTable.Default and OperationBrTable.Targets. More precisely,
+// this pops a value from the stack (called "index") and decide which branch we go into next
+// based on the value.
+//
+// For example, assume we have operations like {default: L_DEFAULT, targets: [L0, L1, L2]}.
+// If "index" >= len(defaults), then branch into the L_DEFAULT label.
+// Otherwise, we enter label of targets[index].
 type OperationBrTable struct {
 	Targets []*BranchTargetDrop
 	Default *BranchTargetDrop
 }
 
-func (o *OperationBrTable) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationBrTable) Kind() OperationKind {
 	return OperationKindBrTable
 }
 
+// OperationCall implements Operation.
+//
+// This corresponds to wasm.OpcodeCallName, and engines are expected to
+// enter into a function whose index equals OperationCall.FunctionIndex.
 type OperationCall struct {
 	FunctionIndex uint32
 }
 
-func (o *OperationCall) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationCall) Kind() OperationKind {
 	return OperationKindCall
 }
 
+// OperationCallIndirect implements Operation.
+//
+// This corresponds to wasm.OpcodeCallIndirectName, and engines are expected to
+// consume the one value from the top of stack (called "offset"),
+// and make a function call against the function whose function address equals
+// Tables[OperationCallIndirect.TableIndex][offset].
+//
+// Note: This is called indirect function call in the sense that the target function is indirectly
+// determined by the current state (top value) of the stack.
+// Therefore, two checks are performed at runtime before entering the target function:
+// 1) whether "offset" exceeds the length of table Tables[OperationCallIndirect.TableIndex].
+// 2) whether the type of the function table[offset] matches the function type specified by OperationCallIndirect.TypeIndex.
 type OperationCallIndirect struct {
 	TypeIndex, TableIndex uint32
 }
 
-func (o *OperationCallIndirect) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationCallIndirect) Kind() OperationKind {
 	return OperationKindCallIndirect
 }
 
+// InclusiveRange is the range which spans across the value stack starting from the top to the bottom, and
+// both boundary are included in the range.
+type InclusiveRange struct {
+	Start, End int
+}
+
+// OperationDrop implements Operation.
+//
+// The engines are expected to discard the values selected by OperationDrop.Depth which
+// starts from the top of the stack to the bottom.
 type OperationDrop struct {
-	// Depths spans across the uint64 value stack at runtime to be dopped by this operation.
+	// Depths spans across the uint64 value stack at runtime to be dropped by this operation.
 	Depth *InclusiveRange
 }
 
-func (o *OperationDrop) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationDrop) Kind() OperationKind {
 	return OperationKindDrop
 }
 
+// OperationSelect implements Operation.
+//
+// This corresponds to wasm.OpcodeSelect.
+//
+// The engines are expected to pop three values, say [..., x2, x1, c], then if the value "c" equals zero,
+// "x1" is pushed back onto the stack and, otherwise "x2" is pushed back.
 type OperationSelect struct{}
 
-func (o *OperationSelect) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationSelect) Kind() OperationKind {
 	return OperationKindSelect
 }
 
+// OperationPick implements Operation.
+//
+// The engines are expected to copy a value pointed by OperationPick.Depth, and push the
+// copied value onto the top of the stack.
 type OperationPick struct {
 	// Depth is the location of the pick target in the uint64 value stack at runtime.
 	// If IsTargetVector=true, this points to the location of the lower 64-bits of the vector.
@@ -624,10 +949,15 @@ type OperationPick struct {
 	IsTargetVector bool
 }
 
-func (o *OperationPick) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationPick) Kind() OperationKind {
 	return OperationKindPick
 }
 
+// OperationSwap implements Operation.
+//
+// The engines are expected to swap the top value of the stack and the one specified by
+// OperationSwap.Depth.
 type OperationSwap struct {
 	// Depth is the location of the pick target in the uint64 value stack at runtime.
 	// If IsTargetVector=true, this points the location of the lower 64-bits of the vector.
@@ -635,19 +965,34 @@ type OperationSwap struct {
 	IsTargetVector bool
 }
 
-func (o *OperationSwap) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationSwap) Kind() OperationKind {
 	return OperationKindSwap
 }
 
+// OperationGlobalGet implements Operation.
+//
+// The engines are expected to read the global value specified by OperationGlobalGet.Index,
+// and push the copy of the value onto the stack.
+//
+// See wasm.OpcodeGlobalGet.
 type OperationGlobalGet struct{ Index uint32 }
 
-func (o *OperationGlobalGet) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationGlobalGet) Kind() OperationKind {
 	return OperationKindGlobalGet
 }
 
+// OperationGlobalSet implements Operation.
+//
+// The engines are expected to consume the value from the top of the stack,
+// and write the value into the global specified by OperationGlobalSet.Index.
+//
+// See wasm.OpcodeGlobalSet.
 type OperationGlobalSet struct{ Index uint32 }
 
-func (o *OperationGlobalSet) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationGlobalSet) Kind() OperationKind {
 	return OperationKindGlobalSet
 }
 
@@ -666,353 +1011,606 @@ type MemoryArg struct {
 	Offset uint32
 }
 
+// OperationLoad implements Operation.
+//
+// This corresponds to wasm.OpcodeI32LoadName wasm.OpcodeI64LoadName wasm.OpcodeF32LoadName and wasm.OpcodeF64LoadName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationLoad struct {
 	Type UnsignedType
 	Arg  *MemoryArg
 }
 
-func (o *OperationLoad) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationLoad) Kind() OperationKind {
 	return OperationKindLoad
 }
 
+// OperationLoad8 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Load8SName wasm.OpcodeI32Load8UName wasm.OpcodeI64Load8SName wasm.OpcodeI64Load8UName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationLoad8 struct {
 	Type SignedInt
 	Arg  *MemoryArg
 }
 
-func (o *OperationLoad8) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (OperationLoad8) Kind() OperationKind {
 	return OperationKindLoad8
 }
 
+// OperationLoad16 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Load16SName wasm.OpcodeI32Load16UName wasm.OpcodeI64Load16SName wasm.OpcodeI64Load16UName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationLoad16 struct {
 	Type SignedInt
 	Arg  *MemoryArg
 }
 
-func (o *OperationLoad16) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (OperationLoad16) Kind() OperationKind {
 	return OperationKindLoad16
 }
 
+// OperationLoad32 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64Load32SName wasm.OpcodeI64Load32UName.
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise load the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationLoad32 struct {
 	Signed bool
 	Arg    *MemoryArg
 }
 
-func (o *OperationLoad32) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (OperationLoad32) Kind() OperationKind {
 	return OperationKindLoad32
 }
 
+// OperationStore implements Operation.
+//
+// This corresponds to wasm.OpcodeI32StoreName wasm.OpcodeI64StoreName wasm.OpcodeF32StoreName wasm.OpcodeF64StoreName
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationStore struct {
 	Type UnsignedType
 	Arg  *MemoryArg
 }
 
-func (o *OperationStore) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (*OperationStore) Kind() OperationKind {
 	return OperationKindStore
 }
 
+// OperationStore8 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Store8Name wasm.OpcodeI64Store8Name
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationStore8 struct {
-	// TODO: Semantically Type doesn't affect operation so consider deleting this field.
-	Type UnsignedInt
-	Arg  *MemoryArg
+	Arg *MemoryArg
 }
 
-func (o *OperationStore8) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (OperationStore8) Kind() OperationKind {
 	return OperationKindStore8
 }
 
+// OperationStore16 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Store16Name wasm.OpcodeI64Store16Name
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationStore16 struct {
-	// TODO: Semantically Type doesn't affect operation so consider deleting this field.
-	Type UnsignedInt
-	Arg  *MemoryArg
+	Arg *MemoryArg
 }
 
-func (o *OperationStore16) Kind() OperationKind {
+// Kind implements Operation.Kind
+func (OperationStore16) Kind() OperationKind {
 	return OperationKindStore16
 }
 
+// OperationStore32 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64Store32Name
+//
+// The engines are expected to check the boundary of memory length, and exit the execution if this exceeds the boundary,
+// otherwise store the corresponding value following the semantics of the corresponding WebAssembly instruction.
 type OperationStore32 struct {
 	Arg *MemoryArg
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationStore32) Kind() OperationKind {
+func (OperationStore32) Kind() OperationKind {
 	return OperationKindStore32
 }
 
+// OperationMemorySize implements Operation.
+//
+// This corresponds to wasm.OpcodeMemorySize.
+//
+// The engines are expected to push the current page size of the memory onto the stack.
 type OperationMemorySize struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationMemorySize) Kind() OperationKind {
+func (OperationMemorySize) Kind() OperationKind {
 	return OperationKindMemorySize
 }
 
+// OperationMemoryGrow implements Operation.
 type OperationMemoryGrow struct{ Alignment uint64 }
 
 // Kind implements Operation.Kind.
-func (o *OperationMemoryGrow) Kind() OperationKind {
+//
+// This corresponds to wasm.OpcodeMemoryGrow.
+//
+// The engines are expected to pop one value from the top of the stack, then
+// execute wasm.MemoryInstance Grow with the value, and push the previous
+// page size of the memory onto the stack.
+func (OperationMemoryGrow) Kind() OperationKind {
 	return OperationKindMemoryGrow
 }
 
+// OperationConstI32 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Const.
 type OperationConstI32 struct{ Value uint32 }
 
 // Kind implements Operation.Kind.
-func (o *OperationConstI32) Kind() OperationKind {
+func (OperationConstI32) Kind() OperationKind {
 	return OperationKindConstI32
 }
 
+// OperationConstI64 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64Const.
 type OperationConstI64 struct{ Value uint64 }
 
 // Kind implements Operation.Kind.
-func (o *OperationConstI64) Kind() OperationKind {
+func (OperationConstI64) Kind() OperationKind {
 	return OperationKindConstI64
 }
 
+// OperationConstF32 implements Operation.
+//
+// This corresponds to wasm.OpcodeF32Const.
 type OperationConstF32 struct{ Value float32 }
 
 // Kind implements Operation.Kind.
-func (o *OperationConstF32) Kind() OperationKind {
+func (OperationConstF32) Kind() OperationKind {
 	return OperationKindConstF32
 }
 
+// OperationConstF64 implements Operation.
+//
+// This corresponds to wasm.OpcodeF64Const.
 type OperationConstF64 struct{ Value float64 }
 
 // Kind implements Operation.Kind.
-func (o *OperationConstF64) Kind() OperationKind {
+func (OperationConstF64) Kind() OperationKind {
 	return OperationKindConstF64
 }
 
+// OperationEq implements Operation.
+//
+// This corresponds to wasm.OpcodeI32EqName wasm.OpcodeI64EqName wasm.OpcodeF32EqName wasm.OpcodeF64EqName
 type OperationEq struct{ Type UnsignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationEq) Kind() OperationKind {
+func (OperationEq) Kind() OperationKind {
 	return OperationKindEq
 }
 
+// OperationNe implements Operation.
+//
+// This corresponds to wasm.OpcodeI32NeName wasm.OpcodeI64NeName wasm.OpcodeF32NeName wasm.OpcodeF64NeName
 type OperationNe struct{ Type UnsignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationNe) Kind() OperationKind {
+func (OperationNe) Kind() OperationKind {
 	return OperationKindNe
 }
 
+// OperationEqz implements Operation.
+//
+// This corresponds to wasm.OpcodeI32EqzName wasm.OpcodeI64EqzName
 type OperationEqz struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationEqz) Kind() OperationKind {
+func (OperationEqz) Kind() OperationKind {
 	return OperationKindEqz
 }
 
+// OperationLt implements Operation.
+//
+// This corresponds to wasm.OpcodeI32LtS wasm.OpcodeI32LtU wasm.OpcodeI64LtS wasm.OpcodeI64LtU wasm.OpcodeF32Lt wasm.OpcodeF64Lt
 type OperationLt struct{ Type SignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationLt) Kind() OperationKind {
+func (OperationLt) Kind() OperationKind {
 	return OperationKindLt
 }
 
+// OperationGt implements Operation.
+//
+// This corresponds to wasm.OpcodeI32GtS wasm.OpcodeI32GtU wasm.OpcodeI64GtS wasm.OpcodeI64GtU wasm.OpcodeF32Gt wasm.OpcodeF64Gt
 type OperationGt struct{ Type SignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationGt) Kind() OperationKind {
+func (OperationGt) Kind() OperationKind {
 	return OperationKindGt
 }
 
+// OperationLe implements Operation.
+//
+// This corresponds to wasm.OpcodeI32LeS wasm.OpcodeI32LeU wasm.OpcodeI64LeS wasm.OpcodeI64LeU wasm.OpcodeF32Le wasm.OpcodeF64Le
 type OperationLe struct{ Type SignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationLe) Kind() OperationKind {
+func (OperationLe) Kind() OperationKind {
 	return OperationKindLe
 }
 
+// OperationGe implements Operation.
+//
+// This corresponds to wasm.OpcodeI32GeS wasm.OpcodeI32GeU wasm.OpcodeI64GeS wasm.OpcodeI64GeU wasm.OpcodeF32Ge wasm.OpcodeF64Ge
 type OperationGe struct{ Type SignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationGe) Kind() OperationKind {
+func (OperationGe) Kind() OperationKind {
 	return OperationKindGe
 }
 
+// OperationAdd implements Operation.
+//
+// This corresponds to wasm.OpcodeI32AddName wasm.OpcodeI64AddName wasm.OpcodeF32AddName wasm.OpcodeF64AddName.
 type OperationAdd struct{ Type UnsignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationAdd) Kind() OperationKind {
+func (OperationAdd) Kind() OperationKind {
 	return OperationKindAdd
 }
 
+// OperationSub implements Operation.
+//
+// This corresponds to wasm.OpcodeI32SubName wasm.OpcodeI64SubName wasm.OpcodeF32SubName wasm.OpcodeF64SubName.
 type OperationSub struct{ Type UnsignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationSub) Kind() OperationKind {
+func (OperationSub) Kind() OperationKind {
 	return OperationKindSub
 }
 
+// OperationMul implements Operation.
+//
+// This corresponds to wasm.OpcodeI32MulName wasm.OpcodeI64MulName wasm.OpcodeF32MulName wasm.OpcodeF64MulName.
 type OperationMul struct{ Type UnsignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationMul) Kind() OperationKind {
+func (OperationMul) Kind() OperationKind {
 	return OperationKindMul
 }
 
+// OperationClz implements Operation.
+//
+// This corresponds to wasm.OpcodeI32ClzName wasm.OpcodeI64ClzName.
+//
+// The engines are expected to count up the leading zeros in the
+// current top of the stack, and push the count result.
+// For example, stack of [..., 0x00_ff_ff_ff] results in [..., 8].
+// See wasm.OpcodeI32Clz wasm.OpcodeI64Clz
 type OperationClz struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationClz) Kind() OperationKind {
+func (OperationClz) Kind() OperationKind {
 	return OperationKindClz
 }
 
+// OperationCtz implements Operation.
+//
+// This corresponds to wasm.OpcodeI32CtzName wasm.OpcodeI64CtzName.
+//
+// The engines are expected to count up the trailing zeros in the
+// current top of the stack, and push the count result.
+// For example, stack of [..., 0xff_ff_ff_00] results in [..., 8].
 type OperationCtz struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationCtz) Kind() OperationKind {
+func (OperationCtz) Kind() OperationKind {
 	return OperationKindCtz
 }
 
+// OperationPopcnt implements Operation.
+//
+// This corresponds to wasm.OpcodeI32PopcntName wasm.OpcodeI64PopcntName.
+//
+// The engines are expected to count up the number of set bits in the
+// current top of the stack, and push the count result.
+// For example, stack of [..., 0b00_00_00_11] results in [..., 2].
 type OperationPopcnt struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationPopcnt) Kind() OperationKind {
+func (OperationPopcnt) Kind() OperationKind {
 	return OperationKindPopcnt
 }
 
+// OperationDiv implements Operation.
+//
+// This corresponds to wasm.OpcodeI32DivS wasm.OpcodeI32DivU wasm.OpcodeI64DivS
+//	wasm.OpcodeI64DivU wasm.OpcodeF32Div wasm.OpcodeF64Div.
 type OperationDiv struct{ Type SignedType }
 
 // Kind implements Operation.Kind.
-func (o *OperationDiv) Kind() OperationKind {
+func (OperationDiv) Kind() OperationKind {
 	return OperationKindDiv
 }
 
+// OperationRem implements Operation.
+//
+// This corresponds to wasm.OpcodeI32RemS wasm.OpcodeI32RemU wasm.OpcodeI64RemS wasm.OpcodeI64RemU.
+//
+// The engines are expected to perform division on the top
+// two values of integer type on the stack and puts the remainder of the result
+// onto the stack. For example, stack [..., 10, 3] results in [..., 1] where
+// the quotient is discarded.
 type OperationRem struct{ Type SignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationRem) Kind() OperationKind {
+func (OperationRem) Kind() OperationKind {
 	return OperationKindRem
 }
 
+// OperationAnd implements Operation.
+//
+// This corresponds to wasm.OpcodeI32AndName wasm.OpcodeI64AndName
+//
+// The engines are expected to perform "And" operation on
+// top two values on the stack, and pushes the result.
 type OperationAnd struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationAnd) Kind() OperationKind {
+func (OperationAnd) Kind() OperationKind {
 	return OperationKindAnd
 }
 
+// OperationOr implements Operation.
+//
+// This corresponds to wasm.OpcodeI32OrName wasm.OpcodeI64OrName
+//
+// The engines are expected to perform "Or" operation on
+// top two values on the stack, and pushes the result.
 type OperationOr struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationOr) Kind() OperationKind {
+func (OperationOr) Kind() OperationKind {
 	return OperationKindOr
 }
 
+// OperationXor implements Operation.
+//
+// This corresponds to wasm.OpcodeI32XorName wasm.OpcodeI64XorName
+//
+// The engines are expected to perform "Xor" operation on
+// top two values on the stack, and pushes the result.
 type OperationXor struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationXor) Kind() OperationKind {
+func (OperationXor) Kind() OperationKind {
 	return OperationKindXor
 }
 
+// OperationShl implements Operation.
+//
+// This corresponds to wasm.OpcodeI32ShlName wasm.OpcodeI64ShlName
+//
+// The engines are expected to perform "Shl" operation on
+// top two values on the stack, and pushes the result.
 type OperationShl struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationShl) Kind() OperationKind {
+func (OperationShl) Kind() OperationKind {
 	return OperationKindShl
 }
 
+// OperationShr implements Operation.
+//
+// This corresponds to wasm.OpcodeI32ShrSName wasm.OpcodeI32ShrUName wasm.OpcodeI64ShrSName wasm.OpcodeI64ShrUName
+//
+// If OperationShr.Type is signed integer, then, the engines are expected to perform arithmetic right shift on the two
+// top values on the stack, otherwise do the logical right shift.
 type OperationShr struct{ Type SignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationShr) Kind() OperationKind {
+func (OperationShr) Kind() OperationKind {
 	return OperationKindShr
 }
 
+// OperationRotl implements Operation.
+//
+// This corresponds to wasm.OpcodeI32RotlName wasm.OpcodeI64RotlName
+//
+// The engines are expected to perform "Rotl" operation on
+// top two values on the stack, and pushes the result.
 type OperationRotl struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationRotl) Kind() OperationKind {
+func (OperationRotl) Kind() OperationKind {
 	return OperationKindRotl
 }
 
+// OperationRotr implements Operation.
+//
+// This corresponds to wasm.OpcodeI32RotrName wasm.OpcodeI64RotrName
+//
+// The engines are expected to perform "Rotr" operation on
+// top two values on the stack, and pushes the result.
 type OperationRotr struct{ Type UnsignedInt }
 
 // Kind implements Operation.Kind.
-func (o *OperationRotr) Kind() OperationKind {
+func (OperationRotr) Kind() OperationKind {
 	return OperationKindRotr
 }
 
+// OperationAbs implements Operation.
+//
+// This corresponds to wasm.OpcodeF32Abs wasm.OpcodeF64Abs
 type OperationAbs struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationAbs) Kind() OperationKind {
+func (OperationAbs) Kind() OperationKind {
 	return OperationKindAbs
 }
 
+// OperationNeg implements Operation.
+//
+// This corresponds to wasm.OpcodeF32Neg wasm.OpcodeF64Neg
 type OperationNeg struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationNeg) Kind() OperationKind {
+func (OperationNeg) Kind() OperationKind {
 	return OperationKindNeg
 }
 
+// OperationCeil implements Operation.
+//
+// This corresponds to wasm.OpcodeF32CeilName wasm.OpcodeF64CeilName
 type OperationCeil struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationCeil) Kind() OperationKind {
+func (OperationCeil) Kind() OperationKind {
 	return OperationKindCeil
 }
 
+// OperationFloor implements Operation.
+//
+// This corresponds to wasm.OpcodeF32FloorName wasm.OpcodeF64FloorName
 type OperationFloor struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationFloor) Kind() OperationKind {
+func (OperationFloor) Kind() OperationKind {
 	return OperationKindFloor
 }
 
+// OperationTrunc implements Operation.
+//
+// This corresponds to wasm.OpcodeF32TruncName wasm.OpcodeF64TruncName
 type OperationTrunc struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationTrunc) Kind() OperationKind {
+func (OperationTrunc) Kind() OperationKind {
 	return OperationKindTrunc
 }
 
+// OperationNearest implements Operation.
+//
+// This corresponds to wasm.OpcodeF32NearestName wasm.OpcodeF64NearestName
+//
+// Note: this is *not* equivalent to math.Round and instead has the same
+// the semantics of LLVM's rint intrinsic. See https://llvm.org/docs/LangRef.html#llvm-rint-intrinsic.
+// For example, math.Round(-4.5) produces -5 while we want to produce -4.
 type OperationNearest struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationNearest) Kind() OperationKind {
+func (OperationNearest) Kind() OperationKind {
 	return OperationKindNearest
 }
 
+// OperationSqrt implements Operation.
+//
+// This corresponds to wasm.OpcodeF32SqrtName wasm.OpcodeF64SqrtName
 type OperationSqrt struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationSqrt) Kind() OperationKind {
+func (OperationSqrt) Kind() OperationKind {
 	return OperationKindSqrt
 }
 
+// OperationMin implements Operation.
+//
+// This corresponds to wasm.OpcodeF32MinName wasm.OpcodeF64MinName
+//
+// The engines are expected to pop two values from the stack, and push back the maximum of
+// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 1.9].
+//
+// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN,
+// which is a different behavior different from math.Min.
 type OperationMin struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationMin) Kind() OperationKind {
+func (OperationMin) Kind() OperationKind {
 	return OperationKindMin
 }
 
+// OperationMax implements Operation.
+//
+// This corresponds to wasm.OpcodeF32MaxName wasm.OpcodeF64MaxName
+//
+// The engines are expected to pop two values from the stack, and push back the maximum of
+// these two values onto the stack. For example, stack [..., 100.1, 1.9] results in [..., 100.1].
+//
+// Note: WebAssembly specifies that min/max must always return NaN if one of values is NaN,
+// which is a different behavior different from math.Max.
 type OperationMax struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationMax) Kind() OperationKind {
+func (OperationMax) Kind() OperationKind {
 	return OperationKindMax
 }
 
+// OperationCopysign implements Operation.
+//
+// This corresponds to wasm.OpcodeF32CopysignName wasm.OpcodeF64CopysignName
+//
+// The engines are expected to pop two float values from the stack, and copy the signbit of
+// the first-popped value to the last one.
+// For example, stack [..., 1.213, -5.0] results in [..., -1.213].
 type OperationCopysign struct{ Type Float }
 
 // Kind implements Operation.Kind.
-func (o *OperationCopysign) Kind() OperationKind {
+func (OperationCopysign) Kind() OperationKind {
 	return OperationKindCopysign
 }
 
+// OperationI32WrapFromI64 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32WrapI64 and equivalent to uint64(uint32(v)) in Go.
+//
+// The engines are expected to replace the 64-bit int on top of the stack
+// with the corresponding 32-bit integer.
 type OperationI32WrapFromI64 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationI32WrapFromI64) Kind() OperationKind {
+func (OperationI32WrapFromI64) Kind() OperationKind {
 	return OperationKindI32WrapFromI64
 }
 
+// OperationITruncFromF implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeI32TruncF32SName wasm.OpcodeI32TruncF32UName wasm.OpcodeI32TruncF64SName
+// 	wasm.OpcodeI32TruncF64UName wasm.OpcodeI64TruncF32SName wasm.OpcodeI64TruncF32UName wasm.OpcodeI64TruncF64SName
+//	wasm.OpcodeI64TruncF64UName. wasm.OpcodeI32TruncSatF32SName wasm.OpcodeI32TruncSatF32UName
+// 	wasm.OpcodeI32TruncSatF64SName wasm.OpcodeI32TruncSatF64UName wasm.OpcodeI64TruncSatF32SName
+//	wasm.OpcodeI64TruncSatF32UName wasm.OpcodeI64TruncSatF64SName wasm.OpcodeI64TruncSatF64UName
+//
+// See [1] and [2] for when we encounter undefined behavior in the WebAssembly specification if OperationITruncFromF.NonTrapping == false.
+// To summarize, if the source float value is NaN or doesn't fit in the destination range of integers (incl. +=Inf),
+// then the runtime behavior is undefined. In wazero, the engines are expected to exit the execution in these undefined cases with
+// wasmruntime.ErrRuntimeInvalidConversionToInteger error.
+//
+// [1] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-umathrmtruncmathsfu_m-n-z for unsigned integers.
+// [2] https://www.w3.org/TR/2019/REC-wasm-core-1-20191205/#-hrefop-trunc-smathrmtruncmathsfs_m-n-z for signed integers.
 type OperationITruncFromF struct {
 	InputType  Float
 	OutputType SignedInt
@@ -1022,103 +1620,164 @@ type OperationITruncFromF struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationITruncFromF) Kind() OperationKind {
+func (OperationITruncFromF) Kind() OperationKind {
 	return OperationKindITruncFromF
 }
 
+// OperationFConvertFromI implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeF32ConvertI32SName wasm.OpcodeF32ConvertI32UName wasm.OpcodeF32ConvertI64SName wasm.OpcodeF32ConvertI64UName
+// 	wasm.OpcodeF64ConvertI32SName wasm.OpcodeF64ConvertI32UName wasm.OpcodeF64ConvertI64SName wasm.OpcodeF64ConvertI64UName
+// and equivalent to float32(uint32(x)), float32(int32(x)), etc in Go.
 type OperationFConvertFromI struct {
 	InputType  SignedInt
 	OutputType Float
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationFConvertFromI) Kind() OperationKind {
+func (OperationFConvertFromI) Kind() OperationKind {
 	return OperationKindFConvertFromI
 }
 
+// OperationF32DemoteFromF64 implements Operation.
+//
+// This corresponds to wasm.OpcodeF32DemoteF64 and is equivalent float32(float64(v)).
 type OperationF32DemoteFromF64 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationF32DemoteFromF64) Kind() OperationKind {
+func (OperationF32DemoteFromF64) Kind() OperationKind {
 	return OperationKindF32DemoteFromF64
 }
 
+// OperationF64PromoteFromF32 implements Operation.
+//
+// This corresponds to wasm.OpcodeF64PromoteF32 and is equivalent float64(float32(v)).
 type OperationF64PromoteFromF32 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationF64PromoteFromF32) Kind() OperationKind {
+func (OperationF64PromoteFromF32) Kind() OperationKind {
 	return OperationKindF64PromoteFromF32
 }
 
+// OperationI32ReinterpretFromF32 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32ReinterpretF32Name.
 type OperationI32ReinterpretFromF32 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationI32ReinterpretFromF32) Kind() OperationKind {
+func (OperationI32ReinterpretFromF32) Kind() OperationKind {
 	return OperationKindI32ReinterpretFromF32
 }
 
+// OperationI64ReinterpretFromF64 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64ReinterpretF64Name.
 type OperationI64ReinterpretFromF64 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationI64ReinterpretFromF64) Kind() OperationKind {
+func (OperationI64ReinterpretFromF64) Kind() OperationKind {
 	return OperationKindI64ReinterpretFromF64
 }
 
+// OperationF32ReinterpretFromI32 implements Operation.
+//
+// This corresponds to wasm.OpcodeF32ReinterpretI32Name.
 type OperationF32ReinterpretFromI32 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationF32ReinterpretFromI32) Kind() OperationKind {
+func (OperationF32ReinterpretFromI32) Kind() OperationKind {
 	return OperationKindF32ReinterpretFromI32
 }
 
+// OperationF64ReinterpretFromI64 implements Operation.
+//
+// This corresponds to wasm.OpcodeF64ReinterpretI64Name.
 type OperationF64ReinterpretFromI64 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationF64ReinterpretFromI64) Kind() OperationKind {
+func (OperationF64ReinterpretFromI64) Kind() OperationKind {
 	return OperationKindF64ReinterpretFromI64
 }
 
+// OperationExtend implements Operation.
+//
+// This corresponds to wasm.OpcodeI64ExtendI32SName wasm.OpcodeI64ExtendI32UName
+//
+// The engines are expected to extend the 32-bit signed or unsigned int on top of the stack
+// as a 64-bit integer of corresponding signedness. For unsigned case, this is just reinterpreting the
+// underlying bit pattern as 64-bit integer. For signed case, this is sign-extension which preserves the
+// original integer's sign.
 type OperationExtend struct{ Signed bool }
 
-func (o *OperationExtend) Kind() OperationKind {
+// Kind implements Operation.Kind.
+func (OperationExtend) Kind() OperationKind {
 	return OperationKindExtend
 }
 
+// OperationSignExtend32From8 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Extend8SName.
+//
+// The engines are expected to sign-extend the first 8-bits of 32-bit in as signed 32-bit int.
 type OperationSignExtend32From8 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationSignExtend32From8) Kind() OperationKind {
+func (OperationSignExtend32From8) Kind() OperationKind {
 	return OperationKindSignExtend32From8
 }
 
+// OperationSignExtend32From16 implements Operation.
+//
+// This corresponds to wasm.OpcodeI32Extend16SName.
+//
+// The engines are expected to sign-extend the first 16-bits of 32-bit in as signed 32-bit int.
 type OperationSignExtend32From16 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationSignExtend32From16) Kind() OperationKind {
+func (OperationSignExtend32From16) Kind() OperationKind {
 	return OperationKindSignExtend32From16
 }
 
+// OperationSignExtend64From8 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64Extend8SName.
+//
+// The engines are expected to sign-extend the first 8-bits of 64-bit in as signed 32-bit int.
 type OperationSignExtend64From8 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationSignExtend64From8) Kind() OperationKind {
+func (OperationSignExtend64From8) Kind() OperationKind {
 	return OperationKindSignExtend64From8
 }
 
+// OperationSignExtend64From16 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64Extend16SName.
+//
+// The engines are expected to sign-extend the first 16-bits of 64-bit in as signed 32-bit int.
 type OperationSignExtend64From16 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationSignExtend64From16) Kind() OperationKind {
+func (OperationSignExtend64From16) Kind() OperationKind {
 	return OperationKindSignExtend64From16
 }
 
+// OperationSignExtend64From32 implements Operation.
+//
+// This corresponds to wasm.OpcodeI64Extend32SName.
+//
+// The engines are expected to sign-extend the first 32-bits of 64-bit in as signed 32-bit int.
 type OperationSignExtend64From32 struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationSignExtend64From32) Kind() OperationKind {
+func (OperationSignExtend64From32) Kind() OperationKind {
 	return OperationKindSignExtend64From32
 }
 
+// OperationMemoryInit implements Operation.
+//
+// This corresponds to wasm.OpcodeMemoryInitName.
 type OperationMemoryInit struct {
 	// DataIndex is the index of the data instance in ModuleInstance.DataInstances
 	// by which this operation instantiates a part of the memory.
@@ -1126,10 +1785,13 @@ type OperationMemoryInit struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationMemoryInit) Kind() OperationKind {
+func (OperationMemoryInit) Kind() OperationKind {
 	return OperationKindMemoryInit
 }
 
+// OperationDataDrop implements Operation.
+//
+// This corresponds to wasm.OpcodeDataDropName.
 type OperationDataDrop struct {
 	// DataIndex is the index of the data instance in ModuleInstance.DataInstances
 	// which this operation drops.
@@ -1137,24 +1799,33 @@ type OperationDataDrop struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationDataDrop) Kind() OperationKind {
+func (OperationDataDrop) Kind() OperationKind {
 	return OperationKindDataDrop
 }
 
+// OperationMemoryCopy implements Operation.
+//
+// This corresponds to wasm.OpcodeMemoryCopyName.
 type OperationMemoryCopy struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationMemoryCopy) Kind() OperationKind {
+func (OperationMemoryCopy) Kind() OperationKind {
 	return OperationKindMemoryCopy
 }
 
+// OperationMemoryFill implements Operation.
+//
+// This corresponds to wasm.OpcodeMemoryFillName.
 type OperationMemoryFill struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationMemoryFill) Kind() OperationKind {
+func (OperationMemoryFill) Kind() OperationKind {
 	return OperationKindMemoryFill
 }
 
+// OperationTableInit implements Operation.
+//
+// This corresponds to wasm.OpcodeTableInitName.
 type OperationTableInit struct {
 	// ElemIndex is the index of the element by which this operation initializes a part of the table.
 	ElemIndex uint32
@@ -1163,89 +1834,108 @@ type OperationTableInit struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableInit) Kind() OperationKind {
+func (OperationTableInit) Kind() OperationKind {
 	return OperationKindTableInit
 }
 
+// OperationElemDrop implements Operation.
+//
+// This corresponds to wasm.OpcodeElemDropName.
 type OperationElemDrop struct {
 	// ElemIndex is the index of the element which this operation drops.
 	ElemIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationElemDrop) Kind() OperationKind {
+func (OperationElemDrop) Kind() OperationKind {
 	return OperationKindElemDrop
 }
 
+// OperationTableCopy implements Operation.
+//
+// This corresponds to wasm.OpcodeTableCopyName.
 type OperationTableCopy struct {
 	SrcTableIndex, DstTableIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableCopy) Kind() OperationKind {
+func (OperationTableCopy) Kind() OperationKind {
 	return OperationKindTableCopy
 }
 
-// OperationRefFunc corresponds to OpcodeRefFunc, and engines are expected to
+// OperationRefFunc implements Operation.
+//
+// This corresponds to wasm.OpcodeRefFuncName, and engines are expected to
 // push the opaque pointer value of engine specific func for the given FunctionIndex.
 //
-// OperationRefFunc implements Operation.
+// Note: in wazero, we express any reference types (funcref or externref) as opaque pointers which is uint64.
+// Therefore, the engine implementations emit instructions to push the address of *function onto the stack.
 type OperationRefFunc struct {
 	FunctionIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationRefFunc) Kind() OperationKind {
+func (OperationRefFunc) Kind() OperationKind {
 	return OperationKindRefFunc
 }
 
 // OperationTableGet implements Operation.
+//
+// This corresponds to wasm.OpcodeTableGetName.
 type OperationTableGet struct {
 	TableIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableGet) Kind() OperationKind {
+func (OperationTableGet) Kind() OperationKind {
 	return OperationKindTableGet
 }
 
 // OperationTableSet implements Operation.
+//
+// This corresponds to wasm.OpcodeTableSetName.
 type OperationTableSet struct {
 	TableIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableSet) Kind() OperationKind {
+func (OperationTableSet) Kind() OperationKind {
 	return OperationKindTableSet
 }
 
 // OperationTableSize implements Operation.
+//
+// This corresponds to wasm.OpcodeTableSizeName.
 type OperationTableSize struct {
 	TableIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableSize) Kind() OperationKind {
+func (OperationTableSize) Kind() OperationKind {
 	return OperationKindTableSize
 }
 
 // OperationTableGrow implements Operation.
+//
+// This corresponds to wasm.OpcodeTableGrowName.
 type OperationTableGrow struct {
 	TableIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableGrow) Kind() OperationKind {
+func (OperationTableGrow) Kind() OperationKind {
 	return OperationKindTableGrow
 }
 
 // OperationTableFill implements Operation.
+//
+// This corresponds to wasm.OpcodeTableFillName.
 type OperationTableFill struct {
 	TableIndex uint32
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationTableFill) Kind() OperationKind {
+func (OperationTableFill) Kind() OperationKind {
 	return OperationKindTableFill
 }
 
@@ -1255,7 +1945,9 @@ type OperationV128Const struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Const) Kind() OperationKind {
+//
+// This corresponds to wasm.OpcodeVecV128Const.
+func (OperationV128Const) Kind() OperationKind {
 	return OperationKindV128Const
 }
 
@@ -1291,68 +1983,85 @@ func shapeName(s Shape) (ret string) {
 }
 
 // OperationV128Add implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16AddName wasm.OpcodeVecI16x8AddName wasm.OpcodeVecI32x4AddName
+// 	wasm.OpcodeVecI64x2AddName wasm.OpcodeVecF32x4AddName wasm.OpcodeVecF64x2AddName
 type OperationV128Add struct {
 	Shape Shape
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Add) Kind() OperationKind {
+func (OperationV128Add) Kind() OperationKind {
 	return OperationKindV128Add
 }
 
 // OperationV128Sub implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16SubName wasm.OpcodeVecI16x8SubName wasm.OpcodeVecI32x4SubName
+// 	wasm.OpcodeVecI64x2SubName wasm.OpcodeVecF32x4SubName wasm.OpcodeVecF64x2SubName
 type OperationV128Sub struct {
 	Shape Shape
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Sub) Kind() OperationKind {
+func (OperationV128Sub) Kind() OperationKind {
 	return OperationKindV128Sub
 }
 
-type LoadV128Type = byte
+// V128LoadType represents a type of wasm.OpcodeVecV128Load* instructions.
+type V128LoadType = byte
 
 const (
-	// LoadV128Type128 corresponds to wasm.OpcodeVecV128LoadName.
-	LoadV128Type128 LoadV128Type = iota
-	// LoadV128Type8x8s corresponds to wasm.OpcodeVecV128Load8x8SName.
-	LoadV128Type8x8s
-	// LoadV128Type8x8u corresponds to wasm.OpcodeVecV128Load8x8UName.
-	LoadV128Type8x8u
-	// LoadV128Type16x4s corresponds to wasm.OpcodeVecV128Load16x4SName
-	LoadV128Type16x4s
-	// LoadV128Type16x4u corresponds to wasm.OpcodeVecV128Load16x4UName
-	LoadV128Type16x4u
-	// LoadV128Type32x2s corresponds to wasm.OpcodeVecV128Load32x2SName
-	LoadV128Type32x2s
-	// LoadV128Type32x2u corresponds to wasm.OpcodeVecV128Load32x2UName
-	LoadV128Type32x2u
-	// LoadV128Type8Splat corresponds to wasm.OpcodeVecV128Load8SplatName
-	LoadV128Type8Splat
-	// LoadV128Type16Splat corresponds to wasm.OpcodeVecV128Load16SplatName
-	LoadV128Type16Splat
-	// LoadV128Type32Splat corresponds to wasm.OpcodeVecV128Load32SplatName
-	LoadV128Type32Splat
-	// LoadV128Type64Splat corresponds to wasm.OpcodeVecV128Load64SplatName
-	LoadV128Type64Splat
-	// LoadV128Type32zero corresponds to wasm.OpcodeVecV128Load32zeroName
-	LoadV128Type32zero
-	// LoadV128Type64zero corresponds to wasm.OpcodeVecV128Load64zeroName
-	LoadV128Type64zero
+	// V128LoadType128 corresponds to wasm.OpcodeVecV128LoadName.
+	V128LoadType128 V128LoadType = iota
+	// V128LoadType8x8s corresponds to wasm.OpcodeVecV128Load8x8SName.
+	V128LoadType8x8s
+	// V128LoadType8x8u corresponds to wasm.OpcodeVecV128Load8x8UName.
+	V128LoadType8x8u
+	// V128LoadType16x4s corresponds to wasm.OpcodeVecV128Load16x4SName
+	V128LoadType16x4s
+	// V128LoadType16x4u corresponds to wasm.OpcodeVecV128Load16x4UName
+	V128LoadType16x4u
+	// V128LoadType32x2s corresponds to wasm.OpcodeVecV128Load32x2SName
+	V128LoadType32x2s
+	// V128LoadType32x2u corresponds to wasm.OpcodeVecV128Load32x2UName
+	V128LoadType32x2u
+	// V128LoadType8Splat corresponds to wasm.OpcodeVecV128Load8SplatName
+	V128LoadType8Splat
+	// V128LoadType16Splat corresponds to wasm.OpcodeVecV128Load16SplatName
+	V128LoadType16Splat
+	// V128LoadType32Splat corresponds to wasm.OpcodeVecV128Load32SplatName
+	V128LoadType32Splat
+	// V128LoadType64Splat corresponds to wasm.OpcodeVecV128Load64SplatName
+	V128LoadType64Splat
+	// V128LoadType32zero corresponds to wasm.OpcodeVecV128Load32zeroName
+	V128LoadType32zero
+	// V128LoadType64zero corresponds to wasm.OpcodeVecV128Load64zeroName
+	V128LoadType64zero
 )
 
 // OperationV128Load implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecV128LoadName wasm.OpcodeVecV128Load8x8SName wasm.OpcodeVecV128Load8x8UName
+// 	wasm.OpcodeVecV128Load16x4SName wasm.OpcodeVecV128Load16x4UName wasm.OpcodeVecV128Load32x2SName
+// 	wasm.OpcodeVecV128Load32x2UName wasm.OpcodeVecV128Load8SplatName wasm.OpcodeVecV128Load16SplatName
+// 	wasm.OpcodeVecV128Load32SplatName wasm.OpcodeVecV128Load64SplatName wasm.OpcodeVecV128Load32zeroName
+// 	wasm.OpcodeVecV128Load64zeroName
 type OperationV128Load struct {
-	Type LoadV128Type
+	Type V128LoadType
 	Arg  *MemoryArg
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Load) Kind() OperationKind {
+func (OperationV128Load) Kind() OperationKind {
 	return OperationKindV128Load
 }
 
 // OperationV128LoadLane implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName
+// 	wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
 type OperationV128LoadLane struct {
 	// LaneIndex is >=0 && <(128/LaneSize).
 	LaneIndex byte
@@ -1362,21 +2071,27 @@ type OperationV128LoadLane struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128LoadLane) Kind() OperationKind {
+func (OperationV128LoadLane) Kind() OperationKind {
 	return OperationKindV128LoadLane
 }
 
 // OperationV128Store implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName
+// 	wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
 type OperationV128Store struct {
 	Arg *MemoryArg
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Store) Kind() OperationKind {
+func (OperationV128Store) Kind() OperationKind {
 	return OperationKindV128Store
 }
 
 // OperationV128StoreLane implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Load8LaneName wasm.OpcodeVecV128Load16LaneName
+// 	wasm.OpcodeVecV128Load32LaneName wasm.OpcodeVecV128Load64LaneName.
 type OperationV128StoreLane struct {
 	// LaneIndex is >=0 && <(128/LaneSize).
 	LaneIndex byte
@@ -1386,11 +2101,17 @@ type OperationV128StoreLane struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128StoreLane) Kind() OperationKind {
+func (OperationV128StoreLane) Kind() OperationKind {
 	return OperationKindV128StoreLane
 }
 
 // OperationV128ExtractLane implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16ExtractLaneSName wasm.OpcodeVecI8x16ExtractLaneUName
+// 	wasm.OpcodeVecI16x8ExtractLaneSName wasm.OpcodeVecI16x8ExtractLaneUName
+// 	wasm.OpcodeVecI32x4ExtractLaneName wasm.OpcodeVecI64x2ExtractLaneName
+//	wasm.OpcodeVecF32x4ExtractLaneName wasm.OpcodeVecF64x2ExtractLaneName.
 type OperationV128ExtractLane struct {
 	// LaneIndex is >=0 && <M where shape = NxM.
 	LaneIndex byte
@@ -1400,11 +2121,16 @@ type OperationV128ExtractLane struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128ExtractLane) Kind() OperationKind {
+func (OperationV128ExtractLane) Kind() OperationKind {
 	return OperationKindV128ExtractLane
 }
 
 // OperationV128ReplaceLane implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16ReplaceLaneName wasm.OpcodeVecI16x8ReplaceLaneName
+//	wasm.OpcodeVecI32x4ReplaceLaneName wasm.OpcodeVecI64x2ReplaceLaneName
+// 	wasm.OpcodeVecF32x4ReplaceLaneName wasm.OpcodeVecF64x2ReplaceLaneName.
 type OperationV128ReplaceLane struct {
 	// LaneIndex is >=0 && <M where shape = NxM.
 	LaneIndex byte
@@ -1412,17 +2138,22 @@ type OperationV128ReplaceLane struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128ReplaceLane) Kind() OperationKind {
+func (OperationV128ReplaceLane) Kind() OperationKind {
 	return OperationKindV128ReplaceLane
 }
 
 // OperationV128Splat implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16SplatName wasm.OpcodeVecI16x8SplatName
+// 	wasm.OpcodeVecI32x4SplatName wasm.OpcodeVecI64x2SplatName
+// 	wasm.OpcodeVecF32x4SplatName wasm.OpcodeVecF64x2SplatName.
 type OperationV128Splat struct {
 	Shape Shape
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Splat) Kind() OperationKind {
+func (OperationV128Splat) Kind() OperationKind {
 	return OperationKindV128Splat
 }
 
@@ -1432,7 +2163,9 @@ type OperationV128Shuffle struct {
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Shuffle) Kind() OperationKind {
+//
+// This corresponds to wasm.OpcodeVecV128i8x16ShuffleName.
+func (OperationV128Shuffle) Kind() OperationKind {
 	return OperationKindV128Shuffle
 }
 
@@ -1440,112 +2173,158 @@ func (o *OperationV128Shuffle) Kind() OperationKind {
 type OperationV128Swizzle struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Swizzle) Kind() OperationKind {
+//
+// This corresponds to wasm.OpcodeVecI8x16SwizzleName.
+func (OperationV128Swizzle) Kind() OperationKind {
 	return OperationKindV128Swizzle
 }
 
 // OperationV128AnyTrue implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128AnyTrueName.
 type OperationV128AnyTrue struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128AnyTrue) Kind() OperationKind {
+func (OperationV128AnyTrue) Kind() OperationKind {
 	return OperationKindV128AnyTrue
 }
 
 // OperationV128AllTrue implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16AllTrueName wasm.OpcodeVecI16x8AllTrueName
+//	wasm.OpcodeVecI32x4AllTrueName wasm.OpcodeVecI64x2AllTrueName.
 type OperationV128AllTrue struct {
 	Shape Shape
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128AllTrue) Kind() OperationKind {
+func (OperationV128AllTrue) Kind() OperationKind {
 	return OperationKindV128AllTrue
 }
 
 // OperationV128BitMask implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16BitMaskName wasm.OpcodeVecI16x8BitMaskName
+//	wasm.OpcodeVecI32x4BitMaskName wasm.OpcodeVecI64x2BitMaskName.
 type OperationV128BitMask struct {
 	Shape Shape
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128BitMask) Kind() OperationKind {
+func (OperationV128BitMask) Kind() OperationKind {
 	return OperationKindV128BitMask
 }
 
 // OperationV128And implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128And.
 type OperationV128And struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128And) Kind() OperationKind {
+func (OperationV128And) Kind() OperationKind {
 	return OperationKindV128And
 }
 
 // OperationV128Not implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Not.
 type OperationV128Not struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Not) Kind() OperationKind {
+func (OperationV128Not) Kind() OperationKind {
 	return OperationKindV128Not
 }
 
 // OperationV128Or implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Or.
 type OperationV128Or struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Or) Kind() OperationKind {
+func (OperationV128Or) Kind() OperationKind {
 	return OperationKindV128Or
 }
 
 // OperationV128Xor implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Xor.
 type OperationV128Xor struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Xor) Kind() OperationKind {
+func (OperationV128Xor) Kind() OperationKind {
 	return OperationKindV128Xor
 }
 
 // OperationV128Bitselect implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128Bitselect.
 type OperationV128Bitselect struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Bitselect) Kind() OperationKind {
+func (OperationV128Bitselect) Kind() OperationKind {
 	return OperationKindV128Bitselect
 }
 
 // OperationV128AndNot implements Operation.
+//
+// This corresponds to wasm.OpcodeVecV128AndNot.
 type OperationV128AndNot struct{}
 
 // Kind implements Operation.Kind.
-func (o *OperationV128AndNot) Kind() OperationKind {
+func (OperationV128AndNot) Kind() OperationKind {
 	return OperationKindV128AndNot
 }
 
 // OperationV128Shl implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16ShlName wasm.OpcodeVecI16x8ShlName
+//	wasm.OpcodeVecI32x4ShlName wasm.OpcodeVecI64x2ShlName
 type OperationV128Shl struct {
 	Shape Shape
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Shl) Kind() OperationKind {
+func (OperationV128Shl) Kind() OperationKind {
 	return OperationKindV128Shl
 }
 
 // OperationV128Shr implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16ShrSName wasm.OpcodeVecI8x16ShrUName wasm.OpcodeVecI16x8ShrSName
+// 	wasm.OpcodeVecI16x8ShrUName wasm.OpcodeVecI32x4ShrSName wasm.OpcodeVecI32x4ShrUName.
+// 	wasm.OpcodeVecI64x2ShrSName wasm.OpcodeVecI64x2ShrUName.
 type OperationV128Shr struct {
 	Shape  Shape
 	Signed bool
 }
 
 // Kind implements Operation.Kind.
-func (o *OperationV128Shr) Kind() OperationKind {
+func (OperationV128Shr) Kind() OperationKind {
 	return OperationKindV128Shr
 }
 
 // OperationV128Cmp implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16EqName, wasm.OpcodeVecI8x16NeName, wasm.OpcodeVecI8x16LtSName, wasm.OpcodeVecI8x16LtUName, wasm.OpcodeVecI8x16GtSName,
+//	wasm.OpcodeVecI8x16GtUName, wasm.OpcodeVecI8x16LeSName, wasm.OpcodeVecI8x16LeUName, wasm.OpcodeVecI8x16GeSName, wasm.OpcodeVecI8x16GeUName,
+//	wasm.OpcodeVecI16x8EqName, wasm.OpcodeVecI16x8NeName, wasm.OpcodeVecI16x8LtSName, wasm.OpcodeVecI16x8LtUName, wasm.OpcodeVecI16x8GtSName,
+//	wasm.OpcodeVecI16x8GtUName, wasm.OpcodeVecI16x8LeSName, wasm.OpcodeVecI16x8LeUName, wasm.OpcodeVecI16x8GeSName, wasm.OpcodeVecI16x8GeUName,
+//	wasm.OpcodeVecI32x4EqName, wasm.OpcodeVecI32x4NeName, wasm.OpcodeVecI32x4LtSName, wasm.OpcodeVecI32x4LtUName, wasm.OpcodeVecI32x4GtSName,
+//	wasm.OpcodeVecI32x4GtUName, wasm.OpcodeVecI32x4LeSName, wasm.OpcodeVecI32x4LeUName, wasm.OpcodeVecI32x4GeSName, wasm.OpcodeVecI32x4GeUName,
+//	wasm.OpcodeVecI64x2EqName, wasm.OpcodeVecI64x2NeName, wasm.OpcodeVecI64x2LtSName, wasm.OpcodeVecI64x2GtSName, wasm.OpcodeVecI64x2LeSName,
+//	wasm.OpcodeVecI64x2GeSName, wasm.OpcodeVecF32x4EqName, wasm.OpcodeVecF32x4NeName, wasm.OpcodeVecF32x4LtName, wasm.OpcodeVecF32x4GtName,
+//	wasm.OpcodeVecF32x4LeName, wasm.OpcodeVecF32x4GeName, wasm.OpcodeVecF64x2EqName, wasm.OpcodeVecF64x2NeName, wasm.OpcodeVecF64x2LtName,
+//	wasm.OpcodeVecF64x2GtName, wasm.OpcodeVecF64x2LeName, wasm.OpcodeVecF64x2GeName
 type OperationV128Cmp struct {
 	Type V128CmpType
 }
 
+// V128CmpType represents a type of vector comparison operation.
 type V128CmpType = byte
 
 const (
@@ -1647,7 +2426,373 @@ const (
 	V128CmpTypeF64x2Ge
 )
 
-//Kind implements Operation.Kind.
-func (o *OperationV128Cmp) Kind() OperationKind {
+// Kind implements Operation.Kind.
+func (OperationV128Cmp) Kind() OperationKind {
 	return OperationKindV128Cmp
 }
+
+// OperationV128AddSat implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16AddSatUName wasm.OpcodeVecI8x16AddSatSName
+// 	wasm.OpcodeVecI16x8AddSatUName wasm.OpcodeVecI16x8AddSatSName
+type OperationV128AddSat struct {
+	// Shape is either ShapeI8x16 or ShapeI16x8.
+	Shape  Shape
+	Signed bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128AddSat) Kind() OperationKind {
+	return OperationKindV128AddSat
+}
+
+// OperationV128SubSat implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16SubSatUName wasm.OpcodeVecI8x16SubSatSName
+// 	wasm.OpcodeVecI16x8SubSatUName wasm.OpcodeVecI16x8SubSatSName
+type OperationV128SubSat struct {
+	// Shape is either ShapeI8x16 or ShapeI16x8.
+	Shape  Shape
+	Signed bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128SubSat) Kind() OperationKind {
+	return OperationKindV128SubSat
+}
+
+// OperationV128Mul implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4MulName wasm.OpcodeVecF64x2MulName
+// 	wasm.OpcodeVecI16x8MulName wasm.OpcodeVecI32x4MulName wasm.OpcodeVecI64x2MulName.
+type OperationV128Mul struct {
+	// Shape is either ShapeI16x8, ShapeI32x4, ShapeI64x2, ShapeF32x4 or ShapeF64x2.
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Mul) Kind() OperationKind {
+	return OperationKindV128Mul
+}
+
+// OperationV128Div implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4DivName wasm.OpcodeVecF64x2DivName.
+type OperationV128Div struct {
+	// Shape is either ShapeF32x4 or ShapeF64x2.
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Div) Kind() OperationKind {
+	return OperationKindV128Div
+}
+
+// OperationV128Neg implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16NegName wasm.OpcodeVecI16x8NegName wasm.OpcodeVecI32x4NegName
+// 	wasm.OpcodeVecI64x2NegName wasm.OpcodeVecF32x4NegName wasm.OpcodeVecF64x2NegName.
+type OperationV128Neg struct {
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Neg) Kind() OperationKind {
+	return OperationKindV128Neg
+}
+
+// OperationV128Sqrt implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4SqrtName wasm.OpcodeVecF64x2SqrtName.
+type OperationV128Sqrt struct {
+	// Shape is either ShapeF32x4 or ShapeF64x2.
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Sqrt) Kind() OperationKind {
+	return OperationKindV128Sqrt
+}
+
+// OperationV128Abs implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16AbsName wasm.OpcodeVecI16x8AbsName wasm.OpcodeVecI32x4AbsName
+// 	wasm.OpcodeVecI64x2AbsName wasm.OpcodeVecF32x4AbsName wasm.OpcodeVecF64x2AbsName.
+type OperationV128Abs struct {
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Abs) Kind() OperationKind {
+	return OperationKindV128Abs
+}
+
+// OperationV128Popcnt implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16PopcntName.
+type OperationV128Popcnt struct {
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Popcnt) Kind() OperationKind {
+	return OperationKindV128Popcnt
+}
+
+// OperationV128Min implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16MinSName wasm.OpcodeVecI8x16MinUName　wasm.OpcodeVecI16x8MinSName wasm.OpcodeVecI16x8MinUName
+//	wasm.OpcodeVecI32x4MinSName wasm.OpcodeVecI32x4MinUName　wasm.OpcodeVecI16x8MinSName wasm.OpcodeVecI16x8MinUName
+//	wasm.OpcodeVecF32x4MinName wasm.OpcodeVecF64x2MinName
+type OperationV128Min struct {
+	Shape  Shape
+	Signed bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Min) Kind() OperationKind {
+	return OperationKindV128Min
+}
+
+// OperationV128Max implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16MaxSName wasm.OpcodeVecI8x16MaxUName　wasm.OpcodeVecI16x8MaxSName wasm.OpcodeVecI16x8MaxUName
+//	wasm.OpcodeVecI32x4MaxSName wasm.OpcodeVecI32x4MaxUName　wasm.OpcodeVecI16x8MaxSName wasm.OpcodeVecI16x8MaxUName
+//	wasm.OpcodeVecF32x4MaxName wasm.OpcodeVecF64x2MaxName.
+type OperationV128Max struct {
+	Shape  Shape
+	Signed bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Max) Kind() OperationKind {
+	return OperationKindV128Max
+}
+
+// OperationV128AvgrU implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI8x16AvgrUName.
+type OperationV128AvgrU struct {
+	Shape Shape
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128AvgrU) Kind() OperationKind {
+	return OperationKindV128AvgrU
+}
+
+// OperationV128Pmin implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4PminName wasm.OpcodeVecF64x2PminName.
+type OperationV128Pmin struct{ Shape Shape }
+
+// Kind implements Operation.Kind
+func (OperationV128Pmin) Kind() OperationKind {
+	return OperationKindV128Pmin
+}
+
+// OperationV128Pmax implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4PmaxName wasm.OpcodeVecF64x2PmaxName.
+type OperationV128Pmax struct{ Shape Shape }
+
+// Kind implements Operation.Kind
+func (OperationV128Pmax) Kind() OperationKind {
+	return OperationKindV128Pmax
+}
+
+// OperationV128Ceil implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4CeilName wasm.OpcodeVecF64x2CeilName
+type OperationV128Ceil struct{ Shape Shape }
+
+// Kind implements Operation.Kind
+func (OperationV128Ceil) Kind() OperationKind {
+	return OperationKindV128Ceil
+}
+
+// OperationV128Floor implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4FloorName wasm.OpcodeVecF64x2FloorName
+type OperationV128Floor struct{ Shape Shape }
+
+// Kind implements Operation.Kind
+func (OperationV128Floor) Kind() OperationKind {
+	return OperationKindV128Floor
+}
+
+// OperationV128Trunc implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4TruncName wasm.OpcodeVecF64x2TruncName
+type OperationV128Trunc struct{ Shape Shape }
+
+// Kind implements Operation.Kind
+func (OperationV128Trunc) Kind() OperationKind {
+	return OperationKindV128Trunc
+}
+
+// OperationV128Nearest implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4NearestName wasm.OpcodeVecF64x2NearestName
+type OperationV128Nearest struct{ Shape Shape }
+
+// Kind implements Operation.Kind
+func (OperationV128Nearest) Kind() OperationKind {
+	return OperationKindV128Nearest
+}
+
+// OperationV128Extend implements Operation
+//
+// This corresponds to
+// 	wasm.OpcodeVecI16x8ExtendLowI8x16SName wasm.OpcodeVecI16x8ExtendHighI8x16SName
+// 	wasm.OpcodeVecI16x8ExtendLowI8x16UName wasm.OpcodeVecI16x8ExtendHighI8x16UName
+// 	wasm.OpcodeVecI32x4ExtendLowI16x8SName wasm.OpcodeVecI32x4ExtendHighI16x8SName
+// 	wasm.OpcodeVecI32x4ExtendLowI16x8UName wasm.OpcodeVecI32x4ExtendHighI16x8UName
+// 	wasm.OpcodeVecI64x2ExtendLowI32x4SName wasm.OpcodeVecI64x2ExtendHighI32x4SName
+// 	wasm.OpcodeVecI64x2ExtendLowI32x4UName wasm.OpcodeVecI64x2ExtendHighI32x4UName
+type OperationV128Extend struct {
+	// OriginShape is the shape of the original lanes for extension which is
+	// either ShapeI8x16, ShapeI16x8, or ShapeI32x4.
+	OriginShape Shape
+	Signed      bool
+	// UseLow true if it uses the lower half of vector for extension.
+	UseLow bool
+}
+
+// Kind implements Operation.Kind
+func (OperationV128Extend) Kind() OperationKind {
+	return OperationKindV128Extend
+}
+
+// OperationV128ExtMul implements Operation
+//
+// This corresponds to
+// 	wasm.OpcodeVecI16x8ExtMulLowI8x16SName wasm.OpcodeVecI16x8ExtMulLowI8x16UName
+// 	wasm.OpcodeVecI16x8ExtMulHighI8x16SName wasm.OpcodeVecI16x8ExtMulHighI8x16UName
+//  wasm.OpcodeVecI32x4ExtMulLowI16x8SName wasm.OpcodeVecI32x4ExtMulLowI16x8UName
+// 	wasm.OpcodeVecI32x4ExtMulHighI16x8SName wasm.OpcodeVecI32x4ExtMulHighI16x8UName
+//  wasm.OpcodeVecI64x2ExtMulLowI32x4SName wasm.OpcodeVecI64x2ExtMulLowI32x4UName
+// 	wasm.OpcodeVecI64x2ExtMulHighI32x4SName wasm.OpcodeVecI64x2ExtMulHighI32x4UName.
+type OperationV128ExtMul struct {
+	// OriginShape is the shape of the original lanes for extension which is
+	// either ShapeI8x16, ShapeI16x8, or ShapeI32x4.
+	OriginShape Shape
+	Signed      bool
+	// UseLow true if it uses the lower half of vector for extension.
+	UseLow bool
+}
+
+// Kind implements Operation.Kind
+func (OperationV128ExtMul) Kind() OperationKind {
+	return OperationKindV128ExtMul
+}
+
+// OperationV128Q15mulrSatS implements Operation
+//
+// This corresponds to wasm.OpcodeVecI16x8Q15mulrSatSName
+type OperationV128Q15mulrSatS struct{}
+
+// Kind implements Operation.Kind
+func (OperationV128Q15mulrSatS) Kind() OperationKind {
+	return OperationKindV128Q15mulrSatS
+}
+
+// OperationV128ExtAddPairwise implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI16x8ExtaddPairwiseI8x16SName wasm.OpcodeVecI16x8ExtaddPairwiseI8x16UName
+// 	wasm.OpcodeVecI32x4ExtaddPairwiseI16x8SName wasm.OpcodeVecI32x4ExtaddPairwiseI16x8UName.
+type OperationV128ExtAddPairwise struct {
+	// OriginShape is the shape of the original lanes for extension which is
+	// either ShapeI8x16, or ShapeI16x8.
+	OriginShape Shape
+	Signed      bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128ExtAddPairwise) Kind() OperationKind {
+	return OperationKindV128ExtAddPairwise
+}
+
+// OperationV128FloatPromote implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF64x2PromoteLowF32x4ZeroName
+type OperationV128FloatPromote struct{}
+
+// Kind implements Operation.Kind.
+func (OperationV128FloatPromote) Kind() OperationKind {
+	return OperationKindV128FloatPromote
+}
+
+// OperationV128FloatDemote implements Operation.
+//
+// This corresponds to wasm.OpcodeVecF32x4DemoteF64x2ZeroName.
+type OperationV128FloatDemote struct{}
+
+// Kind implements Operation.Kind.
+func (OperationV128FloatDemote) Kind() OperationKind {
+	return OperationKindV128FloatDemote
+}
+
+// OperationV128FConvertFromI implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecF32x4ConvertI32x4SName wasm.OpcodeVecF32x4ConvertI32x4UName
+// 	wasm.OpcodeVecF64x2ConvertLowI32x4SName wasm.OpcodeVecF64x2ConvertLowI32x4UName.
+type OperationV128FConvertFromI struct {
+	// DestinationShape is the shape of the destination lanes for conversion which is
+	// either ShapeF32x4, or ShapeF64x2.
+	DestinationShape Shape
+	Signed           bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128FConvertFromI) Kind() OperationKind {
+	return OperationKindV128FConvertFromI
+}
+
+// OperationV128Dot implements Operation.
+//
+// This corresponds to wasm.OpcodeVecI32x4DotI16x8SName
+type OperationV128Dot struct{}
+
+// Kind implements Operation.Kind.
+func (OperationV128Dot) Kind() OperationKind {
+	return OperationKindV128Dot
+}
+
+// OperationV128Narrow implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI8x16NarrowI16x8SName wasm.OpcodeVecI8x16NarrowI16x8UName
+// 	wasm.OpcodeVecI16x8NarrowI32x4SName wasm.OpcodeVecI16x8NarrowI32x4UName.
+type OperationV128Narrow struct {
+	// OriginShape is the shape of the original lanes for narrowing which is
+	// either ShapeI16x8, or ShapeI32x4.
+	OriginShape Shape
+	Signed      bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128Narrow) Kind() OperationKind {
+	return OperationKindV128Narrow
+}
+
+// OperationV128ITruncSatFromF implements Operation.
+//
+// This corresponds to
+// 	wasm.OpcodeVecI32x4TruncSatF64x2UZeroName wasm.OpcodeVecI32x4TruncSatF64x2SZeroName
+// 	wasm.OpcodeVecI32x4TruncSatF32x4UName wasm.OpcodeVecI32x4TruncSatF32x4SName.
+type OperationV128ITruncSatFromF struct {
+	// OriginShape is the shape of the original lanes for truncation which is
+	// either ShapeF32x4, or ShapeF64x2.
+	OriginShape Shape
+	Signed      bool
+}
+
+// Kind implements Operation.Kind.
+func (OperationV128ITruncSatFromF) Kind() OperationKind {
+	return OperationKindV128ITruncSatFromF
+}
diff --git a/internal/wazeroir/signature.go b/internal/wazeroir/signature.go
index f3ff5ac68af..92ce2484d98 100644
--- a/internal/wazeroir/signature.go
+++ b/internal/wazeroir/signature.go
@@ -407,7 +407,7 @@ func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature
 		return signature_F32_I64, nil
 	case wasm.OpcodeI64TruncF64S, wasm.OpcodeI64TruncF64U:
 		return signature_F64_I64, nil
-	case wasm.OpcodeF32ConvertI32s, wasm.OpcodeF32ConvertI32U:
+	case wasm.OpcodeF32ConvertI32S, wasm.OpcodeF32ConvertI32U:
 		return signature_I32_F32, nil
 	case wasm.OpcodeF32ConvertI64S, wasm.OpcodeF32ConvertI64U:
 		return signature_I64_F32, nil
@@ -474,9 +474,6 @@ func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature
 		switch vecOp := c.body[c.pc+1]; vecOp {
 		case wasm.OpcodeVecV128Const:
 			return signature_None_V128, nil
-		case wasm.OpcodeVecI8x16Add, wasm.OpcodeVecI16x8Add, wasm.OpcodeVecI32x4Add, wasm.OpcodeVecI64x2Add,
-			wasm.OpcodeVecI8x16Sub, wasm.OpcodeVecI16x8Sub, wasm.OpcodeVecI32x4Sub, wasm.OpcodeVecI64x2Sub:
-			return signature_V128V128_V128, nil
 		case wasm.OpcodeVecV128Load, wasm.OpcodeVecV128Load8x8s, wasm.OpcodeVecV128Load8x8u,
 			wasm.OpcodeVecV128Load16x4s, wasm.OpcodeVecV128Load16x4u, wasm.OpcodeVecV128Load32x2s,
 			wasm.OpcodeVecV128Load32x2u, wasm.OpcodeVecV128Load8Splat, wasm.OpcodeVecV128Load16Splat,
@@ -532,7 +529,21 @@ func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature
 			wasm.OpcodeVecV128AnyTrue,
 			wasm.OpcodeVecI8x16BitMask, wasm.OpcodeVecI16x8BitMask, wasm.OpcodeVecI32x4BitMask, wasm.OpcodeVecI64x2BitMask:
 			return signature_V128_I32, nil
-		case wasm.OpcodeVecV128Not:
+		case wasm.OpcodeVecV128Not, wasm.OpcodeVecI8x16Neg, wasm.OpcodeVecI16x8Neg, wasm.OpcodeVecI32x4Neg, wasm.OpcodeVecI64x2Neg,
+			wasm.OpcodeVecF32x4Neg, wasm.OpcodeVecF64x2Neg, wasm.OpcodeVecF32x4Sqrt, wasm.OpcodeVecF64x2Sqrt,
+			wasm.OpcodeVecI8x16Abs, wasm.OpcodeVecI8x16Popcnt, wasm.OpcodeVecI16x8Abs, wasm.OpcodeVecI32x4Abs, wasm.OpcodeVecI64x2Abs,
+			wasm.OpcodeVecF32x4Abs, wasm.OpcodeVecF64x2Abs,
+			wasm.OpcodeVecF32x4Ceil, wasm.OpcodeVecF32x4Floor, wasm.OpcodeVecF32x4Trunc, wasm.OpcodeVecF32x4Nearest,
+			wasm.OpcodeVecF64x2Ceil, wasm.OpcodeVecF64x2Floor, wasm.OpcodeVecF64x2Trunc, wasm.OpcodeVecF64x2Nearest,
+			wasm.OpcodeVecI16x8ExtendLowI8x16S, wasm.OpcodeVecI16x8ExtendHighI8x16S, wasm.OpcodeVecI16x8ExtendLowI8x16U, wasm.OpcodeVecI16x8ExtendHighI8x16U,
+			wasm.OpcodeVecI32x4ExtendLowI16x8S, wasm.OpcodeVecI32x4ExtendHighI16x8S, wasm.OpcodeVecI32x4ExtendLowI16x8U, wasm.OpcodeVecI32x4ExtendHighI16x8U,
+			wasm.OpcodeVecI64x2ExtendLowI32x4S, wasm.OpcodeVecI64x2ExtendHighI32x4S, wasm.OpcodeVecI64x2ExtendLowI32x4U, wasm.OpcodeVecI64x2ExtendHighI32x4U,
+			wasm.OpcodeVecI16x8ExtaddPairwiseI8x16S, wasm.OpcodeVecI16x8ExtaddPairwiseI8x16U, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8S, wasm.OpcodeVecI32x4ExtaddPairwiseI16x8U,
+			wasm.OpcodeVecF64x2PromoteLowF32x4Zero, wasm.OpcodeVecF32x4DemoteF64x2Zero,
+			wasm.OpcodeVecF32x4ConvertI32x4S, wasm.OpcodeVecF32x4ConvertI32x4U,
+			wasm.OpcodeVecF64x2ConvertLowI32x4S, wasm.OpcodeVecF64x2ConvertLowI32x4U,
+			wasm.OpcodeVecI32x4TruncSatF32x4S, wasm.OpcodeVecI32x4TruncSatF32x4U,
+			wasm.OpcodeVecI32x4TruncSatF64x2SZero, wasm.OpcodeVecI32x4TruncSatF64x2UZero:
 			return signature_V128_V128, nil
 		case wasm.OpcodeVecV128Bitselect:
 			return signature_V128V128V128_V32, nil
@@ -545,7 +556,26 @@ func (c *compiler) wasmOpcodeSignature(op wasm.Opcode, index uint32) (*signature
 			wasm.OpcodeVecI64x2Eq, wasm.OpcodeVecI64x2Ne, wasm.OpcodeVecI64x2LtS, wasm.OpcodeVecI64x2GtS, wasm.OpcodeVecI64x2LeS,
 			wasm.OpcodeVecI64x2GeS, wasm.OpcodeVecF32x4Eq, wasm.OpcodeVecF32x4Ne, wasm.OpcodeVecF32x4Lt, wasm.OpcodeVecF32x4Gt,
 			wasm.OpcodeVecF32x4Le, wasm.OpcodeVecF32x4Ge, wasm.OpcodeVecF64x2Eq, wasm.OpcodeVecF64x2Ne, wasm.OpcodeVecF64x2Lt,
-			wasm.OpcodeVecF64x2Gt, wasm.OpcodeVecF64x2Le, wasm.OpcodeVecF64x2Ge:
+			wasm.OpcodeVecF64x2Gt, wasm.OpcodeVecF64x2Le, wasm.OpcodeVecF64x2Ge,
+			wasm.OpcodeVecI8x16Add, wasm.OpcodeVecI8x16AddSatS, wasm.OpcodeVecI8x16AddSatU, wasm.OpcodeVecI8x16Sub,
+			wasm.OpcodeVecI8x16SubSatS, wasm.OpcodeVecI8x16SubSatU,
+			wasm.OpcodeVecI16x8Add, wasm.OpcodeVecI16x8AddSatS, wasm.OpcodeVecI16x8AddSatU, wasm.OpcodeVecI16x8Sub,
+			wasm.OpcodeVecI16x8SubSatS, wasm.OpcodeVecI16x8SubSatU, wasm.OpcodeVecI16x8Mul,
+			wasm.OpcodeVecI32x4Add, wasm.OpcodeVecI32x4Sub, wasm.OpcodeVecI32x4Mul,
+			wasm.OpcodeVecI64x2Add, wasm.OpcodeVecI64x2Sub, wasm.OpcodeVecI64x2Mul,
+			wasm.OpcodeVecF32x4Add, wasm.OpcodeVecF32x4Sub, wasm.OpcodeVecF32x4Mul, wasm.OpcodeVecF32x4Div,
+			wasm.OpcodeVecF64x2Add, wasm.OpcodeVecF64x2Sub, wasm.OpcodeVecF64x2Mul, wasm.OpcodeVecF64x2Div,
+			wasm.OpcodeVecI8x16MinS, wasm.OpcodeVecI8x16MinU, wasm.OpcodeVecI8x16MaxS, wasm.OpcodeVecI8x16MaxU, wasm.OpcodeVecI8x16AvgrU,
+			wasm.OpcodeVecI16x8MinS, wasm.OpcodeVecI16x8MinU, wasm.OpcodeVecI16x8MaxS, wasm.OpcodeVecI16x8MaxU, wasm.OpcodeVecI16x8AvgrU,
+			wasm.OpcodeVecI32x4MinS, wasm.OpcodeVecI32x4MinU, wasm.OpcodeVecI32x4MaxS, wasm.OpcodeVecI32x4MaxU,
+			wasm.OpcodeVecF32x4Min, wasm.OpcodeVecF32x4Max, wasm.OpcodeVecF64x2Min, wasm.OpcodeVecF64x2Max,
+			wasm.OpcodeVecF32x4Pmin, wasm.OpcodeVecF32x4Pmax, wasm.OpcodeVecF64x2Pmin, wasm.OpcodeVecF64x2Pmax,
+			wasm.OpcodeVecI16x8Q15mulrSatS,
+			wasm.OpcodeVecI16x8ExtMulLowI8x16S, wasm.OpcodeVecI16x8ExtMulHighI8x16S, wasm.OpcodeVecI16x8ExtMulLowI8x16U, wasm.OpcodeVecI16x8ExtMulHighI8x16U,
+			wasm.OpcodeVecI32x4ExtMulLowI16x8S, wasm.OpcodeVecI32x4ExtMulHighI16x8S, wasm.OpcodeVecI32x4ExtMulLowI16x8U, wasm.OpcodeVecI32x4ExtMulHighI16x8U,
+			wasm.OpcodeVecI64x2ExtMulLowI32x4S, wasm.OpcodeVecI64x2ExtMulHighI32x4S, wasm.OpcodeVecI64x2ExtMulLowI32x4U, wasm.OpcodeVecI64x2ExtMulHighI32x4U,
+			wasm.OpcodeVecI32x4DotI16x8S,
+			wasm.OpcodeVecI8x16NarrowI16x8S, wasm.OpcodeVecI8x16NarrowI16x8U, wasm.OpcodeVecI16x8NarrowI32x4S, wasm.OpcodeVecI16x8NarrowI32x4U:
 			return signature_V128V128_V128, nil
 		default:
 			return nil, fmt.Errorf("unsupported vector instruction in wazeroir: %s", wasm.VectorInstructionName(vecOp))