From 77b8cd0fa8cfb249208685efa4fa716ea4142201 Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <takeshi@tetrate.io>
Date: Mon, 20 Jun 2022 17:36:31 +0900
Subject: [PATCH 1/3] arm64: implement dot

Signed-off-by: Takeshi Yoneda <takeshi@tetrate.io>
---
 internal/engine/compiler/compiler_vec_test.go |  5 ---
 internal/engine/compiler/impl_vec_arm64.go    | 31 +++++++++++++++++--
 .../integration_test/spectest/v2/spec_test.go |  3 +-
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go
index c4801c5f52..1542311177 100644
--- a/internal/engine/compiler/compiler_vec_test.go
+++ b/internal/engine/compiler/compiler_vec_test.go
@@ -7084,11 +7084,6 @@ func TestCompiler_compileV128FConvertFromI(t *testing.T) {
 }
 
 func TestCompiler_compileV128Dot(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
-
 	tests := []struct {
 		name        string
 		x1, x2, exp [16]byte
diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go
index e873c6fad0..da69086c1f 100644
--- a/internal/engine/compiler/impl_vec_arm64.go
+++ b/internal/engine/compiler/impl_vec_arm64.go
@@ -1356,8 +1356,35 @@ func (c *arm64Compiler) compileV128FConvertFromI(o *wazeroir.OperationV128FConve
 }
 
 // compileV128Dot implements compiler.compileV128Dot for arm64.
-func (c *arm64Compiler) compileV128Dot(o *wazeroir.OperationV128Dot) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+func (c *arm64Compiler) compileV128Dot(*wazeroir.OperationV128Dot) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	tmp, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	x1r, x2r := x1.register, x2.register
+
+	// Multiply lower integers and get the 32-bit results into tmp.
+	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.SMULL, x1r, x2r, tmp, arm64.VectorArrangement4H)
+	// Multiply higher integers and get the 32-bit results into x1r.
+	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.SMULL2, x1r, x2r, x1r, arm64.VectorArrangement8H)
+	// Adds these two results into x1r.
+	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.VADDP, x1r, tmp, x1r, arm64.VectorArrangement4S)
+
+	c.markRegisterUnused(x2r)
+	c.pushVectorRuntimeValueLocationOnRegister(x1r)
+
+	return nil
 }
 
 // compileV128Narrow implements compiler.compileV128Narrow for arm64.
diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go
index 8aa41a3af5..a949e40e6a 100644
--- a/internal/integration_test/spectest/v2/spec_test.go
+++ b/internal/integration_test/spectest/v2/spec_test.go
@@ -27,8 +27,7 @@ func TestCompiler(t *testing.T) {
 	spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool {
 		switch path.Base(jsonname) {
 		case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json",
-			"simd_i32x4_dot_i16x8.json", "simd_splat.json", "simd_load.json",
-			"simd_conversions.json":
+			"simd_splat.json", "simd_load.json", "simd_conversions.json":
 			// TODO: implement on arm64.
 			return runtime.GOARCH == "amd64"
 		default:

From b900be25fde8613e8fe83b080c6ff82ca862d050 Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <takeshi@tetrate.io>
Date: Mon, 20 Jun 2022 17:45:32 +0900
Subject: [PATCH 2/3] arm64: implement pseudo max/min

Signed-off-by: Takeshi Yoneda <takeshi@tetrate.io>
---
 internal/engine/compiler/compiler_vec_test.go |  5 ---
 internal/engine/compiler/impl_vec_arm64.go    | 35 +++++++++++++++++--
 .../integration_test/spectest/v2/spec_test.go |  3 +-
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/internal/engine/compiler/compiler_vec_test.go b/internal/engine/compiler/compiler_vec_test.go
index 1542311177..310f925112 100644
--- a/internal/engine/compiler/compiler_vec_test.go
+++ b/internal/engine/compiler/compiler_vec_test.go
@@ -4833,11 +4833,6 @@ func TestCompiler_compileV128Round(t *testing.T) {
 }
 
 func TestCompiler_compileV128_Pmax_Pmin(t *testing.T) {
-	if runtime.GOARCH != "amd64" {
-		// TODO: implement on amd64.
-		t.Skip()
-	}
-
 	tests := []struct {
 		name        string
 		shape       wazeroir.Shape
diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go
index da69086c1f..9829c18ae8 100644
--- a/internal/engine/compiler/impl_vec_arm64.go
+++ b/internal/engine/compiler/impl_vec_arm64.go
@@ -1206,12 +1206,43 @@ func (c *arm64Compiler) compileV128AvgrU(o *wazeroir.OperationV128AvgrU) error {
 
 // compileV128Pmin implements compiler.compileV128Pmin for arm64.
 func (c *arm64Compiler) compileV128Pmin(o *wazeroir.OperationV128Pmin) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+	return c.compileV128PseudoMinOrMax(defaultArrangementForShape(o.Shape), false)
 }
 
 // compileV128Pmax implements compiler.compileV128Pmax for arm64.
 func (c *arm64Compiler) compileV128Pmax(o *wazeroir.OperationV128Pmax) error {
-	return fmt.Errorf("TODO: %s is not implemented yet on arm64 compiler", o.Kind())
+	return c.compileV128PseudoMinOrMax(defaultArrangementForShape(o.Shape), true)
+}
+
+// compileV128PseudoMinOrMax implements compileV128Pmax and compileV128Pmin.
+func (c *arm64Compiler) compileV128PseudoMinOrMax(arr arm64.VectorArrangement, max bool) error {
+	x2 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x2); err != nil {
+		return err
+	}
+
+	x1 := c.locationStack.popV128()
+	if err := c.compileEnsureOnGeneralPurposeRegister(x1); err != nil {
+		return err
+	}
+
+	result, err := c.allocateRegister(registerTypeVector)
+	if err != nil {
+		return err
+	}
+
+	x1r, x2r := x1.register, x2.register
+
+	if max {
+		c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.FCMGT, x1r, x2r, result, arr)
+	} else {
+		c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.FCMGT, x2r, x1r, result, arr)
+	}
+	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.BSL, x1r, x2r, result, arm64.VectorArrangement16B)
+
+	c.markRegisterUnused(x1r, x2r)
+	c.pushVectorRuntimeValueLocationOnRegister(result)
+	return nil
 }
 
 // compileV128Ceil implements compiler.compileV128Ceil for arm64.
diff --git a/internal/integration_test/spectest/v2/spec_test.go b/internal/integration_test/spectest/v2/spec_test.go
index a949e40e6a..c46aa589f4 100644
--- a/internal/integration_test/spectest/v2/spec_test.go
+++ b/internal/integration_test/spectest/v2/spec_test.go
@@ -26,8 +26,7 @@ func TestCompiler(t *testing.T) {
 
 	spectest.Run(t, testcases, compiler.NewEngine, enabledFeatures, func(jsonname string) bool {
 		switch path.Base(jsonname) {
-		case "simd_f64x2_pmin_pmax.json", "simd_f32x4_pmin_pmax.json",
-			"simd_splat.json", "simd_load.json", "simd_conversions.json":
+		case "simd_splat.json", "simd_load.json", "simd_conversions.json":
 			// TODO: implement on arm64.
 			return runtime.GOARCH == "amd64"
 		default:

From 0449eb465c9627644a23d3be16081692a1dfefe4 Mon Sep 17 00:00:00 2001
From: Takeshi Yoneda <takeshi@tetrate.io>
Date: Mon, 20 Jun 2022 17:49:53 +0900
Subject: [PATCH 3/3] comment

Signed-off-by: Takeshi Yoneda <takeshi@tetrate.io>
---
 internal/engine/compiler/impl_vec_arm64.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/internal/engine/compiler/impl_vec_arm64.go b/internal/engine/compiler/impl_vec_arm64.go
index 9829c18ae8..6c5093db0f 100644
--- a/internal/engine/compiler/impl_vec_arm64.go
+++ b/internal/engine/compiler/impl_vec_arm64.go
@@ -1233,11 +1233,13 @@ func (c *arm64Compiler) compileV128PseudoMinOrMax(arr arm64.VectorArrangement, m
 
 	x1r, x2r := x1.register, x2.register
 
+	// Sets all bits on each lane if x1r's lane satisfies the condition (min or max), zeros otherwise.
 	if max {
 		c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.FCMGT, x1r, x2r, result, arr)
 	} else {
 		c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.FCMGT, x2r, x1r, result, arr)
 	}
+	// Select each bit based on the result bits ^.
 	c.assembler.CompileTwoVectorRegistersToVectorRegister(arm64.BSL, x1r, x2r, result, arm64.VectorArrangement16B)
 
 	c.markRegisterUnused(x1r, x2r)