Adding more SIMD constant folding support (#82190)

* Adding more SIMD constant folding support * Adding tests for the new SIMD constant folding paths * Ensure bitcasting float/double is using well-defined behavior
dotnet · Feb 23, 2023 · 7219954 · 7219954
1 parent 2ffe0fe
commit 7219954
Show file tree

Hide file tree

Showing 6 changed files with 1,687 additions and 100 deletions.
diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h
@@ -149,6 +149,39 @@ struct simd32_t
     }
 };
 
+template <typename TBase>
+TBase EvaluateUnaryScalarSpecialized(genTreeOps oper, TBase arg0)
+{
+    switch (oper)
+    {
+        case GT_NOT:
+        {
+            return ~arg0;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
+
+template <>
+inline float EvaluateUnaryScalarSpecialized<float>(genTreeOps oper, float arg0)
+{
+    uint32_t arg0Bits   = BitOperations::SingleToUInt32Bits(arg0);
+    uint32_t resultBits = EvaluateUnaryScalarSpecialized<uint32_t>(oper, arg0Bits);
+    return BitOperations::UInt32BitsToSingle(resultBits);
+}
+
+template <>
+inline double EvaluateUnaryScalarSpecialized<double>(genTreeOps oper, double arg0)
+{
+    uint64_t arg0Bits   = BitOperations::DoubleToUInt64Bits(arg0);
+    uint64_t resultBits = EvaluateUnaryScalarSpecialized<uint64_t>(oper, arg0Bits);
+    return BitOperations::UInt64BitsToDouble(resultBits);
+}
+
 template <typename TBase>
 TBase EvaluateUnaryScalar(genTreeOps oper, TBase arg0)
 {
@@ -161,7 +194,7 @@ TBase EvaluateUnaryScalar(genTreeOps oper, TBase arg0)
 
         default:
         {
-            unreached();
+            return EvaluateUnaryScalarSpecialized<TBase>(oper, arg0);
         }
     }
 }
@@ -268,6 +301,119 @@ void EvaluateUnarySimd(genTreeOps oper, bool scalar, var_types baseType, TSimd*
     }
 }
 
+template <typename TBase>
+TBase EvaluateBinaryScalarRSZ(TBase arg0, TBase arg1)
+{
+    return arg0 >> (arg1 & ((sizeof(TBase) * 8) - 1));
+}
+
+template <>
+inline int8_t EvaluateBinaryScalarRSZ<int8_t>(int8_t arg0, int8_t arg1)
+{
+    uint8_t arg0Bits = static_cast<uint8_t>(arg0);
+    uint8_t arg1Bits = static_cast<uint8_t>(arg1);
+
+    uint8_t resultBits = EvaluateBinaryScalarRSZ<uint8_t>(arg0Bits, arg1Bits);
+    return static_cast<int8_t>(resultBits);
+}
+
+template <>
+inline int16_t EvaluateBinaryScalarRSZ<int16_t>(int16_t arg0, int16_t arg1)
+{
+    uint16_t arg0Bits = static_cast<uint16_t>(arg0);
+    uint16_t arg1Bits = static_cast<uint16_t>(arg1);
+
+    uint16_t resultBits = EvaluateBinaryScalarRSZ<uint16_t>(arg0Bits, arg1Bits);
+    return static_cast<int16_t>(resultBits);
+}
+
+template <>
+inline int32_t EvaluateBinaryScalarRSZ<int32_t>(int32_t arg0, int32_t arg1)
+{
+    uint32_t arg0Bits = static_cast<uint32_t>(arg0);
+    uint32_t arg1Bits = static_cast<uint32_t>(arg1);
+
+    uint32_t resultBits = EvaluateBinaryScalarRSZ<uint32_t>(arg0Bits, arg1Bits);
+    return static_cast<int32_t>(resultBits);
+}
+
+template <>
+inline int64_t EvaluateBinaryScalarRSZ<int64_t>(int64_t arg0, int64_t arg1)
+{
+    uint64_t arg0Bits = static_cast<uint64_t>(arg0);
+    uint64_t arg1Bits = static_cast<uint64_t>(arg1);
+
+    uint64_t resultBits = EvaluateBinaryScalarRSZ<uint64_t>(arg0Bits, arg1Bits);
+    return static_cast<int64_t>(resultBits);
+}
+
+template <typename TBase>
+TBase EvaluateBinaryScalarSpecialized(genTreeOps oper, TBase arg0, TBase arg1)
+{
+    switch (oper)
+    {
+        case GT_AND:
+        {
+            return arg0 & arg1;
+        }
+
+        case GT_AND_NOT:
+        {
+            return arg0 & ~arg1;
+        }
+
+        case GT_LSH:
+        {
+            return arg0 << (arg1 & ((sizeof(TBase) * 8) - 1));
+        }
+
+        case GT_OR:
+        {
+            return arg0 | arg1;
+        }
+
+        case GT_RSH:
+        {
+            return arg0 >> (arg1 & ((sizeof(TBase) * 8) - 1));
+        }
+
+        case GT_RSZ:
+        {
+            return EvaluateBinaryScalarRSZ<TBase>(arg0, arg1);
+        }
+
+        case GT_XOR:
+        {
+            return arg0 ^ arg1;
+        }
+
+        default:
+        {
+            unreached();
+        }
+    }
+}
+
+template <>
+inline float EvaluateBinaryScalarSpecialized<float>(genTreeOps oper, float arg0, float arg1)
+{
+    uint32_t arg0Bits = BitOperations::SingleToUInt32Bits(arg0);
+    uint32_t arg1Bits = BitOperations::SingleToUInt32Bits(arg1);
+
+    uint32_t resultBits = EvaluateBinaryScalarSpecialized<uint32_t>(oper, arg0Bits, arg1Bits);
+    return BitOperations::UInt32BitsToSingle(resultBits);
+}
+
+template <>
+inline double EvaluateBinaryScalarSpecialized<double>(genTreeOps oper, double arg0, double arg1)
+{
+    uint64_t arg0Bits = BitOperations::DoubleToUInt64Bits(arg0);
+    uint64_t arg1Bits = BitOperations::DoubleToUInt64Bits(arg1);
+
+    uint64_t resultBits = EvaluateBinaryScalarSpecialized<uint64_t>(oper, arg0Bits, arg1Bits);
+    return BitOperations::UInt64BitsToDouble(resultBits);
+}
+
 template <typename TBase>
 TBase EvaluateBinaryScalar(genTreeOps oper, TBase arg0, TBase arg1)
 {
@@ -278,14 +424,24 @@ TBase EvaluateBinaryScalar(genTreeOps oper, TBase arg0, TBase arg1)
             return arg0 + arg1;
         }
 
+        case GT_DIV:
+        {
+            return arg0 / arg1;
+        }
+
+        case GT_MUL:
+        {
+            return arg0 * arg1;
+        }
+
         case GT_SUB:
         {
             return arg0 - arg1;
         }
 
         default:
         {
-            unreached();
+            return EvaluateBinaryScalarSpecialized<TBase>(oper, arg0, arg1);
         }
     }
 }
@@ -395,6 +551,18 @@ void EvaluateBinarySimd(genTreeOps oper, bool scalar, var_types baseType, TSimd*
     }
 }
 
+template <typename TSimd, typename TBase>
+void BroadcastConstantToSimd(TSimd* result, TBase arg0)
+{
+    uint32_t count = sizeof(TSimd) / sizeof(TBase);
+
+    for (uint32_t i = 0; i < count; i++)
+    {
+        // Safely execute `result[i] = arg0`
+        memcpy(&result->u8[i * sizeof(TBase)], &arg0, sizeof(TBase));
+    }
+}
+
 #ifdef FEATURE_SIMD
 
 #ifdef TARGET_XARCH

diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp
@@ -2645,6 +2645,22 @@ uint32_t BitOperations::BitScanReverse(uint64_t value)
 #endif
 }
 
+//------------------------------------------------------------------------
+// BitOperations::DoubleToUInt64Bits: Gets the underlying bits for a double-precision floating-point value.
+//
+// Arguments:
+//    value - The number to convert
+//
+// Return Value:
+//    The underlying bits for value.
+//
+uint64_t BitOperations::DoubleToUInt64Bits(double value)
+{
+    uint64_t result;
+    memcpy(&result, &value, sizeof(double));
+    return result;
+}
+
 //------------------------------------------------------------------------
 // BitOperations::LeadingZeroCount: Count the number of leading zero bits in a mask.
 //
@@ -2932,6 +2948,22 @@ uint64_t BitOperations::RotateRight(uint64_t value, uint32_t offset)
     return (value >> (offset & 0x3F)) | (value << ((64 - offset) & 0x3F));
 }
 
+//------------------------------------------------------------------------
+// BitOperations::SingleToUInt32Bits: Gets the underlying bits for a single-precision floating-point value.
+//
+// Arguments:
+//    value - The number to convert
+//
+// Return Value:
+//    The underlying bits for value.
+//
+uint32_t BitOperations::SingleToUInt32Bits(float value)
+{
+    uint32_t result;
+    memcpy(&result, &value, sizeof(float));
+    return result;
+}
+
 //------------------------------------------------------------------------
 // BitOperations::TrailingZeroCount: Count the number of trailing zero bits in an integer value.
 //
@@ -2980,6 +3012,38 @@ uint32_t BitOperations::TrailingZeroCount(uint64_t value)
 #endif
 }
 
+//------------------------------------------------------------------------
+// BitOperations::UInt32BitsToSingle: Gets a single-precision floating-point from its underlying bit value.
+//
+// Arguments:
+//    value - The underlying bit value.
+//
+// Return Value:
+//    The single-precision floating-point from value.
+//
+float BitOperations::UInt32BitsToSingle(uint32_t value)
+{
+    float result;
+    memcpy(&result, &value, sizeof(uint32_t));
+    return result;
+}
+
+//------------------------------------------------------------------------
+// BitOperations::UInt64BitsToDouble: Gets a double-precision floating-point from its underlying bit value.
+//
+// Arguments:
+//    value - The underlying bit value.
+//
+// Return Value:
+//    The double-precision floating-point from value.
+//
+double BitOperations::UInt64BitsToDouble(uint64_t value)
+{
+    double result;
+    memcpy(&result, &value, sizeof(uint64_t));
+    return result;
+}
+
 namespace MagicDivide
 {
 template <int TableBase = 0, int TableSize, typename Magic>

diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h
@@ -751,6 +751,8 @@ class BitOperations
 
     static uint32_t BitScanReverse(uint64_t value);
 
+    static uint64_t DoubleToUInt64Bits(double value);
+
     static uint32_t LeadingZeroCount(uint32_t value);
 
     static uint32_t LeadingZeroCount(uint64_t value);
@@ -775,9 +777,15 @@ class BitOperations
 
     static uint64_t RotateRight(uint64_t value, uint32_t offset);
 
+    static uint32_t SingleToUInt32Bits(float value);
+
     static uint32_t TrailingZeroCount(uint32_t value);
 
     static uint32_t TrailingZeroCount(uint64_t value);
+
+    static float UInt32BitsToSingle(uint32_t value);
+
+    static double UInt64BitsToDouble(uint64_t value);
 };
 
 // The CLR requires that critical section locks be initialized via its ClrCreateCriticalSection API...but