JCSDA · DavidHuber-NOAA · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 27, 2024
diff --git a/var/spack/repos/builtin/packages/py-numpy/intel_mask_conversion.patch b/var/spack/repos/builtin/packages/py-numpy/intel_mask_conversion.patch
@@ -0,0 +1,52 @@
+--- numpy-1.25.0/numpy/core/src/common/simd/avx512/conversion.h.org	2021-02-04 17:38:42.000000000 +0900
++++ numpy-1.25.0/numpy/core/src/common/simd/avx512/conversion.h	2021-02-04 18:29:21.000000000 +0900
+@@ -131,20 +131,44 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
+     __mmask16 gh = _mm512_kunpackb((__mmask16)h, (__mmask16)g);
+     return npyv_pack_b8_b32(ab, cd, ef, gh);
+ }
+-
++/*
++ * A compiler bug workaround on Intel Compiler Classic.
++ * The bug manifests specifically when the
++ * scalar result of _cvtmask64_u64 is compared against the constant -1. This
++ * comparison uniquely triggers a bug under conditions of equality (==) or
++ * inequality (!=) checks, which are typically used in reduction operations like
++ * np.logical_or.
++ *
++ * The underlying issue arises from the compiler's optimizer. When the last
++ * vector comparison instruction operates on zmm, the optimizer erroneously
++ * emits a duplicate of this instruction but on the lower half register ymm. It
++ * then performs a bitwise XOR operation between the mask produced by this
++ * duplicated instruction and the mask from the original comparison instruction.
++ * This erroneous behavior leads to incorrect results.
++ *
++ * See https://github.com/numpy/numpy/issues/26197#issuecomment-2056750975
++ */
++#ifdef __INTEL_COMPILER
++#define NPYV__VOLATILE_CVTMASK64 volatile
++#else
++#define NPYV__VOLATILE_CVTMASK64
++#endif
+ // convert boolean vectors to integer bitfield
+-NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+-{
++NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) {
+ #ifdef NPY_HAVE_AVX512BW_MASK
+-    return (npy_uint64)_cvtmask64_u64(a);
++    npy_uint64 NPYV__VOLATILE_CVTMASK64 t = (npy_uint64)_cvtmask64_u64(a);
++    return t;
+ #elif defined(NPY_HAVE_AVX512BW)
+-    return (npy_uint64)a;
++    npy_uint64 NPYV__VOLATILE_CVTMASK64 t = (npy_uint64)a;
++    return t;
+ #else
+     int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a));
+     int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a));
+     return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32);
+ #endif
+ }
++#undef NPYV__VOLATILE_CVTMASK64
++
+ NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+ {
+ #ifdef NPY_HAVE_AVX512BW_MASK
diff --git a/var/spack/repos/builtin/packages/py-numpy/package.py b/var/spack/repos/builtin/packages/py-numpy/package.py
@@ -139,6 +139,8 @@ class PyNumpy(PythonPackage):
     patch("check_executables2.patch", when="@1.19.0:1.19.5")
     patch("check_executables3.patch", when="@1.16.0:1.18.5")
 
+    patch("intel_mask_conversion.patch", when="@1.25.0:1.26.5")
+
     # Backport bug fix for f2py's define for threading when building with Mingw
     patch(
         "https://github.com/numpy/numpy/pull/20881.patch?full_index=1",
@@ -281,21 +283,45 @@ def blas_lapack_pkg_config(self) -> Tuple[str, str]:
 
         return blas, lapack
 
-    @when("@1.26:")
+    @when("@1.25:")
     def config_settings(self, spec, prefix):
+        settings = {"builddir": "build", "compile-args": f"-j{make_jobs}"}
+
+        if self.spec.satisfies("@1.26:"):
+            settings.update(self.blas_config_settings())
+
+        # Disable AVX512 features for Intel Classic compilers
+        # https://numpy.org/doc/stable/reference/simd/build-options.html
+        # https://github.com/numpy/numpy/issues/27840
+        # https://github.com/matplotlib/matplotlib/issues/28762
+        archs = ("x86_64_v4:", "cannonlake:", "mic_knl")
+        if any([self.spec.satisfies(f"target={arch} %intel") for arch in archs]):
+            intel_settings = {
+                "setup-args": {
+                    "-Dcpu-dispatch": (
+                        settings.get("setup-args", {}).get("-Dcpu-dispatch", "")
+                        + " "
+                        + "MAX -AVX512F -AVX512CD -AVX512_KNL -AVX512_KNM -AVX512_SKX "
+                        + "-AVX512_CLX -AVX512_CNL -AVX512_ICL -AVX512_SPR"
+                    )
+                }
+            }
+            settings.update(intel_settings)
+
+        return settings
+
+    def blas_config_settings(self):
         blas, lapack = self.blas_lapack_pkg_config()
         return {
-            "builddir": "build",
-            "compile-args": f"-j{make_jobs}",
             "setup-args": {
                 # https://scipy.github.io/devdocs/building/blas_lapack.html
                 "-Dblas": blas,
                 "-Dlapack": lapack,
                 # https://numpy.org/doc/stable/reference/simd/build-options.html
                 # TODO: get this working in CI
                 # "-Dcpu-baseline": "native",
-                # "-Dcpu-dispatch": "none",
-            },
+                # "-Dcpu-dispatch": "none ",
+            }
         }
 
     def blas_lapack_site_cfg(self) -> None: