Skip to content

Commit

Permalink
[X86][AVX10.2] Support AVX10.2 VNNI FP16/INT8/INT16 new instructions (l…
Browse files Browse the repository at this point in the history
  • Loading branch information
phoebewang authored and banach-space committed Aug 7, 2024
1 parent ca56835 commit aef5d6a
Show file tree
Hide file tree
Showing 28 changed files with 8,815 additions and 251 deletions.
69 changes: 45 additions & 24 deletions clang/include/clang/Basic/BuiltinsX86.def
Original file line number Diff line number Diff line change
Expand Up @@ -773,18 +773,18 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512v
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni,evex512")

// AVX-VNNI-INT8
TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8")
TARGET_BUILTIN(__builtin_ia32_vpdpbssd128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbssd256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbssds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbssds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbsud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbsud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbuud128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbuud256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds128, "V4iV4iV4iV4i", "ncV:128:", "avxvnniint8|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds256, "V8iV8iV8iV8i", "ncV:256:", "avxvnniint8|avx10.2-256")

TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2OiV2OivC*V2OiUcIi", "nV:128:", "avx512vl")
Expand Down Expand Up @@ -1959,6 +1959,27 @@ TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES,
TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")

// AVX10.2 VNNI FP16
TARGET_BUILTIN(__builtin_ia32_vdpphps128, "V4fV4fV8xV8x", "ncV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vdpphps256, "V8fV8fV16xV16x", "ncV:256:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vdpphps512, "V16fV16fV32xV32x", "ncV:512:", "avx10.2-512")

// AVX10.2 VNNI INT8
TARGET_BUILTIN(__builtin_ia32_vpdpbssd512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpbssds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpbsud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpbsuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpbuud512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpbuuds512, "V16iV16iV16iV16i", "ncV:512:", "avx10.2-512")

// AVX10.2 VNNI INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpwusd512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpwusds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpwuud512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds512, "V16iV16iV16iV16i", "nV:512:", "avx10.2-512")

// AVX10.2 VMPSADBW
TARGET_BUILTIN(__builtin_ia32_mpsadbw512, "V32sV64cV64cIc", "ncV:512:", "avx10.2-512")

Expand Down Expand Up @@ -2088,18 +2109,18 @@ TARGET_BUILTIN(__builtin_ia32_vsubph256_round, "V16xV16xV16xIi", "nV:256:", "avx
TARGET_BUILTIN(__builtin_ia32_vsubps256_round, "V8fV8fV8fIi", "nV:256:", "avx10.2-256")

// AVX-VNNI-INT16
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwsud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwsuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwusd128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwusd256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwusds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwusds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwuud128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")

// AVX-NE-CONVERT
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
Expand Down
279 changes: 279 additions & 0 deletions clang/lib/Headers/avx10_2_512niintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,35 @@
#ifndef __AVX10_2_512NIINTRIN_H
#define __AVX10_2_512NIINTRIN_H

#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
__min_vector_width__(512)))

/* VNNI FP16 */
static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_dpph_ps(__m512 __W,
__m512h __A,
__m512h __B) {
return (__m512)__builtin_ia32_vdpphps512((__v16sf)__W, (__v32hf)__A,
(__v32hf)__B);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_dpph_ps(__m512 __W,
__mmask16 __U,
__m512h __A,
__m512h __B) {
return (__m512)__builtin_ia32_selectps_512(
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B), (__v16sf)__W);
}

static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
__m512 __W,
__m512h __A,
__m512h __B) {
return (__m512)__builtin_ia32_selectps_512(
(__mmask16)__U, (__v16sf)_mm512_dpph_ps(__W, __A, __B),
(__v16sf)_mm512_setzero_ps());
}

/* VMPSADBW */
#define _mm512_mpsadbw_epu8(A, B, imm) \
((__m512i)__builtin_ia32_mpsadbw512((__v64qi)(__m512i)(A), \
Expand All @@ -31,5 +60,255 @@
(__mmask32)(U), (__v32hi)_mm512_mpsadbw_epu8((A), (B), (imm)), \
(__v32hi)_mm512_setzero_si512()))

/* VNNI INT8 */
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
(__v16si)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbssd_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B), (__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbssd_epi32(__W, __A, __B),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
(__v16si)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B), (__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbssds_epi32(__W, __A, __B),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
(__v16si)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbsud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B), (__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbsud_epi32(__W, __A, __B),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
(__v16si)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B), (__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbsuds_epi32(__W, __A, __B),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
(__v16si)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbuud_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B), (__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbuud_epi32(__W, __A, __B),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
__m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
(__v16si)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B), (__v16si)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
__mmask16 __U, __m512i __W, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectd_512(
__U, (__v16si)_mm512_dpbuuds_epi32(__W, __A, __B),
(__v16si)_mm512_setzero_si512());
}

/* VNNI INT16 */
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
(__v16si)__A);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
(__v16si)__A);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
(__v16si)__A);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
(__v16si)__A);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
(__v16si)__A);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
__m512i __B,
__m512i __C) {
return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
(__v16si)__A);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}

#undef __DEFAULT_FN_ATTRS

#endif /* __SSE2__ */
#endif /* __AVX10_2_512NIINTRIN_H */
Loading

0 comments on commit aef5d6a

Please sign in to comment.