Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LoongArch SX SIMD extension implementation #981

Merged
merged 1 commit into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ jobs:
{ name: 'RISC-V', xcc_pkg: gcc-riscv64-linux-gnu, xcc: riscv64-linux-gnu-gcc, xemu_pkg: qemu-system-riscv64,xemu: qemu-riscv64-static, os: ubuntu-latest, },
# SPARC64 qemu emulation seems broken on Ubuntu-22
{ name: 'SPARC', xcc_pkg: gcc-sparc64-linux-gnu, xcc: sparc64-linux-gnu-gcc, xemu_pkg: qemu-system-sparc, xemu: qemu-sparc64-static, os: ubuntu-20.04, },
{ name: 'LoongArch', xcc_pkg: gcc-14-loongarch64-linux-gnu, xcc: loongarch64-linux-gnu-gcc-14, xemu_pkg: qemu-system-loongarch64, xemu: qemu-loongarch64-static, os: ubuntu-24.04, },

{ name: 'ARM, gcc-10', xcc_pkg: gcc-10-arm-linux-gnueabi, xcc: arm-linux-gnueabi-gcc-10, xemu_pkg: qemu-system-arm, xemu: qemu-arm-static, os: ubuntu-20.04, },
{ name: 'AARCH64, gcc-10', xcc_pkg: gcc-10-aarch64-linux-gnu, xcc: aarch64-linux-gnu-gcc-10, xemu_pkg: qemu-system-arm, xemu: qemu-aarch64-static, os: ubuntu-20.04, },
Expand Down Expand Up @@ -474,6 +475,11 @@ jobs:
run: |
make clean; LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make check

- name: LoongArch (XXH_VECTOR=[ scalar, LSX ])
if: ${{ startsWith(matrix.name, 'LoongArch') }}
run: |
CPPFLAGS="-DXXH_VECTOR=XXH_SCALAR" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check
CPPFLAGS=-DXXH_VECTOR=XXH_LSX CFLAGS="-O3 -march=la464 -mlsx" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check

# macOS

Expand Down
2 changes: 2 additions & 0 deletions cli/xsum_arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@
# else
# define XSUM_ARCH "wasm/asmjs"
# endif
#elif defined(__loongarch_lp64)
# define XSUM_ARCH "loongarch"
#else
# define XSUM_ARCH "unknown"
#endif
Expand Down
79 changes: 79 additions & 0 deletions xxhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -3749,6 +3749,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
# include <immintrin.h>
# elif defined(__SSE2__)
# include <emmintrin.h>
# elif defined(__loongarch_sx)
# include <lsxintrin.h>
# endif
#endif

Expand Down Expand Up @@ -3871,6 +3873,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
*/
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
XXH_LSX = 7, /*!< LSX (128-bit SIMD) for LoongArch64 */
};
/*!
* @ingroup tuning
Expand All @@ -3893,6 +3896,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_NEON 4
# define XXH_VSX 5
# define XXH_SVE 6
# define XXH_LSX 7
#endif

#ifndef XXH_VECTOR /* can be defined on command line */
Expand All @@ -3917,6 +3921,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|| (defined(__s390x__) && defined(__VEC__)) \
&& defined(__GNUC__) /* TODO: IBM XL */
# define XXH_VECTOR XXH_VSX
# elif defined(__loongarch_sx)
# define XXH_VECTOR XXH_LSX
# else
# define XXH_VECTOR XXH_SCALAR
# endif
Expand Down Expand Up @@ -3954,6 +3960,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_SVE /* sve */
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_LSX /* lsx */
# define XXH_ACC_ALIGN 64
# endif
#endif

Expand Down Expand Up @@ -5591,6 +5599,71 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,

#endif

#if (XXH_VECTOR == XXH_LSX)
#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))

XXH_FORCE_INLINE void
XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc,
const void* XXH_RESTRICT input,
const void* XXH_RESTRICT secret)
{
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
__m128i* const xacc = (__m128i *) acc;
const __m128i* const xinput = (const __m128i *) input;
const __m128i* const xsecret = (const __m128i *) secret;

for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
/* data_vec = xinput[i]; */
__m128i const data_vec = __lsx_vld(xinput + i, 0);
/* key_vec = xsecret[i]; */
__m128i const key_vec = __lsx_vld(xsecret + i, 0);
/* data_key = data_vec ^ key_vec; */
__m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
/* data_key_lo = data_key >> 32; */
__m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
// __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
__m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo);
/* xacc[i] += swap(data_vec); */
__m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2));
__m128i const sum = __lsx_vadd_d(xacc[i], data_swap);
/* xacc[i] += product; */
xacc[i] = __lsx_vadd_d(product, sum);
}
}
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx)

XXH_FORCE_INLINE void
XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
XXH_ASSERT((((size_t)acc) & 15) == 0);
{
__m128i* const xacc = (__m128i*) acc;
const __m128i* const xsecret = (const __m128i *) secret;
const __m128i prime32 = __lsx_vreplgr2vr_w((int)XXH_PRIME32_1);

for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
/* xacc[i] ^= (xacc[i] >> 47) */
__m128i const acc_vec = xacc[i];
__m128i const shifted = __lsx_vsrli_d(acc_vec, 47);
__m128i const data_vec = __lsx_vxor_v(acc_vec, shifted);
/* xacc[i] ^= xsecret[i]; */
__m128i const key_vec = __lsx_vld(xsecret + i, 0);
__m128i const data_key = __lsx_vxor_v(data_vec, key_vec);

/* xacc[i] *= XXH_PRIME32_1; */
__m128i const data_key_hi = __lsx_vsrli_d(data_key, 32);
__m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32);
__m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32);
xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32));
}
}
}

#endif

/* scalar variants - universal */

#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
Expand Down Expand Up @@ -5821,6 +5894,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_LSX)
#define XXH3_accumulate_512 XXH3_accumulate_512_lsx
#define XXH3_accumulate XXH3_accumulate_lsx
#define XXH3_scrambleAcc XXH3_scrambleAcc_lsx
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#else /* scalar */

#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
Expand Down
Loading