From 7b217fec9dea4867f22874ada8020f1f7ee8a137 Mon Sep 17 00:00:00 2001 From: Andrea Mazzoleni Date: Sun, 6 Nov 2016 11:04:34 +0100 Subject: [PATCH] Update to latest libdeflate --- HISTORY | 2 +- Makefile.am | 2 - doc/history.1 | 2 +- doc/history.d | 2 +- doc/history.txt | 2 +- libdeflate/NEWS | 18 ++ libdeflate/README.md | 2 +- libdeflate/adler32.c | 29 ++-- libdeflate/adler32.h | 12 -- libdeflate/adler32_impl.h | 6 +- libdeflate/common_defs.h | 15 +- libdeflate/compiler_gcc.h | 65 ++++++-- libdeflate/crc32.c | 127 ++++++++++++-- libdeflate/crc32.h | 12 -- libdeflate/crc32_impl.h | 286 ++++++++++++++++++++++++++++++++ libdeflate/deflate_compress.c | 2 + libdeflate/deflate_compress.h | 2 + libdeflate/deflate_decompress.c | 2 +- libdeflate/gzip_compress.c | 3 +- libdeflate/gzip_decompress.c | 4 +- libdeflate/lib_common.h | 7 +- libdeflate/libdeflate.h | 27 ++- libdeflate/x86_cpu_features.h | 6 +- libdeflate/zlib_compress.c | 3 +- libdeflate/zlib_decompress.c | 4 +- 25 files changed, 544 insertions(+), 98 deletions(-) delete mode 100644 libdeflate/adler32.h delete mode 100644 libdeflate/crc32.h create mode 100644 libdeflate/crc32_impl.h diff --git a/HISTORY b/HISTORY index 5a197ba..ebc3aa0 100644 --- a/HISTORY +++ b/HISTORY @@ -9,7 +9,7 @@ ADVANCECOMP VERSION 1.21 2016/11 * Added libdeflate support. It's the new default because it provides better performance and compression than 7z. From https://github.com/ebiggers/libdeflate - at commit 64dc75786d12cc4df005de50add12e36503f579a. + at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88 * Update to te latest zopfli library. From https://github.com/google/zopfli at commit 6818a0859063b946094fb6f94732836404a0d89a. diff --git a/Makefile.am b/Makefile.am index a9b902d..d21a8c2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -201,13 +201,11 @@ noinst_HEADERS = \ 7z/RangeCoder.h \ 7z/WindowIn.h \ 7z/WindowOut.h \ - libdeflate/adler32.h \ libdeflate/adler32_impl.h \ libdeflate/aligned_malloc.h \ libdeflate/bt_matchfinder.h \ libdeflate/common_defs.h \ libdeflate/compiler_gcc.h \ - libdeflate/crc32.h \ libdeflate/crc32_table.h \ libdeflate/decompress_impl.h \ libdeflate/deflate_compress.h \ diff --git a/doc/history.1 b/doc/history.1 index d1c8823..d8a90cf 100644 --- a/doc/history.1 +++ b/doc/history.1 @@ -7,7 +7,7 @@ advcomp \- History For AdvanceCOMP Added libdeflate support. It\'s the new default because it provides better performance and compression than 7z. From https://github.com/ebiggers/libdeflate -at commit 64dc75786d12cc4df005de50add12e36503f579a. +at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88 .IP \(bu Update to te latest zopfli library. From https://github.com/google/zopfli diff --git a/doc/history.d b/doc/history.d index 5009ab2..7de9890 100644 --- a/doc/history.d +++ b/doc/history.d @@ -5,7 +5,7 @@ AdvanceCOMP Version 1.21 2016/11 ) Added libdeflate support. It's the new default because it provides better performance and compression than 7z. From https://github.com/ebiggers/libdeflate - at commit 64dc75786d12cc4df005de50add12e36503f579a. + at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88 ) Update to te latest zopfli library. From https://github.com/google/zopfli at commit 6818a0859063b946094fb6f94732836404a0d89a. diff --git a/doc/history.txt b/doc/history.txt index 962204b..5588a76 100644 --- a/doc/history.txt +++ b/doc/history.txt @@ -9,7 +9,7 @@ ADVANCECOMP VERSION 1.21 2016/11 * Added libdeflate support. It's the new default because it provides better performance and compression than 7z. From https://github.com/ebiggers/libdeflate - at commit 64dc75786d12cc4df005de50add12e36503f579a. + at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88 * Update to te latest zopfli library. From https://github.com/google/zopfli at commit 6818a0859063b946094fb6f94732836404a0d89a. diff --git a/libdeflate/NEWS b/libdeflate/NEWS index 7f1d1fa..9fe9e9f 100644 --- a/libdeflate/NEWS +++ b/libdeflate/NEWS @@ -1,3 +1,21 @@ +Version 0.6: + Various improvements to the gzip program's behavior. + + Faster CRC-32 on AVX-capable processors. + + Other minor changes. + +Version 0.5: + The CRC-32 checksum algorithm has been optimized with carryless + multiplication instructions for x86_64 (PCLMUL). This speeds up gzip + compression and decompression. + + Build fixes for certain platforms and compilers. + + Added more test programs and scripts. + + libdeflate is now entirely MIT-licensed. + Version 0.4: The Adler-32 checksum algorithm has been optimized with vector instructions for x86_64 (SSE2 and AVX2) and ARM (NEON). This speeds up diff --git a/libdeflate/README.md b/libdeflate/README.md index e995116..87b67b5 100644 --- a/libdeflate/README.md +++ b/libdeflate/README.md @@ -18,7 +18,7 @@ libdeflate itself is a library, but the following command-line programs which use this library are also provided: * gzip (or gunzip), a program which mostly behaves like the standard equivalent, - except that it does not yet support reading from standard input and does not + except that it does not yet have good streaming support and therefore does not yet support very large files * benchmark, a program for benchmarking in-memory compression and decompression diff --git a/libdeflate/adler32.c b/libdeflate/adler32.c index 5f2c667..2148802 100644 --- a/libdeflate/adler32.c +++ b/libdeflate/adler32.c @@ -27,9 +27,10 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "adler32.h" #include "x86_cpu_features.h" +#include "libdeflate.h" + /* The Adler-32 divisor, or "base", value. */ #define DIVISOR 65521 @@ -73,7 +74,8 @@ /* Include the AVX2 implementation? */ #define NEED_AVX2_IMPL 0 #if defined(__AVX2__) || \ - (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET) + (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET && \ + COMPILER_SUPPORTS_TARGET_INTRINSICS) # include # undef NEED_AVX2_IMPL # define NEED_AVX2_IMPL 1 @@ -101,10 +103,10 @@ /* Define the generic implementation if needed. */ #if NEED_GENERIC_IMPL -static u32 adler32_generic(const void *buffer, size_t size) +static u32 adler32_generic(u32 adler, const void *buffer, size_t size) { - u32 s1 = 1; - u32 s2 = 0; + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; const u8 *p = buffer; const u8 * const end = p + size; @@ -177,7 +179,7 @@ static u32 adler32_generic(const void *buffer, size_t size) # include "adler32_impl.h" #endif -typedef u32 (*adler32_func_t)(const void *, size_t); +typedef u32 (*adler32_func_t)(u32, const void *, size_t); /* * If multiple implementations are available, then dispatch among them based on @@ -186,23 +188,26 @@ typedef u32 (*adler32_func_t)(const void *, size_t); #if NUM_IMPLS == 1 # define adler32_impl DEFAULT_IMPL #else -static u32 dispatch(const void *, size_t); +static u32 dispatch(u32, const void *, size_t); static adler32_func_t adler32_impl = dispatch; -static u32 dispatch(const void *buffer, size_t size) +static u32 dispatch(u32 adler, const void *buffer, size_t size) { adler32_func_t f = DEFAULT_IMPL; #if NEED_AVX2_IMPL && !defined(__AVX2__) - if (x86_have_cpu_feature(X86_CPU_FEATURE_AVX2)) + if (x86_have_cpu_features(X86_CPU_FEATURE_AVX2)) f = adler32_avx2; #endif adler32_impl = f; - return adler32_impl(buffer, size); + return adler32_impl(adler, buffer, size); } #endif /* NUM_IMPLS != 1 */ -u32 adler32(const void *buffer, size_t size) +LIBDEFLATEAPI u32 +libdeflate_adler32(u32 adler, const void *buffer, size_t size) { - return adler32_impl(buffer, size); + if (buffer == NULL) /* return initial value */ + return 1; + return adler32_impl(adler, buffer, size); } diff --git a/libdeflate/adler32.h b/libdeflate/adler32.h deleted file mode 100644 index 73f0260..0000000 --- a/libdeflate/adler32.h +++ /dev/null @@ -1,12 +0,0 @@ -/* - * adler32.h - Adler-32 checksum algorithm - */ - -#ifndef LIB_ADLER32_H -#define LIB_ADLER32_H - -#include "lib_common.h" - -extern u32 adler32(const void *buffer, size_t size); - -#endif /* LIB_ADLER32_H */ diff --git a/libdeflate/adler32_impl.h b/libdeflate/adler32_impl.h index 0a9e2d7..78e0ae3 100644 --- a/libdeflate/adler32_impl.h +++ b/libdeflate/adler32_impl.h @@ -62,10 +62,10 @@ */ static u32 ATTRIBUTES -FUNCNAME(const void *buffer, size_t size) +FUNCNAME(u32 adler, const void *buffer, size_t size) { - u32 s1 = 1; - u32 s2 = 0; + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; const u8 *p = buffer; const u8 * const end = p + size; const u8 *vend; diff --git a/libdeflate/common_defs.h b/libdeflate/common_defs.h index 7ab1702..677f97e 100644 --- a/libdeflate/common_defs.h +++ b/libdeflate/common_defs.h @@ -121,12 +121,21 @@ typedef size_t machine_word_t; # define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 #endif -/* Does the compiler support __attribute__((target("bmi2")))? */ +/* Are target-specific intrinsics supported in 'target' attribute functions? */ +#ifndef COMPILER_SUPPORTS_TARGET_INTRINSICS +# define COMPILER_SUPPORTS_TARGET_INTRINSICS 0 +#endif + +/* Which targets are supported with the 'target' function attribute? */ +#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET +# define COMPILER_SUPPORTS_PCLMUL_TARGET 0 +#endif #ifndef COMPILER_SUPPORTS_BMI2_TARGET # define COMPILER_SUPPORTS_BMI2_TARGET 0 #endif - -/* Does the compiler support __attribute__((target("avx2")))? */ +#ifndef COMPILER_SUPPORTS_AVX_TARGET +# define COMPILER_SUPPORTS_AVX_TARGET 0 +#endif #ifndef COMPILER_SUPPORTS_AVX2_TARGET # define COMPILER_SUPPORTS_AVX2_TARGET 0 #endif diff --git a/libdeflate/compiler_gcc.h b/libdeflate/compiler_gcc.h index 917de16..3e2d7f5 100644 --- a/libdeflate/compiler_gcc.h +++ b/libdeflate/compiler_gcc.h @@ -3,10 +3,24 @@ * handles clang and the Intel C Compiler. */ -#define GCC_PREREQ(major, minor) \ - (!defined(__clang__) && !defined(__INTEL_COMPILER) && \ - (__GNUC__ > (major) || \ - (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))) +#define GCC_PREREQ(major, minor) \ + (__GNUC__ > (major) || \ + (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) + +/* Note: only check the clang version when absolutely necessary! + * "Vendors" such as Apple can use different version numbers. */ +#ifdef __clang__ +# ifdef __apple_build_version__ +# define CLANG_PREREQ(major, minor, apple_version) \ + (__apple_build_version__ >= (apple_version)) +# else +# define CLANG_PREREQ(major, minor, apple_version) \ + (__clang_major__ > (major) || \ + (__clang_major__ == (major) && __clang_minor__ >= (minor))) +# endif +#else +# define CLANG_PREREQ(major, minor, apple_version) 0 +#endif #ifndef __has_attribute # define __has_attribute(attribute) 0 @@ -33,20 +47,39 @@ #define prefetchw(addr) __builtin_prefetch((addr), 1) #define _aligned_attribute(n) __attribute__((aligned(n))) -#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \ - (GCC_PREREQ(4, 4) || __has_attribute(target)) - -#define COMPILER_SUPPORTS_BMI2_TARGET \ - (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \ - (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))) - /* - * Note: AVX2 support was added in gcc 4.7, but AVX2 intrinsics don't work in - * __attribute__((target("avx2"))) functions until gcc 4.9. + * Support for the following x86 instruction set extensions was introduced by + * the following gcc versions: + * + * PCLMUL 4.4 + * AVX 4.6 + * BMI2 4.7 + * AVX2 4.7 + * + * With clang, __has_builtin() can be used to detect the presence of one of the + * associated builtins. + * + * Additionally, gcc 4.4 introduced the 'target' function attribute. With + * clang, support for this can be detected with with __has_attribute(target). + * + * However, prior to gcc 4.9 and clang 3.8, x86 intrinsics not available in the + * main target could not be used in 'target' attribute functions. Unfortunately + * clang has no feature test macro for this so we have to check its version. */ -#define COMPILER_SUPPORTS_AVX2_TARGET \ - (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE && \ - (GCC_PREREQ(4, 9) || __has_builtin(__builtin_ia32_pmaddwd256))) +#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \ + (GCC_PREREQ(4, 4) || __has_attribute(target)) +#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE +# define COMPILER_SUPPORTS_TARGET_INTRINSICS \ + (GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)) +# define COMPILER_SUPPORTS_PCLMUL_TARGET \ + (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128)) +# define COMPILER_SUPPORTS_AVX_TARGET \ + (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256)) +# define COMPILER_SUPPORTS_BMI2_TARGET \ + (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)) +# define COMPILER_SUPPORTS_AVX2_TARGET \ + (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pmaddwd256)) +#endif /* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */ #ifdef __BYTE_ORDER__ diff --git a/libdeflate/crc32.c b/libdeflate/crc32.c index efc97e6..5c3f8ba 100644 --- a/libdeflate/crc32.c +++ b/libdeflate/crc32.c @@ -31,7 +31,7 @@ * High-level description of CRC * ============================= * - * Consider a bit sequence 'bits[1...len]'. Interpet 'bits' as the "message" + * Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message" * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2), * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute: * @@ -39,7 +39,7 @@ * * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x) - * interpeted as a bitstring of length 'n'. + * interpreted as a bitstring of length 'n'. * * CRC used in gzip * ================ @@ -166,11 +166,62 @@ * * In crc32_slice8(), this method is extended to 8 bytes at a time. The * intermediate remainder (which we never actually store explicitly) is 96 bits. + * + * On CPUs that support fast carryless multiplication, CRCs can be computed even + * more quickly via "folding". See crc32_pclmul() for an example. + */ + +#include "x86_cpu_features.h" + +#include "libdeflate.h" + +/* Select the implementations to compile in. */ + +#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */ +#define DEFAULT_IMPL crc32_slice8 + +/* Include the PCLMUL implementation? */ +#define NEED_PCLMUL_IMPL 0 +#if defined(__PCLMUL__) || \ + (X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET && \ + COMPILER_SUPPORTS_TARGET_INTRINSICS) +# include +# undef NEED_PCLMUL_IMPL +# define NEED_PCLMUL_IMPL 1 +# ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */ +# undef NEED_GENERIC_IMPL +# define NEED_GENERIC_IMPL 0 /* generic impl not needed */ +# undef DEFAULT_IMPL +# define DEFAULT_IMPL crc32_pclmul +# endif /* otherwise, we can build a PCLMUL version, but we won't know whether + we can use it until runtime */ +#endif + +/* + * Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32 + * function doesn't use any AVX intrinsics specifically, it can benefit a lot + * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100 + * MB/s. I expect this is related to the PCLMULQDQ instructions being assembled + * in the newer three-operand form rather than the older two-operand form. + * + * Note: this is only needed if __AVX__ is *not* defined, since otherwise the + * "regular" PCLMUL implementation would already be AVX enabled. */ +#define NEED_PCLMUL_AVX_IMPL 0 +#if NEED_PCLMUL_IMPL && !defined(__AVX__) && \ + X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET +# undef NEED_PCLMUL_AVX_IMPL +# define NEED_PCLMUL_AVX_IMPL 1 +#endif -#define CRC32_SLICE8 +#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL) -#include "crc32.h" +/* Define the CRC-32 table */ +#if NEED_GENERIC_IMPL +# define CRC32_SLICE8 +#else +# define CRC32_SLICE1 /* only need short table for unaligned ends */ +#endif #include "crc32_table.h" static forceinline u32 @@ -179,7 +230,7 @@ crc32_update_byte(u32 remainder, u8 next_byte) return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte]; } -#ifdef CRC32_SLICE1 +#if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL) static u32 crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes) { @@ -258,18 +309,60 @@ crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes) } #endif -u32 -crc32_gzip(const void *buffer, size_t nbytes) -{ - u32 remainder = ~0; -#if defined(CRC32_SLICE1) - remainder = crc32_slice1(remainder, buffer, nbytes); -#elif defined(CRC32_SLICE4) - remainder = crc32_slice4(remainder, buffer, nbytes); -#elif defined(CRC32_SLICE8) - remainder = crc32_slice8(remainder, buffer, nbytes); +/* Define the PCLMUL implementation if needed. */ +#if NEED_PCLMUL_IMPL +# define FUNCNAME crc32_pclmul +# define FUNCNAME_ALIGNED crc32_pclmul_aligned +# ifdef __PCLMUL__ +# define ATTRIBUTES +# else +# define ATTRIBUTES __attribute__((target("pclmul"))) +# endif +# include "crc32_impl.h" +#endif + +/* Define the PCLMUL/AVX implementation if needed. */ +#if NEED_PCLMUL_AVX_IMPL +# define FUNCNAME crc32_pclmul_avx +# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned +# define ATTRIBUTES __attribute__((target("pclmul,avx"))) +# include "crc32_impl.h" +#endif + +typedef u32 (*crc32_func_t)(u32, const u8 *, size_t); + +/* + * If multiple implementations are available, then dispatch among them based on + * CPU features at runtime. Otherwise just call the single one directly. + */ +#if NUM_IMPLS == 1 +# define crc32_impl DEFAULT_IMPL #else -# error "don't know which CRC-32 implementation to use!" +static u32 dispatch(u32, const u8 *, size_t); + +static crc32_func_t crc32_impl = dispatch; + +static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes) +{ + crc32_func_t f = DEFAULT_IMPL; +#if NEED_PCLMUL_IMPL && !defined(__PCLMUL__) + if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ)) + f = crc32_pclmul; +#endif +#if NEED_PCLMUL_AVX_IMPL + if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ | + X86_CPU_FEATURE_AVX)) + f = crc32_pclmul_avx; #endif - return ~remainder; + crc32_impl = f; + return crc32_impl(remainder, buffer, nbytes); +} +#endif /* NUM_IMPLS != 1 */ + +LIBDEFLATEAPI u32 +libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes) +{ + if (buffer == NULL) /* return initial value */ + return 0; + return ~crc32_impl(~remainder, buffer, nbytes); } diff --git a/libdeflate/crc32.h b/libdeflate/crc32.h deleted file mode 100644 index 2b9e957..0000000 --- a/libdeflate/crc32.h +++ /dev/null @@ -1,12 +0,0 @@ -/* - * crc32.h - CRC-32 checksum algorithm for the gzip format - */ - -#ifndef LIB_CRC32_H -#define LIB_CRC32_H - -#include "lib_common.h" - -extern u32 crc32_gzip(const void *buffer, size_t size); - -#endif /* LIB_CRC32_H */ diff --git a/libdeflate/crc32_impl.h b/libdeflate/crc32_impl.h new file mode 100644 index 0000000..625bc18 --- /dev/null +++ b/libdeflate/crc32_impl.h @@ -0,0 +1,286 @@ +/* + * crc32_impl.h + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * CRC-32 folding with PCLMULQDQ. + * + * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits, + * producing an abbreviated message which is congruent the original message + * modulo the generator polynomial G(x). + * + * Folding each 512 bits is implemented as eight 64-bit folds, each of which + * uses one carryless multiplication instruction. It's expected that CPUs may + * be able to execute some of these multiplications in parallel. + * + * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x) + * be 95 bits from a constant distance D later in the message. The relevant + * portion of the message can be written as: + * + * M(x) = A(x)*x^D + B(x) + * + * ... where + and * represent addition and multiplication, respectively, of + * polynomials over GF(2). Note that when implemented on a computer, these + * operations are equivalent to XOR and carryless multiplication, respectively. + * + * For the purpose of CRC calculation, only the remainder modulo the generator + * polynomial G(x) matters: + * + * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x) + * + * Since the modulo operation can be applied anywhere in a sequence of additions + * and multiplications without affecting the result, this is equivalent to: + * + * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x) + * + * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e. + * a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless + * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit + * product. Then, adding (XOR-ing) the product to B(x) produces a polynomial + * with the same length as B(x) but with the same remainder as 'A(x)*x^D + + * B(x)'. This is the basic fold operation with 64 bits. + * + * Note that the carryless multiplication instruction PCLMULQDQ actually takes + * two 64-bit inputs and produces a 127-bit product in the low-order bits of a + * 128-bit XMM register. This works fine, but care must be taken to account for + * "bit endianness". With the CRC version implemented here, bits are always + * ordered such that the lowest-order bit represents the coefficient of highest + * power of x and the highest-order bit represents the coefficient of the lowest + * power of x. This is backwards from the more intuitive order. Still, + * carryless multiplication works essentially the same either way. It just must + * be accounted for that when we XOR the 95-bit product in the low-order 95 bits + * of a 128-bit XMM register into 128-bits of later data held in another XMM + * register, we'll really be XOR-ing the product into the mathematically higher + * degree end of those later bits, not the lower degree end as may be expected. + * + * So given that caveat and the fact that we process 512 bits per iteration, the + * 'D' values we need for the two 64-bit halves of each 128 bits of data are: + * + * D = (512 + 95) - 64 for the higher-degree half of each 128 bits, + * i.e. the lower order bits in the XMM register + * + * D = (512 + 95) - 128 for the lower-degree half of each 128 bits, + * i.e. the higher order bits in the XMM register + * + * The required 'x^D mod G(x)' values were precomputed. + * + * When <= 512 bits remain in the message, we finish up by folding across + * smaller distances. This works similarly; the distance D is just different, + * so different constant multipliers must be used. Finally, once the remaining + * message is just 64 bits, it is is reduced to the CRC-32 using Barrett + * reduction (explained later). + * + * For more information see the original paper from Intel: + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * December 2009 + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + */ +static u32 ATTRIBUTES +FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t vec_count) +{ + /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ + const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 }; + const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 }; + const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E }; + const __v2di final_multiplier = (__v2di){ 0xB8BC6765 }; + const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF }; + const __v2di barrett_reduction_constants = + (__v2di){ 0x00000001F7011641, 0x00000001DB710641 }; + + const __m128i * const end = p + vec_count; + const __m128i * const end512 = p + (vec_count & ~3); + __m128i x0, x1, x2, x3; + + /* + * Account for the current 'remainder', i.e. the CRC of the part of the + * message already processed. Explanation: rewrite the message + * polynomial M(x) in terms of the first part A(x), the second part + * B(x), and the length of the second part in bits |B(x)| >= 32: + * + * M(x) = A(x)*x^|B(x)| + B(x) + * + * Then the CRC of M(x) is: + * + * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x)) + * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x)) + * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x)) + * + * Note: all arithmetic is modulo G(x), the generator polynomial; that's + * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x). + * + * So the CRC of the full message is the CRC of the second part of the + * message where the first 32 bits of the second part of the message + * have been XOR'ed with the CRC of the first part of the message. + */ + x0 = *p++; + x0 ^= (__m128i)(__v4si){ remainder }; + + if (p > end512) /* only 128, 256, or 384 bits of input? */ + goto _128_bits_at_a_time; + x1 = *p++; + x2 = *p++; + x3 = *p++; + + /* Fold 512 bits at a time */ + for (; p != end512; p += 4) { + __m128i y0, y1, y2, y3; + + y0 = p[0]; + y1 = p[1]; + y2 = p[2]; + y3 = p[3]; + + /* + * Note: the immediate constant for PCLMULQDQ specifies which + * 64-bit halves of the 128-bit vectors to multiply: + * + * 0x00 means low halves (higher degree polynomial terms for us) + * 0x11 means high halves (lower degree polynomial terms for us) + */ + y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00); + y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00); + y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00); + y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00); + y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11); + y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11); + y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11); + y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11); + + x0 = y0; + x1 = y1; + x2 = y2; + x3 = y3; + } + + /* Fold 512 bits => 128 bits */ + x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00); + x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00); + x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11); + x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11); + x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00); + x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11); + x0 = x3; + +_128_bits_at_a_time: + while (p != end) { + /* Fold 128 bits into next 128 bits */ + x1 = *p++; + x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00); + x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11); + x0 = x1; + } + + /* Now there are just 128 bits left, stored in 'x0'. */ + + /* + * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, + * which is equivalent to multiplying by x^32. This is needed because + * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). + */ + x0 = _mm_srli_si128(x0, 8) ^ + _mm_clmulepi64_si128(x0, multipliers_1, 0x10); + + /* Fold 96 => 64 bits */ + x0 = _mm_srli_si128(x0, 4) ^ + _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00); + + /* + * Finally, reduce 64 => 32 bits using Barrett reduction. + * + * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to + * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): + * + * R(x) = (A(x)*x^32 + B(x)) mod G(x) + * = (A(x)*x^32) mod G(x) + B(x) + * + * Then, by the Division Algorithm there exists a unique q(x) such that: + * + * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) + * + * Since the left-hand side is of maximum degree 31, the right-hand side + * must be too. This implies that we can apply 'mod x^32' to the + * right-hand side without changing its value: + * + * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 + * + * Note that '+' is equivalent to '-' in polynomials over GF(2). + * + * We also know that: + * + * / A(x)*x^32 \ + * q(x) = floor ( --------- ) + * \ G(x) / + * + * To compute this efficiently, we can multiply the top and bottom by + * x^32 and move the division by G(x) to the top: + * + * / A(x) * floor(x^64 / G(x)) \ + * q(x) = floor ( ------------------------- ) + * \ x^32 / + * + * Note that floor(x^64 / G(x)) is a constant. + * + * So finally we have: + * + * / A(x) * floor(x^64 / G(x)) \ + * R(x) = B(x) + G(x)*floor ( ------------------------- ) + * \ x^32 / + */ + x1 = x0; + x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00); + x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10); + return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4)); +} + +/* + * Fast CRC-32 implementation for x86_64 processors that have the carryless + * multiplication extension (PCLMUL). + * + * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead + * of crc32_slice8() because only a few bytes need to be processed, so a smaller + * table is preferable. + */ +static u32 ATTRIBUTES +FUNCNAME(u32 remainder, const u8 *buffer, size_t nbytes) +{ + if ((uintptr_t)buffer & 15) { + size_t n = MIN(nbytes, -(uintptr_t)buffer & 15); + remainder = crc32_slice1(remainder, buffer, n); + buffer += n; + nbytes -= n; + } + if (nbytes >= 16) { + remainder = FUNCNAME_ALIGNED(remainder, (const __m128i *)buffer, + nbytes / 16); + buffer += nbytes & ~15; + nbytes &= 15; + } + return crc32_slice1(remainder, buffer, nbytes); +} + +#undef FUNCNAME +#undef FUNCNAME_ALIGNED +#undef ATTRIBUTES diff --git a/libdeflate/deflate_compress.c b/libdeflate/deflate_compress.c index 255a6cd..a1e6c74 100644 --- a/libdeflate/deflate_compress.c +++ b/libdeflate/deflate_compress.c @@ -2787,6 +2787,8 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c, if (unlikely(in_nbytes < 16)) { struct deflate_output_bitstream os; deflate_init_output(&os, out, out_nbytes_avail); + if (in_nbytes == 0) + in = &os; /* Avoid passing NULL to memcpy() */ deflate_write_uncompressed_block(&os, in, in_nbytes, true); return deflate_flush_output(&os); } diff --git a/libdeflate/deflate_compress.h b/libdeflate/deflate_compress.h index 1491e00..f4bb23b 100644 --- a/libdeflate/deflate_compress.h +++ b/libdeflate/deflate_compress.h @@ -1,6 +1,8 @@ #ifndef LIB_DEFLATE_COMPRESS_H #define LIB_DEFLATE_COMPRESS_H +#include "lib_common.h" + /* DEFLATE compression is private to deflate_compress.c, but we do need to be * able to query the compression level for zlib and gzip header generation. */ diff --git a/libdeflate/deflate_decompress.c b/libdeflate/deflate_decompress.c index 8fcbd8a..8284fc8 100644 --- a/libdeflate/deflate_decompress.c +++ b/libdeflate/deflate_decompress.c @@ -842,7 +842,7 @@ dispatch(struct libdeflate_decompressor * restrict d, { decompress_func_t f = deflate_decompress_default; #if X86_CPU_FEATURES_ENABLED - if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2)) + if (x86_have_cpu_features(X86_CPU_FEATURE_BMI2)) f = deflate_decompress_bmi2; #endif decompress_impl = f; diff --git a/libdeflate/gzip_compress.c b/libdeflate/gzip_compress.c index 2f52dda..bfc75e2 100644 --- a/libdeflate/gzip_compress.c +++ b/libdeflate/gzip_compress.c @@ -27,7 +27,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "crc32.h" #include "deflate_compress.h" #include "gzip_constants.h" #include "unaligned.h" @@ -77,7 +76,7 @@ libdeflate_gzip_compress(struct libdeflate_compressor *c, out_next += deflate_size; /* CRC32 */ - put_unaligned_le32(crc32_gzip(in, in_size), out_next); + put_unaligned_le32(libdeflate_crc32(0, in, in_size), out_next); out_next += 4; /* ISIZE */ diff --git a/libdeflate/gzip_decompress.c b/libdeflate/gzip_decompress.c index 467de32..e3ce3d7 100644 --- a/libdeflate/gzip_decompress.c +++ b/libdeflate/gzip_decompress.c @@ -27,7 +27,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "crc32.h" #include "gzip_constants.h" #include "unaligned.h" @@ -118,7 +117,8 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d, in_next = in_end - GZIP_FOOTER_SIZE; /* CRC32 */ - if (crc32_gzip(out, actual_out_nbytes) != get_unaligned_le32(in_next)) + if (libdeflate_crc32(0, out, actual_out_nbytes) != + get_unaligned_le32(in_next)) return LIBDEFLATE_BAD_DATA; in_next += 4; diff --git a/libdeflate/lib_common.h b/libdeflate/lib_common.h index b041ac9..149f1df 100644 --- a/libdeflate/lib_common.h +++ b/libdeflate/lib_common.h @@ -5,6 +5,11 @@ #ifndef LIB_LIB_COMMON_H #define LIB_LIB_COMMON_H +#ifdef LIBDEFLATE_H +# error "lib_common.h must always be included before libdeflate.h" + /* because BUILDING_LIBDEFLATE must be set first */ +#endif + #define BUILDING_LIBDEFLATE #include "common_defs.h" @@ -21,10 +26,8 @@ * shared library, since these symbols are not exported. */ #define SYM_FIXUP(sym) _libdeflate_##sym -#define adler32 SYM_FIXUP(adler32) #define aligned_malloc SYM_FIXUP(aligned_malloc) #define aligned_free SYM_FIXUP(aligned_free) -#define crc32_gzip SYM_FIXUP(crc32_gzip) #define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level) #define _x86_cpu_features SYM_FIXUP(_x86_cpu_features) #define x86_setup_cpu_features SYM_FIXUP(x86_setup_cpu_features) diff --git a/libdeflate/libdeflate.h b/libdeflate/libdeflate.h index df09462..c64e9bf 100644 --- a/libdeflate/libdeflate.h +++ b/libdeflate/libdeflate.h @@ -10,10 +10,11 @@ extern "C" { #endif #define LIBDEFLATE_VERSION_MAJOR 0 -#define LIBDEFLATE_VERSION_MINOR 4 -#define LIBDEFLATE_VERSION_STRING "0.4" +#define LIBDEFLATE_VERSION_MINOR 6 +#define LIBDEFLATE_VERSION_STRING "0.6" #include +#include /* * On Windows, if you want to link to the DLL version of libdeflate, then @@ -243,6 +244,28 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, LIBDEFLATEAPI void libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); +/* ========================================================================== */ +/* Checksums */ +/* ========================================================================== */ + +/* + * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of + * data and returns the updated checksum. When starting a new checksum, the + * required initial value for 'adler' is 1. This value is also returned when + * 'buffer' is specified as NULL. + */ +LIBDEFLATEAPI uint32_t +libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len); + + +/* + * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data + * and returns the updated checksum. When starting a new checksum, the required + * initial value for 'crc' is 0. This value is also returned when 'buffer' is + * specified as NULL. + */ +LIBDEFLATEAPI uint32_t +libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); #ifdef __cplusplus } diff --git a/libdeflate/x86_cpu_features.h b/libdeflate/x86_cpu_features.h index 50dd9c7..4221bcc 100644 --- a/libdeflate/x86_cpu_features.h +++ b/libdeflate/x86_cpu_features.h @@ -34,13 +34,13 @@ extern u32 _x86_cpu_features; extern void x86_setup_cpu_features(void); -/* Does the processor have the specified feature? */ +/* Does the processor have the specified feature(s)? */ static inline bool -x86_have_cpu_feature(u32 feature) +x86_have_cpu_features(u32 features) { if (_x86_cpu_features == 0) x86_setup_cpu_features(); - return _x86_cpu_features & feature; + return (_x86_cpu_features & features) == features; } #endif /* X86_CPU_FEATURES_ENABLED */ diff --git a/libdeflate/zlib_compress.c b/libdeflate/zlib_compress.c index af86afa..b4cebaf 100644 --- a/libdeflate/zlib_compress.c +++ b/libdeflate/zlib_compress.c @@ -27,7 +27,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "adler32.h" #include "deflate_compress.h" #include "unaligned.h" #include "zlib_constants.h" @@ -73,7 +72,7 @@ libdeflate_zlib_compress(struct libdeflate_compressor *c, out_next += deflate_size; /* ADLER32 */ - put_unaligned_be32(adler32(in, in_size), out_next); + put_unaligned_be32(libdeflate_adler32(1, in, in_size), out_next); out_next += 4; return out_next - (u8 *)out; diff --git a/libdeflate/zlib_decompress.c b/libdeflate/zlib_decompress.c index b9ecd08..c5a15ca 100644 --- a/libdeflate/zlib_decompress.c +++ b/libdeflate/zlib_decompress.c @@ -27,7 +27,6 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include "adler32.h" #include "unaligned.h" #include "zlib_constants.h" @@ -84,7 +83,8 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *d, in_next = in_end - ZLIB_FOOTER_SIZE; /* ADLER32 */ - if (adler32(out, actual_out_nbytes) != get_unaligned_be32(in_next)) + if (libdeflate_adler32(1, out, actual_out_nbytes) != + get_unaligned_be32(in_next)) return LIBDEFLATE_BAD_DATA; return LIBDEFLATE_SUCCESS;