From 7b217fec9dea4867f22874ada8020f1f7ee8a137 Mon Sep 17 00:00:00 2001
From: Andrea Mazzoleni <amadvance@gmail.com>
Date: Sun, 6 Nov 2016 11:04:34 +0100
Subject: [PATCH] Update to latest libdeflate

---
 HISTORY                         |   2 +-
 Makefile.am                     |   2 -
 doc/history.1                   |   2 +-
 doc/history.d                   |   2 +-
 doc/history.txt                 |   2 +-
 libdeflate/NEWS                 |  18 ++
 libdeflate/README.md            |   2 +-
 libdeflate/adler32.c            |  29 ++--
 libdeflate/adler32.h            |  12 --
 libdeflate/adler32_impl.h       |   6 +-
 libdeflate/common_defs.h        |  15 +-
 libdeflate/compiler_gcc.h       |  65 ++++++--
 libdeflate/crc32.c              | 127 ++++++++++++--
 libdeflate/crc32.h              |  12 --
 libdeflate/crc32_impl.h         | 286 ++++++++++++++++++++++++++++++++
 libdeflate/deflate_compress.c   |   2 +
 libdeflate/deflate_compress.h   |   2 +
 libdeflate/deflate_decompress.c |   2 +-
 libdeflate/gzip_compress.c      |   3 +-
 libdeflate/gzip_decompress.c    |   4 +-
 libdeflate/lib_common.h         |   7 +-
 libdeflate/libdeflate.h         |  27 ++-
 libdeflate/x86_cpu_features.h   |   6 +-
 libdeflate/zlib_compress.c      |   3 +-
 libdeflate/zlib_decompress.c    |   4 +-
 25 files changed, 544 insertions(+), 98 deletions(-)
 delete mode 100644 libdeflate/adler32.h
 delete mode 100644 libdeflate/crc32.h
 create mode 100644 libdeflate/crc32_impl.h

diff --git a/HISTORY b/HISTORY
index 5a197ba..ebc3aa0 100644
--- a/HISTORY
+++ b/HISTORY
@@ -9,7 +9,7 @@ ADVANCECOMP VERSION 1.21 2016/11
 * Added libdeflate support. It's the new default because it provides
   better performance and compression than 7z.
   From https://github.com/ebiggers/libdeflate
-  at commit 64dc75786d12cc4df005de50add12e36503f579a.
+  at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
 * Update to te latest zopfli library.
   From https://github.com/google/zopfli
   at commit 6818a0859063b946094fb6f94732836404a0d89a.
diff --git a/Makefile.am b/Makefile.am
index a9b902d..d21a8c2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -201,13 +201,11 @@ noinst_HEADERS = \
 	7z/RangeCoder.h \
 	7z/WindowIn.h \
 	7z/WindowOut.h \
-	libdeflate/adler32.h \
 	libdeflate/adler32_impl.h \
 	libdeflate/aligned_malloc.h \
 	libdeflate/bt_matchfinder.h \
 	libdeflate/common_defs.h \
 	libdeflate/compiler_gcc.h \
-	libdeflate/crc32.h \
 	libdeflate/crc32_table.h \
 	libdeflate/decompress_impl.h \
 	libdeflate/deflate_compress.h \
diff --git a/doc/history.1 b/doc/history.1
index d1c8823..d8a90cf 100644
--- a/doc/history.1
+++ b/doc/history.1
@@ -7,7 +7,7 @@ advcomp \- History For AdvanceCOMP
 Added libdeflate support. It\'s the new default because it provides
 better performance and compression than 7z.
 From https://github.com/ebiggers/libdeflate
-at commit 64dc75786d12cc4df005de50add12e36503f579a.
+at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
 .IP \(bu
 Update to te latest zopfli library.
 From https://github.com/google/zopfli
diff --git a/doc/history.d b/doc/history.d
index 5009ab2..7de9890 100644
--- a/doc/history.d
+++ b/doc/history.d
@@ -5,7 +5,7 @@ AdvanceCOMP Version 1.21 2016/11
 	) Added libdeflate support. It's the new default because it provides
 		better performance and compression than 7z.
 		From https://github.com/ebiggers/libdeflate
-		at commit 64dc75786d12cc4df005de50add12e36503f579a.
+		at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
 	) Update to te latest zopfli library.
 		From https://github.com/google/zopfli
 		at commit 6818a0859063b946094fb6f94732836404a0d89a.
diff --git a/doc/history.txt b/doc/history.txt
index 962204b..5588a76 100644
--- a/doc/history.txt
+++ b/doc/history.txt
@@ -9,7 +9,7 @@ ADVANCECOMP VERSION 1.21 2016/11
 * Added libdeflate support. It's the new default because it provides
   better performance and compression than 7z.
   From https://github.com/ebiggers/libdeflate
-  at commit 64dc75786d12cc4df005de50add12e36503f579a.
+  at commit 28cc14994b8b57f590d31a7340c8fffc5cc37d88
 * Update to te latest zopfli library.
   From https://github.com/google/zopfli
   at commit 6818a0859063b946094fb6f94732836404a0d89a.
diff --git a/libdeflate/NEWS b/libdeflate/NEWS
index 7f1d1fa..9fe9e9f 100644
--- a/libdeflate/NEWS
+++ b/libdeflate/NEWS
@@ -1,3 +1,21 @@
+Version 0.6:
+	Various improvements to the gzip program's behavior.
+
+	Faster CRC-32 on AVX-capable processors.
+
+	Other minor changes.
+
+Version 0.5:
+	The CRC-32 checksum algorithm has been optimized with carryless
+	multiplication instructions for x86_64 (PCLMUL).  This speeds up gzip
+	compression and decompression.
+
+	Build fixes for certain platforms and compilers.
+
+	Added more test programs and scripts.
+
+	libdeflate is now entirely MIT-licensed.
+
 Version 0.4:
 	The Adler-32 checksum algorithm has been optimized with vector
 	instructions for x86_64 (SSE2 and AVX2) and ARM (NEON).  This speeds up
diff --git a/libdeflate/README.md b/libdeflate/README.md
index e995116..87b67b5 100644
--- a/libdeflate/README.md
+++ b/libdeflate/README.md
@@ -18,7 +18,7 @@ libdeflate itself is a library, but the following command-line programs which
 use this library are also provided:
 
 * gzip (or gunzip), a program which mostly behaves like the standard equivalent,
-  except that it does not yet support reading from standard input and does not
+  except that it does not yet have good streaming support and therefore does not
   yet support very large files
 * benchmark, a program for benchmarking in-memory compression and decompression
 
diff --git a/libdeflate/adler32.c b/libdeflate/adler32.c
index 5f2c667..2148802 100644
--- a/libdeflate/adler32.c
+++ b/libdeflate/adler32.c
@@ -27,9 +27,10 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "adler32.h"
 #include "x86_cpu_features.h"
 
+#include "libdeflate.h"
+
 /* The Adler-32 divisor, or "base", value. */
 #define DIVISOR 65521
 
@@ -73,7 +74,8 @@
 /* Include the AVX2 implementation? */
 #define NEED_AVX2_IMPL 0
 #if defined(__AVX2__) || \
-	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET)
+	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET && \
+	 COMPILER_SUPPORTS_TARGET_INTRINSICS)
 #  include <immintrin.h>
 #  undef NEED_AVX2_IMPL
 #  define NEED_AVX2_IMPL 1
@@ -101,10 +103,10 @@
 
 /* Define the generic implementation if needed. */
 #if NEED_GENERIC_IMPL
-static u32 adler32_generic(const void *buffer, size_t size)
+static u32 adler32_generic(u32 adler, const void *buffer, size_t size)
 {
-	u32 s1 = 1;
-	u32 s2 = 0;
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
 	const u8 *p = buffer;
 	const u8 * const end = p + size;
 
@@ -177,7 +179,7 @@ static u32 adler32_generic(const void *buffer, size_t size)
 #  include "adler32_impl.h"
 #endif
 
-typedef u32 (*adler32_func_t)(const void *, size_t);
+typedef u32 (*adler32_func_t)(u32, const void *, size_t);
 
 /*
  * If multiple implementations are available, then dispatch among them based on
@@ -186,23 +188,26 @@ typedef u32 (*adler32_func_t)(const void *, size_t);
 #if NUM_IMPLS == 1
 #  define adler32_impl DEFAULT_IMPL
 #else
-static u32 dispatch(const void *, size_t);
+static u32 dispatch(u32, const void *, size_t);
 
 static adler32_func_t adler32_impl = dispatch;
 
-static u32 dispatch(const void *buffer, size_t size)
+static u32 dispatch(u32 adler, const void *buffer, size_t size)
 {
 	adler32_func_t f = DEFAULT_IMPL;
 #if NEED_AVX2_IMPL && !defined(__AVX2__)
-	if (x86_have_cpu_feature(X86_CPU_FEATURE_AVX2))
+	if (x86_have_cpu_features(X86_CPU_FEATURE_AVX2))
 		f = adler32_avx2;
 #endif
 	adler32_impl = f;
-	return adler32_impl(buffer, size);
+	return adler32_impl(adler, buffer, size);
 }
 #endif /* NUM_IMPLS != 1 */
 
-u32 adler32(const void *buffer, size_t size)
+LIBDEFLATEAPI u32
+libdeflate_adler32(u32 adler, const void *buffer, size_t size)
 {
-	return adler32_impl(buffer, size);
+	if (buffer == NULL) /* return initial value */
+		return 1;
+	return adler32_impl(adler, buffer, size);
 }
diff --git a/libdeflate/adler32.h b/libdeflate/adler32.h
deleted file mode 100644
index 73f0260..0000000
--- a/libdeflate/adler32.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * adler32.h - Adler-32 checksum algorithm
- */
-
-#ifndef LIB_ADLER32_H
-#define LIB_ADLER32_H
-
-#include "lib_common.h"
-
-extern u32 adler32(const void *buffer, size_t size);
-
-#endif /* LIB_ADLER32_H */
diff --git a/libdeflate/adler32_impl.h b/libdeflate/adler32_impl.h
index 0a9e2d7..78e0ae3 100644
--- a/libdeflate/adler32_impl.h
+++ b/libdeflate/adler32_impl.h
@@ -62,10 +62,10 @@
  */
 
 static u32 ATTRIBUTES
-FUNCNAME(const void *buffer, size_t size)
+FUNCNAME(u32 adler, const void *buffer, size_t size)
 {
-	u32 s1 = 1;
-	u32 s2 = 0;
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
 	const u8 *p = buffer;
 	const u8 * const end = p + size;
 	const u8 *vend;
diff --git a/libdeflate/common_defs.h b/libdeflate/common_defs.h
index 7ab1702..677f97e 100644
--- a/libdeflate/common_defs.h
+++ b/libdeflate/common_defs.h
@@ -121,12 +121,21 @@ typedef size_t machine_word_t;
 #  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
 #endif
 
-/* Does the compiler support __attribute__((target("bmi2")))? */
+/* Are target-specific intrinsics supported in 'target' attribute functions? */
+#ifndef COMPILER_SUPPORTS_TARGET_INTRINSICS
+#  define COMPILER_SUPPORTS_TARGET_INTRINSICS 0
+#endif
+
+/* Which targets are supported with the 'target' function attribute? */
+#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET
+#  define COMPILER_SUPPORTS_PCLMUL_TARGET 0
+#endif
 #ifndef COMPILER_SUPPORTS_BMI2_TARGET
 #  define COMPILER_SUPPORTS_BMI2_TARGET 0
 #endif
-
-/* Does the compiler support __attribute__((target("avx2")))? */
+#ifndef COMPILER_SUPPORTS_AVX_TARGET
+#  define COMPILER_SUPPORTS_AVX_TARGET 0
+#endif
 #ifndef COMPILER_SUPPORTS_AVX2_TARGET
 #  define COMPILER_SUPPORTS_AVX2_TARGET 0
 #endif
diff --git a/libdeflate/compiler_gcc.h b/libdeflate/compiler_gcc.h
index 917de16..3e2d7f5 100644
--- a/libdeflate/compiler_gcc.h
+++ b/libdeflate/compiler_gcc.h
@@ -3,10 +3,24 @@
  * handles clang and the Intel C Compiler.
  */
 
-#define GCC_PREREQ(major, minor)					\
-	(!defined(__clang__) && !defined(__INTEL_COMPILER) &&		\
-	 (__GNUC__ > (major) ||						\
-	  (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
+#define GCC_PREREQ(major, minor)		\
+	(__GNUC__ > (major) ||			\
+	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+
+/* Note: only check the clang version when absolutely necessary!
+ * "Vendors" such as Apple can use different version numbers. */
+#ifdef __clang__
+#  ifdef __apple_build_version__
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__apple_build_version__ >= (apple_version))
+#  else
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__clang_major__ > (major) ||			\
+	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
+#  endif
+#else
+#  define CLANG_PREREQ(major, minor, apple_version)	0
+#endif
 
 #ifndef __has_attribute
 #  define __has_attribute(attribute)	0
@@ -33,20 +47,39 @@
 #define prefetchw(addr)		__builtin_prefetch((addr), 1)
 #define _aligned_attribute(n)	__attribute__((aligned(n)))
 
-#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE		\
-	(GCC_PREREQ(4, 4) || __has_attribute(target))
-
-#define COMPILER_SUPPORTS_BMI2_TARGET				\
-	(COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE &&		\
-	 (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))
-
 /*
- * Note: AVX2 support was added in gcc 4.7, but AVX2 intrinsics don't work in
- * __attribute__((target("avx2"))) functions until gcc 4.9.
+ * Support for the following x86 instruction set extensions was introduced by
+ * the following gcc versions:
+ *
+ *	PCLMUL	4.4
+ *	AVX	4.6
+ *	BMI2	4.7
+ *	AVX2	4.7
+ *
+ * With clang, __has_builtin() can be used to detect the presence of one of the
+ * associated builtins.
+ *
+ * Additionally, gcc 4.4 introduced the 'target' function attribute.  With
+ * clang, support for this can be detected with with __has_attribute(target).
+ *
+ * However, prior to gcc 4.9 and clang 3.8, x86 intrinsics not available in the
+ * main target could not be used in 'target' attribute functions.  Unfortunately
+ * clang has no feature test macro for this so we have to check its version.
  */
-#define COMPILER_SUPPORTS_AVX2_TARGET				\
-	(COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE &&		\
-	 (GCC_PREREQ(4, 9) || __has_builtin(__builtin_ia32_pmaddwd256)))
+#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE		\
+	(GCC_PREREQ(4, 4) || __has_attribute(target))
+#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+#  define COMPILER_SUPPORTS_TARGET_INTRINSICS			\
+	(GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000))
+#  define COMPILER_SUPPORTS_PCLMUL_TARGET			\
+	(GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
+#  define COMPILER_SUPPORTS_AVX_TARGET				\
+	(GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
+#  define COMPILER_SUPPORTS_BMI2_TARGET				\
+	(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
+#  define COMPILER_SUPPORTS_AVX2_TARGET				\
+	(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pmaddwd256))
+#endif
 
 /* Newer gcc supports __BYTE_ORDER__.  Older gcc doesn't. */
 #ifdef __BYTE_ORDER__
diff --git a/libdeflate/crc32.c b/libdeflate/crc32.c
index efc97e6..5c3f8ba 100644
--- a/libdeflate/crc32.c
+++ b/libdeflate/crc32.c
@@ -31,7 +31,7 @@
  * High-level description of CRC
  * =============================
  *
- * Consider a bit sequence 'bits[1...len]'.  Interpet 'bits' as the "message"
+ * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
  * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
  * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
  *
@@ -39,7 +39,7 @@
  *
  * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
  * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
- * interpeted as a bitstring of length 'n'.
+ * interpreted as a bitstring of length 'n'.
  *
  * CRC used in gzip
  * ================
@@ -166,11 +166,62 @@
  *
  * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
  * intermediate remainder (which we never actually store explicitly) is 96 bits.
+ *
+ * On CPUs that support fast carryless multiplication, CRCs can be computed even
+ * more quickly via "folding".  See crc32_pclmul() for an example.
+ */
+
+#include "x86_cpu_features.h"
+
+#include "libdeflate.h"
+
+/* Select the implementations to compile in. */
+
+#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
+#define DEFAULT_IMPL crc32_slice8
+
+/* Include the PCLMUL implementation? */
+#define NEED_PCLMUL_IMPL 0
+#if defined(__PCLMUL__) || \
+	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_PCLMUL_TARGET &&	\
+	 COMPILER_SUPPORTS_TARGET_INTRINSICS)
+#  include <wmmintrin.h>
+#  undef NEED_PCLMUL_IMPL
+#  define NEED_PCLMUL_IMPL 1
+#  ifdef __PCLMUL__ /* compiling for PCLMUL, i.e. can we assume it's there? */
+#    undef NEED_GENERIC_IMPL
+#    define NEED_GENERIC_IMPL 0 /* generic impl not needed */
+#    undef DEFAULT_IMPL
+#    define DEFAULT_IMPL crc32_pclmul
+#  endif /* otherwise, we can build a PCLMUL version, but we won't know whether
+	    we can use it until runtime */
+#endif
+
+/*
+ * Include the PCLMUL/AVX implementation?  Although our PCLMUL-optimized CRC-32
+ * function doesn't use any AVX intrinsics specifically, it can benefit a lot
+ * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
+ * MB/s.  I expect this is related to the PCLMULQDQ instructions being assembled
+ * in the newer three-operand form rather than the older two-operand form.
+ *
+ * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
+ * "regular" PCLMUL implementation would already be AVX enabled.
  */
+#define NEED_PCLMUL_AVX_IMPL 0
+#if NEED_PCLMUL_IMPL && !defined(__AVX__) && \
+	 X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET
+#  undef NEED_PCLMUL_AVX_IMPL
+#  define NEED_PCLMUL_AVX_IMPL 1
+#endif
 
-#define CRC32_SLICE8
+#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_PCLMUL_IMPL + NEED_PCLMUL_AVX_IMPL)
 
-#include "crc32.h"
+/* Define the CRC-32 table */
+#if NEED_GENERIC_IMPL
+#  define CRC32_SLICE8
+#else
+#  define CRC32_SLICE1 /* only need short table for unaligned ends */
+#endif
 #include "crc32_table.h"
 
 static forceinline u32
@@ -179,7 +230,7 @@ crc32_update_byte(u32 remainder, u8 next_byte)
 	return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
 }
 
-#ifdef CRC32_SLICE1
+#if defined(CRC32_SLICE1) || (NUM_IMPLS > NEED_GENERIC_IMPL)
 static u32
 crc32_slice1(u32 remainder, const u8 *buffer, size_t nbytes)
 {
@@ -258,18 +309,60 @@ crc32_slice8(u32 remainder, const u8 *buffer, size_t nbytes)
 }
 #endif
 
-u32
-crc32_gzip(const void *buffer, size_t nbytes)
-{
-	u32 remainder = ~0;
-#if defined(CRC32_SLICE1)
-	remainder = crc32_slice1(remainder, buffer, nbytes);
-#elif defined(CRC32_SLICE4)
-	remainder = crc32_slice4(remainder, buffer, nbytes);
-#elif defined(CRC32_SLICE8)
-	remainder = crc32_slice8(remainder, buffer, nbytes);
+/* Define the PCLMUL implementation if needed. */
+#if NEED_PCLMUL_IMPL
+#  define FUNCNAME		crc32_pclmul
+#  define FUNCNAME_ALIGNED	crc32_pclmul_aligned
+#  ifdef __PCLMUL__
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		__attribute__((target("pclmul")))
+#  endif
+#  include "crc32_impl.h"
+#endif
+
+/* Define the PCLMUL/AVX implementation if needed. */
+#if NEED_PCLMUL_AVX_IMPL
+#  define FUNCNAME		crc32_pclmul_avx
+#  define FUNCNAME_ALIGNED	crc32_pclmul_avx_aligned
+#  define ATTRIBUTES		__attribute__((target("pclmul,avx")))
+#  include "crc32_impl.h"
+#endif
+
+typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
+
+/*
+ * If multiple implementations are available, then dispatch among them based on
+ * CPU features at runtime.  Otherwise just call the single one directly.
+ */
+#if NUM_IMPLS == 1
+#  define crc32_impl DEFAULT_IMPL
 #else
-#  error "don't know which CRC-32 implementation to use!"
+static u32 dispatch(u32, const u8 *, size_t);
+
+static crc32_func_t crc32_impl = dispatch;
+
+static u32 dispatch(u32 remainder, const u8 *buffer, size_t nbytes)
+{
+	crc32_func_t f = DEFAULT_IMPL;
+#if NEED_PCLMUL_IMPL && !defined(__PCLMUL__)
+	if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ))
+		f = crc32_pclmul;
+#endif
+#if NEED_PCLMUL_AVX_IMPL
+	if (x86_have_cpu_features(X86_CPU_FEATURE_PCLMULQDQ |
+				  X86_CPU_FEATURE_AVX))
+		f = crc32_pclmul_avx;
 #endif
-	return ~remainder;
+	crc32_impl = f;
+	return crc32_impl(remainder, buffer, nbytes);
+}
+#endif /* NUM_IMPLS != 1 */
+
+LIBDEFLATEAPI u32
+libdeflate_crc32(u32 remainder, const void *buffer, size_t nbytes)
+{
+	if (buffer == NULL) /* return initial value */
+		return 0;
+	return ~crc32_impl(~remainder, buffer, nbytes);
 }
diff --git a/libdeflate/crc32.h b/libdeflate/crc32.h
deleted file mode 100644
index 2b9e957..0000000
--- a/libdeflate/crc32.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * crc32.h - CRC-32 checksum algorithm for the gzip format
- */
-
-#ifndef LIB_CRC32_H
-#define LIB_CRC32_H
-
-#include "lib_common.h"
-
-extern u32 crc32_gzip(const void *buffer, size_t size);
-
-#endif /* LIB_CRC32_H */
diff --git a/libdeflate/crc32_impl.h b/libdeflate/crc32_impl.h
new file mode 100644
index 0000000..625bc18
--- /dev/null
+++ b/libdeflate/crc32_impl.h
@@ -0,0 +1,286 @@
+/*
+ * crc32_impl.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * CRC-32 folding with PCLMULQDQ.
+ *
+ * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
+ * producing an abbreviated message which is congruent the original message
+ * modulo the generator polynomial G(x).
+ *
+ * Folding each 512 bits is implemented as eight 64-bit folds, each of which
+ * uses one carryless multiplication instruction.  It's expected that CPUs may
+ * be able to execute some of these multiplications in parallel.
+ *
+ * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
+ * be 95 bits from a constant distance D later in the message.  The relevant
+ * portion of the message can be written as:
+ *
+ *	M(x) = A(x)*x^D + B(x)
+ *
+ * ... where + and * represent addition and multiplication, respectively, of
+ * polynomials over GF(2).  Note that when implemented on a computer, these
+ * operations are equivalent to XOR and carryless multiplication, respectively.
+ *
+ * For the purpose of CRC calculation, only the remainder modulo the generator
+ * polynomial G(x) matters:
+ *
+ *	M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
+ *
+ * Since the modulo operation can be applied anywhere in a sequence of additions
+ * and multiplications without affecting the result, this is equivalent to:
+ *
+ *	M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
+ *
+ * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
+ * a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
+ * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
+ * product.  Then, adding (XOR-ing) the product to B(x) produces a polynomial
+ * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
+ * B(x)'.  This is the basic fold operation with 64 bits.
+ *
+ * Note that the carryless multiplication instruction PCLMULQDQ actually takes
+ * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
+ * 128-bit XMM register.  This works fine, but care must be taken to account for
+ * "bit endianness".  With the CRC version implemented here, bits are always
+ * ordered such that the lowest-order bit represents the coefficient of highest
+ * power of x and the highest-order bit represents the coefficient of the lowest
+ * power of x.  This is backwards from the more intuitive order.  Still,
+ * carryless multiplication works essentially the same either way.  It just must
+ * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
+ * of a 128-bit XMM register into 128-bits of later data held in another XMM
+ * register, we'll really be XOR-ing the product into the mathematically higher
+ * degree end of those later bits, not the lower degree end as may be expected.
+ *
+ * So given that caveat and the fact that we process 512 bits per iteration, the
+ * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
+ *
+ *	D = (512 + 95) - 64	 for the higher-degree half of each 128 bits,
+ *				 i.e. the lower order bits in the XMM register
+ *
+ *	D = (512 + 95) - 128	 for the lower-degree half of each 128 bits,
+ *				 i.e. the higher order bits in the XMM register
+ *
+ * The required 'x^D mod G(x)' values were precomputed.
+ *
+ * When <= 512 bits remain in the message, we finish up by folding across
+ * smaller distances.  This works similarly; the distance D is just different,
+ * so different constant multipliers must be used.  Finally, once the remaining
+ * message is just 64 bits, it is is reduced to the CRC-32 using Barrett
+ * reduction (explained later).
+ *
+ * For more information see the original paper from Intel:
+ *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *	December 2009
+ *	http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ */
+static u32 ATTRIBUTES
+FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t vec_count)
+{
+	/* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
+	const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
+	const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
+	const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
+	const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
+	const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
+	const __v2di barrett_reduction_constants =
+			(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
+
+	const __m128i * const end = p + vec_count;
+	const __m128i * const end512 = p + (vec_count & ~3);
+	__m128i x0, x1, x2, x3;
+
+	/*
+	 * Account for the current 'remainder', i.e. the CRC of the part of the
+	 * message already processed.  Explanation: rewrite the message
+	 * polynomial M(x) in terms of the first part A(x), the second part
+	 * B(x), and the length of the second part in bits |B(x)| >= 32:
+	 *
+	 *	M(x) = A(x)*x^|B(x)| + B(x)
+	 *
+	 * Then the CRC of M(x) is:
+	 *
+	 *	CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
+	 *	          = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
+	 *	          = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
+	 *
+	 * Note: all arithmetic is modulo G(x), the generator polynomial; that's
+	 * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
+	 *
+	 * So the CRC of the full message is the CRC of the second part of the
+	 * message where the first 32 bits of the second part of the message
+	 * have been XOR'ed with the CRC of the first part of the message.
+	 */
+	x0 = *p++;
+	x0 ^= (__m128i)(__v4si){ remainder };
+
+	if (p > end512) /* only 128, 256, or 384 bits of input? */
+		goto _128_bits_at_a_time;
+	x1 = *p++;
+	x2 = *p++;
+	x3 = *p++;
+
+	/* Fold 512 bits at a time */
+	for (; p != end512; p += 4) {
+		__m128i y0, y1, y2, y3;
+
+		y0 = p[0];
+		y1 = p[1];
+		y2 = p[2];
+		y3 = p[3];
+
+		/*
+		 * Note: the immediate constant for PCLMULQDQ specifies which
+		 * 64-bit halves of the 128-bit vectors to multiply:
+		 *
+		 * 0x00 means low halves (higher degree polynomial terms for us)
+		 * 0x11 means high halves (lower degree polynomial terms for us)
+		 */
+		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
+		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
+		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
+		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
+		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
+		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
+		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
+		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
+
+		x0 = y0;
+		x1 = y1;
+		x2 = y2;
+		x3 = y3;
+	}
+
+	/* Fold 512 bits => 128 bits */
+	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
+	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
+	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
+	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
+	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
+	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
+	x0 = x3;
+
+_128_bits_at_a_time:
+	while (p != end) {
+		/* Fold 128 bits into next 128 bits */
+		x1 = *p++;
+		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
+		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
+		x0 = x1;
+	}
+
+	/* Now there are just 128 bits left, stored in 'x0'. */
+
+	/*
+	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+	 * which is equivalent to multiplying by x^32.  This is needed because
+	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+	 */
+	x0 = _mm_srli_si128(x0, 8) ^
+	     _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
+
+	/* Fold 96 => 64 bits */
+	x0 = _mm_srli_si128(x0, 4) ^
+	     _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
+
+        /*
+	 * Finally, reduce 64 => 32 bits using Barrett reduction.
+	 *
+	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
+	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
+	 *
+	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
+	 *	     = (A(x)*x^32) mod G(x) + B(x)
+	 *
+	 * Then, by the Division Algorithm there exists a unique q(x) such that:
+	 *
+	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+	 *
+	 * Since the left-hand side is of maximum degree 31, the right-hand side
+	 * must be too.  This implies that we can apply 'mod x^32' to the
+	 * right-hand side without changing its value:
+	 *
+	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+	 *
+	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
+	 *
+	 * We also know that:
+	 *
+	 *	              / A(x)*x^32 \
+	 *	q(x) = floor (  ---------  )
+	 *	              \    G(x)   /
+	 *
+	 * To compute this efficiently, we can multiply the top and bottom by
+	 * x^32 and move the division by G(x) to the top:
+	 *
+	 *	              / A(x) * floor(x^64 / G(x)) \
+	 *	q(x) = floor (  -------------------------  )
+	 *	              \           x^32            /
+	 *
+	 * Note that floor(x^64 / G(x)) is a constant.
+	 *
+	 * So finally we have:
+	 *
+	 *	                          / A(x) * floor(x^64 / G(x)) \
+	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
+	 *	                          \           x^32            /
+	 */
+	x1 = x0;
+	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
+	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
+	return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
+}
+
+/*
+ * Fast CRC-32 implementation for x86_64 processors that have the carryless
+ * multiplication extension (PCLMUL).
+ *
+ * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
+ * of crc32_slice8() because only a few bytes need to be processed, so a smaller
+ * table is preferable.
+ */
+static u32 ATTRIBUTES
+FUNCNAME(u32 remainder, const u8 *buffer, size_t nbytes)
+{
+	if ((uintptr_t)buffer & 15) {
+		size_t n = MIN(nbytes, -(uintptr_t)buffer & 15);
+		remainder = crc32_slice1(remainder, buffer, n);
+		buffer += n;
+		nbytes -= n;
+	}
+	if (nbytes >= 16) {
+		remainder = FUNCNAME_ALIGNED(remainder, (const __m128i *)buffer,
+					     nbytes / 16);
+		buffer += nbytes & ~15;
+		nbytes &= 15;
+	}
+	return crc32_slice1(remainder, buffer, nbytes);
+}
+
+#undef FUNCNAME
+#undef FUNCNAME_ALIGNED
+#undef ATTRIBUTES
diff --git a/libdeflate/deflate_compress.c b/libdeflate/deflate_compress.c
index 255a6cd..a1e6c74 100644
--- a/libdeflate/deflate_compress.c
+++ b/libdeflate/deflate_compress.c
@@ -2787,6 +2787,8 @@ libdeflate_deflate_compress(struct libdeflate_compressor *c,
 	if (unlikely(in_nbytes < 16)) {
 		struct deflate_output_bitstream os;
 		deflate_init_output(&os, out, out_nbytes_avail);
+		if (in_nbytes == 0)
+			in = &os; /* Avoid passing NULL to memcpy() */
 		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
 		return deflate_flush_output(&os);
 	}
diff --git a/libdeflate/deflate_compress.h b/libdeflate/deflate_compress.h
index 1491e00..f4bb23b 100644
--- a/libdeflate/deflate_compress.h
+++ b/libdeflate/deflate_compress.h
@@ -1,6 +1,8 @@
 #ifndef LIB_DEFLATE_COMPRESS_H
 #define LIB_DEFLATE_COMPRESS_H
 
+#include "lib_common.h"
+
 /* DEFLATE compression is private to deflate_compress.c, but we do need to be
  * able to query the compression level for zlib and gzip header generation.  */
 
diff --git a/libdeflate/deflate_decompress.c b/libdeflate/deflate_decompress.c
index 8fcbd8a..8284fc8 100644
--- a/libdeflate/deflate_decompress.c
+++ b/libdeflate/deflate_decompress.c
@@ -842,7 +842,7 @@ dispatch(struct libdeflate_decompressor * restrict d,
 {
 	decompress_func_t f = deflate_decompress_default;
 #if X86_CPU_FEATURES_ENABLED
-	if (x86_have_cpu_feature(X86_CPU_FEATURE_BMI2))
+	if (x86_have_cpu_features(X86_CPU_FEATURE_BMI2))
 		f = deflate_decompress_bmi2;
 #endif
 	decompress_impl = f;
diff --git a/libdeflate/gzip_compress.c b/libdeflate/gzip_compress.c
index 2f52dda..bfc75e2 100644
--- a/libdeflate/gzip_compress.c
+++ b/libdeflate/gzip_compress.c
@@ -27,7 +27,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "crc32.h"
 #include "deflate_compress.h"
 #include "gzip_constants.h"
 #include "unaligned.h"
@@ -77,7 +76,7 @@ libdeflate_gzip_compress(struct libdeflate_compressor *c,
 	out_next += deflate_size;
 
 	/* CRC32 */
-	put_unaligned_le32(crc32_gzip(in, in_size), out_next);
+	put_unaligned_le32(libdeflate_crc32(0, in, in_size), out_next);
 	out_next += 4;
 
 	/* ISIZE */
diff --git a/libdeflate/gzip_decompress.c b/libdeflate/gzip_decompress.c
index 467de32..e3ce3d7 100644
--- a/libdeflate/gzip_decompress.c
+++ b/libdeflate/gzip_decompress.c
@@ -27,7 +27,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "crc32.h"
 #include "gzip_constants.h"
 #include "unaligned.h"
 
@@ -118,7 +117,8 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
 	in_next = in_end - GZIP_FOOTER_SIZE;
 
 	/* CRC32 */
-	if (crc32_gzip(out, actual_out_nbytes) != get_unaligned_le32(in_next))
+	if (libdeflate_crc32(0, out, actual_out_nbytes) !=
+	    get_unaligned_le32(in_next))
 		return LIBDEFLATE_BAD_DATA;
 	in_next += 4;
 
diff --git a/libdeflate/lib_common.h b/libdeflate/lib_common.h
index b041ac9..149f1df 100644
--- a/libdeflate/lib_common.h
+++ b/libdeflate/lib_common.h
@@ -5,6 +5,11 @@
 #ifndef LIB_LIB_COMMON_H
 #define LIB_LIB_COMMON_H
 
+#ifdef LIBDEFLATE_H
+#  error "lib_common.h must always be included before libdeflate.h"
+   /* because BUILDING_LIBDEFLATE must be set first */
+#endif
+
 #define BUILDING_LIBDEFLATE
 
 #include "common_defs.h"
@@ -21,10 +26,8 @@
  * shared library, since these symbols are not exported.
  */
 #define SYM_FIXUP(sym)			_libdeflate_##sym
-#define adler32				SYM_FIXUP(adler32)
 #define aligned_malloc			SYM_FIXUP(aligned_malloc)
 #define aligned_free			SYM_FIXUP(aligned_free)
-#define crc32_gzip			SYM_FIXUP(crc32_gzip)
 #define deflate_get_compression_level	SYM_FIXUP(deflate_get_compression_level)
 #define _x86_cpu_features		SYM_FIXUP(_x86_cpu_features)
 #define x86_setup_cpu_features		SYM_FIXUP(x86_setup_cpu_features)
diff --git a/libdeflate/libdeflate.h b/libdeflate/libdeflate.h
index df09462..c64e9bf 100644
--- a/libdeflate/libdeflate.h
+++ b/libdeflate/libdeflate.h
@@ -10,10 +10,11 @@ extern "C" {
 #endif
 
 #define LIBDEFLATE_VERSION_MAJOR	0
-#define LIBDEFLATE_VERSION_MINOR	4
-#define LIBDEFLATE_VERSION_STRING	"0.4"
+#define LIBDEFLATE_VERSION_MINOR	6
+#define LIBDEFLATE_VERSION_STRING	"0.6"
 
 #include <stddef.h>
+#include <stdint.h>
 
 /*
  * On Windows, if you want to link to the DLL version of libdeflate, then
@@ -243,6 +244,28 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
 LIBDEFLATEAPI void
 libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
 
+/* ========================================================================== */
+/*                                Checksums                                   */
+/* ========================================================================== */
+
+/*
+ * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
+ * data and returns the updated checksum.  When starting a new checksum, the
+ * required initial value for 'adler' is 1.  This value is also returned when
+ * 'buffer' is specified as NULL.
+ */
+LIBDEFLATEAPI uint32_t
+libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len);
+
+
+/*
+ * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
+ * and returns the updated checksum.  When starting a new checksum, the required
+ * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
+ * specified as NULL.
+ */
+LIBDEFLATEAPI uint32_t
+libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
 
 #ifdef __cplusplus
 }
diff --git a/libdeflate/x86_cpu_features.h b/libdeflate/x86_cpu_features.h
index 50dd9c7..4221bcc 100644
--- a/libdeflate/x86_cpu_features.h
+++ b/libdeflate/x86_cpu_features.h
@@ -34,13 +34,13 @@ extern u32 _x86_cpu_features;
 extern void
 x86_setup_cpu_features(void);
 
-/* Does the processor have the specified feature?  */
+/* Does the processor have the specified feature(s)?  */
 static inline bool
-x86_have_cpu_feature(u32 feature)
+x86_have_cpu_features(u32 features)
 {
 	if (_x86_cpu_features == 0)
 		x86_setup_cpu_features();
-	return _x86_cpu_features & feature;
+	return (_x86_cpu_features & features) == features;
 }
 
 #endif /* X86_CPU_FEATURES_ENABLED */
diff --git a/libdeflate/zlib_compress.c b/libdeflate/zlib_compress.c
index af86afa..b4cebaf 100644
--- a/libdeflate/zlib_compress.c
+++ b/libdeflate/zlib_compress.c
@@ -27,7 +27,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "adler32.h"
 #include "deflate_compress.h"
 #include "unaligned.h"
 #include "zlib_constants.h"
@@ -73,7 +72,7 @@ libdeflate_zlib_compress(struct libdeflate_compressor *c,
 	out_next += deflate_size;
 
 	/* ADLER32  */
-	put_unaligned_be32(adler32(in, in_size), out_next);
+	put_unaligned_be32(libdeflate_adler32(1, in, in_size), out_next);
 	out_next += 4;
 
 	return out_next - (u8 *)out;
diff --git a/libdeflate/zlib_decompress.c b/libdeflate/zlib_decompress.c
index b9ecd08..c5a15ca 100644
--- a/libdeflate/zlib_decompress.c
+++ b/libdeflate/zlib_decompress.c
@@ -27,7 +27,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "adler32.h"
 #include "unaligned.h"
 #include "zlib_constants.h"
 
@@ -84,7 +83,8 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
 	in_next = in_end - ZLIB_FOOTER_SIZE;
 
 	/* ADLER32  */
-	if (adler32(out, actual_out_nbytes) != get_unaligned_be32(in_next))
+	if (libdeflate_adler32(1, out, actual_out_nbytes) !=
+	    get_unaligned_be32(in_next))
 		return LIBDEFLATE_BAD_DATA;
 
 	return LIBDEFLATE_SUCCESS;