diff --git a/go.mod b/go.mod
index 34b96dcb5aa..8d74c1afc97 100644
--- a/go.mod
+++ b/go.mod
@@ -9,15 +9,15 @@ require (
 	github.com/Soontao/goHttpDigestClient v0.0.0-20170320082612-6d28bb1415c5
 	github.com/andybalholm/brotli v1.0.4
 	github.com/dop251/goja v0.0.0-20220214123719-b09a6bfa842f
-	github.com/fatih/color v1.12.0
+	github.com/fatih/color v1.13.0
 	github.com/golang/protobuf v1.5.2
 	github.com/gorilla/websocket v1.5.0
 	github.com/influxdata/influxdb1-client v0.0.0-20190402204710-8ff2fc3824fc
 	github.com/jhump/protoreflect v1.12.0
-	github.com/klauspost/compress v1.13.6
+	github.com/klauspost/compress v1.15.1
 	github.com/mailru/easyjson v0.7.7
-	github.com/mattn/go-colorable v0.1.8
-	github.com/mattn/go-isatty v0.0.13
+	github.com/mattn/go-colorable v0.1.12
+	github.com/mattn/go-isatty v0.0.14
 	github.com/mccutchen/go-httpbin v1.1.2-0.20190116014521-c5cb2f4802fa
 	github.com/nu7hatch/gouuid v0.0.0-20131221200532-179d4d0c4d8d
 	github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c
diff --git a/go.sum b/go.sum
index 28e75502240..0b84cfc4318 100644
--- a/go.sum
+++ b/go.sum
@@ -40,8 +40,8 @@ github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.m
 github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk=
 github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.mod h1:AFq3mo9L8Lqqiid3OhADV3RfLJnjiw63cSpi+fDTRC0=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/fatih/color v1.12.0 h1:mRhaKNwANqRgUBGKmnI5ZxEk7QXmjQeCcuYFMX2bfcc=
-github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
+github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w=
+github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/fsnotify/fsnotify v1.4.9 h1:hsms1Qyu0jgnwNXIxa+/V/PDsU6CfLf6CNO8H7IWoS4=
 github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
@@ -89,8 +89,8 @@ github.com/jhump/protoreflect v1.12.0 h1:1NQ4FpWMgn3by/n1X0fbeKEUxP1wBt7+Oitpv01
 github.com/jhump/protoreflect v1.12.0/go.mod h1:JytZfP5d0r8pVNLZvai7U/MCuTWITgrI4tTg7puQFKI=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
-github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc=
-github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/klauspost/compress v1.15.1 h1:y9FcTHGyrebwfP0ZZqFiaxTaiDnUrGkJkI+f583BL1A=
+github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
 github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
@@ -100,11 +100,12 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
-github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8=
-github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
+github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
+github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZbaA40=
+github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4=
 github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
-github.com/mattn/go-isatty v0.0.13 h1:qdl+GuBjcsKKDco5BsxPJlId98mSWNKqYA+Co0SC1yA=
-github.com/mattn/go-isatty v0.0.13/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
+github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y=
+github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
 github.com/mccutchen/go-httpbin v1.1.2-0.20190116014521-c5cb2f4802fa h1:lx8ZnNPwjkXSzOROz0cg69RlErRXs+L3eDkggASWKLo=
 github.com/mccutchen/go-httpbin v1.1.2-0.20190116014521-c5cb2f4802fa/go.mod h1:fhpOYavp5g2K74XDl/ao2y4KvhqVtKlkg1e+0UaQv7I=
 github.com/mstoykov/envconfig v1.4.1-0.20220114105314-765c6d8c76f1 h1:94EkGmhXrVUEal+uLwFUf4fMXPhZpM5tYxuIsxrCCbI=
@@ -195,6 +196,8 @@ golang.org/x/sys v0.0.0-20200519105757-fe76b779f299/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM=
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
diff --git a/vendor/github.com/fatih/color/README.md b/vendor/github.com/fatih/color/README.md
index 5c751f2158c..5152bf59bf8 100644
--- a/vendor/github.com/fatih/color/README.md
+++ b/vendor/github.com/fatih/color/README.md
@@ -78,7 +78,7 @@ notice("Don't forget this...")
 ### Custom fprint functions (FprintFunc)
 
 ```go
-blue := color.New(FgBlue).FprintfFunc()
+blue := color.New(color.FgBlue).FprintfFunc()
 blue(myWriter, "important notice: %s", stars)
 
 // Mix up with multiple attributes
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index 3429879eb69..0e2dc116ad2 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -17,6 +17,49 @@ This package provides various compression algorithms.
 
 # changelog
 
+* Mar 3, 2022 (v1.15.0)
+	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
+	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
+	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
+	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)
+	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
+	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
+
+<details>
+	<summary>See  Details</summary>
+Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
+
+Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
+
+While the release has been extensively tested, it is recommended to testing when upgrading.
+</details>
+
+* Feb 22, 2022 (v1.14.4)
+	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
+	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
+	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501
+	* huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
+
+* Feb 17, 2022 (v1.14.3)
+	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)
+	* flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483)
+	* s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486)
+
+* Jan 25, 2022 (v1.14.2)
+	* zstd: improve header decoder by @dsnet  [#476](https://github.com/klauspost/compress/pull/476)
+	* zstd: Add bigger default blocks  [#469](https://github.com/klauspost/compress/pull/469)
+	* zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470)
+	* zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472)
+	* flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473)
+	* zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475)
+
+* Jan 11, 2022 (v1.14.1)
+	* s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
+	* flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
+	* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
+	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
+	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+
 * Aug 30, 2021 (v1.13.5)
 	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
 	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
@@ -46,6 +89,9 @@ This package provides various compression algorithms.
 	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
 	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
 
+<details>
+	<summary>See changes to v1.12.x</summary>
+	
 * May 25, 2021 (v1.12.3)
 	* deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374)
 	* deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375)
@@ -67,9 +113,10 @@ This package provides various compression algorithms.
 	* s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352)
 	* zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346)
 	* s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349)
+</details>
 
 <details>
-	<summary>See changes prior to v1.12.1</summary>
+	<summary>See changes to v1.11.x</summary>
 	
 * Mar 26, 2021 (v1.11.13)
 	* zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345)
@@ -128,7 +175,7 @@ This package provides various compression algorithms.
 </details>
 
 <details>
-	<summary>See changes prior to v1.11.0</summary>
+	<summary>See changes to v1.10.x</summary>
  
 * July 8, 2020 (v1.10.11) 
 	* zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278)
@@ -290,11 +337,6 @@ This package provides various compression algorithms.
 
 # deflate usage
 
-* [High Throughput Benchmark](http://blog.klauspost.com/go-gzipdeflate-benchmarks/).
-* [Small Payload/Webserver Benchmarks](http://blog.klauspost.com/gzip-performance-for-go-webservers/).
-* [Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
-* [Re-balancing Deflate Compression Levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/)
-
 The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
 
 | old import         | new import                              | Documentation
@@ -316,6 +358,8 @@ Memory usage is typically 1MB for a Writer. stdlib is in the same range.
 If you expect to have a lot of concurrently allocated Writers consider using 
 the stateless compress described below.
 
+For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).
+
 # Stateless compression
 
 This package offers stateless compression as a special option for gzip/deflate. 
@@ -432,6 +476,13 @@ For more information see my blog post on [Fast Linear Time Compression](http://b
 
 This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.
 
+# Other packages
+
+Here are other packages of good quality and pure Go (no cgo wrappers or autoconverted code):
+
+* [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.
+* [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.
+* [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.
 
 # license
 
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index 5283ac5a538..bffa2f33236 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -6,6 +6,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
@@ -37,15 +38,17 @@ const (
 	maxMatchLength   = 258 // The longest match for the compressor
 	minOffsetSize    = 1   // The shortest offset that makes any sense
 
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
 	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
-	maxHashOffset       = 1 << 24
+	maxHashOffset       = 1 << 28
 
 	skipNever = math.MaxInt32
 
@@ -70,9 +73,9 @@ var levels = []compressionLevel{
 	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{8, 8, 24, 16, skipNever, 7},
-	{10, 16, 24, 64, skipNever, 8},
-	{32, 258, 258, 4096, skipNever, 9},
+	{8, 12, 16, 24, skipNever, 7},
+	{16, 30, 40, 64, skipNever, 8},
+	{32, 258, 258, 1024, skipNever, 9},
 }
 
 // advancedState contains state for the advanced levels, with bigger hash tables, etc.
@@ -93,8 +96,9 @@ type advancedState struct {
 	hashOffset int
 
 	// input window: unprocessed data is window[index:windowEnd]
-	index     int
-	hashMatch [maxMatchLength + minMatchLength]uint32
+	index          int
+	estBitsPerByte int
+	hashMatch      [maxMatchLength + minMatchLength]uint32
 
 	hash uint32
 	ii   uint16 // position of last match, intended to overflow to reset.
@@ -103,6 +107,7 @@ type advancedState struct {
 type compressor struct {
 	compressionLevel
 
+	h *huffmanEncoder
 	w *huffmanBitWriter
 
 	// compression algorithm
@@ -170,7 +175,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tok, eof, window)
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
 		return d.w.err
 	}
 	return nil
@@ -263,7 +269,7 @@ func (d *compressor) fillWindow(b []byte) {
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
 	minMatchLook := maxMatchLength
 	if lookahead < minMatchLook {
 		minMatchLook = lookahead
@@ -279,36 +285,75 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 
 	// If we've got a match that's good enough, only look in 1/4 the chain.
 	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
+	length = minMatchLength - 1
 
 	wEnd := win[pos+length]
 	wPos := win[pos:]
 	minIndex := pos - windowSize
+	if minIndex < 0 {
+		minIndex = 0
+	}
+	offset = 0
+
+	cGain := 0
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
 
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 6
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
+			if n > length {
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
 				}
-				wEnd = win[pos+n]
 			}
 		}
-		if i == minIndex {
+		if i <= minIndex {
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
 		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
-		if i < minIndex || i < 0 {
+		if i < minIndex {
 			break
 		}
 	}
@@ -327,8 +372,7 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-	b = b[:4]
-	return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
+	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
 }
 
 // bulkHash4 will compute hashes using the same
@@ -337,11 +381,12 @@ func bulkHash4(b []byte, dst []uint32) {
 	if len(b) < 4 {
 		return
 	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	hb := binary.LittleEndian.Uint32(b)
+
 	dst[0] = hash4u(hb, hashBits)
 	end := len(b) - 4 + 1
 	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
+		hb = (hb >> 8) | uint32(b[i+3])<<24
 		dst[i] = hash4u(hb, hashBits)
 	}
 }
@@ -374,10 +419,21 @@ func (d *compressor) deflateLazy() {
 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
 
 	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 	if s.index < s.maxInsertIndex {
-		s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+		s.hash = hash4(d.window[s.index:])
 	}
 
 	for {
@@ -410,7 +466,7 @@ func (d *compressor) deflateLazy() {
 		}
 		if s.index < s.maxInsertIndex {
 			// Update the hash
-			s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+			s.hash = hash4(d.window[s.index:])
 			ch := s.hashHead[s.hash&hashMask]
 			s.chainHead = int(ch)
 			s.hashPrev[s.index&windowMask] = ch
@@ -426,12 +482,37 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
 				s.length = newLength
 				s.offset = newOffset
 			}
 		}
+
 		if prevLength >= minMatchLength && s.length <= prevLength {
+			// Check for better match at end...
+			//
+			// checkOff must be >=2 since we otherwise risk checking s.index
+			// Offset of 2 seems to yield best results.
+			const checkOff = 2
+			prevIndex := s.index - 1
+			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
+				end := lookahead
+				if lookahead > maxMatchLength {
+					end = maxMatchLength
+				}
+				end += prevIndex
+				idx := prevIndex + prevLength - (4 - checkOff)
+				h := hash4(d.window[idx:])
+				ch2 := int(s.hashHead[h&hashMask]) - s.hashOffset - prevLength + (4 - checkOff)
+				if ch2 > minIndex {
+					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
+					// It seems like a pure length metric is best.
+					if length > prevLength {
+						prevLength = length
+						prevOffset = prevIndex - ch2
+					}
+				}
+			}
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
 			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
@@ -479,6 +560,7 @@ func (d *compressor) deflateLazy() {
 				}
 				d.tokens.Reset()
 			}
+			s.ii = 0
 		} else {
 			// Reset, if we got a match this run.
 			if s.length >= minMatchLength {
@@ -498,13 +580,12 @@ func (d *compressor) deflateLazy() {
 
 				// If we have a long run of no matches, skip additional bytes
 				// Resets when s.ii overflows after 64KB.
-				if s.ii > 31 {
-					n := int(s.ii >> 5)
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
 					for j := 0; j < n; j++ {
 						if s.index >= d.windowEnd-1 {
 							break
 						}
-
 						d.tokens.AddLiteral(d.window[s.index-1])
 						if d.tokens.n == maxFlateBlockTokens {
 							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
@@ -512,6 +593,14 @@ func (d *compressor) deflateLazy() {
 							}
 							d.tokens.Reset()
 						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
 						s.index++
 					}
 					// Flush last byte
@@ -611,7 +700,9 @@ func (d *compressor) write(b []byte) (n int, err error) {
 	}
 	n = len(b)
 	for len(b) > 0 {
-		d.step(d)
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
 		b = b[d.fill(d, b):]
 		if d.err != nil {
 			return 0, d.err
@@ -652,13 +743,13 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 8
+		d.w.logNewTablePenalty = 7
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logNewTablePenalty = 10
+		d.w.logNewTablePenalty = 8
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index a746eb73387..d55ea2a7759 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -179,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
@@ -213,11 +213,9 @@ func (e *fastGen) Reset() {
 // matchLen returns the maximum length.
 // 'a' must be the shortest of the two.
 func matchLen(a, b []byte) int {
-	b = b[:len(a)]
 	var checked int
 
 	for len(a) >= 8 {
-		b = b[:len(a)]
 		if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
 			return checked + (bits.TrailingZeros64(diff) >> 3)
 		}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 3ad5e980724..25f6d1108fc 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -8,6 +8,7 @@ import (
 	"encoding/binary"
 	"fmt"
 	"io"
+	"math"
 )
 
 const (
@@ -24,6 +25,10 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
@@ -36,8 +41,11 @@ const (
 	bufferSize = bufferFlushSize + 8
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = [32]int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -51,19 +59,22 @@ var lengthBase = [32]uint8{
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
-var offsetExtraBits = [64]int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
 	/* extended window */
-	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+	14, 14,
 }
 
 var offsetCombined = [32]uint32{}
 
 func init() {
-	var offsetBase = [64]uint32{
+	var offsetBase = [32]uint32{
 		/* normal deflate */
 		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
 		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
@@ -73,17 +84,15 @@ func init() {
 		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
 
 		/* extended window */
-		0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
-		0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
-		0x100000, 0x180000, 0x200000, 0x300000,
+		0x008000, 0x00c000,
 	}
 
 	for i := range offsetCombined[:] {
 		// Don't use extended window values...
-		if offsetBase[i] > 0x006000 {
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
 			continue
 		}
-		offsetCombined[i] = uint32(offsetExtraBits[i])<<16 | (offsetBase[i])
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
 	}
 }
 
@@ -99,7 +108,7 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint16
+	nbits           uint8
 	nbytes          uint8
 	lastHuffMan     bool
 	literalEncoding *huffmanEncoder
@@ -155,37 +164,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.lastHuffMan = false
 }
 
-func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
-	offsets, lits = true, true
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a := t.offHist[:offsetCodeCount]
-	b := w.offsetFreq[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			offsets = false
-			break
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
 
 	a = t.extraHist[:literalCount-256]
-	b = w.literalFreq[256:literalCount]
+	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			lits = false
-			break
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	if lits {
-		a = t.litHist[:]
-		b = w.literalFreq[:len(a)]
-		for i := range a {
-			if b[i] == 0 && a[i] != 0 {
-				lits = false
-				break
-			}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	return
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -221,8 +226,8 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
-	w.bits |= uint64(b) << w.nbits
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -423,7 +428,7 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
 	// The function does not get inlined if we "& 63" the shift.
-	w.bits |= uint64(c.code) << w.nbits
+	w.bits |= uint64(c.code) << (w.nbits & 63)
 	w.nbits += c.len
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -566,7 +571,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
-	w.generate(tokens)
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
@@ -577,7 +582,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -595,7 +603,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -634,22 +642,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
-	if !sync {
-		tokens.Fill()
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
 	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
 	var size int
+
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += newSize >> w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
 
 		// Check if a new table is better.
 		if newSize < reuseSize {
@@ -660,35 +685,83 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		} else {
 			size = reuseSize
 		}
+
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
 		// Check if we get a reasonable size decrease.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if storable && ssize <= size {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 	}
 
 	// We want a new block/table
 	if w.lastHeader == 0 {
-		w.generate(tokens)
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
 		var numCodegens int
-		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
-		// Store bytes, if we don't get a reasonable improvement.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.lastHeader, _ = w.headerSize()
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
 		w.lastHuffMan = false
 	}
 
@@ -699,6 +772,19 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
@@ -733,7 +819,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num
 	return
 }
 
-func (w *huffmanBitWriter) generate(t *tokens) {
+func (w *huffmanBitWriter) generate() {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
@@ -765,10 +851,10 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
 
 	for _, t := range tokens {
-		if t < matchType {
+		if t < 256 {
 			//w.writeCode(lits[t.literal()])
-			c := lits[t.literal()]
-			bits |= uint64(c.code) << nbits
+			c := lits[t]
+			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
@@ -790,13 +876,13 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
+		lengthCode := lengthCode(length) & 31
 		if false {
-			w.writeCode(lengths[lengthCode&31])
+			w.writeCode(lengths[lengthCode])
 		} else {
 			// inlined
-			c := lengths[lengthCode&31]
-			bits |= uint64(c.code) << nbits
+			c := lengths[lengthCode]
+			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
@@ -815,11 +901,11 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			}
 		}
 
-		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
-		if extraLengthBits > 0 {
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
 			//w.writeBits(extraLength, extraLengthBits)
-			extraLength := int32(length - lengthBase[lengthCode&31])
-			bits |= uint64(extraLength) << nbits
+			extraLength := int32(length - lengthBase[lengthCode])
+			bits |= uint64(extraLength) << (nbits & 63)
 			nbits += extraLengthBits
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
@@ -839,14 +925,13 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offset >> 16
-		offset &= matchOffsetOnlyMask
+		offsetCode := (offset >> 16) & 31
 		if false {
-			w.writeCode(offs[offsetCode&31])
+			w.writeCode(offs[offsetCode])
 		} else {
 			// inlined
 			c := offs[offsetCode]
-			bits |= uint64(c.code) << nbits
+			bits |= uint64(c.code) << (nbits & 63)
 			nbits += c.len
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
@@ -864,11 +949,12 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 				}
 			}
 		}
-		offsetComb := offsetCombined[offsetCode]
-		if offsetComb > 1<<16 {
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
 			//w.writeBits(extraOffset, extraOffsetBits)
-			bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << nbits
-			nbits += uint16(offsetComb >> 16)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -934,6 +1020,29 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
 	histogram(input, w.literalFreq[:numLiterals], fill)
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
 	w.literalFreq[endBlockMarker] = 1
 	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
 	if fill {
@@ -951,8 +1060,10 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	estBits += estBits >> w.logNewTablePenalty
 
 	// Store bytes, if we don't get a reasonable improvement.
-	ssize, storable := w.storedSize(input)
 	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -963,7 +1074,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 
 		if estBits < reuseSize {
 			if debugDeflate {
-				//fmt.Println("not reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
 			}
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
@@ -996,14 +1107,44 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	encoding := w.literalEncoding.codes[:256]
 	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
 	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
-	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		bits |= uint64(c.code) << nbits
-		nbits += c.len
-		if debugDeflate {
-			count += int(c.len)
+
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
+		}
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
 		}
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= uint64(a.code) << (nbits & 63)
+		bits |= uint64(b.code) << ((nbits + a.len) & 63)
+		c := encoding[input[2]]
+		nbits += b.len + a.len
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		input = input[3:]
+	}
+
+	// Remaining...
+	for _, t := range input {
 		if nbits >= 48 {
 			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
 			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
@@ -1015,17 +1156,33 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 					nbytes = 0
 					return
 				}
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
 				_, w.err = w.writer.Write(w.bytes[:nbytes])
 				nbytes = 0
 			}
 		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		if debugDeflate {
+			count += int(c.len)
+		}
 	}
 	// Restore...
 	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
 
 	if debugDeflate {
-		fmt.Println("wrote", count/8, "bytes")
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
 	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
 	if eof || sync {
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 67b2b387284..9ab497c275b 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -17,7 +17,8 @@ const (
 
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
-	code, len uint16
+	code uint16
+	len  uint8
 }
 
 type huffmanEncoder struct {
@@ -56,7 +57,7 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
+func (h *hcode) set(code uint16, length uint8) {
 	h.len = length
 	h.code = code
 }
@@ -80,7 +81,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 	var ch uint16
 	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -99,7 +100,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = hcode{code: reverseBits(bits, size), len: size}
 	}
 	return h
 }
@@ -129,9 +130,7 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
 	var total int
 	for _, f := range b {
-		if f != 0 {
-			total += int(h.codes[f].len)
-		}
+		total += int(h.codes[f].len)
 	}
 	return total
 }
@@ -189,14 +188,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     int32(list[1].freq),
-			nextCharFreq: int32(list[2].freq),
-			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -207,8 +211,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
 			// We've run out of both leafs and pairs.
@@ -240,7 +244,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
 			levels[l.level-1].needed = 2
 		}
 
@@ -298,7 +308,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 
 		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)}
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -311,6 +321,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // maxBits  The maximum number of bits to use for any literal.
 func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -319,11 +330,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			list[count] = literalNode{}
-			h.codes[i].len = 0
+			codes[i].len = 0
 		}
 	}
-	list[len(freq)] = literalNode{}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
index d1edb356c4b..414c0bea9fa 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -36,6 +36,13 @@ type lengthExtra struct {
 
 var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
 
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
 // Initialize the fixedHuffmanDecoder only once upon first use.
 var fixedOnce sync.Once
 var fixedHuffmanDecoder huffmanDecoder
@@ -328,11 +335,17 @@ func (f *decompressor) nextBlock() {
 	switch typ {
 	case 0:
 		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
 	case 1:
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -341,6 +354,9 @@ func (f *decompressor) nextBlock() {
 		f.hl = &f.h1
 		f.hd = &f.h2
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
 	default:
 		// 3 is reserved.
 		if debugDecode {
@@ -550,221 +566,6 @@ func (f *decompressor) readHuffman() error {
 	return nil
 }
 
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlockGeneric() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := f.r.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlockGeneric
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
-		}
-
-		var dist uint32
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			sym, err := f.huffSym(f.hd)
-			if err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
-				}
-				f.err = err
-				return
-			}
-			dist = uint32(sym)
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
-			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > uint32(f.dict.histSize()) {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, int(dist)
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index cc6db27925c..8d632cea0f5 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -21,6 +21,11 @@ func (f *decompressor) huffmanBytesBuffer() {
 	)
 	fr := f.r.(*bytes.Buffer)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -39,41 +44,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -88,10 +87,12 @@ readLiteral:
 				f.toRead = f.dict.readFlush()
 				f.step = (*decompressor).huffmanBytesBuffer
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -101,9 +102,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -111,25 +113,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -137,12 +141,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -152,38 +156,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -197,9 +198,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -207,14 +209,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -224,6 +228,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -248,10 +253,12 @@ copyHistory:
 			f.toRead = f.dict.readFlush()
 			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -265,6 +272,11 @@ func (f *decompressor) huffmanBytesReader() {
 	)
 	fr := f.r.(*bytes.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -283,41 +295,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -332,10 +338,12 @@ readLiteral:
 				f.toRead = f.dict.readFlush()
 				f.step = (*decompressor).huffmanBytesReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -345,9 +353,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -355,25 +364,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -381,12 +392,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -396,38 +407,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -441,9 +449,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -451,14 +460,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -468,6 +479,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -492,10 +504,12 @@ copyHistory:
 			f.toRead = f.dict.readFlush()
 			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -509,6 +523,11 @@ func (f *decompressor) huffmanBufioReader() {
 	)
 	fr := f.r.(*bufio.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -527,41 +546,35 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -576,10 +589,12 @@ readLiteral:
 				f.toRead = f.dict.readFlush()
 				f.step = (*decompressor).huffmanBufioReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -589,9 +604,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -599,25 +615,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -625,12 +643,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -640,38 +658,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -685,9 +700,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -695,14 +711,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -712,6 +730,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -736,10 +755,12 @@ copyHistory:
 			f.toRead = f.dict.readFlush()
 			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -753,6 +774,11 @@ func (f *decompressor) huffmanStringsReader() {
 	)
 	fr := f.r.(*strings.Reader)
 
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -771,41 +797,286 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb := f.nb, f.b
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
@@ -818,12 +1089,14 @@ readLiteral:
 			f.dict.writeByte(byte(v))
 			if f.dict.availWrite() == 0 {
 				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanStringsReader
+				f.step = (*decompressor).huffmanGenericReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
@@ -833,9 +1106,10 @@ readLiteral:
 			val := decCodeToLen[(v - 257)]
 			length = int(val.length) + 3
 			n := uint(val.extra)
-			for f.nb < n {
+			for fnb < n {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
@@ -843,25 +1117,27 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
-			f.b >>= n & regSizeMaskUint32
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
 
 		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
+			for fnb < 5 {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
@@ -869,12 +1145,12 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
 			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
 			// with single element, huffSym must error on these two edge cases. In both
@@ -884,38 +1160,35 @@ readLiteral:
 			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 			// but is smart enough to keep local variables in registers, so use nb and b,
 			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & regSizeMaskUint32)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & regSizeMaskUint32)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					dist = uint32(chunk >> huffmanValueShift)
 					break
 				}
@@ -929,9 +1202,10 @@ readLiteral:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
 			extra := (dist & 1) << (nb & regSizeMaskUint32)
-			for f.nb < nb {
+			for fnb < nb {
 				c, err := fr.ReadByte()
 				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
@@ -939,14 +1213,16 @@ readLiteral:
 					return
 				}
 				f.roffset++
-				f.b |= uint32(c) << f.nb
-				f.nb += 8
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
-			f.b >>= nb & regSizeMaskUint32
-			f.nb -= nb
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
 			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -956,6 +1232,7 @@ readLiteral:
 
 		// No check on length; encoding can be prescient.
 		if dist > uint32(f.dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
@@ -978,12 +1255,14 @@ copyHistory:
 
 		if f.dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 func (f *decompressor) huffmanBlockDecoder() func() {
@@ -996,7 +1275,9 @@ func (f *decompressor) huffmanBlockDecoder() func() {
 		return f.huffmanBufioReader
 	case *strings.Reader:
 		return f.huffmanStringsReader
+	case Reader:
+		return f.huffmanGenericReader
 	default:
-		return f.huffmanBlockGeneric
+		return f.huffmanGenericReader
 	}
 }
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 1e5eea3968a..0f14f8d63b4 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -125,11 +154,43 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index 234c4389ab3..8603fbd55ad 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -134,7 +134,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index c22b4244a5c..039639f8989 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -5,7 +5,7 @@ import "fmt"
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
-	table [tableSize]tableEntryPrev
+	table [1 << 16]tableEntryPrev
 }
 
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
@@ -13,6 +13,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 8 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -73,7 +75,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hash4u(cv, tableBits)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -141,7 +143,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -156,7 +166,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(t+4) < len(src) && t > 0 {
 					cv := load3232(src, t)
-					nextHash := hash(cv)
+					nextHash := hash4u(cv, tableBits)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t},
@@ -165,30 +175,31 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				goto emitRemainder
 			}
 
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntryPrev{
-				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3},
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 5 {
+				nextHash := hash4u(load3232(src, i), tableBits)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
 			}
-			x >>= 8
-			prevHash = hash(uint32(x))
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
-			prevHash = hash(uint32(x))
+			prevHash = hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
-			currHash := hash(uint32(x))
+			currHash := hash4u(uint32(x), tableBits)
 			candidates := e.table[currHash]
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
@@ -200,15 +211,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			candidate = candidates.Cur
 			minOffset := e.cur + s - (maxMatchOffset - 4)
 
-			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
+			if candidate.offset > minOffset {
+				if cv == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
 				candidate = candidates.Prev
 				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
+					// Match at prev...
+					continue
 				}
 			}
 			cv = uint32(x >> 8)
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index e62f0c02b1e..1cbffa1aefe 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -135,7 +135,15 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index 293a3a320b7..4b97576bd38 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -210,7 +210,15 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index a709977ec49..62888edf3cd 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -243,7 +243,15 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if false {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index 53e89912463..544162a4318 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -249,7 +249,15 @@ func statelessEnc(dst *tokens, src []byte, startAt int16) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
index eb862d7a920..d818790c132 100644
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -13,11 +13,10 @@ import (
 )
 
 const (
-	// From top
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 5 bits    offsetcode
-	// 16 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
 	lengthShift         = 22
 	offsetMask          = 1<<lengthShift - 1
 	typeMask            = 3 << 30
@@ -129,11 +128,11 @@ var offsetCodes14 = [256]uint32{
 type token uint32
 
 type tokens struct {
-	nLits     int
 	extraHist [32]uint16  // codes 256->maxnumlit
 	offHist   [32]uint16  // offset codes
 	litHist   [256]uint16 // codes 0->255
-	n         uint16      // Must be able to contain maxStoreBlockSize
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
 	tokens    [maxStoreBlockSize + 1]token
 }
 
@@ -142,7 +141,7 @@ func (t *tokens) Reset() {
 		return
 	}
 	t.n = 0
-	t.nLits = 0
+	t.nFilled = 0
 	for i := range t.litHist[:] {
 		t.litHist[i] = 0
 	}
@@ -161,12 +160,12 @@ func (t *tokens) Fill() {
 	for i, v := range t.litHist[:] {
 		if v == 0 {
 			t.litHist[i] = 1
-			t.nLits++
+			t.nFilled++
 		}
 	}
 	for i, v := range t.extraHist[:literalCount-256] {
 		if v == 0 {
-			t.nLits++
+			t.nFilled++
 			t.extraHist[i] = 1
 		}
 	}
@@ -196,20 +195,17 @@ func (t *tokens) indexTokens(in []token) {
 
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
 		dst.litHist[v]++
+		dst.n++
 	}
-	dst.n += uint16(len(lit))
-	dst.nLits += len(lit)
 }
 
 func (t *tokens) AddLiteral(lit byte) {
 	t.tokens[t.n] = token(lit)
 	t.litHist[lit]++
 	t.n++
-	t.nLits++
 }
 
 // from https://stackoverflow.com/a/28730362
@@ -230,8 +226,9 @@ func (t *tokens) EstimatedBits() int {
 	shannon := float32(0)
 	bits := int(0)
 	nMatches := 0
-	if t.nLits > 0 {
-		invTotal := 1.0 / float32(t.nLits)
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
 				n := float32(v)
@@ -275,10 +272,9 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 	}
 	oCode := offsetCode(xoffset)
 	xoffset |= oCode << 16
-	t.nLits++
 
 	t.extraHist[lengthCodes1[uint8(xlength)]]++
-	t.offHist[oCode]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
 	t.n++
 }
@@ -297,13 +293,16 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 		xl := xlength
 		if xl > 258 {
 			// We need to have at least baseMatchLength left over for next loop.
-			xl = 258 - baseMatchLength
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
 		}
 		xlength -= xl
 		xl -= baseMatchLength
-		t.nLits++
 		t.extraHist[lengthCodes1[uint8(xl)]]++
-		t.offHist[oc]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
 		t.n++
 	}
@@ -359,8 +358,8 @@ func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
 func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-// The code is never more than 8 bits, but is returned as uint32 for convenience.
-func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
diff --git a/vendor/github.com/klauspost/compress/huff0/autogen.go b/vendor/github.com/klauspost/compress/huff0/autogen.go
new file mode 100644
index 00000000000..ff2c69d60cf
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/autogen.go
@@ -0,0 +1,5 @@
+package huff0
+
+//go:generate go run generate.go
+//go:generate asmfmt -w decompress_amd64.s
+//go:generate asmfmt -w decompress_8b_amd64.s
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e8868a..451160edda3 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -8,115 +8,10 @@ package huff0
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 )
 
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReader struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
-	b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
@@ -213,10 +108,17 @@ func (b *bitReaderBytes) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderBytes) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
@@ -263,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 
+// peekTopBits(n) is equvialent to peekBitFast(64 - n)
+func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
+	return uint16(b.value >> n)
+}
+
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
@@ -318,10 +225,17 @@ func (b *bitReaderShifted) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderShifted) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 8323dc05389..bc95ac623bd 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@ package huff0
 
 import (
 	"fmt"
+	"math"
 	"runtime"
 	"sync"
 )
@@ -289,6 +290,10 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
 		if err != nil {
 			return nil, err
 		}
+		if len(s.Out)-idx > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -332,6 +337,10 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
 			return nil, errs[i]
 		}
 		o := s.tmpOut[i]
+		if len(o) > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 9b7cc8e97bb..04f6529955e 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -20,7 +21,7 @@ type dEntrySingle struct {
 
 // double-symbols decoding
 type dEntryDouble struct {
-	seq   uint16
+	seq   [4]byte
 	nBits uint8
 	len   uint8
 }
@@ -216,6 +217,7 @@ func (s *Scratch) Decoder() *Decoder {
 	return &Decoder{
 		dt:             s.dt,
 		actualTableLog: s.actualTableLog,
+		bufs:           &s.decPool,
 	}
 }
 
@@ -223,6 +225,15 @@ func (s *Scratch) Decoder() *Decoder {
 type Decoder struct {
 	dt             dTable
 	actualTableLog uint8
+	bufs           *sync.Pool
+}
+
+func (d *Decoder) buffer() *[4][256]byte {
+	buf, ok := d.bufs.Get().(*[4][256]byte)
+	if ok {
+		return buf
+	}
+	return &[4][256]byte{}
 }
 
 // Decompress1X will decompress a 1X encoded stream.
@@ -249,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	for br.off >= 8 {
@@ -277,6 +289,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
+				d.bufs.Put(bufs)
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
@@ -284,6 +297,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -310,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
@@ -319,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -341,7 +357,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	switch d.actualTableLog {
@@ -369,6 +386,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
 					br.close()
+					d.bufs.Put(bufs)
 					return nil, ErrMaxDecodedSizeExceeded
 				}
 				dst = append(dst, buf[:]...)
@@ -398,6 +416,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
 					br.close()
+					d.bufs.Put(bufs)
 					return nil, ErrMaxDecodedSizeExceeded
 				}
 				dst = append(dst, buf[:]...)
@@ -426,6 +445,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -455,6 +475,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -484,6 +505,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -513,6 +535,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -542,6 +565,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -571,6 +595,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			off += 4
 			if off == 0 {
 				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
 					br.close()
 					return nil, ErrMaxDecodedSizeExceeded
 				}
@@ -578,10 +603,12 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 			}
 		}
 	default:
+		d.bufs.Put(bufs)
 		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -601,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
+			d.bufs.Put(bufs)
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
@@ -609,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -628,7 +657,8 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	const shift = 56
@@ -655,6 +685,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
+				d.bufs.Put(bufs)
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
@@ -663,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -679,6 +711,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
@@ -688,199 +721,10 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256 / 4
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == bufoff {
-			if bufoff > dstEvery {
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	for i := range br {
-		offset := dstEvery * i
-		br := &br[i]
-		bitsLeft := br.off*8 + uint(64-br.bitsRead)
-		for bitsLeft > 0 {
-			br.fill()
-			if false && br.bitsRead >= 32 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value = (br.value << 32) | uint64(low)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value = (br.value << 8) | uint64(br.in[br.off-1])
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= len(out) {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@@ -914,18 +758,18 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	shift := (8 - d.actualTableLog) & 7
+	shift := (56 + (8 - d.actualTableLog)) & 63
 
 	const tlSize = 1 << 8
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -935,96 +779,109 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -1032,23 +889,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1068,24 +933,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[uint8(br.value>>shift)].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
@@ -1121,18 +993,18 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	const shift = 0
+	const shift = 56
 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -1142,96 +1014,109 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -1241,21 +1126,27 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1275,24 +1166,32 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[br.peekByteFast()].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
new file mode 100644
index 00000000000..0d6cb1a962b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s
@@ -0,0 +1,488 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#define bufoff      256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+	MOVQ BP, 0(SP)
+
+	XORQ exhausted, exhausted // exhausted = false
+	XORQ off, off             // off = 0
+
+	MOVBQZX peekBits+32(FP), peek_bits
+	MOVQ    buf+40(FP), buffer
+	MOVQ    tbl+48(FP), table
+
+	MOVQ pbr0+0(FP), br0
+	MOVQ pbr1+8(FP), br1
+	MOVQ pbr2+16(FP), br2
+	MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+	// const stream = 0
+	// br0.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+	MOVQ    bitReaderShifted_value(br0), br_value
+	MOVQ    bitReaderShifted_off(br0), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill0
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br0), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill0:
+
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 0(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 0+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+	MOVQ br_value, bitReaderShifted_value(br0)
+	MOVQ br_offset, bitReaderShifted_off(br0)
+
+	// const stream = 1
+	// br1.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+	MOVQ    bitReaderShifted_value(br1), br_value
+	MOVQ    bitReaderShifted_off(br1), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill1
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br1), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill1:
+
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 256(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 256+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+	MOVQ br_value, bitReaderShifted_value(br1)
+	MOVQ br_offset, bitReaderShifted_off(br1)
+
+	// const stream = 2
+	// br2.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+	MOVQ    bitReaderShifted_value(br2), br_value
+	MOVQ    bitReaderShifted_off(br2), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill2
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br2), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill2:
+
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 512(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 512+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+	MOVQ br_value, bitReaderShifted_value(br2)
+	MOVQ br_offset, bitReaderShifted_off(br2)
+
+	// const stream = 3
+	// br3.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+	MOVQ    bitReaderShifted_value(br3), br_value
+	MOVQ    bitReaderShifted_off(br3), br_offset
+
+	// if b.bitsRead >= 32 {
+	CMPQ br_bits_read, $32
+	JB   skip_fill3
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br3), AX
+	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+	ORQ  AX, br_value
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill3:
+
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 768(buffer)(off*1)
+
+	// SECOND PART:
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v2 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+	// v3 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
+	MOVBQZX AL, CX
+	SHLQ    CX, br_value     // value <<= n
+	ADDQ    CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off+2] = uint8(v2.entry >> 8)
+	// buf[stream][off+3] = uint8(v3.entry >> 8)
+	MOVW BX, 768+2(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+	MOVQ br_value, bitReaderShifted_value(br3)
+	MOVQ br_offset, bitReaderShifted_off(br3)
+
+	ADDQ $4, off // off += 2
+
+	TESTB DH, DH // any br[i].ofs < 4?
+	JNZ   end
+
+	CMPQ off, $bufoff
+	JL   main_loop
+
+end:
+	MOVQ 0(SP), BP
+
+	MOVB off, ret+56(FP)
+	RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
new file mode 100644
index 00000000000..6d477a2c11e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in
@@ -0,0 +1,197 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+
+#define bufoff      256     // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+    MOVQ    BP, 0(SP)
+
+    XORQ    exhausted, exhausted    // exhausted = false
+    XORQ    off, off                // off = 0
+
+    MOVBQZX peekBits+32(FP), peek_bits
+    MOVQ    buf+40(FP), buffer
+    MOVQ    tbl+48(FP), table
+
+    MOVQ    pbr0+0(FP), br0
+    MOVQ    pbr1+8(FP), br1
+    MOVQ    pbr2+16(FP), br2
+    MOVQ    pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+    // const stream = {{ var "id" }}
+    // br{{ var "id"}}.fillFast()
+    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
+    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+	// if b.bitsRead >= 32 {
+    CMPQ    br_bits_read, $32
+    JB      skip_fill{{ var "id" }}
+
+    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
+    SUBQ    $4, br_offset           // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
+    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+    MOVQ    br_bits_read, CX
+    SHLQ    CL, AX
+    ORQ     AX, br_value
+
+    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+    CMPQ    br_offset, $4
+    SETLT   DL
+    ORB     DL, DH
+    // }
+skip_fill{{ var "id" }}:
+
+    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v0 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v1 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CX, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off] = uint8(v0.entry >> 8)
+    // buf[stream][off+1] = uint8(v1.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
+
+    // SECOND PART:
+    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v2 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+
+    // v3 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+    MOVBQZX AL, CX
+    SHLQ    CX, br_value            // value <<= n
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off+2] = uint8(v2.entry >> 8)
+    // buf[stream][off+3] = uint8(v3.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
+
+    // update the bitrader reader structure
+    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
+    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+    {{ set "id" "0" }}
+    {{ set "ofs" "0" }}
+    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "1" }}
+    {{ set "ofs" "8" }}
+    {{ set "bufofs" "256" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "2" }}
+    {{ set "ofs" "16" }}
+    {{ set "bufofs" "512" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "3" }}
+    {{ set "ofs" "24" }}
+    {{ set "bufofs" "768" }}
+    {{ template "decode_2_values_x86" . }}
+
+    ADDQ    $4, off     // off += 2
+
+    TESTB   DH, DH      // any br[i].ofs < 4?
+    JNZ     end
+
+    CMPQ    off, $bufoff
+    JL      main_loop
+end:
+    MOVQ    0(SP), BP
+
+    MOVB    off, ret+56(FP)
+    RET
+#undef  off
+#undef  buffer
+#undef  table
+
+#undef  br_bits_read
+#undef  br_value
+#undef  br_offset
+#undef  peek_bits
+#undef  exhausted
+
+#undef  br0
+#undef  br1
+#undef  br2
+#undef  br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 00000000000..d47f6644f3f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,181 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// that uses an asm implementation of its main loop.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+// go:noescape
+func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+// go:noescape
+func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+
+	use8BitTables := d.actualTableLog <= 8
+	if cap(dst) < fallback8BitSize && use8BitTables {
+		return d.decompress4X8bit(dst, src)
+	}
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	const debug = false
+
+	// see: bitReaderShifted.peekBitsFast()
+	peekBits := uint8((64 - d.actualTableLog) & 63)
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		if use8BitTables {
+			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+		} else {
+			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
+		}
+		if debug {
+			fmt.Print("DEBUG: ")
+			fmt.Printf("off=%d,", off)
+			for i := 0; i < 4; i++ {
+				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
+					i, br[i].bitsRead, br[i].value, br[i].off)
+			}
+			fmt.Println("")
+		}
+
+		if off != 0 {
+			break
+		}
+
+		if bufoff > dstEvery {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 1")
+		}
+		copy(out, buf[0][:])
+		copy(out[dstEvery:], buf[1][:])
+		copy(out[dstEvery*2:], buf[2][:])
+		copy(out[dstEvery*3:], buf[3][:])
+		out = out[bufoff:]
+		decoded += bufoff * 4
+		// There must at least be 3 buffers left.
+		if len(out) < dstEvery*3 {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 2")
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 00000000000..2edad3ea5a4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,506 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff      256 // see decompress.go, we're using [4][256]byte table
+
+// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+	MOVQ BP, 0(SP)
+
+	XORQ exhausted, exhausted // exhausted = false
+	XORQ off, off             // off = 0
+
+	MOVBQZX peekBits+32(FP), peek_bits
+	MOVQ    buf+40(FP), buffer
+	MOVQ    tbl+48(FP), table
+
+	MOVQ pbr0+0(FP), br0
+	MOVQ pbr1+8(FP), br1
+	MOVQ pbr2+16(FP), br2
+	MOVQ pbr3+24(FP), br3
+
+main_loop:
+
+	// const stream = 0
+	// br0.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
+	MOVQ    bitReaderShifted_value(br0), br_value
+	MOVQ    bitReaderShifted_off(br0), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill0
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br0), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill0:
+
+	// val0 := br0.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br0.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br0.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 0(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
+	MOVQ br_value, bitReaderShifted_value(br0)
+	MOVQ br_offset, bitReaderShifted_off(br0)
+
+	// const stream = 1
+	// br1.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
+	MOVQ    bitReaderShifted_value(br1), br_value
+	MOVQ    bitReaderShifted_off(br1), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill1
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br1), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill1:
+
+	// val0 := br1.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br1.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br1.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 256(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
+	MOVQ br_value, bitReaderShifted_value(br1)
+	MOVQ br_offset, bitReaderShifted_off(br1)
+
+	// const stream = 2
+	// br2.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
+	MOVQ    bitReaderShifted_value(br2), br_value
+	MOVQ    bitReaderShifted_off(br2), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill2
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br2), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill2:
+
+	// val0 := br2.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br2.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br2.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 512(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
+	MOVQ br_value, bitReaderShifted_value(br2)
+	MOVQ br_offset, bitReaderShifted_off(br2)
+
+	// const stream = 3
+	// br3.fillFast()
+	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
+	MOVQ    bitReaderShifted_value(br3), br_value
+	MOVQ    bitReaderShifted_off(br3), br_offset
+
+	// We must have at least 2 * max tablelog left
+	CMPQ br_bits_read, $64-22
+	JBE  skip_fill3
+
+	SUBQ $32, br_bits_read // b.bitsRead -= 32
+	SUBQ $4, br_offset     // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	MOVQ bitReaderShifted_in(br3), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+
+#else
+	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
+	MOVQ br_bits_read, CX
+	SHLQ CL, AX
+
+#endif
+
+	ORQ AX, br_value
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  br_offset, $4
+	SETLT DL
+	ORB   DL, DH
+
+	// }
+skip_fill3:
+
+	// val0 := br3.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	MOVQ br_value, AX
+	MOVQ peek_bits, CX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v0 := table[val0&mask]
+	MOVW 0(table)(AX*2), AX // AX - v0
+
+	// br3.advance(uint8(v0.entry))
+	MOVB AH, BL // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+#ifdef GOAMD64_v3
+	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+
+#else
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ peek_bits, CX
+	MOVQ br_value, AX
+	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+
+#endif
+
+	// v1 := table[val1&mask]
+	MOVW 0(table)(AX*2), AX // AX - v1
+
+	// br3.advance(uint8(v1.entry))
+	MOVB AH, BH // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+	MOVBQZX AL, CX
+	SHLXQ   AX, br_value, br_value // value <<= n
+
+#else
+	MOVBQZX AL, CX
+	SHLQ    CL, br_value // value <<= n
+
+#endif
+
+	ADDQ CX, br_bits_read // bits_read += n
+
+	// these two writes get coalesced
+	// buf[stream][off] = uint8(v0.entry >> 8)
+	// buf[stream][off+1] = uint8(v1.entry >> 8)
+	MOVW BX, 768(buffer)(off*1)
+
+	// update the bitrader reader structure
+	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
+	MOVQ br_value, bitReaderShifted_value(br3)
+	MOVQ br_offset, bitReaderShifted_off(br3)
+
+	ADDQ $2, off // off += 2
+
+	TESTB DH, DH // any br[i].ofs < 4?
+	JNZ   end
+
+	CMPQ off, $bufoff
+	JL   main_loop
+
+end:
+	MOVQ 0(SP), BP
+
+	MOVB off, ret+56(FP)
+	RET
+
+#undef off
+#undef buffer
+#undef table
+
+#undef br_bits_read
+#undef br_value
+#undef br_offset
+#undef peek_bits
+#undef exhausted
+
+#undef br0
+#undef br1
+#undef br2
+#undef br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
new file mode 100644
index 00000000000..330d86ae155
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in
@@ -0,0 +1,195 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+#ifdef GOAMD64_v4
+#ifndef GOAMD64_v3
+#define GOAMD64_v3
+#endif
+#endif
+
+#define bufoff      256     // see decompress.go, we're using [4][256]byte table
+
+//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
+//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
+TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
+#define off             R8
+#define buffer          DI
+#define table           SI
+
+#define br_bits_read    R9
+#define br_value        R10
+#define br_offset       R11
+#define peek_bits       R12
+#define exhausted       DX
+
+#define br0             R13
+#define br1             R14
+#define br2             R15
+#define br3             BP
+
+    MOVQ    BP, 0(SP)
+
+    XORQ    exhausted, exhausted    // exhausted = false
+    XORQ    off, off                // off = 0
+
+    MOVBQZX peekBits+32(FP), peek_bits
+    MOVQ    buf+40(FP), buffer
+    MOVQ    tbl+48(FP), table
+
+    MOVQ    pbr0+0(FP), br0
+    MOVQ    pbr1+8(FP), br1
+    MOVQ    pbr2+16(FP), br2
+    MOVQ    pbr3+24(FP), br3
+
+main_loop:
+{{ define "decode_2_values_x86" }}
+    // const stream = {{ var "id" }}
+    // br{{ var "id"}}.fillFast()
+    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
+    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
+    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
+
+    // We must have at least 2 * max tablelog left
+    CMPQ    br_bits_read, $64-22
+    JBE     skip_fill{{ var "id" }}
+
+    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
+    SUBQ    $4, br_offset           // b.off -= 4
+
+	// v := b.in[b.off-4 : b.off]
+	// v = v[:4]
+	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+#ifdef GOAMD64_v3
+    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
+#else
+    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
+    MOVQ    br_bits_read, CX
+    SHLQ    CL, AX
+#endif
+
+    ORQ     AX, br_value
+
+    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
+    CMPQ    br_offset, $4
+    SETLT   DL
+    ORB     DL, DH
+    // }
+skip_fill{{ var "id" }}:
+
+    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
+#ifdef GOAMD64_v3
+    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
+#else
+    MOVQ    br_value, AX
+    MOVQ    peek_bits, CX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+#endif
+
+    // v0 := table[val0&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v0
+
+    // br{{ var "id"}}.advance(uint8(v0.entry))
+    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
+
+#ifdef GOAMD64_v3
+    MOVBQZX AL, CX
+    SHLXQ   AX, br_value, br_value // value <<= n
+#else
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+#endif
+
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+#ifdef GOAMD64_v3
+    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
+#else
+    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
+    MOVQ    peek_bits, CX
+    MOVQ    br_value, AX
+    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
+#endif
+
+    // v1 := table[val1&mask]
+    MOVW    0(table)(AX*2), AX      // AX - v1
+
+    // br{{ var "id"}}.advance(uint8(v1.entry))
+    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
+
+#ifdef GOAMD64_v3
+    MOVBQZX AL, CX
+    SHLXQ   AX, br_value, br_value // value <<= n
+#else
+    MOVBQZX AL, CX
+    SHLQ    CL, br_value            // value <<= n
+#endif
+
+    ADDQ    CX, br_bits_read        // bits_read += n
+
+
+    // these two writes get coalesced
+    // buf[stream][off] = uint8(v0.entry >> 8)
+    // buf[stream][off+1] = uint8(v1.entry >> 8)
+    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
+
+    // update the bitrader reader structure
+    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
+    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
+    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
+{{ end }}
+
+    {{ set "id" "0" }}
+    {{ set "ofs" "0" }}
+    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "1" }}
+    {{ set "ofs" "8" }}
+    {{ set "bufofs" "256" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "2" }}
+    {{ set "ofs" "16" }}
+    {{ set "bufofs" "512" }}
+    {{ template "decode_2_values_x86" . }}
+
+    {{ set "id" "3" }}
+    {{ set "ofs" "24" }}
+    {{ set "bufofs" "768" }}
+    {{ template "decode_2_values_x86" . }}
+
+    ADDQ    $2, off     // off += 2
+
+    TESTB   DH, DH      // any br[i].ofs < 4?
+    JNZ     end
+
+    CMPQ    off, $bufoff
+    JL      main_loop
+end:
+    MOVQ    0(SP), BP
+
+    MOVB    off, ret+56(FP)
+    RET
+#undef  off
+#undef  buffer
+#undef  table
+
+#undef  br_bits_read
+#undef  br_value
+#undef  br_offset
+#undef  peek_bits
+#undef  exhausted
+
+#undef  br0
+#undef  br1
+#undef  br2
+#undef  br3
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 00000000000..126b4d68a94
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,193 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
+			out = out[bufoff:]
+			decoded += bufoff * 4
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 3ee00ecb470..e8ad17ad08e 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"math"
 	"math/bits"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -116,6 +117,7 @@ type Scratch struct {
 	nodes          []nodeElt
 	tmpOut         [4][]byte
 	fse            *fse.Scratch
+	decPool        sync.Pool // *[4][256]byte buffers.
 	huffWeight     [maxSymbolValue + 1]byte
 }
 
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index c8f0f16fc1e..e3445ac194e 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -78,6 +78,9 @@ of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is
 in the future. So if you want to limit concurrency for future updates, specify the concurrency
 you would like.
 
+If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)`
+which will compress input as each block is completed, blocking on writes until each has completed.
+
 You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined 
 compression settings can be specified.
 
@@ -104,7 +107,8 @@ and seems to ignore concatenated streams, even though [it is part of the spec](h
 For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
 
 `EncodeAll` will encode all input in src and append it to dst.
-This function can be called concurrently, but each call will only run on a single goroutine.
+This function can be called concurrently. 
+Each call will only run on a same goroutine as the caller.
 
 Encoded blocks can be concatenated and the result will be the combined input stream.
 Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
@@ -149,10 +153,10 @@ http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 
 This package:
 file    out     level   insize      outsize     millis  mb/s
-silesia.tar zskp    1   211947520   73101992    643     313.87
-silesia.tar zskp    2   211947520   67504318    969     208.38
-silesia.tar zskp    3   211947520   64595893    2007    100.68
-silesia.tar zskp    4   211947520   60995370    8825    22.90
+silesia.tar zskp    1   211947520   73821326    634     318.47
+silesia.tar zskp    2   211947520   67655404    1508    133.96
+silesia.tar zskp    3   211947520   64746933    3000    67.37
+silesia.tar zskp    4   211947520   60073508    16926   11.94
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
@@ -161,94 +165,94 @@ silesia.tar zstd    6   211947520   62916450    1913    105.66
 silesia.tar zstd    9   211947520   60212393    5063    39.92
 
 gzip, stdlib/this package:
-silesia.tar gzstd   1   211947520   80007735    1654    122.21
-silesia.tar gzkp    1   211947520   80136201    1152    175.45
+silesia.tar gzstd   1   211947520   80007735    1498    134.87
+silesia.tar gzkp    1   211947520   80088272    1009    200.31
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
 
 file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  235022249   3088    590.30
-gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  zskp    3   1911399616  175034659   9636    189.17
-gob-stream  zskp    4   1911399616  165609838   50369   36.19
+gob-stream  zskp    1   1911399616  233948096   3230    564.34
+gob-stream  zskp    2   1911399616  203997694   4997    364.73
+gob-stream  zskp    3   1911399616  173526523   13435   135.68
+gob-stream  zskp    4   1911399616  162195235   47559   38.33
 
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
 gob-stream  zstd    9   1911399616  177620386   16175   112.70
 
-gob-stream  gzstd   1   1911399616  357382641   10251   177.82
-gob-stream  gzkp    1   1911399616  359753026   5438    335.20
+gob-stream  gzstd   1   1911399616  357382013   9046    201.49
+gob-stream  gzkp    1   1911399616  359136669   4885    373.08
 
 The test data for the Large Text Compression Benchmark is the first
 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
 http://mattmahoney.net/dc/textdata.html
 
 file    out level   insize      outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343848582   3609    264.18
-enwik9  zskp    2   1000000000  317276632   5746    165.97
-enwik9  zskp    3   1000000000  292243069   12162   78.41
-enwik9  zskp    4   1000000000  262183768   82837   11.51
+enwik9  zskp    1   1000000000  343833605   3687    258.64
+enwik9  zskp    2   1000000000  317001237   7672    124.29
+enwik9  zskp    3   1000000000  291915823   15923   59.89
+enwik9  zskp    4   1000000000  261710291   77697   12.27
 
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
 enwik9  zstd    9   1000000000  278348700   28549   33.40
 
-enwik9  gzstd   1   1000000000  382578136   9604    99.30
-enwik9  gzkp    1   1000000000  383825945   6544    145.73
+enwik9  gzstd   1   1000000000  382578136   8608    110.78
+enwik9  gzkp    1   1000000000  382781160   5628    169.45
 
 Highly compressible JSON file.
 https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 
 file                        out level   insize      outsize     millis  mb/s
-github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
-github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
-github-june-2days-2019.json zskp    3   6273951764  524340691   34043   175.75
-github-june-2days-2019.json zskp    4   6273951764  470320075   170190  35.16
+github-june-2days-2019.json zskp    1   6273951764  697439532   9789    611.17
+github-june-2days-2019.json zskp    2   6273951764  610876538   18553   322.49
+github-june-2days-2019.json zskp    3   6273951764  517662858   44186   135.41
+github-june-2days-2019.json zskp    4   6273951764  464617114   165373  36.18
 
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
 github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
 
-github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
-github-june-2days-2019.json gzkp    1   6273951764  1125417694  21788   274.61
+github-june-2days-2019.json gzstd   1   6273951764  1164397768  26793   223.32
+github-june-2days-2019.json gzkp    1   6273951764  1120631856  17693   338.16
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
 
 file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
-rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    zskp    3   8558382592  3158085214  77675   105.08
-rawstudio-mint14.tar    zskp    4   8558382592  2965110639  857750  9.52
+rawstudio-mint14.tar    zskp    1   8558382592  3718400221  18206   448.29
+rawstudio-mint14.tar    zskp    2   8558382592  3326118337  37074   220.15
+rawstudio-mint14.tar    zskp    3   8558382592  3163842361  87306   93.49
+rawstudio-mint14.tar    zskp    4   8558382592  2970480650  783862  10.41
 
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
 rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
 
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
-rawstudio-mint14.tar    gzkp    1   8558382592  3962605659  45113   180.92
+rawstudio-mint14.tar    gzstd   1   8558382592  3926234992  51345   158.96
+rawstudio-mint14.tar    gzkp    1   8558382592  3960117298  36722   222.26
 
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 
 file                    out level   insize      outsize     millis  mb/s
-nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
-nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-nyc-taxi-data-10M.csv   zskp    3   3325605752  530289687   25239   125.66
-nyc-taxi-data-10M.csv   zskp    4   3325605752  476268884   135958  23.33
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641319332   9462    335.17
+nyc-taxi-data-10M.csv   zskp    2   3325605752  588976126   17570   180.50
+nyc-taxi-data-10M.csv   zskp    3   3325605752  529329260   32432   97.79
+nyc-taxi-data-10M.csv   zskp    4   3325605752  474949772   138025  22.98
 
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
 nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
 
-nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  922257165   16780   189.00
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928654908   21270   149.11
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  922273214   13929   227.68
 ```
 
 ## Decompressor
@@ -283,8 +287,13 @@ func Decompress(in io.Reader, out io.Writer) error {
 }
 ```
 
-It is important to use the "Close" function when you no longer need the Reader to stop running goroutines. 
-See "Allocation-less operation" below.
+It is important to use the "Close" function when you no longer need the Reader to stop running goroutines, 
+when running with default settings.
+Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream.
+
+Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput.
+However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data 
+as it is being requested only.
 
 For decoding buffers, it could look something like this:
 
@@ -293,7 +302,7 @@ import "github.com/klauspost/compress/zstd"
 
 // Create a reader that caches decompressors.
 // For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil)
+var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0))
 
 // Decompress a buffer. We don't supply a destination buffer,
 // so it will be allocated by the decoder.
@@ -303,9 +312,12 @@ func Decompress(src []byte) ([]byte, error) {
 ```
 
 Both of these cases should provide the functionality needed. 
-The decoder can be used for *concurrent* decompression of multiple buffers. 
+The decoder can be used for *concurrent* decompression of multiple buffers.
+By default 4 decompressors will be created. 
+
 It will only allow a certain number of concurrent operations to run. 
-To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
+To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
+It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders.
 
 ### Dictionaries
 
@@ -357,19 +369,21 @@ In this case no unneeded allocations should be made.
 The buffer decoder does everything on the same goroutine and does nothing concurrently.
 It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
 
-The stream decoder operates on
+The stream decoder will create goroutines that:
 
-* One goroutine reads input and splits the input to several block decoders.
-* A number of decoders will decode blocks.
-* A goroutine coordinates these blocks and sends history from one to the next.
+1) Reads input and splits the input into blocks.
+2) Decompression of literals.
+3) Decompression of sequences.
+4) Reconstruction of output stream.
 
 So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
 
+The concurrency level will, for streams, determine how many blocks ahead the compression will start.
+
 Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
 
-In practice this means that concurrency is often limited to utilizing about 2 cores effectively.
- 
- 
+In practice this means that concurrency is often limited to utilizing about 3 cores effectively.
+  
 ### Benchmarks
 
 These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 85445853715..d7cd15ba29d 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -7,6 +7,7 @@ package zstd
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 	"math/bits"
 )
@@ -50,16 +51,23 @@ func (b *bitReader) getBits(n uint8) int {
 	if n == 0 /*|| b.bitsRead >= 64 */ {
 		return 0
 	}
-	return b.getBitsFast(n)
+	return int(b.get32BitsFast(n))
 }
 
-// getBitsFast requires that at least one bit is requested every time.
+// get32BitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) int {
+func (b *bitReader) get32BitsFast(n uint8) uint32 {
 	const regMask = 64 - 1
 	v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
 	b.bitsRead += n
-	return int(v)
+	return v
+}
+
+func (b *bitReader) get16BitsFast(n uint8) uint16 {
+	const regMask = 64 - 1
+	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
+	b.bitsRead += n
+	return v
 }
 
 // fillFast() will make sure at least 32 bits are available.
@@ -125,6 +133,9 @@ func (b *bitReader) remain() uint {
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	if !b.finished() {
+		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index 303ae90f944..b3661828509 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -38,7 +38,7 @@ func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
 	b.nBits += bits
 }
 
-// addBits32NC will add up to 32 bits.
+// addBits32NC will add up to 31 bits.
 // It will not check if there is space for them,
 // so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
@@ -46,6 +46,26 @@ func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
 	b.nBits += bits
 }
 
+// addBits64NC will add up to 64 bits.
+// There must be space for 32 bits.
+func (b *bitWriter) addBits64NC(value uint64, bits uint8) {
+	if bits <= 31 {
+		b.addBits32Clean(uint32(value), bits)
+		return
+	}
+	b.addBits32Clean(uint32(value), 32)
+	b.flush32()
+	b.addBits32Clean(uint32(value>>32), bits-32)
+}
+
+// addBits32Clean will add up to 32 bits.
+// It will not check if there is space for them.
+// The input must not contain more bits than specified.
+func (b *bitWriter) addBits32Clean(value uint32, bits uint8) {
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 8a98c4562e0..7d567a54a05 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -76,17 +76,25 @@ type blockDec struct {
 	// Window size of the block.
 	WindowSize uint64
 
-	history     chan *history
-	input       chan struct{}
-	result      chan decodeOutput
-	sequenceBuf []seq
-	err         error
-	decWG       sync.WaitGroup
+	err error
+
+	// Check against this crc
+	checkCRC []byte
 
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
 	localFrame *frameDec
 
+	sequence []seqVals
+
+	async struct {
+		newHist  *history
+		literals []byte
+		seqData  []byte
+		seqSize  int // Size of uncompressed sequences
+		fcs      uint64
+	}
+
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
@@ -109,13 +117,8 @@ func (b *blockDec) String() string {
 
 func newBlockDec(lowMem bool) *blockDec {
 	b := blockDec{
-		lowMem:  lowMem,
-		result:  make(chan decodeOutput, 1),
-		input:   make(chan struct{}, 1),
-		history: make(chan *history, 1),
+		lowMem: lowMem,
 	}
-	b.decWG.Add(1)
-	go b.startDecoder()
 	return &b
 }
 
@@ -138,6 +141,12 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
 		b.RLESize = uint32(cSize)
 		if b.lowMem {
 			maxSize = cSize
@@ -158,7 +167,19 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			}
 			return ErrCompressedSizeTooBig
 		}
+		// Empty compressed blocks must at least be 2 bytes
+		// for Literals_Block_Type and one for Sequences_Section_Header.
+		if cSize < 2 {
+			return ErrBlockTooSmall
+		}
 	case blockTypeRaw:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
+
 		b.RLESize = 0
 		// We do not need a destination for raw blocks.
 		maxSize = -1
@@ -193,85 +214,14 @@ func (b *blockDec) sendErr(err error) {
 	b.Last = true
 	b.Type = blockTypeReserved
 	b.err = err
-	b.input <- struct{}{}
 }
 
 // Close will release resources.
 // Closed blockDec cannot be reset.
 func (b *blockDec) Close() {
-	close(b.input)
-	close(b.history)
-	close(b.result)
-	b.decWG.Wait()
 }
 
-// decodeAsync will prepare decoding the block when it receives input.
-// This will separate output and history.
-func (b *blockDec) startDecoder() {
-	defer b.decWG.Done()
-	for range b.input {
-		//println("blockDec: Got block input")
-		switch b.Type {
-		case blockTypeRLE:
-			if cap(b.dst) < int(b.RLESize) {
-				if b.lowMem {
-					b.dst = make([]byte, b.RLESize)
-				} else {
-					b.dst = make([]byte, maxBlockSize)
-				}
-			}
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst[:b.RLESize],
-				err: nil,
-			}
-			v := b.data[0]
-			for i := range o.b {
-				o.b[i] = v
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeRaw:
-			o := decodeOutput{
-				d:   b,
-				b:   b.data,
-				err: nil,
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeCompressed:
-			b.dst = b.dst[:0]
-			err := b.decodeCompressed(nil)
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst,
-				err: err,
-			}
-			if debugDecoder {
-				println("Decompressed to", len(b.dst), "bytes, error:", err)
-			}
-			b.result <- o
-		case blockTypeReserved:
-			// Used for returning errors.
-			<-b.history
-			b.result <- decodeOutput{
-				d:   b,
-				b:   nil,
-				err: b.err,
-			}
-		default:
-			panic("Invalid block type")
-		}
-		if debugDecoder {
-			println("blockDec: Finished block")
-		}
-	}
-}
-
-// decodeAsync will prepare decoding the block when it receives the history.
-// If history is provided, it will not fetch it from the channel.
+// decodeBuf
 func (b *blockDec) decodeBuf(hist *history) error {
 	switch b.Type {
 	case blockTypeRLE:
@@ -294,14 +244,23 @@ func (b *blockDec) decodeBuf(hist *history) error {
 		return nil
 	case blockTypeCompressed:
 		saved := b.dst
-		b.dst = hist.b
-		hist.b = nil
+		// Append directly to history
+		if hist.ignoreBuffer == 0 {
+			b.dst = hist.b
+			hist.b = nil
+		} else {
+			b.dst = b.dst[:0]
+		}
 		err := b.decodeCompressed(hist)
 		if debugDecoder {
 			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
 		}
-		hist.b = b.dst
-		b.dst = saved
+		if hist.ignoreBuffer == 0 {
+			hist.b = b.dst
+			b.dst = saved
+		} else {
+			hist.appendKeep(b.dst)
+		}
 		return err
 	case blockTypeReserved:
 		// Used for returning errors.
@@ -311,30 +270,18 @@ func (b *blockDec) decodeBuf(hist *history) error {
 	}
 }
 
-// decodeCompressed will start decompressing a block.
-// If no history is supplied the decoder will decodeAsync as much as possible
-// before fetching from blockDec.history
-func (b *blockDec) decodeCompressed(hist *history) error {
-	in := b.data
-	delayedHistory := hist == nil
-
-	if delayedHistory {
-		// We must always grab history.
-		defer func() {
-			if hist == nil {
-				<-b.history
-			}
-		}()
-	}
+func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
 	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
 	if len(in) < 2 {
-		return ErrBlockTooSmall
+		return in, ErrBlockTooSmall
 	}
+
 	litType := literalsBlockType(in[0] & 3)
 	var litRegenSize int
 	var litCompSize int
 	sizeFormat := (in[0] >> 2) & 3
 	var fourStreams bool
+	var literals []byte
 	switch litType {
 	case literalsBlockRaw, literalsBlockRLE:
 		switch sizeFormat {
@@ -350,7 +297,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
 			in = in[3:]
@@ -361,7 +308,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
 			litRegenSize = int(n & 1023)
@@ -372,7 +319,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 4 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
 			litRegenSize = int(n & 16383)
@@ -382,7 +329,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 5 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
 			litRegenSize = int(n & 262143)
@@ -393,13 +340,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	if debugDecoder {
 		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
 	}
-	var literals []byte
-	var huff *huff0.Scratch
+	if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
+		return in, ErrWindowSizeExceeded
+	}
+
 	switch litType {
 	case literalsBlockRaw:
 		if len(in) < litRegenSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litRegenSize]
 		in = in[litRegenSize:]
@@ -407,7 +356,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	case literalsBlockRLE:
 		if len(in) < 1 {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
@@ -418,7 +367,6 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 					b.literalBuf = make([]byte, litRegenSize)
 				} else {
 					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
-
 				}
 			}
 		}
@@ -434,7 +382,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	case literalsBlockTreeless:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		// Store compressed literals, so we defer decoding until we get history.
 		literals = in[:litCompSize]
@@ -442,31 +390,65 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		if debugDecoder {
 			printf("Found %d compressed literals\n", litCompSize)
 		}
+		huff := hist.huffTree
+		if huff == nil {
+			return in, errors.New("literal block was treeless, but no history was defined")
+		}
+		// Ensure we have space to store it.
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, 0, litRegenSize)
+			} else {
+				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+			}
+		}
+		var err error
+		// Use our out buffer.
+		huff.MaxDecodedSize = maxCompressedBlockSize
+		if fourStreams {
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
+		} else {
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
+		}
+		// Make sure we don't leak our literals buffer
+		if err != nil {
+			println("decompressing literals:", err)
+			return in, err
+		}
+		if len(literals) != litRegenSize {
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+		}
+
 	case literalsBlockCompressed:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litCompSize]
 		in = in[litCompSize:]
-		huff = huffDecoderPool.Get().(*huff0.Scratch)
-		var err error
 		// Ensure we have space to store it.
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
 				b.literalBuf = make([]byte, 0, litRegenSize)
 			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
 			}
 		}
-		if huff == nil {
-			huff = &huff0.Scratch{}
+		huff := hist.huffTree
+		if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
+			huff = huffDecoderPool.Get().(*huff0.Scratch)
+			if huff == nil {
+				huff = &huff0.Scratch{}
+			}
 		}
+		var err error
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
-			return err
+			return in, err
 		}
+		hist.huffTree = huff
+		huff.MaxDecodedSize = maxCompressedBlockSize
 		// Use our out buffer.
 		if fourStreams {
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -475,27 +457,56 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
-			return err
+			return in, err
 		}
 		// Make sure we don't leak our literals buffer
 		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
 		if debugDecoder {
 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
 		}
 	}
+	hist.decoders.literals = literals
+	return in, nil
+}
+
+// decodeCompressed will start decompressing a block.
+func (b *blockDec) decodeCompressed(hist *history) error {
+	in := b.data
+	in, err := b.decodeLiterals(in, hist)
+	if err != nil {
+		return err
+	}
+	err = b.prepareSequences(in, hist)
+	if err != nil {
+		return err
+	}
+	if hist.decoders.nSeqs == 0 {
+		b.dst = append(b.dst, hist.decoders.literals...)
+		return nil
+	}
+	err = hist.decoders.decodeSync(hist)
+	if err != nil {
+		return err
+	}
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
+	return nil
+}
 
+func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
+	if debugDecoder {
+		printf("prepareSequences: %d byte(s) input\n", len(in))
+	}
 	// Decode Sequences
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
 	if len(in) < 1 {
 		return ErrBlockTooSmall
 	}
+	var nSeqs int
 	seqHeader := in[0]
-	nSeqs := 0
 	switch {
-	case seqHeader == 0:
-		in = in[1:]
 	case seqHeader < 128:
 		nSeqs = int(seqHeader)
 		in = in[1:]
@@ -512,19 +523,16 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
 		in = in[3:]
 	}
-	// Allocate sequences
-	if cap(b.sequenceBuf) < nSeqs {
-		if b.lowMem {
-			b.sequenceBuf = make([]seq, nSeqs)
-		} else {
-			// Allocate max
-			b.sequenceBuf = make([]seq, nSeqs, maxSequences)
+	if nSeqs == 0 && len(in) != 0 {
+		// When no sequences, there should not be any more data...
+		if debugDecoder {
+			printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
 		}
-	} else {
-		// Reuse buffer
-		b.sequenceBuf = b.sequenceBuf[:nSeqs]
+		return ErrUnexpectedBlockSize
 	}
-	var seqs = &sequenceDecs{}
+
+	var seqs = &hist.decoders
+	seqs.nSeqs = nSeqs
 	if nSeqs > 0 {
 		if len(in) < 1 {
 			return ErrBlockTooSmall
@@ -553,6 +561,9 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			}
 			switch mode {
 			case compModePredefined:
+				if seq.fse != nil && !seq.fse.preDefined {
+					fseDecoderPool.Put(seq.fse)
+				}
 				seq.fse = &fsePredef[i]
 			case compModeRLE:
 				if br.remain() < 1 {
@@ -560,34 +571,36 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 				}
 				v := br.Uint8()
 				br.advance(1)
-				dec := fseDecoderPool.Get().(*fseDecoder)
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
 				symb, err := decSymbolValue(v, symbolTableX[i])
 				if err != nil {
 					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
 					return err
 				}
-				dec.setRLE(symb)
-				seq.fse = dec
+				seq.fse.setRLE(symb)
 				if debugDecoder {
 					printf("RLE set to %+v, code: %v", symb, v)
 				}
 			case compModeFSE:
 				println("Reading table for", tableIndex(i))
-				dec := fseDecoderPool.Get().(*fseDecoder)
-				err := dec.readNCount(&br, uint16(maxTableSymbol[i]))
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
+				err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
 				if err != nil {
 					println("Read table error:", err)
 					return err
 				}
-				err = dec.transform(symbolTableX[i])
+				err = seq.fse.transform(symbolTableX[i])
 				if err != nil {
 					println("Transform table error:", err)
 					return err
 				}
 				if debugDecoder {
-					println("Read table ok", "symbolLen:", dec.symbolLen)
+					println("Read table ok", "symbolLen:", seq.fse.symbolLen)
 				}
-				seq.fse = dec
 			case compModeRepeat:
 				seq.repeat = true
 			}
@@ -597,140 +610,89 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		in = br.unread()
 	}
-
-	// Wait for history.
-	// All time spent after this is critical since it is strictly sequential.
-	if hist == nil {
-		hist = <-b.history
-		if hist.error {
-			return ErrDecoderClosed
-		}
-	}
-
-	// Decode treeless literal block.
-	if litType == literalsBlockTreeless {
-		// TODO: We could send the history early WITHOUT the stream history.
-		//   This would allow decoding treeless literals before the byte history is available.
-		//   Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
-		//   So not much obvious gain here.
-
-		if hist.huffTree == nil {
-			return errors.New("literal block was treeless, but no history was defined")
-		}
-		// Ensure we have space to store it.
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
-			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
-			}
-		}
-		var err error
-		// Use our out buffer.
-		huff = hist.huffTree
-		if fourStreams {
-			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
-		} else {
-			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
-		}
-		// Make sure we don't leak our literals buffer
-		if err != nil {
-			println("decompressing literals:", err)
-			return err
-		}
-		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
-		}
-	} else {
-		if hist.huffTree != nil && huff != nil {
-			if hist.dict == nil || hist.dict.litEnc != hist.huffTree {
-				huffDecoderPool.Put(hist.huffTree)
-			}
-			hist.huffTree = nil
-		}
-	}
-	if huff != nil {
-		hist.huffTree = huff
-	}
 	if debugDecoder {
-		println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.")
+		println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
 	}
 
 	if nSeqs == 0 {
-		// Decompressed content is defined entirely as Literals Section content.
-		b.dst = append(b.dst, literals...)
-		if delayedHistory {
-			hist.append(literals)
+		if len(b.sequence) > 0 {
+			b.sequence = b.sequence[:0]
 		}
 		return nil
 	}
-
-	seqs, err := seqs.mergeHistory(&hist.decoders)
-	if err != nil {
-		return err
-	}
-	if debugDecoder {
-		println("History merged ok")
+	br := seqs.br
+	if br == nil {
+		br = &bitReader{}
 	}
-	br := &bitReader{}
 	if err := br.init(in); err != nil {
 		return err
 	}
 
-	// TODO: Investigate if sending history without decoders are faster.
-	//   This would allow the sequences to be decoded async and only have to construct stream history.
-	//   If only recent offsets were not transferred, this would be an obvious win.
-	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+	if err := seqs.initialize(br, hist, b.dst); err != nil {
+		println("initializing sequences:", err)
+		return err
+	}
+	return nil
+}
+
+func (b *blockDec) decodeSequences(hist *history) error {
+	if cap(b.sequence) < hist.decoders.nSeqs {
+		if b.lowMem {
+			b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
+		} else {
+			b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
+		}
+	}
+	b.sequence = b.sequence[:hist.decoders.nSeqs]
+	if hist.decoders.nSeqs == 0 {
+		hist.decoders.seqSize = len(hist.decoders.literals)
+		return nil
+	}
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.prevOffset = hist.recentOffsets
+	err := hist.decoders.decode(b.sequence)
+	hist.recentOffsets = hist.decoders.prevOffset
+	return err
+}
 
+func (b *blockDec) executeSequences(hist *history) error {
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
-		// We do not need history any more.
+		// We do not need history anymore.
 		if hist.dict != nil {
 			hist.dict.content = nil
 		}
 	}
-
-	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
-
-	err = seqs.decode(nSeqs, br, hbytes)
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.out = b.dst[:0]
+	err := hist.decoders.execute(b.sequence, hbytes)
 	if err != nil {
 		return err
 	}
-	if !br.finished() {
-		return fmt.Errorf("%d extra bits on block, should be 0", br.remain())
-	}
+	return b.updateHistory(hist)
+}
 
-	err = br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
+func (b *blockDec) updateHistory(hist *history) error {
 	if len(b.data) > maxCompressedBlockSize {
 		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
 	}
 	// Set output and release references.
-	b.dst = seqs.out
-	seqs.out, seqs.literals, seqs.hist = nil, nil, nil
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
 
-	if !delayedHistory {
-		// If we don't have delayed history, no need to update.
-		hist.recentOffsets = seqs.prevOffset
-		return nil
-	}
 	if b.Last {
 		// if last block we don't care about history.
 		println("Last block, no history returned")
 		hist.b = hist.b[:0]
 		return nil
+	} else {
+		hist.append(b.dst)
+		if debugDecoder {
+			println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
+		}
 	}
-	hist.append(b.dst)
-	hist.recentOffsets = seqs.prevOffset
-	if debugDecoder {
-		println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.")
-	}
+	hist.decoders.out, hist.decoders.literals = nil, nil
 
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index 3df185ee465..12e8f6f0b61 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -51,7 +51,7 @@ func (b *blockEnc) init() {
 		if cap(b.literals) < maxCompressedBlockSize {
 			b.literals = make([]byte, 0, maxCompressedBlockSize)
 		}
-		const defSeqs = 200
+		const defSeqs = 2000
 		if cap(b.sequences) < defSeqs {
 			b.sequences = make([]seq, 0, defSeqs)
 		}
@@ -426,7 +426,7 @@ func fuzzFseEncoder(data []byte) int {
 		return 0
 	}
 	enc := fseEncoder{}
-	hist := enc.Histogram()[:256]
+	hist := enc.Histogram()
 	maxSym := uint8(0)
 	for i, v := range data {
 		v = v & 63
@@ -722,52 +722,53 @@ func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 		println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB)
 	}
 	seq--
-	if llEnc.maxBits+mlEnc.maxBits+ofEnc.maxBits <= 32 {
-		// No need to flush (common)
-		for seq >= 0 {
-			s = b.sequences[seq]
-			wr.flush32()
-			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-			// tabelog max is 8 for all.
-			of.encode(ofB)
-			ml.encode(mlB)
-			ll.encode(llB)
-			wr.flush32()
-
-			// We checked that all can stay within 32 bits
-			wr.addBits32NC(s.litLen, llB.outBits)
-			wr.addBits32NC(s.matchLen, mlB.outBits)
-			wr.addBits32NC(s.offset, ofB.outBits)
-
-			if debugSequences {
-				println("Encoded seq", seq, s)
-			}
-
-			seq--
-		}
-	} else {
-		for seq >= 0 {
-			s = b.sequences[seq]
-			wr.flush32()
-			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-			// tabelog max is below 8 for each.
-			of.encode(ofB)
-			ml.encode(mlB)
-			ll.encode(llB)
-			wr.flush32()
-
-			// ml+ll = max 32 bits total
-			wr.addBits32NC(s.litLen, llB.outBits)
-			wr.addBits32NC(s.matchLen, mlB.outBits)
-			wr.flush32()
-			wr.addBits32NC(s.offset, ofB.outBits)
-
-			if debugSequences {
-				println("Encoded seq", seq, s)
-			}
-
-			seq--
-		}
+	// Store sequences in reverse...
+	for seq >= 0 {
+		s = b.sequences[seq]
+
+		ofB := ofTT[s.ofCode]
+		wr.flush32() // tablelog max is below 8 for each, so it will fill max 24 bits.
+		//of.encode(ofB)
+		nbBitsOut := (uint32(of.state) + ofB.deltaNbBits) >> 16
+		dstState := int32(of.state>>(nbBitsOut&15)) + int32(ofB.deltaFindState)
+		wr.addBits16NC(of.state, uint8(nbBitsOut))
+		of.state = of.stateTable[dstState]
+
+		// Accumulate extra bits.
+		outBits := ofB.outBits & 31
+		extraBits := uint64(s.offset & bitMask32[outBits])
+		extraBitsN := outBits
+
+		mlB := mlTT[s.mlCode]
+		//ml.encode(mlB)
+		nbBitsOut = (uint32(ml.state) + mlB.deltaNbBits) >> 16
+		dstState = int32(ml.state>>(nbBitsOut&15)) + int32(mlB.deltaFindState)
+		wr.addBits16NC(ml.state, uint8(nbBitsOut))
+		ml.state = ml.stateTable[dstState]
+
+		outBits = mlB.outBits & 31
+		extraBits = extraBits<<outBits | uint64(s.matchLen&bitMask32[outBits])
+		extraBitsN += outBits
+
+		llB := llTT[s.llCode]
+		//ll.encode(llB)
+		nbBitsOut = (uint32(ll.state) + llB.deltaNbBits) >> 16
+		dstState = int32(ll.state>>(nbBitsOut&15)) + int32(llB.deltaFindState)
+		wr.addBits16NC(ll.state, uint8(nbBitsOut))
+		ll.state = ll.stateTable[dstState]
+
+		outBits = llB.outBits & 31
+		extraBits = extraBits<<outBits | uint64(s.litLen&bitMask32[outBits])
+		extraBitsN += outBits
+
+		wr.flush32()
+		wr.addBits64NC(extraBits, extraBitsN)
+
+		if debugSequences {
+			println("Encoded seq", seq, s)
+		}
+
+		seq--
 	}
 	ml.flush(mlEnc.actualTableLog)
 	of.flush(ofEnc.actualTableLog)
@@ -801,14 +802,13 @@ func (b *blockEnc) genCodes() {
 		// nothing to do
 		return
 	}
-
 	if len(b.sequences) > math.MaxUint16 {
 		panic("can only encode up to 64K sequences")
 	}
 	// No bounds checks after here:
-	llH := b.coders.llEnc.Histogram()[:256]
-	ofH := b.coders.ofEnc.Histogram()[:256]
-	mlH := b.coders.mlEnc.Histogram()[:256]
+	llH := b.coders.llEnc.Histogram()
+	ofH := b.coders.ofEnc.Histogram()
+	mlH := b.coders.mlEnc.Histogram()
 	for i := range llH {
 		llH[i] = 0
 	}
@@ -820,7 +820,8 @@ func (b *blockEnc) genCodes() {
 	}
 
 	var llMax, ofMax, mlMax uint8
-	for i, seq := range b.sequences {
+	for i := range b.sequences {
+		seq := &b.sequences[i]
 		v := llCode(seq.litLen)
 		seq.llCode = v
 		llH[v]++
@@ -844,7 +845,6 @@ func (b *blockEnc) genCodes() {
 				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
 			}
 		}
-		b.sequences[i] = seq
 	}
 	maxCount := func(a []uint32) int {
 		var max uint32
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index aab71c6cf85..b80191e4b1e 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -113,6 +113,9 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
 func (r *readerWrapper) readByte() (byte, error) {
 	n2, err := r.r.Read(r.tmp[:1])
 	if err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
 		return 0, err
 	}
 	if n2 != 1 {
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
index 69736e8d4bb..5022e71c836 100644
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -5,6 +5,7 @@ package zstd
 
 import (
 	"bytes"
+	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -15,18 +16,50 @@ const HeaderMaxSize = 14 + 3
 
 // Header contains information about the first frame and block within that.
 type Header struct {
-	// Window Size the window of data to keep while decoding.
-	// Will only be set if HasFCS is false.
-	WindowSize uint64
+	// SingleSegment specifies whether the data is to be decompressed into a
+	// single contiguous memory segment.
+	// It implies that WindowSize is invalid and that FrameContentSize is valid.
+	SingleSegment bool
 
-	// Frame content size.
-	// Expected size of the entire frame.
-	FrameContentSize uint64
+	// WindowSize is the window of data to keep while decoding.
+	// Will only be set if SingleSegment is false.
+	WindowSize uint64
 
 	// Dictionary ID.
 	// If 0, no dictionary.
 	DictionaryID uint32
 
+	// HasFCS specifies whether FrameContentSize has a valid value.
+	HasFCS bool
+
+	// FrameContentSize is the expected uncompressed size of the entire frame.
+	FrameContentSize uint64
+
+	// Skippable will be true if the frame is meant to be skipped.
+	// This implies that FirstBlock.OK is false.
+	Skippable bool
+
+	// SkippableID is the user-specific ID for the skippable frame.
+	// Valid values are between 0 to 15, inclusive.
+	SkippableID int
+
+	// SkippableSize is the length of the user data to skip following
+	// the header.
+	SkippableSize uint32
+
+	// HeaderSize is the raw size of the frame header.
+	//
+	// For normal frames, it includes the size of the magic number and
+	// the size of the header (per section 3.1.1.1).
+	// It does not include the size for any data blocks (section 3.1.1.2) nor
+	// the size for the trailing content checksum.
+	//
+	// For skippable frames, this counts the size of the magic number
+	// along with the size of the size field of the payload.
+	// It does not include the size of the skippable payload itself.
+	// The total frame size is the HeaderSize plus the SkippableSize.
+	HeaderSize int
+
 	// First block information.
 	FirstBlock struct {
 		// OK will be set if first block could be decoded.
@@ -51,17 +84,9 @@ type Header struct {
 		CompressedSize int
 	}
 
-	// Skippable will be true if the frame is meant to be skipped.
-	// No other information will be populated.
-	Skippable bool
-
 	// If set there is a checksum present for the block content.
+	// The checksum field at the end is always 4 bytes long.
 	HasCheckSum bool
-
-	// If this is true FrameContentSize will have a valid value
-	HasFCS bool
-
-	SingleSegment bool
 }
 
 // Decode the header from the beginning of the stream.
@@ -71,39 +96,46 @@ type Header struct {
 // If there isn't enough input, io.ErrUnexpectedEOF is returned.
 // The FirstBlock.OK will indicate if enough information was available to decode the first block header.
 func (h *Header) Decode(in []byte) error {
+	*h = Header{}
 	if len(in) < 4 {
 		return io.ErrUnexpectedEOF
 	}
+	h.HeaderSize += 4
 	b, in := in[:4], in[4:]
 	if !bytes.Equal(b, frameMagic) {
 		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
 			return ErrMagicMismatch
 		}
-		*h = Header{Skippable: true}
+		if len(in) < 4 {
+			return io.ErrUnexpectedEOF
+		}
+		h.HeaderSize += 4
+		h.Skippable = true
+		h.SkippableID = int(b[0] & 0xf)
+		h.SkippableSize = binary.LittleEndian.Uint32(in)
 		return nil
 	}
+
+	// Read Window_Descriptor
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
 	if len(in) < 1 {
 		return io.ErrUnexpectedEOF
 	}
-
-	// Clear output
-	*h = Header{}
 	fhd, in := in[0], in[1:]
+	h.HeaderSize++
 	h.SingleSegment = fhd&(1<<5) != 0
 	h.HasCheckSum = fhd&(1<<2) != 0
-
 	if fhd&(1<<3) != 0 {
 		return errors.New("reserved bit set on frame header")
 	}
 
-	// Read Window_Descriptor
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
 	if !h.SingleSegment {
 		if len(in) < 1 {
 			return io.ErrUnexpectedEOF
 		}
 		var wd byte
 		wd, in = in[0], in[1:]
+		h.HeaderSize++
 		windowLog := 10 + (wd >> 3)
 		windowBase := uint64(1) << windowLog
 		windowAdd := (windowBase / 8) * uint64(wd&0x7)
@@ -120,9 +152,7 @@ func (h *Header) Decode(in []byte) error {
 			return io.ErrUnexpectedEOF
 		}
 		b, in = in[:size], in[size:]
-		if b == nil {
-			return io.ErrUnexpectedEOF
-		}
+		h.HeaderSize += int(size)
 		switch size {
 		case 1:
 			h.DictionaryID = uint32(b[0])
@@ -152,9 +182,7 @@ func (h *Header) Decode(in []byte) error {
 			return io.ErrUnexpectedEOF
 		}
 		b, in = in[:fcsSize], in[fcsSize:]
-		if b == nil {
-			return io.ErrUnexpectedEOF
-		}
+		h.HeaderSize += int(fcsSize)
 		switch fcsSize {
 		case 1:
 			h.FrameContentSize = uint64(b[0])
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index f430f58b572..9fcdaac1dc7 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -5,9 +5,13 @@
 package zstd
 
 import (
-	"errors"
+	"bytes"
+	"context"
+	"encoding/binary"
 	"io"
 	"sync"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 // Decoder provides decoding of zstandard streams.
@@ -22,12 +26,19 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 
-	// Streams ready to be decoded.
-	stream chan decodeStream
-
 	// Current read position used for Reader functionality.
 	current decoderState
 
+	// sync stream decoding
+	syncStream struct {
+		decodedFrame uint64
+		br           readerWrapper
+		enabled      bool
+		inFrame      bool
+	}
+
+	frame *frameDec
+
 	// Custom dictionaries.
 	// Always uses copies.
 	dicts map[uint32]dict
@@ -46,7 +57,10 @@ type decoderState struct {
 	output chan decodeOutput
 
 	// cancel remaining output.
-	cancel chan struct{}
+	cancel context.CancelFunc
+
+	// crc of current frame
+	crc *xxhash.Digest
 
 	flushed bool
 }
@@ -81,7 +95,7 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 			return nil, err
 		}
 	}
-	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	d.current.crc = xxhash.New()
 	d.current.flushed = true
 
 	if r == nil {
@@ -130,7 +144,7 @@ func (d *Decoder) Read(p []byte) (int, error) {
 				break
 			}
 			if !d.nextBlock(n == 0) {
-				return n, nil
+				return n, d.current.err
 			}
 		}
 	}
@@ -162,6 +176,7 @@ func (d *Decoder) Reset(r io.Reader) error {
 
 	d.drainOutput()
 
+	d.syncStream.br.r = nil
 	if r == nil {
 		d.current.err = ErrDecoderNilInput
 		if len(d.current.b) > 0 {
@@ -195,33 +210,39 @@ func (d *Decoder) Reset(r io.Reader) error {
 		}
 		return nil
 	}
-
-	if d.stream == nil {
-		d.stream = make(chan decodeStream, 1)
-		d.streamWg.Add(1)
-		go d.startStreamDecoder(d.stream)
-	}
-
 	// Remove current block.
+	d.stashDecoder()
 	d.current.decodeOutput = decodeOutput{}
 	d.current.err = nil
-	d.current.cancel = make(chan struct{})
 	d.current.flushed = false
 	d.current.d = nil
 
-	d.stream <- decodeStream{
-		r:      r,
-		output: d.current.output,
-		cancel: d.current.cancel,
+	// Ensure no-one else is still running...
+	d.streamWg.Wait()
+	if d.frame == nil {
+		d.frame = newFrameDec(d.o)
+	}
+
+	if d.o.concurrent == 1 {
+		return d.startSyncDecoder(r)
 	}
+
+	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	ctx, cancel := context.WithCancel(context.Background())
+	d.current.cancel = cancel
+	d.streamWg.Add(1)
+	go d.startStreamDecoder(ctx, r, d.current.output)
+
 	return nil
 }
 
 // drainOutput will drain the output until errEndOfStream is sent.
 func (d *Decoder) drainOutput() {
 	if d.current.cancel != nil {
-		println("cancelling current")
-		close(d.current.cancel)
+		if debugDecoder {
+			println("cancelling current")
+		}
+		d.current.cancel()
 		d.current.cancel = nil
 	}
 	if d.current.d != nil {
@@ -243,12 +264,9 @@ func (d *Decoder) drainOutput() {
 			}
 			d.decoders <- v.d
 		}
-		if v.err == errEndOfStream {
-			println("current flushed")
-			d.current.flushed = true
-			return
-		}
 	}
+	d.current.output = nil
+	d.current.flushed = true
 }
 
 // WriteTo writes data to w until there's no more data to write or when an error occurs.
@@ -287,7 +305,7 @@ func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
 // DecodeAll can be used concurrently.
 // The Decoder concurrency limits will be respected.
 func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
-	if d.current.err == ErrDecoderClosed {
+	if d.decoders == nil {
 		return dst, ErrDecoderClosed
 	}
 
@@ -300,6 +318,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 		}
 		frame.rawInput = nil
 		frame.bBuf = nil
+		if frame.history.decoders.br != nil {
+			frame.history.decoders.br.in = nil
+		}
 		d.decoders <- block
 	}()
 	frame.bBuf = input
@@ -307,27 +328,31 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	for {
 		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
-		if err == io.EOF {
-			if debugDecoder {
-				println("frame reset return EOF")
+		if err != nil {
+			if err == io.EOF {
+				if debugDecoder {
+					println("frame reset return EOF")
+				}
+				return dst, nil
 			}
-			return dst, nil
+			return dst, err
 		}
 		if frame.DictionaryID != nil {
 			dict, ok := d.dicts[*frame.DictionaryID]
 			if !ok {
 				return nil, ErrUnknownDictionary
 			}
+			if debugDecoder {
+				println("setting dict", frame.DictionaryID)
+			}
 			frame.history.setDict(&dict)
 		}
-		if err != nil {
-			return dst, err
-		}
-		if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+
+		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
 			return dst, ErrDecoderSizeExceeded
 		}
-		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
-			// Never preallocate moe than 1 GB up front.
+		if frame.FrameContentSize < 1<<30 {
+			// Never preallocate more than 1 GB up front.
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
 				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
 				copy(dst2, dst)
@@ -368,33 +393,170 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 // If non-blocking mode is used the returned boolean will be false
 // if no data was available without blocking.
 func (d *Decoder) nextBlock(blocking bool) (ok bool) {
-	if d.current.d != nil {
-		if debugDecoder {
-			printf("re-adding current decoder %p", d.current.d)
-		}
-		d.decoders <- d.current.d
-		d.current.d = nil
-	}
 	if d.current.err != nil {
 		// Keep error state.
-		return blocking
+		return false
 	}
+	d.current.b = d.current.b[:0]
 
+	// SYNC:
+	if d.syncStream.enabled {
+		if !blocking {
+			return false
+		}
+		ok = d.nextBlockSync()
+		if !ok {
+			d.stashDecoder()
+		}
+		return ok
+	}
+
+	//ASYNC:
+	d.stashDecoder()
 	if blocking {
-		d.current.decodeOutput = <-d.current.output
+		d.current.decodeOutput, ok = <-d.current.output
 	} else {
 		select {
-		case d.current.decodeOutput = <-d.current.output:
+		case d.current.decodeOutput, ok = <-d.current.output:
 		default:
 			return false
 		}
 	}
+	if !ok {
+		// This should not happen, so signal error state...
+		d.current.err = io.ErrUnexpectedEOF
+		return false
+	}
+	next := d.current.decodeOutput
+	if next.d != nil && next.d.async.newHist != nil {
+		d.current.crc.Reset()
+	}
 	if debugDecoder {
-		println("got", len(d.current.b), "bytes, error:", d.current.err)
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b)))
+		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
+	}
+
+	if len(next.b) > 0 {
+		n, err := d.current.crc.Write(next.b)
+		if err == nil {
+			if n != len(next.b) {
+				d.current.err = io.ErrShortWrite
+			}
+		}
+	}
+	if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
+		got := d.current.crc.Sum64()
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
+		if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
+			if debugDecoder {
+				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+			}
+			d.current.err = ErrCRCMismatch
+		} else {
+			if debugDecoder {
+				println("CRC ok", tmp[:])
+			}
+		}
+	}
+
+	return true
+}
+
+func (d *Decoder) nextBlockSync() (ok bool) {
+	if d.current.d == nil {
+		d.current.d = <-d.decoders
+	}
+	for len(d.current.b) == 0 {
+		if !d.syncStream.inFrame {
+			d.frame.history.reset()
+			d.current.err = d.frame.reset(&d.syncStream.br)
+			if d.current.err != nil {
+				return false
+			}
+			if d.frame.DictionaryID != nil {
+				dict, ok := d.dicts[*d.frame.DictionaryID]
+				if !ok {
+					d.current.err = ErrUnknownDictionary
+					return false
+				} else {
+					d.frame.history.setDict(&dict)
+				}
+			}
+			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
+				d.current.err = ErrDecoderSizeExceeded
+				return false
+			}
+
+			d.syncStream.decodedFrame = 0
+			d.syncStream.inFrame = true
+		}
+		d.current.err = d.frame.next(d.current.d)
+		if d.current.err != nil {
+			return false
+		}
+		d.frame.history.ensureBlock()
+		if debugDecoder {
+			println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame)
+		}
+		histBefore := len(d.frame.history.b)
+		d.current.err = d.current.d.decodeBuf(&d.frame.history)
+
+		if d.current.err != nil {
+			println("error after:", d.current.err)
+			return false
+		}
+		d.current.b = d.frame.history.b[histBefore:]
+		if debugDecoder {
+			println("history after:", len(d.frame.history.b))
+		}
+
+		// Check frame size (before CRC)
+		d.syncStream.decodedFrame += uint64(len(d.current.b))
+		if d.syncStream.decodedFrame > d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeExceeded
+			return false
+		}
+
+		// Check FCS
+		if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeMismatch
+			return false
+		}
+
+		// Update/Check CRC
+		if d.frame.HasCheckSum {
+			d.frame.crc.Write(d.current.b)
+			if d.current.d.Last {
+				d.current.err = d.frame.checkCRC()
+				if d.current.err != nil {
+					println("CRC error:", d.current.err)
+					return false
+				}
+			}
+		}
+		d.syncStream.inFrame = !d.current.d.Last
 	}
 	return true
 }
 
+func (d *Decoder) stashDecoder() {
+	if d.current.d != nil {
+		if debugDecoder {
+			printf("re-adding current decoder %p", d.current.d)
+		}
+		d.decoders <- d.current.d
+		d.current.d = nil
+	}
+}
+
 // Close will release all resources.
 // It is NOT possible to reuse the decoder after this.
 func (d *Decoder) Close() {
@@ -402,10 +564,10 @@ func (d *Decoder) Close() {
 		return
 	}
 	d.drainOutput()
-	if d.stream != nil {
-		close(d.stream)
+	if d.current.cancel != nil {
+		d.current.cancel()
 		d.streamWg.Wait()
-		d.stream = nil
+		d.current.cancel = nil
 	}
 	if d.decoders != nil {
 		close(d.decoders)
@@ -456,100 +618,307 @@ type decodeOutput struct {
 	err error
 }
 
-type decodeStream struct {
-	r io.Reader
-
-	// Blocks ready to be written to output.
-	output chan decodeOutput
-
-	// cancel reading from the input
-	cancel chan struct{}
+func (d *Decoder) startSyncDecoder(r io.Reader) error {
+	d.frame.history.reset()
+	d.syncStream.br = readerWrapper{r: r}
+	d.syncStream.inFrame = false
+	d.syncStream.enabled = true
+	d.syncStream.decodedFrame = 0
+	return nil
 }
 
-// errEndOfStream indicates that everything from the stream was read.
-var errEndOfStream = errors.New("end-of-stream")
-
 // Create Decoder:
-// Spawn n block decoders. These accept tasks to decode a block.
-// Create goroutine that handles stream processing, this will send history to decoders as they are available.
-// Decoders update the history as they decode.
-// When a block is returned:
-// 		a) history is sent to the next decoder,
-// 		b) content written to CRC.
-// 		c) return data to WRITER.
-// 		d) wait for next block to return data.
-// Once WRITTEN, the decoders reused by the writer frame decoder for re-use.
-func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
+// ASYNC:
+// Spawn 4 go routines.
+// 0: Read frames and decode blocks.
+// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
+// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
+// 3: Wait for stream history, execute sequences, send stream history.
+func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
 	defer d.streamWg.Done()
-	frame := newFrameDec(d.o)
-	for stream := range inStream {
-		if debugDecoder {
-			println("got new stream")
+	br := readerWrapper{r: r}
+
+	var seqPrepare = make(chan *blockDec, d.o.concurrent)
+	var seqDecode = make(chan *blockDec, d.o.concurrent)
+	var seqExecute = make(chan *blockDec, d.o.concurrent)
+
+	// Async 1: Prepare blocks...
+	go func() {
+		var hist history
+		var hasErr bool
+		for block := range seqPrepare {
+			if hasErr {
+				if block != nil {
+					seqDecode <- block
+				}
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 1: new history")
+				}
+				hist.reset()
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+			}
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqDecode <- block
+				continue
+			}
+
+			remain, err := block.decodeLiterals(block.data, &hist)
+			block.err = err
+			hasErr = block.err != nil
+			if err == nil {
+				block.async.literals = hist.decoders.literals
+				block.async.seqData = remain
+			} else if debugDecoder {
+				println("decodeLiterals error:", err)
+			}
+			seqDecode <- block
 		}
-		br := readerWrapper{r: stream.r}
-	decodeStream:
-		for {
-			frame.history.reset()
-			err := frame.reset(&br)
-			if debugDecoder && err != nil {
-				println("Frame decoder returned", err)
+		close(seqDecode)
+	}()
+
+	// Async 2: Decode sequences...
+	go func() {
+		var hist history
+		var hasErr bool
+
+		for block := range seqDecode {
+			if hasErr {
+				if block != nil {
+					seqExecute <- block
+				}
+				continue
 			}
-			if err == nil && frame.DictionaryID != nil {
-				dict, ok := d.dicts[*frame.DictionaryID]
-				if !ok {
-					err = ErrUnknownDictionary
-				} else {
-					frame.history.setDict(&dict)
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
+				}
+				hist.decoders = block.async.newHist.decoders
+				hist.recentOffsets = block.async.newHist.recentOffsets
+				hist.windowSize = block.async.newHist.windowSize
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
 				}
 			}
-			if err != nil {
-				stream.output <- decodeOutput{
-					err: err,
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqExecute <- block
+				continue
+			}
+
+			hist.decoders.literals = block.async.literals
+			block.err = block.prepareSequences(block.async.seqData, &hist)
+			if debugDecoder && block.err != nil {
+				println("prepareSequences returned:", block.err)
+			}
+			hasErr = block.err != nil
+			if block.err == nil {
+				block.err = block.decodeSequences(&hist)
+				if debugDecoder && block.err != nil {
+					println("decodeSequences returned:", block.err)
 				}
-				break
+				hasErr = block.err != nil
+				//				block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs]
+				block.async.seqSize = hist.decoders.seqSize
 			}
-			if debugDecoder {
-				println("starting frame decoder")
-			}
-
-			// This goroutine will forward history between frames.
-			frame.frameDone.Add(1)
-			frame.initAsync()
-
-			go frame.startDecoder(stream.output)
-		decodeFrame:
-			// Go through all blocks of the frame.
-			for {
-				dec := <-d.decoders
-				select {
-				case <-stream.cancel:
-					if !frame.sendErr(dec, io.EOF) {
-						// To not let the decoder dangle, send it back.
-						stream.output <- decodeOutput{d: dec}
+			seqExecute <- block
+		}
+		close(seqExecute)
+	}()
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	// Async 3: Execute sequences...
+	frameHistCache := d.frame.history.b
+	go func() {
+		var hist history
+		var decodedFrame uint64
+		var fcs uint64
+		var hasErr bool
+		for block := range seqExecute {
+			out := decodeOutput{err: block.err, d: block}
+			if block.err != nil || hasErr {
+				hasErr = true
+				output <- out
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 3: new history")
+				}
+				hist.windowSize = block.async.newHist.windowSize
+				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+
+				if cap(hist.b) < hist.allocFrameBuffer {
+					if cap(frameHistCache) >= hist.allocFrameBuffer {
+						hist.b = frameHistCache
+					} else {
+						hist.b = make([]byte, 0, hist.allocFrameBuffer)
+						println("Alloc history sized", hist.allocFrameBuffer)
+					}
+				}
+				hist.b = hist.b[:0]
+				fcs = block.async.fcs
+				decodedFrame = 0
+			}
+			do := decodeOutput{err: block.err, d: block}
+			switch block.Type {
+			case blockTypeRLE:
+				if debugDecoder {
+					println("add rle block length:", block.RLESize)
+				}
+
+				if cap(block.dst) < int(block.RLESize) {
+					if block.lowMem {
+						block.dst = make([]byte, block.RLESize)
+					} else {
+						block.dst = make([]byte, maxBlockSize)
 					}
-					break decodeStream
-				default:
 				}
-				err := frame.next(dec)
-				switch err {
-				case io.EOF:
-					// End of current frame, no error
-					println("EOF on next block")
-					break decodeFrame
-				case nil:
-					continue
-				default:
-					println("block decoder returned", err)
-					break decodeStream
+				block.dst = block.dst[:block.RLESize]
+				v := block.data[0]
+				for i := range block.dst {
+					block.dst[i] = v
+				}
+				hist.append(block.dst)
+				do.b = block.dst
+			case blockTypeRaw:
+				if debugDecoder {
+					println("add raw block length:", len(block.data))
+				}
+				hist.append(block.data)
+				do.b = block.data
+			case blockTypeCompressed:
+				if debugDecoder {
+					println("execute with history length:", len(hist.b), "window:", hist.windowSize)
+				}
+				hist.decoders.seqSize = block.async.seqSize
+				hist.decoders.literals = block.async.literals
+				do.err = block.executeSequences(&hist)
+				hasErr = do.err != nil
+				if debugDecoder && hasErr {
+					println("executeSequences returned:", do.err)
+				}
+				do.b = block.dst
+			}
+			if !hasErr {
+				decodedFrame += uint64(len(do.b))
+				if decodedFrame > fcs {
+					println("fcs exceeded", block.Last, fcs, decodedFrame)
+					do.err = ErrFrameSizeExceeded
+					hasErr = true
+				} else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
+					do.err = ErrFrameSizeMismatch
+					hasErr = true
+				} else {
+					if debugDecoder {
+						println("fcs ok", block.Last, fcs, decodedFrame)
+					}
 				}
 			}
-			// All blocks have started decoding, check if there are more frames.
-			println("waiting for done")
-			frame.frameDone.Wait()
-			println("done waiting...")
+			output <- do
+		}
+		close(output)
+		frameHistCache = hist.b
+		wg.Done()
+		if debugDecoder {
+			println("decoder goroutines finished")
+		}
+	}()
+
+decodeStream:
+	for {
+		frame := d.frame
+		if debugDecoder {
+			println("New frame...")
+		}
+		var historySent bool
+		frame.history.reset()
+		err := frame.reset(&br)
+		if debugDecoder && err != nil {
+			println("Frame decoder returned", err)
+		}
+		if err == nil && frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				err = ErrUnknownDictionary
+			} else {
+				frame.history.setDict(&dict)
+			}
+		}
+		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+			err = ErrDecoderSizeExceeded
+		}
+		if err != nil {
+			select {
+			case <-ctx.Done():
+			case dec := <-d.decoders:
+				dec.sendErr(err)
+				seqPrepare <- dec
+			}
+			break decodeStream
+		}
+
+		// Go through all blocks of the frame.
+		for {
+			var dec *blockDec
+			select {
+			case <-ctx.Done():
+				break decodeStream
+			case dec = <-d.decoders:
+				// Once we have a decoder, we MUST return it.
+			}
+			err := frame.next(dec)
+			if !historySent {
+				h := frame.history
+				if debugDecoder {
+					println("Alloc History:", h.allocFrameBuffer)
+				}
+				dec.async.newHist = &h
+				dec.async.fcs = frame.FrameContentSize
+				historySent = true
+			} else {
+				dec.async.newHist = nil
+			}
+			if debugDecoder && err != nil {
+				println("next block returned error:", err)
+			}
+			dec.err = err
+			dec.checkCRC = nil
+			if dec.Last && frame.HasCheckSum && err == nil {
+				crc, err := frame.rawInput.readSmall(4)
+				if err != nil {
+					println("CRC missing?", err)
+					dec.err = err
+				}
+				var tmp [4]byte
+				copy(tmp[:], crc)
+				dec.checkCRC = tmp[:]
+				if debugDecoder {
+					println("found crc to check:", dec.checkCRC)
+				}
+			}
+			err = dec.err
+			last := dec.Last
+			seqPrepare <- dec
+			if err != nil {
+				break decodeStream
+			}
+			if last {
+				break
+			}
 		}
-		frame.frameDone.Wait()
-		println("Sending EOS")
-		stream.output <- decodeOutput{err: errEndOfStream}
 	}
+	close(seqPrepare)
+	wg.Wait()
+	d.frame.history.b = frameHistCache
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 95cc9b8b81f..fd05c9bb012 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -28,6 +28,9 @@ func (o *decoderOptions) setDefault() {
 		concurrent:    runtime.GOMAXPROCS(0),
 		maxWindowSize: MaxWindowSize,
 	}
+	if o.concurrent > 4 {
+		o.concurrent = 4
+	}
 	o.maxDecodedSize = 1 << 63
 }
 
@@ -37,16 +40,25 @@ func WithDecoderLowmem(b bool) DOption {
 	return func(o *decoderOptions) error { o.lowMem = b; return nil }
 }
 
-// WithDecoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
-// The value supplied must be at least 1.
-// By default this will be set to GOMAXPROCS.
+// WithDecoderConcurrency sets the number of created decoders.
+// When decoding block with DecodeAll, this will limit the number
+// of possible concurrently running decodes.
+// When decoding streams, this will limit the number of
+// inflight blocks.
+// When decoding streams and setting maximum to 1,
+// no async decoding will be done.
+// When a value of 0 is provided GOMAXPROCS will be used.
+// By default this will be set to 4 or GOMAXPROCS, whatever is lower.
 func WithDecoderConcurrency(n int) DOption {
 	return func(o *decoderOptions) error {
-		if n <= 0 {
+		if n < 0 {
 			return errors.New("concurrency must be at least 1")
 		}
-		o.concurrent = n
+		if n == 0 {
+			o.concurrent = runtime.GOMAXPROCS(0)
+		} else {
+			o.concurrent = n
+		}
 		return nil
 	}
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index 295cd602a42..15ae8ee8077 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -108,11 +108,6 @@ func (e *fastBase) UseBlock(enc *blockEnc) {
 	e.blk = enc
 }
 
-func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
-}
-
 func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 	if debugAsserts {
 		if s < 0 {
@@ -131,9 +126,24 @@ func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
 		}
 	}
+	a := src[s:]
+	b := src[t:]
+	b = b[:len(a)]
+	end := int32((len(a) >> 3) << 3)
+	for i := int32(0); i < end; i += 8 {
+		if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
+			return i + int32(bits.TrailingZeros64(diff)>>3)
+		}
+	}
 
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
+	a = a[end:]
+	b = b[end:]
+	for i := range a {
+		if a[i] != b[i] {
+			return int32(i) + end
+		}
+	}
+	return int32(len(a)) + end
 }
 
 // Reset the encoding table.
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index f2502629bc5..f51ab529a0b 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -6,8 +6,6 @@ package zstd
 
 import (
 	"fmt"
-	"math"
-	"math/bits"
 )
 
 const (
@@ -87,7 +85,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 7
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -136,20 +134,7 @@ encodeLoop:
 				// Consider history as well.
 				var seq seq
 				var length int32
-				// length = 4 + e.matchlen(s+6, repIndex+4, src)
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
-
+				length = 4 + e.matchlen(s+6, repIndex+4, src)
 				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
@@ -236,20 +221,7 @@ encodeLoop:
 		}
 
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlen(s+4, t+4, src) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -286,20 +258,7 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlen(s+4, o2+4, src)
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
 			nextHash := hashLen(cv, hashLog, tableFastHashLen)
@@ -375,7 +334,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -418,21 +377,7 @@ encodeLoop:
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// length := 4 + e.matchlen(s+6, repIndex+4, src)
-				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
-				var length int32
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
 
 				seq.matchLen = uint32(length - zstdMinMatch)
 
@@ -522,21 +467,7 @@ encodeLoop:
 			panic(fmt.Sprintf("t (%d) < 0 ", t))
 		}
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -573,21 +504,7 @@ encodeLoop:
 		if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
 			nextHash := hashLen(cv, hashLog, tableFastHashLen)
@@ -731,19 +648,7 @@ encodeLoop:
 				// Consider history as well.
 				var seq seq
 				var length int32
-				// length = 4 + e.matchlen(s+6, repIndex+4, src)
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
+				length = 4 + e.matchlen(s+6, repIndex+4, src)
 
 				seq.matchLen = uint32(length - zstdMinMatch)
 
@@ -831,20 +736,7 @@ encodeLoop:
 		}
 
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlen(s+4, t+4, src) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -881,20 +773,7 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlen(s+4, o2+4, src)
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
 			nextHash := hashLen(cv, hashLog, tableFastHashLen)
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index e6e315969b0..dcc987a7cb6 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -98,23 +98,25 @@ func (e *Encoder) Reset(w io.Writer) {
 	if cap(s.filling) == 0 {
 		s.filling = make([]byte, 0, e.o.blockSize)
 	}
-	if cap(s.current) == 0 {
-		s.current = make([]byte, 0, e.o.blockSize)
-	}
-	if cap(s.previous) == 0 {
-		s.previous = make([]byte, 0, e.o.blockSize)
+	if e.o.concurrent > 1 {
+		if cap(s.current) == 0 {
+			s.current = make([]byte, 0, e.o.blockSize)
+		}
+		if cap(s.previous) == 0 {
+			s.previous = make([]byte, 0, e.o.blockSize)
+		}
+		s.current = s.current[:0]
+		s.previous = s.previous[:0]
+		if s.writing == nil {
+			s.writing = &blockEnc{lowMem: e.o.lowMem}
+			s.writing.init()
+		}
+		s.writing.initNewEncode()
 	}
 	if s.encoder == nil {
 		s.encoder = e.o.encoder()
 	}
-	if s.writing == nil {
-		s.writing = &blockEnc{lowMem: e.o.lowMem}
-		s.writing.init()
-	}
-	s.writing.initNewEncode()
 	s.filling = s.filling[:0]
-	s.current = s.current[:0]
-	s.previous = s.previous[:0]
 	s.encoder.Reset(e.o.dict, false)
 	s.headerWritten = false
 	s.eofWritten = false
@@ -258,6 +260,46 @@ func (e *Encoder) nextBlock(final bool) error {
 		return s.err
 	}
 
+	// SYNC:
+	if e.o.concurrent == 1 {
+		src := s.filling
+		s.nInput += int64(len(s.filling))
+		if debugEncoder {
+			println("Adding sync block,", len(src), "bytes, final:", final)
+		}
+		enc := s.encoder
+		blk := enc.Block()
+		blk.reset(nil)
+		enc.Encode(blk, src)
+		blk.last = final
+		if final {
+			s.eofWritten = true
+		}
+
+		err := errIncompressible
+		// If we got the exact same number of literals as input,
+		// assume the literals cannot be compressed.
+		if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
+			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		}
+		switch err {
+		case errIncompressible:
+			if debugEncoder {
+				println("Storing incompressible block as raw")
+			}
+			blk.encodeRaw(src)
+			// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
+		case nil:
+		default:
+			s.err = err
+			return err
+		}
+		_, s.err = s.w.Write(blk.output)
+		s.nWritten += int64(len(blk.output))
+		s.filling = s.filling[:0]
+		return s.err
+	}
+
 	// Move blocks forward.
 	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
 	s.nInput += int64(len(s.current))
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index 7d29e1d689e..44d8dbd199a 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -24,6 +24,7 @@ type encoderOptions struct {
 	allLitEntropy   bool
 	customWindow    bool
 	customALEntropy bool
+	customBlockSize bool
 	lowMem          bool
 	dict            *dict
 }
@@ -33,7 +34,7 @@ func (o *encoderOptions) setDefault() {
 		concurrent:    runtime.GOMAXPROCS(0),
 		crc:           true,
 		single:        nil,
-		blockSize:     1 << 16,
+		blockSize:     maxCompressedBlockSize,
 		windowSize:    8 << 20,
 		level:         SpeedDefault,
 		allLitEntropy: true,
@@ -75,6 +76,7 @@ func WithEncoderCRC(b bool) EOption {
 // WithEncoderConcurrency will set the concurrency,
 // meaning the maximum number of encoders to run concurrently.
 // The value supplied must be at least 1.
+// For streams, setting a value of 1 will disable async compression.
 // By default this will be set to GOMAXPROCS.
 func WithEncoderConcurrency(n int) EOption {
 	return func(o *encoderOptions) error {
@@ -106,6 +108,7 @@ func WithWindowSize(n int) EOption {
 		o.customWindow = true
 		if o.blockSize > o.windowSize {
 			o.blockSize = o.windowSize
+			o.customBlockSize = true
 		}
 		return nil
 	}
@@ -188,10 +191,9 @@ func EncoderLevelFromZstd(level int) EncoderLevel {
 		return SpeedDefault
 	case level >= 6 && level < 10:
 		return SpeedBetterCompression
-	case level >= 10:
+	default:
 		return SpeedBestCompression
 	}
-	return SpeedDefault
 }
 
 // String provides a string representation of the compression level.
@@ -222,6 +224,9 @@ func WithEncoderLevel(l EncoderLevel) EOption {
 			switch o.level {
 			case SpeedFastest:
 				o.windowSize = 4 << 20
+				if !o.customBlockSize {
+					o.blockSize = 1 << 16
+				}
 			case SpeedDefault:
 				o.windowSize = 8 << 20
 			case SpeedBetterCompression:
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 989c79f8c31..11089d22328 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -8,23 +8,17 @@ import (
 	"bytes"
 	"encoding/hex"
 	"errors"
-	"hash"
 	"io"
-	"sync"
 
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 type frameDec struct {
-	o      decoderOptions
-	crc    hash.Hash64
-	offset int64
+	o   decoderOptions
+	crc *xxhash.Digest
 
 	WindowSize uint64
 
-	// In order queue of blocks being decoded.
-	decoding chan *blockDec
-
 	// Frame history passed between blocks
 	history history
 
@@ -34,15 +28,10 @@ type frameDec struct {
 	bBuf byteBuf
 
 	FrameContentSize uint64
-	frameDone        sync.WaitGroup
 
 	DictionaryID  *uint32
 	HasCheckSum   bool
 	SingleSegment bool
-
-	// asyncRunning indicates whether the async routine processes input on 'decoding'.
-	asyncRunningMu sync.Mutex
-	asyncRunning   bool
 }
 
 const (
@@ -208,7 +197,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 	default:
 		fcsSize = 1 << v
 	}
-	d.FrameContentSize = 0
+	d.FrameContentSize = fcsUnknown
 	if fcsSize > 0 {
 		b, err := br.readSmall(fcsSize)
 		if err != nil {
@@ -229,9 +218,10 @@ func (d *frameDec) reset(br byteBuffer) error {
 			d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
 		}
 		if debugDecoder {
-			println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize)
+			println("Read FCS:", d.FrameContentSize)
 		}
 	}
+
 	// Move this to shared.
 	d.HasCheckSum = fhd&(1<<2) != 0
 	if d.HasCheckSum {
@@ -264,10 +254,16 @@ func (d *frameDec) reset(br byteBuffer) error {
 	}
 	d.history.windowSize = int(d.WindowSize)
 	if d.o.lowMem && d.history.windowSize < maxBlockSize {
-		d.history.maxSize = d.history.windowSize * 2
+		d.history.allocFrameBuffer = d.history.windowSize * 2
+		// TODO: Maybe use FrameContent size
 	} else {
-		d.history.maxSize = d.history.windowSize + maxBlockSize
+		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
 	}
+
+	if debugDecoder {
+		println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum)
+	}
+
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -276,49 +272,18 @@ func (d *frameDec) reset(br byteBuffer) error {
 // next will start decoding the next block from stream.
 func (d *frameDec) next(block *blockDec) error {
 	if debugDecoder {
-		printf("decoding new block %p:%p", block, block.data)
+		println("decoding new block")
 	}
 	err := block.reset(d.rawInput, d.WindowSize)
 	if err != nil {
 		println("block error:", err)
 		// Signal the frame decoder we have a problem.
-		d.sendErr(block, err)
+		block.sendErr(err)
 		return err
 	}
-	block.input <- struct{}{}
-	if debugDecoder {
-		println("next block:", block)
-	}
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return nil
-	}
-	if block.Last {
-		// We indicate the frame is done by sending io.EOF
-		d.decoding <- block
-		return io.EOF
-	}
-	d.decoding <- block
 	return nil
 }
 
-// sendEOF will queue an error block on the frame.
-// This will cause the frame decoder to return when it encounters the block.
-// Returns true if the decoder was added.
-func (d *frameDec) sendErr(block *blockDec, err error) bool {
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return false
-	}
-
-	println("sending error", err.Error())
-	block.sendErr(err)
-	d.decoding <- block
-	return true
-}
-
 // checkCRC will check the checksum if the frame has one.
 // Will return ErrCRCMismatch if crc check failed, otherwise nil.
 func (d *frameDec) checkCRC() error {
@@ -340,7 +305,7 @@ func (d *frameDec) checkCRC() error {
 		return err
 	}
 
-	if !bytes.Equal(tmp[:], want) {
+	if !bytes.Equal(tmp[:], want) && !ignoreCRC {
 		if debugDecoder {
 			println("CRC Check Failed:", tmp[:], "!=", want)
 		}
@@ -352,131 +317,13 @@ func (d *frameDec) checkCRC() error {
 	return nil
 }
 
-func (d *frameDec) initAsync() {
-	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 2MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize
-	}
-	// re-alloc if more than one extra block size.
-	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.history.b) < d.history.maxSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.decoding) < d.o.concurrent {
-		d.decoding = make(chan *blockDec, d.o.concurrent)
-	}
-	if debugDecoder {
-		h := d.history
-		printf("history init. len: %d, cap: %d", len(h.b), cap(h.b))
-	}
-	d.asyncRunningMu.Lock()
-	d.asyncRunning = true
-	d.asyncRunningMu.Unlock()
-}
-
-// startDecoder will start decoding blocks and write them to the writer.
-// The decoder will stop as soon as an error occurs or at end of frame.
-// When the frame has finished decoding the *bufio.Reader
-// containing the remaining input will be sent on frameDec.frameDone.
-func (d *frameDec) startDecoder(output chan decodeOutput) {
-	written := int64(0)
-
-	defer func() {
-		d.asyncRunningMu.Lock()
-		d.asyncRunning = false
-		d.asyncRunningMu.Unlock()
-
-		// Drain the currently decoding.
-		d.history.error = true
-	flushdone:
-		for {
-			select {
-			case b := <-d.decoding:
-				b.history <- &d.history
-				output <- <-b.result
-			default:
-				break flushdone
-			}
-		}
-		println("frame decoder done, signalling done")
-		d.frameDone.Done()
-	}()
-	// Get decoder for first block.
-	block := <-d.decoding
-	block.history <- &d.history
-	for {
-		var next *blockDec
-		// Get result
-		r := <-block.result
-		if r.err != nil {
-			println("Result contained error", r.err)
-			output <- r
-			return
-		}
-		if debugDecoder {
-			println("got result, from ", d.offset, "to", d.offset+int64(len(r.b)))
-			d.offset += int64(len(r.b))
-		}
-		if !block.Last {
-			// Send history to next block
-			select {
-			case next = <-d.decoding:
-				if debugDecoder {
-					println("Sending ", len(d.history.b), "bytes as history")
-				}
-				next.history <- &d.history
-			default:
-				// Wait until we have sent the block, so
-				// other decoders can potentially get the decoder.
-				next = nil
-			}
-		}
-
-		// Add checksum, async to decoding.
-		if d.HasCheckSum {
-			n, err := d.crc.Write(r.b)
-			if err != nil {
-				r.err = err
-				if n != len(r.b) {
-					r.err = io.ErrShortWrite
-				}
-				output <- r
-				return
-			}
-		}
-		written += int64(len(r.b))
-		if d.SingleSegment && uint64(written) > d.FrameContentSize {
-			println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize)
-			r.err = ErrFrameSizeExceeded
-			output <- r
-			return
-		}
-		if block.Last {
-			r.err = d.checkCRC()
-			output <- r
-			return
-		}
-		output <- r
-		if next == nil {
-			// There was no decoder available, we wait for one now that we have sent to the writer.
-			if debugDecoder {
-				println("Sending ", len(d.history.b), " bytes as history")
-			}
-			next = <-d.decoding
-			next.history <- &d.history
-		}
-		block = next
-	}
-}
-
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	saved := d.history.b
 
 	// We use the history for output to avoid copying it.
 	d.history.b = dst
+	d.history.ignoreBuffer = len(dst)
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
 	var err error
@@ -489,22 +336,30 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 			println("next block:", dec)
 		}
 		err = dec.decodeBuf(&d.history)
-		if err != nil || dec.Last {
+		if err != nil {
 			break
 		}
 		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
 			err = ErrDecoderSizeExceeded
 			break
 		}
-		if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
-			println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
+		if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
+			println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
 			err = ErrFrameSizeExceeded
 			break
 		}
+		if dec.Last {
+			break
+		}
+		if debugDecoder {
+			println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
+		}
 	}
 	dst = d.history.b
 	if err == nil {
-		if d.HasCheckSum {
+		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
+			err = ErrFrameSizeMismatch
+		} else if d.HasCheckSum {
 			var n int
 			n, err = d.crc.Write(dst[crcStart:])
 			if err == nil {
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index e6d3d49b39c..bb3d4fd6c31 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -379,7 +379,7 @@ func (s decSymbol) final() (int, uint8) {
 // This can only be used if no symbols are 0 bits.
 // At least tablelog bits must be available in the bit reader.
 func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
-	lowBits := uint16(br.getBitsFast(s.state.nbBits()))
+	lowBits := br.get16BitsFast(s.state.nbBits())
 	s.state = s.dt[s.state.newState()+lowBits]
 	return s.state.baseline(), s.state.addBits()
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index b4757ee3f03..5442061b18d 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -62,9 +62,8 @@ func (s symbolTransform) String() string {
 // To indicate that you have populated the histogram call HistogramFinished
 // with the value of the highest populated symbol, as well as the number of entries
 // in the most populated entry. These are accepted at face value.
-// The returned slice will always be length 256.
-func (s *fseEncoder) Histogram() []uint32 {
-	return s.count[:]
+func (s *fseEncoder) Histogram() *[256]uint32 {
+	return &s.count
 }
 
 // HistogramFinished can be called to indicate that the histogram has been populated.
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz.go b/vendor/github.com/klauspost/compress/zstd/fuzz.go
new file mode 100644
index 00000000000..7f2210e0530
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz.go
@@ -0,0 +1,11 @@
+//go:build ignorecrc
+// +build ignorecrc
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+// ignoreCRC can be used for fuzz testing to ignore CRC values...
+const ignoreCRC = true
diff --git a/vendor/github.com/klauspost/compress/zstd/fuzz_none.go b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
new file mode 100644
index 00000000000..6811c68a893
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fuzz_none.go
@@ -0,0 +1,11 @@
+//go:build !ignorecrc
+// +build !ignorecrc
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+// ignoreCRC can be used for fuzz testing to ignore CRC values...
+const ignoreCRC = false
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index f783e32d251..28b40153cc2 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -10,20 +10,31 @@ import (
 
 // history contains the information transferred between blocks.
 type history struct {
-	b             []byte
-	huffTree      *huff0.Scratch
-	recentOffsets [3]int
+	// Literal decompression
+	huffTree *huff0.Scratch
+
+	// Sequence decompression
 	decoders      sequenceDecs
-	windowSize    int
-	maxSize       int
-	error         bool
-	dict          *dict
+	recentOffsets [3]int
+
+	// History buffer...
+	b []byte
+
+	// ignoreBuffer is meant to ignore a number of bytes
+	// when checking for matches in history
+	ignoreBuffer int
+
+	windowSize       int
+	allocFrameBuffer int // needed?
+	error            bool
+	dict             *dict
 }
 
 // reset will reset the history to initial state of a frame.
 // The history must already have been initialized to the desired size.
 func (h *history) reset() {
 	h.b = h.b[:0]
+	h.ignoreBuffer = 0
 	h.error = false
 	h.recentOffsets = [3]int{1, 4, 8}
 	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
@@ -35,7 +46,7 @@ func (h *history) reset() {
 	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
 		fseDecoderPool.Put(f)
 	}
-	h.decoders = sequenceDecs{}
+	h.decoders = sequenceDecs{br: h.decoders.br}
 	if h.huffTree != nil {
 		if h.dict == nil || h.dict.litEnc != h.huffTree {
 			huffDecoderPool.Put(h.huffTree)
@@ -54,6 +65,7 @@ func (h *history) setDict(dict *dict) {
 	h.decoders.litLengths = dict.llDec
 	h.decoders.offsets = dict.ofDec
 	h.decoders.matchLengths = dict.mlDec
+	h.decoders.dict = dict.content
 	h.recentOffsets = dict.offsets
 	h.huffTree = dict.litEnc
 }
@@ -83,6 +95,24 @@ func (h *history) append(b []byte) {
 	copy(h.b[h.windowSize-len(b):], b)
 }
 
+// ensureBlock will ensure there is space for at least one block...
+func (h *history) ensureBlock() {
+	if cap(h.b) < h.allocFrameBuffer {
+		h.b = make([]byte, 0, h.allocFrameBuffer)
+		return
+	}
+
+	avail := cap(h.b) - len(h.b)
+	if avail >= h.windowSize || avail > maxCompressedBlockSize {
+		return
+	}
+	// Move data down so we only have window size left.
+	// We know we have less than window size in b at this point.
+	discard := len(h.b) - h.windowSize
+	copy(h.b, h.b[discard:])
+	h.b = h.b[:h.windowSize]
+}
+
 // append bytes to history without ever discarding anything.
 func (h *history) appendKeep(b []byte) {
 	h.b = append(h.b, b...)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index be8db5bf796..cea17856197 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -1,6 +1,7 @@
 // +build !appengine
 // +build gc
 // +build !purego
+// +build !noasm
 
 #include "textflag.h"
 
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
new file mode 100644
index 00000000000..4d64a17d69c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@@ -0,0 +1,186 @@
+// +build gc,!purego,!noasm
+
+#include "textflag.h"
+
+// Register allocation.
+#define digest	R1
+#define h	R2 // Return value.
+#define p	R3 // Input pointer.
+#define len	R4
+#define nblocks	R5 // len / 32.
+#define prime1	R7
+#define prime2	R8
+#define prime3	R9
+#define prime4	R10
+#define prime5	R11
+#define v1	R12
+#define v2	R13
+#define v3	R14
+#define v4	R15
+#define x1	R20
+#define x2	R21
+#define x3	R22
+#define x4	R23
+
+#define round(acc, x) \
+	MADD prime2, acc, x, acc \
+	ROR  $64-31, acc         \
+	MUL  prime1, acc         \
+
+// x = round(0, x).
+#define round0(x) \
+	MUL prime2, x \
+	ROR $64-31, x \
+	MUL prime1, x \
+
+#define mergeRound(x) \
+	round0(x)                 \
+	EOR  x, h                 \
+	MADD h, prime4, prime1, h \
+
+// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
+#define blocksLoop() \
+	LSR     $5, len, nblocks \
+	PCALIGN $16              \
+	loop:                    \
+	LDP.P   32(p), (x1, x2)  \
+	round(v1, x1)            \
+	LDP     -16(p), (x3, x4) \
+	round(v2, x2)            \
+	SUB     $1, nblocks      \
+	round(v3, x3)            \
+	round(v4, x4)            \
+	CBNZ    nblocks, loop    \
+
+// The primes are repeated here to ensure that they're stored
+// in a contiguous array, so we can load them with LDP.
+DATA primes<> +0(SB)/8, $11400714785074694791
+DATA primes<> +8(SB)/8, $14029467366897019727
+DATA primes<>+16(SB)/8, $1609587929392839161
+DATA primes<>+24(SB)/8, $9650029242287828579
+DATA primes<>+32(SB)/8, $2870177450012600261
+GLOBL primes<>(SB), NOPTR+RODATA, $40
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
+	LDP b_base+0(FP), (p, len)
+
+	LDP  primes<> +0(SB), (prime1, prime2)
+	LDP  primes<>+16(SB), (prime3, prime4)
+	MOVD primes<>+32(SB), prime5
+
+	CMP  $32, len
+	CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
+	BLO  afterLoop
+
+	ADD  prime1, prime2, v1
+	MOVD prime2, v2
+	MOVD $0, v3
+	NEG  prime1, v4
+
+	blocksLoop()
+
+	ROR $64-1, v1, x1
+	ROR $64-7, v2, x2
+	ADD x1, x2
+	ROR $64-12, v3, x3
+	ROR $64-18, v4, x4
+	ADD x3, x4
+	ADD x2, x4, h
+
+	mergeRound(v1)
+	mergeRound(v2)
+	mergeRound(v3)
+	mergeRound(v4)
+
+afterLoop:
+	ADD len, h
+
+	TBZ   $4, len, try8
+	LDP.P 16(p), (x1, x2)
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h, h
+	MADD h, prime4, prime1, h
+
+	round0(x2)
+	ROR  $64-27, h
+	EOR  x2 @> 64-27, h
+	MADD h, prime4, prime1, h
+
+try8:
+	TBZ    $3, len, try4
+	MOVD.P 8(p), x1
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h
+	MADD h, prime4, prime1, h
+
+try4:
+	TBZ     $2, len, try2
+	MOVWU.P 4(p), x2
+
+	MUL  prime1, x2
+	ROR  $64-23, h
+	EOR  x2 @> 64-23, h
+	MADD h, prime3, prime2, h
+
+try2:
+	TBZ     $1, len, try1
+	MOVHU.P 2(p), x3
+	AND     $255, x3, x1
+	LSR     $8, x3, x2
+
+	MUL prime5, x1
+	ROR $64-11, h
+	EOR x1 @> 64-11, h
+	MUL prime1, h
+
+	MUL prime5, x2
+	ROR $64-11, h
+	EOR x2 @> 64-11, h
+	MUL prime1, h
+
+try1:
+	TBZ   $0, len, end
+	MOVBU (p), x4
+
+	MUL prime5, x4
+	ROR $64-11, h
+	EOR x4 @> 64-11, h
+	MUL prime1, h
+
+end:
+	EOR h >> 33, h
+	MUL prime2, h
+	EOR h >> 29, h
+	MUL prime3, h
+	EOR h >> 32, h
+
+	MOVD h, ret+24(FP)
+	RET
+
+// func writeBlocks(d *Digest, b []byte) int
+//
+// Assumes len(b) >= 32.
+TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
+	LDP primes<>(SB), (prime1, prime2)
+
+	// Load state. Assume v[1-4] are stored contiguously.
+	MOVD d+0(FP), digest
+	LDP  0(digest), (v1, v2)
+	LDP  16(digest), (v3, v4)
+
+	LDP b_base+8(FP), (p, len)
+
+	blocksLoop()
+
+	// Store updated state.
+	STP (v1, v2), 0(digest)
+	STP (v3, v4), 16(digest)
+
+	BIC  $31, len
+	MOVD len, ret+32(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
similarity index 51%
rename from vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
rename to vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
index 0ae847f75b0..1a1fac9c261 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@@ -1,5 +1,9 @@
-//go:build !appengine && gc && !purego
-// +build !appengine,gc,!purego
+//go:build (amd64 || arm64) && !appengine && gc && !purego && !noasm
+// +build amd64 arm64
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm
 
 package xxhash
 
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 1f52f296e71..209cb4a999c 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -1,5 +1,5 @@
-//go:build !amd64 || appengine || !gc || purego
-// +build !amd64 appengine !gc purego
+//go:build (!amd64 && !arm64) || appengine || !gc || purego || noasm
+// +build !amd64,!arm64 appengine !gc purego noasm
 
 package xxhash
 
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 1dd39e63b7e..819f1461b70 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -20,6 +20,10 @@ type seq struct {
 	llCode, mlCode, ofCode uint8
 }
 
+type seqVals struct {
+	ll, ml, mo int
+}
+
 func (s seq) String() string {
 	if s.offset <= 3 {
 		if s.offset == 0 {
@@ -61,16 +65,18 @@ type sequenceDecs struct {
 	offsets      sequenceDec
 	matchLengths sequenceDec
 	prevOffset   [3]int
-	hist         []byte
 	dict         []byte
 	literals     []byte
 	out          []byte
+	nSeqs        int
+	br           *bitReader
+	seqSize      int
 	windowSize   int
 	maxBits      uint8
 }
 
 // initialize all 3 decoders from the stream input.
-func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error {
+func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error {
 	if err := s.litLengths.init(br); err != nil {
 		return errors.New("litLengths:" + err.Error())
 	}
@@ -80,8 +86,7 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	if err := s.matchLengths.init(br); err != nil {
 		return errors.New("matchLengths:" + err.Error())
 	}
-	s.literals = literals
-	s.hist = hist.b
+	s.br = br
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
@@ -94,11 +99,261 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 }
 
 // decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	s.seqSize = 0
+	litRemain := len(s.literals)
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+	for i := range seqs {
+		var ll, mo, ml int
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("WARNING: temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
+			br.fillFast()
+		} else {
+			if br.overread() {
+				if debugDecoder {
+					printf("reading sequence %d, exceeded available data\n", i)
+				}
+				return io.ErrUnexpectedEOF
+			}
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
+			br.fill()
+		}
+
+		if debugSequences {
+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+		}
+		// Evaluate.
+		// We might be doing this async, so do it early.
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+		}
+		s.seqSize += ll + ml
+		if s.seqSize > maxBlockSize {
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		}
+		litRemain -= ll
+		if litRemain < 0 {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+		}
+		seqs[i] = seqVals{
+			ll: ll,
+			ml: ml,
+			mo: mo,
+		}
+		if i == len(seqs)-1 {
+			// This is the last sequence, so we shouldn't update state.
+			break
+		}
+
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
+		} else {
+			bits := br.get32BitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+		}
+	}
+	s.seqSize += litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// execute will execute the decoded sequence with the provided history.
+// The sequence must be evaluated before being sent.
+func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Copy from dictionary...
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (seq.mo - (t + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict))
+			}
+			end := dictO + seq.ml
+			if end > len(s.dict) {
+				n := len(s.dict) - dictO
+				copy(out[t:], s.dict[dictO:])
+				t += n
+				seq.ml -= n
+			} else {
+				copy(out[t:], s.dict[dictO:end])
+				t += end - dictO
+				continue
+			}
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+		// We must be in current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+				continue
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
+
+// decode sequences from the stream with the provided history.
+func (s *sequenceDecs) decodeSync(history *history) error {
+	br := s.br
+	seqs := s.nSeqs
 	startSize := len(s.out)
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	hist := history.b[history.ignoreBuffer:]
+	out := s.out
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
 
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
@@ -151,7 +406,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
-						println("temp was 0")
+						println("WARNING: temp was 0")
 						temp = 1
 					}
 
@@ -176,51 +431,49 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		if ll > len(s.literals) {
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := ll + ml + len(s.out)
+		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size", size)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
 		}
-		if size > cap(s.out) {
+		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
 			// but could be if destination slice is too small for sync operations.
 			// over-allocating here can create a large amount of GC pressure so we try to keep
 			// it as contained as possible
-			used := len(s.out) - startSize
+			used := len(out) - startSize
 			addBytes := 256 + ll + ml + used>>2
 			// Clamp to max block size.
 			if used+addBytes > maxBlockSize {
 				addBytes = maxBlockSize - used
 			}
-			s.out = append(s.out, make([]byte, addBytes)...)
-			s.out = s.out[:len(s.out)-addBytes]
+			out = append(out, make([]byte, addBytes)...)
+			out = out[:len(out)-addBytes]
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 
 		// Add literals
-		s.out = append(s.out, s.literals[:ll]...)
+		out = append(out, s.literals[:ll]...)
 		s.literals = s.literals[ll:]
-		out := s.out
 
 		if mo == 0 && ml > 0 {
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		}
 
-		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+		if mo > len(out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
 			}
 
 			// we may be in dictionary.
-			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
 				out = append(out, s.dict[dictO:]...)
-				mo -= len(s.dict) - dictO
 				ml -= len(s.dict) - dictO
 			} else {
 				out = append(out, s.dict[dictO:end]...)
@@ -231,26 +484,25 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := mo - len(s.out); v > 0 {
+		if v := mo - len(out); v > 0 {
 			// v is the start position in history from end.
-			start := len(s.hist) - v
+			start := len(hist) - v
 			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
-				out = append(out, s.hist[start:]...)
-				mo -= v
+				out = append(out, hist[start:]...)
 				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+ml]...)
+				out = append(out, hist[start:start+ml]...)
 				ml = 0
 			}
 		}
 		// We must be in current buffer now
 		if ml > 0 {
-			start := len(s.out) - mo
-			if ml <= len(s.out)-start {
+			start := len(out) - mo
+			if ml <= len(out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+ml]...)
+				out = append(out, out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
@@ -264,7 +516,6 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 				}
 			}
 		}
-		s.out = out
 		if i == 0 {
 			// This is the last sequence, so we shouldn't update state.
 			break
@@ -278,7 +529,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			mlState = mlTable[mlState.newState()&maxTableMask]
 			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
-			bits := br.getBitsFast(nBits)
+			bits := br.get32BitsFast(nBits)
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
 
@@ -291,9 +542,14 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		}
 	}
 
+	// Check if space for literals
+	if len(s.literals)+len(s.out)-startSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
+	}
+
 	// Add final literals
-	s.out = append(s.out, s.literals...)
-	return nil
+	s.out = append(out, s.literals...)
+	return br.close()
 }
 
 // update states, at least 27 bits must be available.
@@ -326,7 +582,7 @@ func (s *sequenceDecs) updateAlt(br *bitReader) {
 		s.offsets.state.state = s.offsets.state.dt[c.newState()]
 		return
 	}
-	bits := br.getBitsFast(nBits)
+	bits := br.get32BitsFast(nBits)
 	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
 	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
 
@@ -457,36 +713,3 @@ func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int {
 	s.prevOffset[0] = temp
 	return temp
 }
-
-// mergeHistory will merge history.
-func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) {
-	for i := uint(0); i < 3; i++ {
-		var sNew, sHist *sequenceDec
-		switch i {
-		default:
-			// same as "case 0":
-			sNew = &s.litLengths
-			sHist = &hist.litLengths
-		case 1:
-			sNew = &s.offsets
-			sHist = &hist.offsets
-		case 2:
-			sNew = &s.matchLengths
-			sHist = &hist.matchLengths
-		}
-		if sNew.repeat {
-			if sHist.fse == nil {
-				return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i)
-			}
-			continue
-		}
-		if sNew.fse == nil {
-			return nil, fmt.Errorf("sequence stream %d, no fse found", i)
-		}
-		if sHist.fse != nil && !sHist.fse.preDefined {
-			fseDecoderPool.Put(sHist.fse)
-		}
-		sHist.fse = sNew.fse
-	}
-	return hist, nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
index 967f29b3120..ffffcbc254e 100644
--- a/vendor/github.com/klauspost/compress/zstd/zip.go
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -20,7 +20,7 @@ const ZipMethodPKWare = 20
 
 var zipReaderPool sync.Pool
 
-// newZipReader cannot be used since we would leak goroutines...
+// newZipReader creates a pooled zip decompressor.
 func newZipReader(r io.Reader) io.ReadCloser {
 	dec, ok := zipReaderPool.Get().(*Decoder)
 	if ok {
@@ -44,10 +44,14 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
 	r.mu.Lock()
 	defer r.mu.Unlock()
 	if r.dec == nil {
-		return 0, errors.New("Read after Close")
+		return 0, errors.New("read after close or EOF")
 	}
 	dec, err := r.dec.Read(p)
-
+	if err == io.EOF {
+		err = r.dec.Reset(nil)
+		zipReaderPool.Put(r.dec)
+		r.dec = nil
+	}
 	return dec, err
 }
 
@@ -112,11 +116,5 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
 // See ZipCompressor for example.
 func ZipDecompressor() func(r io.Reader) io.ReadCloser {
-	return func(r io.Reader) io.ReadCloser {
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
-		if err != nil {
-			panic(err)
-		}
-		return d.IOReadCloser()
-	}
+	return newZipReader
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index ef1d49a009c..c1c90b4a072 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -39,6 +39,9 @@ const zstdMinMatch = 3
 // Reset the buffer offset when reaching this.
 const bufferReset = math.MaxInt32 - MaxWindowSize
 
+// fcsUnknown is used for unknown frame content size.
+const fcsUnknown = math.MaxUint64
+
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
 	// Typically this indicates wrong or corrupted input.
@@ -52,6 +55,10 @@ var (
 	// Typically returned on invalid input.
 	ErrBlockTooSmall = errors.New("block too small")
 
+	// ErrUnexpectedBlockSize is returned when a block has unexpected size.
+	// Typically returned on invalid input.
+	ErrUnexpectedBlockSize = errors.New("unexpected block size")
+
 	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
 	// Typically this indicates wrong or corrupted input.
 	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
@@ -75,6 +82,10 @@ var (
 	// This is only returned if SingleSegment is specified on the frame.
 	ErrFrameSizeExceeded = errors.New("frame size exceeded")
 
+	// ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size.
+	// This is only returned if SingleSegment is specified on the frame.
+	ErrFrameSizeMismatch = errors.New("frame size does not match size on stream")
+
 	// ErrCRCMismatch is returned if CRC mismatches.
 	ErrCRCMismatch = errors.New("CRC check failed")
 
diff --git a/vendor/github.com/mattn/go-colorable/README.md b/vendor/github.com/mattn/go-colorable/README.md
index e055952b667..ca0483711c9 100644
--- a/vendor/github.com/mattn/go-colorable/README.md
+++ b/vendor/github.com/mattn/go-colorable/README.md
@@ -1,6 +1,6 @@
 # go-colorable
 
-[![Build Status](https://travis-ci.org/mattn/go-colorable.svg?branch=master)](https://travis-ci.org/mattn/go-colorable)
+[![Build Status](https://github.com/mattn/go-colorable/workflows/test/badge.svg)](https://github.com/mattn/go-colorable/actions?query=workflow%3Atest)
 [![Codecov](https://codecov.io/gh/mattn/go-colorable/branch/master/graph/badge.svg)](https://codecov.io/gh/mattn/go-colorable)
 [![GoDoc](https://godoc.org/github.com/mattn/go-colorable?status.svg)](http://godoc.org/github.com/mattn/go-colorable)
 [![Go Report Card](https://goreportcard.com/badge/mattn/go-colorable)](https://goreportcard.com/report/mattn/go-colorable)
diff --git a/vendor/github.com/mattn/go-colorable/colorable_appengine.go b/vendor/github.com/mattn/go-colorable/colorable_appengine.go
index 1f7806fe16b..416d1bbbf83 100644
--- a/vendor/github.com/mattn/go-colorable/colorable_appengine.go
+++ b/vendor/github.com/mattn/go-colorable/colorable_appengine.go
@@ -1,3 +1,4 @@
+//go:build appengine
 // +build appengine
 
 package colorable
diff --git a/vendor/github.com/mattn/go-colorable/colorable_others.go b/vendor/github.com/mattn/go-colorable/colorable_others.go
index 08cbd1e0fa2..766d94603ac 100644
--- a/vendor/github.com/mattn/go-colorable/colorable_others.go
+++ b/vendor/github.com/mattn/go-colorable/colorable_others.go
@@ -1,5 +1,5 @@
-// +build !windows
-// +build !appengine
+//go:build !windows && !appengine
+// +build !windows,!appengine
 
 package colorable
 
diff --git a/vendor/github.com/mattn/go-colorable/colorable_windows.go b/vendor/github.com/mattn/go-colorable/colorable_windows.go
index 41215d7fc4f..1846ad5ab41 100644
--- a/vendor/github.com/mattn/go-colorable/colorable_windows.go
+++ b/vendor/github.com/mattn/go-colorable/colorable_windows.go
@@ -1,5 +1,5 @@
-// +build windows
-// +build !appengine
+//go:build windows && !appengine
+// +build windows,!appengine
 
 package colorable
 
@@ -452,18 +452,22 @@ func (w *Writer) Write(data []byte) (n int, err error) {
 	} else {
 		er = bytes.NewReader(data)
 	}
-	var bw [1]byte
+	var plaintext bytes.Buffer
 loop:
 	for {
 		c1, err := er.ReadByte()
 		if err != nil {
+			plaintext.WriteTo(w.out)
 			break loop
 		}
 		if c1 != 0x1b {
-			bw[0] = c1
-			w.out.Write(bw[:])
+			plaintext.WriteByte(c1)
 			continue
 		}
+		_, err = plaintext.WriteTo(w.out)
+		if err != nil {
+			break loop
+		}
 		c2, err := er.ReadByte()
 		if err != nil {
 			break loop
diff --git a/vendor/github.com/mattn/go-colorable/noncolorable.go b/vendor/github.com/mattn/go-colorable/noncolorable.go
index 95f2c6be257..05d6f74bf6b 100644
--- a/vendor/github.com/mattn/go-colorable/noncolorable.go
+++ b/vendor/github.com/mattn/go-colorable/noncolorable.go
@@ -18,18 +18,22 @@ func NewNonColorable(w io.Writer) io.Writer {
 // Write writes data on console
 func (w *NonColorable) Write(data []byte) (n int, err error) {
 	er := bytes.NewReader(data)
-	var bw [1]byte
+	var plaintext bytes.Buffer
 loop:
 	for {
 		c1, err := er.ReadByte()
 		if err != nil {
+			plaintext.WriteTo(w.out)
 			break loop
 		}
 		if c1 != 0x1b {
-			bw[0] = c1
-			w.out.Write(bw[:])
+			plaintext.WriteByte(c1)
 			continue
 		}
+		_, err = plaintext.WriteTo(w.out)
+		if err != nil {
+			break loop
+		}
 		c2, err := er.ReadByte()
 		if err != nil {
 			break loop
@@ -38,7 +42,6 @@ loop:
 			continue
 		}
 
-		var buf bytes.Buffer
 		for {
 			c, err := er.ReadByte()
 			if err != nil {
@@ -47,7 +50,6 @@ loop:
 			if ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '@' {
 				break
 			}
-			buf.Write([]byte(string(c)))
 		}
 	}
 
diff --git a/vendor/github.com/mattn/go-isatty/isatty_bsd.go b/vendor/github.com/mattn/go-isatty/isatty_bsd.go
index 711f288085a..39bbcf00f0c 100644
--- a/vendor/github.com/mattn/go-isatty/isatty_bsd.go
+++ b/vendor/github.com/mattn/go-isatty/isatty_bsd.go
@@ -1,3 +1,4 @@
+//go:build (darwin || freebsd || openbsd || netbsd || dragonfly) && !appengine
 // +build darwin freebsd openbsd netbsd dragonfly
 // +build !appengine
 
diff --git a/vendor/github.com/mattn/go-isatty/isatty_others.go b/vendor/github.com/mattn/go-isatty/isatty_others.go
index 3eba4cb34a2..31503226f6c 100644
--- a/vendor/github.com/mattn/go-isatty/isatty_others.go
+++ b/vendor/github.com/mattn/go-isatty/isatty_others.go
@@ -1,3 +1,4 @@
+//go:build appengine || js || nacl || wasm
 // +build appengine js nacl wasm
 
 package isatty
diff --git a/vendor/github.com/mattn/go-isatty/isatty_plan9.go b/vendor/github.com/mattn/go-isatty/isatty_plan9.go
index c5b6e0c084a..bae7f9bb3dc 100644
--- a/vendor/github.com/mattn/go-isatty/isatty_plan9.go
+++ b/vendor/github.com/mattn/go-isatty/isatty_plan9.go
@@ -1,3 +1,4 @@
+//go:build plan9
 // +build plan9
 
 package isatty
diff --git a/vendor/github.com/mattn/go-isatty/isatty_solaris.go b/vendor/github.com/mattn/go-isatty/isatty_solaris.go
index 30106707834..0c3acf2dc28 100644
--- a/vendor/github.com/mattn/go-isatty/isatty_solaris.go
+++ b/vendor/github.com/mattn/go-isatty/isatty_solaris.go
@@ -1,5 +1,5 @@
-// +build solaris
-// +build !appengine
+//go:build solaris && !appengine
+// +build solaris,!appengine
 
 package isatty
 
diff --git a/vendor/github.com/mattn/go-isatty/isatty_tcgets.go b/vendor/github.com/mattn/go-isatty/isatty_tcgets.go
index 4e7b850ecfb..67787657fb2 100644
--- a/vendor/github.com/mattn/go-isatty/isatty_tcgets.go
+++ b/vendor/github.com/mattn/go-isatty/isatty_tcgets.go
@@ -1,3 +1,4 @@
+//go:build (linux || aix || zos) && !appengine
 // +build linux aix zos
 // +build !appengine
 
diff --git a/vendor/github.com/mattn/go-isatty/isatty_windows.go b/vendor/github.com/mattn/go-isatty/isatty_windows.go
index 1fa86915405..8e3c99171bf 100644
--- a/vendor/github.com/mattn/go-isatty/isatty_windows.go
+++ b/vendor/github.com/mattn/go-isatty/isatty_windows.go
@@ -1,5 +1,5 @@
-// +build windows
-// +build !appengine
+//go:build windows && !appengine
+// +build windows,!appengine
 
 package isatty
 
@@ -76,7 +76,7 @@ func isCygwinPipeName(name string) bool {
 }
 
 // getFileNameByHandle use the undocomented ntdll NtQueryObject to get file full name from file handler
-// since GetFileInformationByHandleEx is not avilable under windows Vista and still some old fashion
+// since GetFileInformationByHandleEx is not available under windows Vista and still some old fashion
 // guys are using Windows XP, this is a workaround for those guys, it will also work on system from
 // Windows vista to 10
 // see https://stackoverflow.com/a/18792477 for details
diff --git a/vendor/modules.txt b/vendor/modules.txt
index f11d93bf5cc..db04bbb53a2 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -33,7 +33,7 @@ github.com/dop251/goja/ftoa/internal/fast
 github.com/dop251/goja/parser
 github.com/dop251/goja/token
 github.com/dop251/goja/unistring
-# github.com/fatih/color v1.12.0
+# github.com/fatih/color v1.13.0
 ## explicit; go 1.13
 github.com/fatih/color
 # github.com/go-sourcemap/sourcemap v2.1.4-0.20211119122758-180fcef48034+incompatible
@@ -78,7 +78,7 @@ github.com/jhump/protoreflect/internal/codec
 # github.com/josharian/intern v1.0.0
 ## explicit; go 1.5
 github.com/josharian/intern
-# github.com/klauspost/compress v1.13.6
+# github.com/klauspost/compress v1.15.1
 ## explicit; go 1.15
 github.com/klauspost/compress
 github.com/klauspost/compress/flate
@@ -94,10 +94,10 @@ github.com/mailru/easyjson
 github.com/mailru/easyjson/buffer
 github.com/mailru/easyjson/jlexer
 github.com/mailru/easyjson/jwriter
-# github.com/mattn/go-colorable v0.1.8
+# github.com/mattn/go-colorable v0.1.12
 ## explicit; go 1.13
 github.com/mattn/go-colorable
-# github.com/mattn/go-isatty v0.0.13
+# github.com/mattn/go-isatty v0.0.14
 ## explicit; go 1.12
 github.com/mattn/go-isatty
 # github.com/mccutchen/go-httpbin v1.1.2-0.20190116014521-c5cb2f4802fa