TUN-9467: bump coredns to solve CVE

* TUN-9467: bump coredns to solve CVE
2025-07-28 05:49:57 +00:00 · 2025-06-12 10:46:10 +00:00
parent f8d12c9d39
commit a408612f26
459 changed files with 30077 additions and 16165 deletions
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -7,6 +7,7 @@ package flate

 import (
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"math"
@@ -90,9 +91,8 @@ type advancedState struct {
 	ii uint16 // position of last match, intended to overflow to reset.

 	// input window: unprocessed data is window[index:windowEnd]
-	index          int
-	estBitsPerByte int
-	hashMatch      [maxMatchLength + minMatchLength]uint32
+	index     int
+	hashMatch [maxMatchLength + minMatchLength]uint32

 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
@@ -212,7 +212,7 @@ func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
 // Should only be used after a start/reset.
 func (d *compressor) fillWindow(b []byte) {
 	// Do not fill window if we are in store-only or huffman mode.
-	if d.level <= 0 {
+	if d.level <= 0 && d.level > -MinCustomWindowSize {
 		return
 	}
 	if d.fast != nil {
@@ -294,7 +294,6 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 	}
 	offset = 0

-	cGain := 0
 	if d.chain < 100 {
 		for i := prevHead; tries > 0; tries-- {
 			if wEnd == win[i+length] {
@@ -322,10 +321,14 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 		return
 	}

+	// Minimum gain to accept a match.
+	cGain := 4
+
 	// Some like it higher (CSV), some like it lower (JSON)
-	const baseCost = 6
+	const baseCost = 3
 	// Base is 4 bytes at with an additional cost.
 	// Matches must be better than this.
+
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
@@ -333,7 +336,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, of
 				// Calculate gain. Estimate
 				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])

-				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				//fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length)
 				if newGain > cGain {
 					length = n
 					offset = pos - i
@@ -490,27 +493,103 @@ func (d *compressor) deflateLazy() {
 		}

 		if prevLength >= minMatchLength && s.length <= prevLength {
-			// Check for better match at end...
+			// No better match, but check for better match at end...
 			//
-			// checkOff must be >=2 since we otherwise risk checking s.index
-			// Offset of 2 seems to yield best results.
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield best results. 3 is sometimes better.
 			const checkOff = 2
-			prevIndex := s.index - 1
-			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
-				end := lookahead
-				if lookahead > maxMatchLength {
-					end = maxMatchLength
-				}
-				end += prevIndex
-				idx := prevIndex + prevLength - (4 - checkOff)
-				h := hash4(d.window[idx:])
-				ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength + (4 - checkOff)
-				if ch2 > minIndex {
-					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
-					// It seems like a pure length metric is best.
-					if length > prevLength {
-						prevLength = length
-						prevOffset = prevIndex - ch2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := lookahead
+					if lookahead > maxMatchLength+checkOff {
+						end = maxMatchLength + checkOff
+					}
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						} else if false {
+							// Check one further ahead.
+							// Only rarely better, disabled for now.
+							prevIndex++
+							h := hash4(d.window[prevIndex+prevLength:])
+							ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+							if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+								length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+								// It seems like a pure length metric is best.
+								if length > prevLength+checkOff {
+									prevLength = length
+									prevOffset = prevIndex - ch2
+									prevIndex--
+
+									// Extend back...
+									for i := checkOff; i >= 0; i-- {
+										if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] {
+											// Emit tokens we "owe"
+											for j := 0; j <= i; j++ {
+												d.tokens.AddLiteral(d.window[prevIndex+j])
+												if d.tokens.n == maxFlateBlockTokens {
+													// The block includes the current character
+													if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+														return
+													}
+													d.tokens.Reset()
+												}
+												s.index++
+												if s.index < s.maxInsertIndex {
+													h := hash4(d.window[s.index:])
+													ch := s.hashHead[h]
+													s.chainHead = int(ch)
+													s.hashPrev[s.index&windowMask] = ch
+													s.hashHead[h] = uint32(s.index + s.hashOffset)
+												}
+											}
+											break
+										} else {
+											prevLength++
+										}
+									}
+								}
+							}
+						}
 					}
 				}
 			}
@@ -755,6 +834,12 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.initDeflate()
 		d.fill = (*compressor).fillDeflate
 		d.step = (*compressor).deflateLazy
+	case -level >= MinCustomWindowSize && -level <= MaxCustomWindowSize:
+		d.w.logNewTablePenalty = 7
+		d.fast = &fastEncL5Window{maxOffset: int32(-level), cur: maxStoreBlockSize}
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
@@ -776,7 +861,7 @@ func (d *compressor) reset(w io.Writer) {
 	}
 	switch d.compressionLevel.chain {
 	case 0:
-		// level was NoCompression or ConstantCompresssion.
+		// level was NoCompression or ConstantCompression.
 		d.windowEnd = 0
 	default:
 		s := d.state
@@ -851,6 +936,28 @@ func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
 	return zw, err
 }

+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = 32
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = windowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("flate: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("flate: requested window size bigger than MaxCustomWindowSize")
+	}
+	var dw Writer
+	if err := dw.d.init(w, -windowSize); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see NewWriter).
 type Writer struct {
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -6,9 +6,10 @@
 package flate

 import (
-	"encoding/binary"
 	"fmt"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )

 type fastEnc interface {
@@ -59,11 +60,11 @@ const (
 )

 func load3232(b []byte, i int32) uint32 {
-	return binary.LittleEndian.Uint32(b[i:])
+	return le.Load32(b, i)
 }

 func load6432(b []byte, i int32) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
+	return le.Load64(b, i)
 }

 type tableEntry struct {
@@ -135,8 +136,8 @@ func hashLen(u uint64, length, mls uint8) uint32 {
 // matchlen will return the match length between offsets and t in src.
 // The maximum length returned is maxMatchLength - 4.
 // It is assumed that s > t, that t >=0 and s < len(src).
-func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
-	if debugDecode {
+func (e *fastGen) matchlen(s, t int, src []byte) int32 {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
@@ -150,18 +151,34 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
 		}
 	}
-	s1 := int(s) + maxMatchLength - 4
-	if s1 > len(src) {
-		s1 = len(src)
+	s1 := min(s+maxMatchLength-4, len(src))
+	left := s1 - s
+	n := int32(0)
+	for left >= 8 {
+		diff := le.Load64(src, s) ^ le.Load64(src, t)
+		if diff != 0 {
+			return n + int32(bits.TrailingZeros64(diff)>>3)
+		}
+		s += 8
+		t += 8
+		n += 8
+		left -= 8
 	}

-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:s1], src[t:]))
+	a := src[s:s1]
+	b := src[t:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
 }

 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
-func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
+func (e *fastGen) matchlenLong(s, t int, src []byte) int32 {
 	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
@@ -177,7 +194,28 @@ func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
 		}
 	}
 	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
+	left := len(src) - s
+	n := int32(0)
+	for left >= 8 {
+		diff := le.Load64(src, s) ^ le.Load64(src, t)
+		if diff != 0 {
+			return n + int32(bits.TrailingZeros64(diff)>>3)
+		}
+		s += 8
+		t += 8
+		n += 8
+		left -= 8
+	}
+
+	a := src[s:]
+	b := src[t:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
 }

 // Reset the encoding table.
@@ -192,25 +230,3 @@ func (e *fastGen) Reset() {
 	}
 	e.hist = e.hist[:0]
 }
-
-// matchLen returns the maximum length.
-// 'a' must be the shortest of the two.
-func matchLen(a, b []byte) int {
-	var checked int
-
-	for len(a) >= 8 {
-		if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
-			return checked + (bits.TrailingZeros64(diff) >> 3)
-		}
-		checked += 8
-		a = a[8:]
-		b = b[8:]
-	}
-	b = b[:len(a)]
-	for i := range a {
-		if a[i] != b[i] {
-			return i + checked
-		}
-	}
-	return len(a) + checked
-}
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -5,10 +5,11 @@
 package flate

 import (
-	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
+
+	"github.com/klauspost/compress/internal/le"
 )

 const (
@@ -34,11 +35,6 @@ const (
 	// Should preferably be a multiple of 6, since
 	// we accumulate 6 bytes between writes to the buffer.
 	bufferFlushSize = 246
-
-	// bufferSize is the actual output byte buffer size.
-	// It must have additional headroom for a flush
-	// which can contain up to 8 bytes.
-	bufferSize = bufferFlushSize + 8
 )

 // Minimum length code that emits bits.
@@ -443,7 +439,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 	n := w.nbytes

 	// We over-write, but faster...
-	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
+	le.Store64(w.bytes[n:], bits)
 	n += 6

 	if n >= bufferFlushSize {
@@ -859,7 +855,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -887,7 +883,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -910,7 +906,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= uint64(extraLength) << (nbits & 63)
 			nbits += extraLengthBits
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -936,7 +932,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= c.code64() << (nbits & 63)
 			nbits += c.len()
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -958,7 +954,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
 			nbits += uint8(offsetComb)
 			if nbits >= 48 {
-				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				le.Store64(w.bytes[nbytes:], bits)
 				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 				bits >>= 48
 				nbits -= 48
@@ -1112,7 +1108,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		// We must have at least 48 bits free.
 		if nbits >= 8 {
 			n := nbits >> 3
-			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			le.Store64(w.bytes[nbytes:], bits)
 			bits >>= (n * 8) & 63
 			nbits -= n * 8
 			nbytes += n
@@ -1141,7 +1137,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Remaining...
 	for _, t := range input {
 		if nbits >= 48 {
-			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			le.Store64(w.bytes[nbytes:], bits)
 			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
 			bits >>= 48
 			nbits -= 48
--- a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
@@ -42,25 +42,6 @@ func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
 	}
 }

-// siftDownByFreq implements the heap property on data[lo, hi).
-// first is an offset into the array where the root of the heap lies.
-func siftDownByFreq(data []literalNode, lo, hi, first int) {
-	root := lo
-	for {
-		child := 2*root + 1
-		if child >= hi {
-			break
-		}
-		if child+1 < hi && (data[first+child].freq == data[first+child+1].freq && data[first+child].literal < data[first+child+1].literal || data[first+child].freq < data[first+child+1].freq) {
-			child++
-		}
-		if data[first+root].freq == data[first+child].freq && data[first+root].literal > data[first+child].literal || data[first+root].freq > data[first+child].freq {
-			return
-		}
-		data[first+root], data[first+child] = data[first+child], data[first+root]
-		root = child
-	}
-}
 func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
 	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
 	if hi-lo > 40 {
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -120,8 +120,9 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	const sanity = false

 	if h.chunks == nil {
-		h.chunks = &[huffmanNumChunks]uint16{}
+		h.chunks = new([huffmanNumChunks]uint16)
 	}
+
 	if h.maxRead != 0 {
 		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}
@@ -175,6 +176,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	}

 	h.maxRead = min
+
 	chunks := h.chunks[:]
 	for i := range chunks {
 		chunks[i] = 0
@@ -202,8 +204,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 			if cap(h.links[off]) < numLinks {
 				h.links[off] = make([]uint16, numLinks)
 			} else {
-				links := h.links[off][:0]
-				h.links[off] = links[:numLinks]
+				h.links[off] = h.links[off][:numLinks]
 			}
 		}
 	} else {
@@ -277,7 +278,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	return true
 }

-// The actual read interface needed by NewReader.
+// Reader is the actual read interface needed by NewReader.
 // If the passed in io.Reader does not also have ReadByte,
 // the NewReader will introduce its own buffering.
 type Reader interface {
@@ -285,6 +286,26 @@ type Reader interface {
 	io.ByteReader
 }

+type step uint8
+
+const (
+	copyData step = iota + 1
+	nextBlock
+	huffmanBytesBuffer
+	huffmanBytesReader
+	huffmanBufioReader
+	huffmanStringsReader
+	huffmanGenericReader
+)
+
+// flushMode tells decompressor when to return data
+type flushMode uint8
+
+const (
+	syncFlush    flushMode = iota // return data after sync flush block
+	partialFlush                  // return data after each block
+)
+
 // Decompress state.
 type decompressor struct {
 	// Input source.
@@ -303,7 +324,7 @@ type decompressor struct {

 	// Next step in the decompression,
 	// and decompression state.
-	step      func(*decompressor)
+	step      step
 	stepState int
 	err       error
 	toRead    []byte
@@ -319,6 +340,8 @@ type decompressor struct {

 	nb    uint
 	final bool
+
+	flushMode flushMode
 }

 func (f *decompressor) nextBlock() {
@@ -342,7 +365,7 @@ func (f *decompressor) nextBlock() {
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
-		f.huffmanBlockDecoder()()
+		f.huffmanBlockDecoder()
 		if debugDecode {
 			fmt.Println("predefinied huffman block")
 		}
@@ -353,7 +376,7 @@ func (f *decompressor) nextBlock() {
 		}
 		f.hl = &f.h1
 		f.hd = &f.h2
-		f.huffmanBlockDecoder()()
+		f.huffmanBlockDecoder()
 		if debugDecode {
 			fmt.Println("dynamic huffman block")
 		}
@@ -379,14 +402,16 @@ func (f *decompressor) Read(b []byte) (int, error) {
 		if f.err != nil {
 			return 0, f.err
 		}
-		f.step(f)
+
+		f.doStep()
+
 		if f.err != nil && len(f.toRead) == 0 {
 			f.toRead = f.dict.readFlush() // Flush what's left in case of error
 		}
 	}
 }

-// Support the io.WriteTo interface for io.Copy and friends.
+// WriteTo implements the io.WriteTo interface for io.Copy and friends.
 func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
 	total := int64(0)
 	flushed := false
@@ -410,7 +435,7 @@ func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
 			return total, f.err
 		}
 		if f.err == nil {
-			f.step(f)
+			f.doStep()
 		}
 		if len(f.toRead) == 0 && f.err != nil && !flushed {
 			f.toRead = f.dict.readFlush() // Flush what's left in case of error
@@ -603,7 +628,10 @@ func (f *decompressor) dataBlock() {
 	}

 	if n == 0 {
-		f.toRead = f.dict.readFlush()
+		if f.flushMode == syncFlush {
+			f.toRead = f.dict.readFlush()
+		}
+
 		f.finishBlock()
 		return
 	}
@@ -631,7 +659,7 @@ func (f *decompressor) copyData() {

 	if f.dict.availWrite() == 0 || f.copyLen > 0 {
 		f.toRead = f.dict.readFlush()
-		f.step = (*decompressor).copyData
+		f.step = copyData
 		return
 	}
 	f.finishBlock()
@@ -642,9 +670,34 @@ func (f *decompressor) finishBlock() {
 		if f.dict.availRead() > 0 {
 			f.toRead = f.dict.readFlush()
 		}
+
 		f.err = io.EOF
+	} else if f.flushMode == partialFlush && f.dict.availRead() > 0 {
+		f.toRead = f.dict.readFlush()
+	}
+
+	f.step = nextBlock
+}
+
+func (f *decompressor) doStep() {
+	switch f.step {
+	case copyData:
+		f.copyData()
+	case nextBlock:
+		f.nextBlock()
+	case huffmanBytesBuffer:
+		f.huffmanBytesBuffer()
+	case huffmanBytesReader:
+		f.huffmanBytesReader()
+	case huffmanBufioReader:
+		f.huffmanBufioReader()
+	case huffmanStringsReader:
+		f.huffmanStringsReader()
+	case huffmanGenericReader:
+		f.huffmanGenericReader()
+	default:
+		panic("BUG: unexpected step state")
 	}
-	f.step = (*decompressor).nextBlock
 }

 // noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
@@ -747,12 +800,47 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 		h1:       f.h1,
 		h2:       f.h2,
 		dict:     f.dict,
-		step:     (*decompressor).nextBlock,
+		step:     nextBlock,
 	}
 	f.dict.init(maxMatchOffset, dict)
 	return nil
 }

+type ReaderOpt func(*decompressor)
+
+// WithPartialBlock tells decompressor to return after each block,
+// so it can read data written with partial flush
+func WithPartialBlock() ReaderOpt {
+	return func(f *decompressor) {
+		f.flushMode = partialFlush
+	}
+}
+
+// WithDict initializes the reader with a preset dictionary
+func WithDict(dict []byte) ReaderOpt {
+	return func(f *decompressor) {
+		f.dict.init(maxMatchOffset, dict)
+	}
+}
+
+// NewReaderOpts returns new reader with provided options
+func NewReaderOpts(r io.Reader, opts ...ReaderOpt) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = nextBlock
+	f.dict.init(maxMatchOffset, nil)
+
+	for _, opt := range opts {
+		opt(&f)
+	}
+
+	return &f
+}
+
 // NewReader returns a new ReadCloser that can be used
 // to read the uncompressed version of r.
 // If r does not also implement io.ByteReader,
@@ -762,15 +850,7 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 //
 // The ReadCloser returned by NewReader also implements Resetter.
 func NewReader(r io.Reader) io.ReadCloser {
-	fixedHuffmanDecoderInit()
-
-	var f decompressor
-	f.r = makeReader(r)
-	f.bits = new([maxNumLit + maxNumDist]int)
-	f.codebits = new([numCodes]int)
-	f.step = (*decompressor).nextBlock
-	f.dict.init(maxMatchOffset, nil)
-	return &f
+	return NewReaderOpts(r)
 }

 // NewReaderDict is like NewReader but initializes the reader
@@ -781,13 +861,5 @@ func NewReader(r io.Reader) io.ReadCloser {
 //
 // The ReadCloser returned by NewReader also implements Resetter.
 func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
-	fixedHuffmanDecoderInit()
-
-	var f decompressor
-	f.r = makeReader(r)
-	f.bits = new([maxNumLit + maxNumDist]int)
-	f.codebits = new([numCodes]int)
-	f.step = (*decompressor).nextBlock
-	f.dict.init(maxMatchOffset, dict)
-	return &f
+	return NewReaderOpts(r, WithDict(dict))
 }
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -85,7 +85,7 @@ readLiteral:
 			dict.writeByte(byte(v))
 			if dict.availWrite() == 0 {
 				f.toRead = dict.readFlush()
-				f.step = (*decompressor).huffmanBytesBuffer
+				f.step = huffmanBytesBuffer
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
 				return
@@ -251,7 +251,7 @@ copyHistory:

 		if dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = dict.readFlush()
-			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+			f.step = huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
 			return
@@ -336,7 +336,7 @@ readLiteral:
 			dict.writeByte(byte(v))
 			if dict.availWrite() == 0 {
 				f.toRead = dict.readFlush()
-				f.step = (*decompressor).huffmanBytesReader
+				f.step = huffmanBytesReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
 				return
@@ -502,7 +502,7 @@ copyHistory:

 		if dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = dict.readFlush()
-			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+			f.step = huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
 			return
@@ -587,7 +587,7 @@ readLiteral:
 			dict.writeByte(byte(v))
 			if dict.availWrite() == 0 {
 				f.toRead = dict.readFlush()
-				f.step = (*decompressor).huffmanBufioReader
+				f.step = huffmanBufioReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
 				return
@@ -753,7 +753,7 @@ copyHistory:

 		if dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = dict.readFlush()
-			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+			f.step = huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
 			return
@@ -838,7 +838,7 @@ readLiteral:
 			dict.writeByte(byte(v))
 			if dict.availWrite() == 0 {
 				f.toRead = dict.readFlush()
-				f.step = (*decompressor).huffmanStringsReader
+				f.step = huffmanStringsReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
 				return
@@ -1004,7 +1004,7 @@ copyHistory:

 		if dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = dict.readFlush()
-			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.step = huffmanStringsReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
 			return
@@ -1089,7 +1089,7 @@ readLiteral:
 			dict.writeByte(byte(v))
 			if dict.availWrite() == 0 {
 				f.toRead = dict.readFlush()
-				f.step = (*decompressor).huffmanGenericReader
+				f.step = huffmanGenericReader
 				f.stepState = stateInit
 				f.b, f.nb = fb, fnb
 				return
@@ -1255,7 +1255,7 @@ copyHistory:

 		if dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = dict.readFlush()
-			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
+			f.step = huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
 			f.b, f.nb = fb, fnb
 			return
@@ -1265,19 +1265,19 @@ copyHistory:
 	// Not reached
 }

-func (f *decompressor) huffmanBlockDecoder() func() {
+func (f *decompressor) huffmanBlockDecoder() {
 	switch f.r.(type) {
 	case *bytes.Buffer:
-		return f.huffmanBytesBuffer
+		f.huffmanBytesBuffer()
 	case *bytes.Reader:
-		return f.huffmanBytesReader
+		f.huffmanBytesReader()
 	case *bufio.Reader:
-		return f.huffmanBufioReader
+		f.huffmanBufioReader()
 	case *strings.Reader:
-		return f.huffmanStringsReader
+		f.huffmanStringsReader()
 	case Reader:
-		return f.huffmanGenericReader
+		f.huffmanGenericReader()
 	default:
-		return f.huffmanGenericReader
+		f.huffmanGenericReader()
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,9 +1,9 @@
 package flate

 import (
-	"encoding/binary"
 	"fmt"
-	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )

 // fastGen maintains the table for matches,
@@ -77,6 +77,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {

 		nextS := s
 		var candidate tableEntry
+		var t int32
 		for {
 			nextHash := hashLen(cv, tableBits, hashBytes)
 			candidate = e.table[nextHash]
@@ -88,9 +89,8 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur}
 			nextHash = hashLen(now, tableBits, hashBytes)
-
-			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
@@ -103,8 +103,8 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			now >>= 8
 			e.table[nextHash] = tableEntry{offset: s + e.cur}

-			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
@@ -120,36 +120,10 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			// literal bytes prior to s.

 			// Extend the 4-byte match as long as possible.
-			t := candidate.offset - e.cur
-			var l = int32(4)
-			if false {
-				l = e.matchlenLong(s+4, t+4, src) + 4
-			} else {
-				// inlined:
-				a := src[s+4:]
-				b := src[t+4:]
-				for len(a) >= 8 {
-					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
-						l += int32(bits.TrailingZeros64(diff) >> 3)
-						break
-					}
-					l += 8
-					a = a[8:]
-					b = b[8:]
-				}
-				if len(a) < 8 {
-					b = b[:len(a)]
-					for i := range a {
-						if a[i] != b[i] {
-							break
-						}
-						l++
-					}
-				}
-			}
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4

 			// Extend backwards
-			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			for t > 0 && s > nextEmit && le.Load8(src, t-1) == le.Load8(src, s-1) {
 				s--
 				t--
 				l++
@@ -221,8 +195,8 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2}

-			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
+			t = candidate.offset - e.cur
+			if s-t > maxMatchOffset || uint32(x) != load3232(src, t) {
 				cv = x >> 8
 				s++
 				break
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -126,7 +126,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {

 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4

 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -135,7 +135,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			// Extend the 4-byte match as long as possible.
 			//
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4

 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -98,19 +98,19 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			e.bTable[nextHashL] = entry

 			t = lCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// We got a long match. Use that.
 				break
 			}

 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
 				lCandidate = e.bTable[hash7(next, tableBits)]

 				// If the next long is a candidate, check if we should use that instead...
-				lOff := nextS - (lCandidate.offset - e.cur)
-				if lOff < maxMatchOffset && load3232(src, lCandidate.offset-e.cur) == uint32(next) {
+				lOff := lCandidate.offset - e.cur
+				if nextS-lOff < maxMatchOffset && load3232(src, lOff) == uint32(next) {
 					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
 					if l2 > l1 {
 						s = nextS
@@ -127,7 +127,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		// them as literal bytes.

 		// Extend the 4-byte match as long as possible.
-		l := e.matchlenLong(s+4, t+4, src) + 4
+		l := e.matchlenLong(int(s+4), int(t+4), src) + 4

 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -111,14 +111,326 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {

 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+				if uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur

 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchlen(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				// Found a 4 match...
+				l = e.matchlen(int(s+4), int(t+4), src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
+
+// fastEncL5Window is a level 5 encoder,
+// but with a custom window size.
+type fastEncL5Window struct {
+	hist      []byte
+	cur       int32
+	maxOffset int32
+	table     [tableSize]tableEntry
+	bTable    [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	maxMatchOffset := e.maxOffset
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
@@ -130,7 +442,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					break
 				}
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
@@ -140,7 +452,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			}

 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 				lCandidate = e.bTable[nextHashL]
@@ -153,7 +465,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 := lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+					if load3232(src, t2) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -164,7 +476,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@@ -308,3 +620,89 @@ emitRemainder:
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
+
+// Reset the encoding table.
+func (e *fastEncL5Window) Reset() {
+	// We keep the same allocs, since we are compressing the same block sizes.
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= int32(bufferReset) {
+		e.cur += e.maxOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
+
+func (e *fastEncL5Window) addBlock(src []byte) int32 {
+	// check if we have space already
+	maxMatchOffset := e.maxOffset
+
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < int(maxMatchOffset*2) {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -113,7 +113,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {

 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+				if uint32(cv) == load3232(src, t) {
 					// Long candidate matches at least 4 bytes.

 					// Store the next match
@@ -123,9 +123,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {

 					// Check the previous long candidate as well.
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
-						l = e.matchlen(s+4, t+4, src) + 4
-						ml1 := e.matchlen(s+4, t2+4, src) + 4
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchlen(int(s+4), int(t2+4), src) + 4
 						if ml1 > l {
 							t = t2
 							l = ml1
@@ -136,7 +136,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				}
 				// Current value did not match, but check if previous long value does.
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
@@ -146,9 +146,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			}

 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
 				// Found a 4 match...
-				l = e.matchlen(s+4, t+4, src) + 4
+				l = e.matchlen(int(s+4), int(t+4), src) + 4

 				// Look up next long candidate (at nextS)
 				lCandidate = e.bTable[nextHashL]
@@ -162,7 +162,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				const repOff = 1
 				t2 := s - repeat + repOff
 				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
-					ml := e.matchlen(s+4+repOff, t2+4, src) + 4
+					ml := e.matchlen(int(s+4+repOff), int(t2+4), src) + 4
 					if ml > l {
 						t = t2
 						l = ml
@@ -175,8 +175,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 = lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
-						ml := e.matchlen(nextS+4, t2+4, src) + 4
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
@@ -186,8 +186,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
-						ml := e.matchlen(nextS+4, t2+4, src) + 4
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
@@ -207,9 +207,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {

 		// Extend the 4-byte match as long as possible.
 		if l == 0 {
-			l = e.matchlenLong(s+4, t+4, src) + 4
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
 		} else if l == maxMatchLength {
-			l += e.matchlenLong(s+l, t+l, src)
+			l += e.matchlenLong(int(s+l), int(t+l), src)
 		}

 		// Try to locate a better match by checking the end-of-match...
@@ -227,7 +227,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			off := s2 - t2
 			if off < maxMatchOffset {
 				if off > 0 && t2 >= 0 {
-					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
 						t = t2
 						l = l2
 						s = s2
@@ -237,7 +237,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
 				off := s2 - t2
 				if off > 0 && off < maxMatchOffset && t2 >= 0 {
-					if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
 						t = t2
 						l = l2
 						s = s2
--- a/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
+++ b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
@@ -0,0 +1,34 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package flate
+
+import (
+	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+		left -= 8
+	}
+
+	a = a[n:]
+	b = b[n:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+}
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -4,6 +4,8 @@ import (
 	"io"
 	"math"
 	"sync"
+
+	"github.com/klauspost/compress/internal/le"
 )

 const (
@@ -86,11 +88,19 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 		dict = dict[len(dict)-maxStatelessDict:]
 	}

+	// For subsequent loops, keep shallow dict reference to avoid alloc+copy.
+	var inDict []byte
+
 	for len(in) > 0 {
 		todo := in
-		if len(todo) > maxStatelessBlock-len(dict) {
+		if len(inDict) > 0 {
+			if len(todo) > maxStatelessBlock-maxStatelessDict {
+				todo = todo[:maxStatelessBlock-maxStatelessDict]
+			}
+		} else if len(todo) > maxStatelessBlock-len(dict) {
 			todo = todo[:maxStatelessBlock-len(dict)]
 		}
+		inOrg := in
 		in = in[len(todo):]
 		uncompressed := todo
 		if len(dict) > 0 {
@@ -102,7 +112,11 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 			todo = combined
 		}
 		// Compress
-		statelessEnc(&dst, todo, int16(len(dict)))
+		if len(inDict) == 0 {
+			statelessEnc(&dst, todo, int16(len(dict)))
+		} else {
+			statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict)
+		}
 		isEof := eof && len(in) == 0

 		if dst.n == 0 {
@@ -119,7 +133,8 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 		}
 		if len(in) > 0 {
 			// Retain a dict if we have more
-			dict = todo[len(todo)-maxStatelessDict:]
+			inDict = inOrg[len(uncompressed)-maxStatelessDict:]
+			dict = nil
 			dst.Reset()
 		}
 		if bw.err != nil {
@@ -139,18 +154,11 @@ func hashSL(u uint32) uint32 {
 }

 func load3216(b []byte, i int16) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return le.Load32(b, i)
 }

 func load6416(b []byte, i int16) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return le.Load64(b, i)
 }

 func statelessEnc(dst *tokens, src []byte, startAt int16) {