Commit 2e196b15 authored by Klaus Post's avatar Klaus Post Committed by Nigel Tao

compress/flate: level 1 (best speed) match across blocks

This change makes deflate level 1 (best speed) match across
block boundaries. This comes at a small speed penalty,
but improves compression on almost all output.

Sample numbers on various content types:

enwik9:            391052014 ->  382578469 bytes, 77.59 -> 74.28 MB/s
adresser.001:       57269799 ->   47756095 bytes, 287.84 -> 357.86 MB/s
10gb:             5233055166 -> 5198328382 bytes, 105.85 -> 96.99 MB/s
rawstudio-mint14: 3972329211 -> 3927423364 bytes, 100.07 -> 94.22 MB/s
sites:             165556800 ->  163178702 bytes, 72.31 -> 70.15 MB/s
objectfiles:       115962472 ->  111649524 bytes, 132.60 -> 128.05 MB/s
sharnd.out:        200015283 ->  200015283 bytes, 221.50 -> 218.83 MB/s

Change-Id: I62a139e5c06976e803439a4268acede5139b8cfc
Reviewed-on: https://go-review.googlesource.com/31640Reviewed-by: 's avatarJoe Tsai <thebrokentoaster@gmail.com>
Reviewed-by: 's avatarNigel Tao <nigeltao@golang.org>
parent f8a34443
......@@ -84,9 +84,10 @@ type compressor struct {
bulkHasher func([]byte, []uint32)
// compression algorithm
fill func(*compressor, []byte) int // copy data to window
step func(*compressor) // process window
sync bool // requesting flush
fill func(*compressor, []byte) int // copy data to window
step func(*compressor) // process window
sync bool // requesting flush
bestSpeed *deflateFast // Encoder for BestSpeed
// Input hash chains
// hashHead[hashValue] contains the largest inputIndex with the specified hash value
......@@ -346,12 +347,13 @@ func (d *compressor) encSpeed() {
d.err = d.w.err
}
d.windowEnd = 0
d.bestSpeed.reset()
return
}
}
// Encode the block.
d.tokens = encodeBestSpeed(d.tokens[:0], d.window[:d.windowEnd])
d.tokens = d.bestSpeed.encode(d.tokens[:0], d.window[:d.windowEnd])
// If we removed less than 1/16th, Huffman compress the block.
if len(d.tokens) > d.windowEnd-(d.windowEnd>>4) {
......@@ -584,6 +586,7 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
d.window = make([]byte, maxStoreBlockSize)
d.fill = (*compressor).fillStore
d.step = (*compressor).encSpeed
d.bestSpeed = newDeflateFast()
d.tokens = make([]token, maxStoreBlockSize)
case level == DefaultCompression:
level = 6
......@@ -609,6 +612,7 @@ func (d *compressor) reset(w io.Writer) {
case BestSpeed:
d.windowEnd = 0
d.tokens = d.tokens[:0]
d.bestSpeed.reset()
default:
d.chainHead = -1
for i := range d.hashHead {
......
......@@ -490,6 +490,7 @@ func TestWriterReset(t *testing.T) {
w.d.fill, wref.d.fill = nil, nil
w.d.step, wref.d.step = nil, nil
w.d.bulkHasher, wref.d.bulkHasher = nil, nil
w.d.bestSpeed, wref.d.bestSpeed = nil, nil
// hashMatch is always overwritten when used.
copy(w.d.hashMatch[:], wref.d.hashMatch[:])
if len(w.d.tokens) != 0 {
......@@ -681,3 +682,119 @@ func TestWriterPersistentError(t *testing.T) {
}
}
}
func TestBestSpeedMatch(t *testing.T) {
cases := []struct {
previous, current []byte
t, s, want int32
}{{
previous: []byte{0, 0, 0, 1, 2},
current: []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
t: -3,
s: 3,
want: 6,
}, {
previous: []byte{0, 0, 0, 1, 2},
current: []byte{2, 4, 5, 0, 1, 2, 3, 4, 5},
t: -3,
s: 3,
want: 3,
}, {
previous: []byte{0, 0, 0, 1, 1},
current: []byte{3, 4, 5, 0, 1, 2, 3, 4, 5},
t: -3,
s: 3,
want: 2,
}, {
previous: []byte{0, 0, 0, 1, 2},
current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
t: -1,
s: 0,
want: 4,
}, {
previous: []byte{0, 0, 0, 1, 2, 3, 4, 5, 2, 2},
current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
t: -7,
s: 4,
want: 5,
}, {
previous: []byte{9, 9, 9, 9, 9},
current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
t: -1,
s: 0,
want: 0,
}, {
previous: []byte{9, 9, 9, 9, 9},
current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
t: 0,
s: 1,
want: 0,
}, {
previous: []byte{},
current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
t: -5,
s: 1,
want: 0,
}, {
previous: []byte{},
current: []byte{9, 2, 2, 2, 1, 2, 3, 4, 5},
t: -1,
s: 1,
want: 0,
}, {
previous: []byte{},
current: []byte{2, 2, 2, 2, 1, 2, 3, 4, 5},
t: 0,
s: 1,
want: 3,
}, {
previous: []byte{3, 4, 5},
current: []byte{3, 4, 5},
t: -3,
s: 0,
want: 3,
}, {
previous: make([]byte, 1000),
current: make([]byte, 1000),
t: -1000,
s: 0,
want: maxMatchLength - 4,
}, {
previous: make([]byte, 200),
current: make([]byte, 500),
t: -200,
s: 0,
want: maxMatchLength - 4,
}, {
previous: make([]byte, 200),
current: make([]byte, 500),
t: 0,
s: 1,
want: maxMatchLength - 4,
}, {
previous: make([]byte, maxMatchLength-4),
current: make([]byte, 500),
t: -(maxMatchLength - 4),
s: 0,
want: maxMatchLength - 4,
}, {
previous: make([]byte, 200),
current: make([]byte, 500),
t: -200,
s: 400,
want: 100,
}, {
previous: make([]byte, 10),
current: make([]byte, 500),
t: 200,
s: 400,
want: 100,
}}
for i, c := range cases {
e := deflateFast{prev: c.previous}
got := e.matchLen(c.s, c.t, c.current)
if got != c.want {
t.Errorf("Test %d: match length, want %d, got %d", i, c.want, got)
}
}
}
......@@ -14,12 +14,12 @@ const (
tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
)
func load32(b []byte, i int) uint32 {
func load32(b []byte, i int32) uint32 {
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}
func load64(b []byte, i int) uint64 {
func load64(b []byte, i int32) uint64 {
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
......@@ -38,31 +38,49 @@ const (
minNonLiteralBlockSize = 1 + 1 + inputMargin
)
func encodeBestSpeed(dst []token, src []byte) []token {
type tableEntry struct {
val uint32 // Value at destination
offset int32
}
// deflateFast maintains the table for matches,
// and the previous byte block for cross block matching.
type deflateFast struct {
table [tableSize]tableEntry
prev []byte // Previous block, zero length if unknown.
cur int32 // Current match offset.
}
func newDeflateFast() *deflateFast {
return &deflateFast{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}
}
// encode encodes a block given in src and encodes tokens
// to dst and returns the result.
func (e *deflateFast) encode(dst []token, src []byte) []token {
// Ensure that e.cur doesn't wrap.
if e.cur > 1<<30 {
*e = deflateFast{cur: maxStoreBlockSize, prev: e.prev[:0]}
}
// This check isn't in the Snappy implementation, but there, the caller
// instead of the callee handles this case.
if len(src) < minNonLiteralBlockSize {
e.cur += maxStoreBlockSize
e.prev = e.prev[:0]
return emitLiteral(dst, src)
}
// Initialize the hash table.
//
// The table element type is uint16, as s < sLimit and sLimit < len(src)
// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
var table [tableSize]uint16
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
sLimit := int32(len(src) - inputMargin)
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
nextHash := hash(load32(src, s))
nextEmit := int32(0)
s := int32(0)
cv := load32(src, s)
nextHash := hash(cv)
for {
// Copied from the C++ snappy implementation:
......@@ -80,10 +98,10 @@ func encodeBestSpeed(dst []token, src []byte) []token {
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
// the number of bytes to move ahead for each iteration.
skip := 32
skip := int32(32)
nextS := s
candidate := 0
var candidate tableEntry
for {
s = nextS
bytesBetweenHashLookups := skip >> 5
......@@ -92,13 +110,19 @@ func encodeBestSpeed(dst []token, src []byte) []token {
if nextS > sLimit {
goto emitRemainder
}
candidate = int(table[nextHash&tableMask])
table[nextHash&tableMask] = uint16(s)
nextHash = hash(load32(src, nextS))
// TODO: < should be <=, and add a test for that.
if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
break
candidate = e.table[nextHash&tableMask]
now := load32(src, nextS)
e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
nextHash = hash(now)
offset := s - (candidate.offset - e.cur)
// TODO: >= should be >, and add a test for that.
if offset >= maxMatchOffset || cv != candidate.val {
// Out of range or not matched.
cv = now
continue
}
break
}
// A 4-byte match has been found. We'll later see if more than 4 bytes
......@@ -117,22 +141,16 @@ func encodeBestSpeed(dst []token, src []byte) []token {
for {
// Invariant: we have a 4-byte match at s, and no need to emit any
// literal bytes prior to s.
base := s
// Extend the 4-byte match as long as possible.
//
// This is an inlined version of Snappy's:
// s = extendMatch(src, candidate+4, s+4)
s += 4
s1 := base + maxMatchLength
if s1 > len(src) {
s1 = len(src)
}
for i := candidate + 4; s < s1 && src[i] == src[s]; i, s = i+1, s+1 {
}
t := candidate.offset - e.cur + 4
l := e.matchLen(s, t, src)
// matchToken is flate's equivalent of Snappy's emitCopy.
dst = append(dst, matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset)))
// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
dst = append(dst, matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset)))
s += l
nextEmit = s
if s >= sLimit {
goto emitRemainder
......@@ -145,14 +163,18 @@ func encodeBestSpeed(dst []token, src []byte) []token {
// are faster as one load64 call (with some shifts) instead of
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x >> 0))
table[prevHash&tableMask] = uint16(s - 1)
currHash := hash(uint32(x >> 8))
candidate = int(table[currHash&tableMask])
table[currHash&tableMask] = uint16(s)
prevHash := hash(uint32(x))
e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
x >>= 8
currHash := hash(uint32(x))
candidate = e.table[currHash&tableMask]
e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
offset := s - (candidate.offset - e.cur)
// TODO: >= should be >, and add a test for that.
if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x >> 16))
if offset >= maxMatchOffset || uint32(x) != candidate.val {
cv = uint32(x >> 8)
nextHash = hash(cv)
s++
break
}
......@@ -160,15 +182,91 @@ func encodeBestSpeed(dst []token, src []byte) []token {
}
emitRemainder:
if nextEmit < len(src) {
if int(nextEmit) < len(src) {
dst = emitLiteral(dst, src[nextEmit:])
}
e.cur += int32(len(src))
e.prev = e.prev[:len(src)]
copy(e.prev, src)
return dst
}
func emitLiteral(dst []token, lit []byte) []token {
for _, v := range lit {
dst = append(dst, token(v))
dst = append(dst, literalToken(uint32(v)))
}
return dst
}
// matchLen returns the match length between src[s:] and src[t:].
// t can be negative to indicate the match is starting in e.prev.
// We assume that src[s-4:s] and src[t-4:t] already match.
func (e *deflateFast) matchLen(s, t int32, src []byte) int32 {
s1 := int(s) + maxMatchLength - 4
if s1 > len(src) {
s1 = len(src)
}
// If we are inside the current block
if t >= 0 {
b := src[t:]
a := src[s:s1]
b = b[:len(a)]
// Extend the match to be as long as possible.
for i := range a {
if a[i] != b[i] {
return int32(i)
}
}
return int32(len(a))
}
// We found a match in the previous block.
tp := int32(len(e.prev)) + t
if tp < 0 {
return 0
}
// Extend the match to be as long as possible.
a := src[s:s1]
b := e.prev[tp:]
if len(b) > len(a) {
b = b[:len(a)]
}
a = a[:len(b)]
for i := range b {
if a[i] != b[i] {
return int32(i)
}
}
// If we reached our limit, we matched everything we are
// allowed to in the previous block and we return.
n := int32(len(b))
if int(s+n) == s1 {
return n
}
// Continue looking for more matches in the current block.
a = src[s+n : s1]
b = src[:len(a)]
for i := range a {
if a[i] != b[i] {
return int32(i) + n
}
}
return int32(len(a)) + n
}
// Reset resets the encoding history.
// This ensures that no matches are made to the previous block.
func (e *deflateFast) reset() {
e.prev = e.prev[:0]
// Bump the offset, so all matches will fail distance check.
e.cur += maxMatchOffset
// Protect against e.cur wraparound.
if e.cur > 1<<30 {
*e = deflateFast{cur: maxStoreBlockSize, prev: e.prev[:0]}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment