compress/flate: use a constant hash table size for Best Speed.

This makes compress/flate's version of Snappy diverge from the upstream golang/snappy version, but the latter has a goal of matching C++ snappy output byte-for-byte. Both C++ and the asm version of golang/snappy can use a smaller N for the O(N) zero-initialization of the hash table when the input is small, even if the pure Go golang/snappy algorithm cannot: "var table [tableSize]uint16" zeroes all tableSize elements. For this package, we don't have the match-C++-snappy goal, so we can use a different (constant) hash table size. This is a small win, in terms of throughput and output size, but it also enables us to re-use the (constant size) hash table between encodeBestSpeed calls, avoiding the cost of zero-initializing the hash table altogether. This will be implemented in follow-up commits. This package's benchmarks: name old speed new speed delta EncodeDigitsSpeed1e4-8 72.8MB/s ± 1% 73.5MB/s ± 1% +0.86% (p=0.000 n=10+10) EncodeDigitsSpeed1e5-8 77.5MB/s ± 1% 78.0MB/s ± 0% +0.69% (p=0.000 n=10+10) EncodeDigitsSpeed1e6-8 82.0MB/s ± 1% 82.7MB/s ± 1% +0.85% (p=0.000 n=10+9) EncodeTwainSpeed1e4-8 65.1MB/s ± 1% 65.6MB/s ± 0% +0.78% (p=0.000 n=10+9) EncodeTwainSpeed1e5-8 80.0MB/s ± 0% 80.6MB/s ± 1% +0.66% (p=0.000 n=9+10) EncodeTwainSpeed1e6-8 81.6MB/s ± 1% 82.1MB/s ± 1% +0.55% (p=0.017 n=10+10) Input size in bytes, output size (and time taken) before and after on some larger files: 1073741824 57269781 ( 3183ms) 57269781 ( 3177ms) adresser.001 1000000000 391052000 ( 11071ms) 391051996 ( 11067ms) enwik9 1911399616 378679516 ( 13450ms) 378679514 ( 13079ms) gob-stream 8558382592 3972329193 ( 99962ms) 3972329193 ( 91290ms) rawstudio-mint14.tar 200000000 200015265 ( 776ms) 200015265 ( 774ms) sharnd.out Thanks to Klaus Post for the original suggestion on cl/21021. Change-Id: Ia4c63a8d1b92c67e1765ec5c3c8c69d289d9a6ce Reviewed-on: https://go-review.googlesource.com/22604Reviewed-by: Russ Cox <rsc@golang.org>

compress/flate: use a constant hash table size for Best Speed.
This makes compress/flate's version of Snappy diverge from the upstream golang/snappy version, but the latter has a goal of matching C++ snappy output byte-for-byte. Both C++ and the asm version of golang/snappy can use a smaller N for the O(N) zero-initialization of the hash table when the input is small, even if the pure Go golang/snappy algorithm cannot: "var table [tableSize]uint16" zeroes all tableSize elements. For this package, we don't have the match-C++-snappy goal, so we can use a different (constant) hash table size. This is a small win, in terms of throughput and output size, but it also enables us to re-use the (constant size) hash table between encodeBestSpeed calls, avoiding the cost of zero-initializing the hash table altogether. This will be implemented in follow-up commits. This package's benchmarks: name old speed new speed delta EncodeDigitsSpeed1e4-8 72.8MB/s ± 1% 73.5MB/s ± 1% +0.86% (p=0.000 n=10+10) EncodeDigitsSpeed1e5-8 77.5MB/s ± 1% 78.0MB/s ± 0% +0.69% (p=0.000 n=10+10) EncodeDigitsSpeed1e6-8 82.0MB/s ± 1% 82.7MB/s ± 1% +0.85% (p=0.000 n=10+9) EncodeTwainSpeed1e4-8 65.1MB/s ± 1% 65.6MB/s ± 0% +0.78% (p=0.000 n=10+9) EncodeTwainSpeed1e5-8 80.0MB/s ± 0% 80.6MB/s ± 1% +0.66% (p=0.000 n=9+10) EncodeTwainSpeed1e6-8 81.6MB/s ± 1% 82.1MB/s ± 1% +0.55% (p=0.017 n=10+10) Input size in bytes, output size (and time taken) before and after on some larger files: 1073741824 57269781 ( 3183ms) 57269781 ( 3177ms) adresser.001 1000000000 391052000 ( 11071ms) 391051996 ( 11067ms) enwik9 1911399616 378679516 ( 13450ms) 378679514 ( 13079ms) gob-stream 8558382592 3972329193 ( 99962ms) 3972329193 ( 91290ms) rawstudio-mint14.tar 200000000 200015265 ( 776ms) 200015265 ( 774ms) sharnd.out Thanks to Klaus Post for the original suggestion on cl/21021. Change-Id: Ia4c63a8d1b92c67e1765ec5c3c8c69d289d9a6ce Reviewed-on: https://go-review.googlesource.com/22604Reviewed-by: Russ Cox <rsc@golang.org>
1fb4e4de · Nigel Tao · 5edcff01 · 1fb4e4de
Commit 1fb4e4de authored Apr 29, 2016 by Nigel Tao
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 24 deletions

deflatefast.go src/compress/flate/deflatefast.go +18 -24

No files found.
--- a/src/compress/flate/deflatefast.go
+++ b/src/compress/flate/deflatefast.go
@@ -7,7 +7,14 @@ package flate
 // This encoding algorithm, which prioritizes speed over output size, is
 // based on Snappy's LZ77-style encoder: github.com/golang/snappy

-const maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
+const (
+	maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
+
+	tableBits  = 14             // Bits used in the table.
+	tableSize  = 1 << tableBits // Size of the table.
+	tableMask  = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+)

 func load32(b []byte, i int) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
@@ -20,8 +27,8 @@ func load64(b []byte, i int) uint64 {
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }

-func hash(u, shift uint32) uint32 {
-	return (u * 0x1e35a7bd) >> shift
+func hash(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> tableShift
 }

 // These constants are defined by the Snappy implementation so that its
@@ -40,24 +47,11 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 		return emitLiteral(dst, src)
 	}

-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// Initialize the hash table.
+	//
 	// The table element type is uint16, as s < sLimit and sLimit < len(src)
 	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
-	const (
-		maxTableSize = 1 << 14
-		// tableMask is redundant, but helps the compiler eliminate bounds
-		// checks.
-		tableMask = maxTableSize - 1
-	)
-	shift := uint32(32 - 8)
-	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-		shift--
-	}
-	// In Go, all array elements are zero-initialized, so there is no advantage
-	// to a smaller tableSize per se. However, it matches the C++ algorithm,
-	// and in the asm versions of this code, we can get away with zeroing only
-	// the first tableSize elements.
-	var table [maxTableSize]uint16
+	var table [tableSize]uint16

 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
@@ -70,7 +64,7 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
-	nextHash := hash(load32(src, s), shift)
+	nextHash := hash(load32(src, s))

 	for {
 		// Copied from the C++ snappy implementation:
@@ -102,7 +96,7 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 			}
 			candidate = int(table[nextHash&tableMask])
 			table[nextHash&tableMask] = uint16(s)
-			nextHash = hash(load32(src, nextS), shift)
+			nextHash = hash(load32(src, nextS))
 			if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
 				break
 			}
@@ -152,13 +146,13 @@ func encodeBestSpeed(dst []token, src []byte) []token {
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load64(src, s-1)
-			prevHash := hash(uint32(x>>0), shift)
+			prevHash := hash(uint32(x >> 0))
 			table[prevHash&tableMask] = uint16(s - 1)
-			currHash := hash(uint32(x>>8), shift)
+			currHash := hash(uint32(x >> 8))
 			candidate = int(table[currHash&tableMask])
 			table[currHash&tableMask] = uint16(s)
 			if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
-				nextHash = hash(uint32(x>>16), shift)
+				nextHash = hash(uint32(x >> 16))
 				s++
 				break
 			}