Skip to content

Commit feffc6c

Browse files
committed
internal/prefix: use wide integer loads when reading
Rather than reading bits into the bit-buffer on a byte-by-byte basis, use uint64 loads to get a wide integer in a single operation. This optimization is only done on the Reader. The equivalent change is still to come for the Writer. On the compress/flate benchmarks: benchmark old MB/s new MB/s speedup BenchmarkDecode/Digits/Speed/1e4-4 102.85 112.07 1.09x BenchmarkDecode/Digits/Speed/1e5-4 97.34 106.39 1.09x BenchmarkDecode/Digits/Speed/1e6-4 100.97 110.31 1.09x BenchmarkDecode/Digits/Default/1e4-4 100.51 108.91 1.08x BenchmarkDecode/Digits/Default/1e5-4 97.16 105.93 1.09x BenchmarkDecode/Digits/Default/1e6-4 97.60 105.35 1.08x BenchmarkDecode/Digits/Compression/1e4-4 100.38 109.33 1.09x BenchmarkDecode/Digits/Compression/1e5-4 97.86 105.72 1.08x BenchmarkDecode/Digits/Compression/1e6-4 97.04 105.08 1.08x BenchmarkDecode/Huffman/Speed/1e4-4 98.97 112.76 1.14x BenchmarkDecode/Huffman/Speed/1e5-4 109.61 127.67 1.16x BenchmarkDecode/Huffman/Speed/1e6-4 110.23 128.10 1.16x BenchmarkDecode/Huffman/Default/1e4-4 99.20 110.47 1.11x BenchmarkDecode/Huffman/Default/1e5-4 102.74 117.63 1.14x BenchmarkDecode/Huffman/Default/1e6-4 104.49 120.32 1.15x BenchmarkDecode/Huffman/Compression/1e4-4 98.83 113.05 1.14x BenchmarkDecode/Huffman/Compression/1e5-4 102.10 117.41 1.15x BenchmarkDecode/Huffman/Compression/1e6-4 104.28 120.21 1.15x
1 parent daa2593 commit feffc6c

File tree

1 file changed

+51
-24
lines changed

1 file changed

+51
-24
lines changed

internal/prefix/reader.go

Lines changed: 51 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package prefix
77
import (
88
"bufio"
99
"bytes"
10+
"encoding/binary"
1011
"io"
1112
"strings"
1213

@@ -30,9 +31,9 @@ type Reader struct {
3031
byteRd compress.ByteReader // Set if rd is a ByteReader
3132
bufRd compress.BufferedReader // Set if rd is a BufferedReader
3233

33-
bufBits uint64 // Buffer to hold some bits
34-
numBits uint // Number of valid bits in bufBits
35-
transform [256]byte // LUT to transform bit-ordering
34+
bufBits uint64 // Buffer to hold some bits
35+
numBits uint // Number of valid bits in bufBits
36+
bigEndian bool // Do we treat input bytes as big endian?
3637

3738
// These fields are only used if rd is a compress.BufferedReader.
3839
bufPeek []byte // Buffer for the Peek data
@@ -52,7 +53,8 @@ type Reader struct {
5253
// least-significant bits of a byte (such as for deflate and brotli).
5354
func (pr *Reader) Init(r io.Reader, bigEndian bool) {
5455
*pr = Reader{
55-
rd: r,
56+
rd: r,
57+
bigEndian: bigEndian,
5658

5759
bb: pr.bb,
5860
br: pr.br,
@@ -89,12 +91,6 @@ func (pr *Reader) Init(r io.Reader, bigEndian bool) {
8991
pr.bu.Reset(r)
9092
pr.rd, pr.bufRd = pr.bu, pr.bu
9193
}
92-
93-
if bigEndian {
94-
copy(pr.transform[:], internal.ReverseLUT[:])
95-
} else {
96-
copy(pr.transform[:], internal.IdentityLUT[:])
97-
}
9894
}
9995

10096
// BitsRead reports the total number of bits emitted from any Read method.
@@ -130,7 +126,11 @@ func (pr *Reader) Read(buf []byte) (cnt int, err error) {
130126
return 0, errUnaligned
131127
}
132128
for cnt = 0; len(buf) > cnt && pr.numBits > 0; cnt++ {
133-
buf[cnt] = pr.transform[byte(pr.bufBits)]
129+
if pr.bigEndian {
130+
buf[cnt] = internal.ReverseLUT[byte(pr.bufBits)]
131+
} else {
132+
buf[cnt] = byte(pr.bufBits)
133+
}
134134
pr.bufBits >>= 8
135135
pr.numBits -= 8
136136
}
@@ -259,9 +259,12 @@ func (pr *Reader) PullBits(nb uint) error {
259259
return err
260260
}
261261

262+
// Peek no more bytes than necessary.
263+
// The computation for cntPeek computes the minimum number of
264+
// bytes to Peek to fill nb bits.
262265
var err error
263-
cntPeek := 8 // Minimum Peek amount to make progress
264-
if pr.bufRd.Buffered() > cntPeek {
266+
cntPeek := int(nb+(-nb&7)) / 8
267+
if cntPeek < pr.bufRd.Buffered() {
265268
cntPeek = pr.bufRd.Buffered()
266269
}
267270
pr.bufPeek, err = pr.bufRd.Peek(cntPeek)
@@ -276,17 +279,38 @@ func (pr *Reader) PullBits(nb uint) error {
276279
return err
277280
}
278281
}
279-
cnt := int(64-pr.numBits) / 8
280-
if cnt > len(pr.bufPeek) {
281-
cnt = len(pr.bufPeek)
282-
}
283-
for _, c := range pr.bufPeek[:cnt] {
284-
pr.bufBits |= uint64(pr.transform[c]) << pr.numBits
285-
pr.numBits += 8
286-
}
287-
pr.bufPeek = pr.bufPeek[cnt:]
288-
if pr.numBits > 56 {
282+
283+
n := int(64-pr.numBits) / 8 // Number of bytes to copy to bit buffer
284+
if len(pr.bufPeek) >= 8 {
285+
// Starting with Go 1.7, the compiler should use a wide integer
286+
// load here if the architecture supports it.
287+
u := binary.LittleEndian.Uint64(pr.bufPeek)
288+
if pr.bigEndian {
289+
// Swap all the bits within each byte.
290+
u = (u&0xaaaaaaaaaaaaaaaa)>>1 | (u&0x5555555555555555)<<1
291+
u = (u&0xcccccccccccccccc)>>2 | (u&0x3333333333333333)<<2
292+
u = (u&0xf0f0f0f0f0f0f0f0)>>4 | (u&0x0f0f0f0f0f0f0f0f)<<4
293+
}
294+
295+
pr.bufBits |= u << pr.numBits
296+
pr.numBits += uint(n * 8)
297+
pr.bufPeek = pr.bufPeek[n:]
289298
break
299+
} else {
300+
if n > len(pr.bufPeek) {
301+
n = len(pr.bufPeek)
302+
}
303+
for _, c := range pr.bufPeek[:n] {
304+
if pr.bigEndian {
305+
c = internal.ReverseLUT[c]
306+
}
307+
pr.bufBits |= uint64(c) << pr.numBits
308+
pr.numBits += 8
309+
}
310+
pr.bufPeek = pr.bufPeek[n:]
311+
if pr.numBits > 56 {
312+
break
313+
}
290314
}
291315
}
292316
pr.fedBits = pr.numBits
@@ -299,7 +323,10 @@ func (pr *Reader) PullBits(nb uint) error {
299323
}
300324
return err
301325
}
302-
pr.bufBits |= uint64(pr.transform[c]) << pr.numBits
326+
if pr.bigEndian {
327+
c = internal.ReverseLUT[c]
328+
}
329+
pr.bufBits |= uint64(c) << pr.numBits
303330
pr.numBits += 8
304331
pr.Offset++
305332
}

0 commit comments

Comments
 (0)