Skip to content

Commit 5f7837f

Browse files
committed
REinsert amd64 optimizations
1 parent 7d8e822 commit 5f7837f

File tree

6 files changed

+849
-13
lines changed

6 files changed

+849
-13
lines changed

flate/asm_test.go

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
// Copyright 2015, Klaus Post, see LICENSE for details.
2+
3+
//+build amd64
4+
5+
package flate
6+
7+
import (
8+
"math/rand"
9+
"testing"
10+
)
11+
12+
func TestCRC(t *testing.T) {
13+
if !useSSE42 {
14+
t.Skip("Skipping CRC test, no SSE 4.2 available")
15+
}
16+
for _, x := range deflateTests {
17+
y := x.out
18+
if len(y) >= minMatchLength {
19+
t.Logf("In: %v, Out:0x%08x", y[0:minMatchLength], crc32sse(y[0:minMatchLength]))
20+
}
21+
}
22+
}
23+
24+
func TestCRCBulk(t *testing.T) {
25+
if !useSSE42 {
26+
t.Skip("Skipping CRC test, no SSE 4.2 available")
27+
}
28+
for _, x := range deflateTests {
29+
y := x.out
30+
y = append(y, y...)
31+
y = append(y, y...)
32+
y = append(y, y...)
33+
y = append(y, y...)
34+
y = append(y, y...)
35+
y = append(y, y...)
36+
if !testing.Short() {
37+
y = append(y, y...)
38+
y = append(y, y...)
39+
}
40+
y = append(y, 1)
41+
if len(y) >= minMatchLength {
42+
for j := len(y) - 1; j >= 4; j-- {
43+
44+
// Create copy, so we easier detect of-of-bound reads
45+
test := make([]byte, j)
46+
test2 := make([]byte, j)
47+
copy(test, y[:j])
48+
copy(test2, y[:j])
49+
50+
// We allocate one more than we need to test for unintentional overwrites
51+
dst := make([]uint32, j-3+1)
52+
ref := make([]uint32, j-3+1)
53+
for i := range dst {
54+
dst[i] = uint32(i + 100)
55+
ref[i] = uint32(i + 101)
56+
}
57+
// Last entry must NOT be overwritten.
58+
dst[j-3] = 0x1234
59+
ref[j-3] = 0x1234
60+
61+
// Do two encodes we can compare
62+
crc32sseAll(test, dst)
63+
crc32sseAll(test2, ref)
64+
65+
// Check all values
66+
for i, got := range dst {
67+
if i == j-3 {
68+
if dst[i] != 0x1234 {
69+
t.Fatalf("end of expected dst overwritten, was %08x", uint32(dst[i]))
70+
}
71+
continue
72+
}
73+
expect := crc32sse(y[i : i+4])
74+
if got != expect && got == uint32(i)+100 {
75+
t.Errorf("Len:%d Index:%d, expected 0x%08x but not modified", len(y), i, uint32(expect))
76+
} else if got != expect {
77+
t.Errorf("Len:%d Index:%d, got 0x%08x expected:0x%08x", len(y), i, uint32(got), uint32(expect))
78+
}
79+
expect = ref[i]
80+
if got != expect {
81+
t.Errorf("Len:%d Index:%d, got 0x%08x expected:0x%08x", len(y), i, got, expect)
82+
}
83+
}
84+
}
85+
}
86+
}
87+
}
88+
89+
func TestMatchLen(t *testing.T) {
90+
if !useSSE42 {
91+
t.Skip("Skipping Matchlen test, no SSE 4.2 available")
92+
}
93+
// Maximum length tested
94+
var maxLen = 512
95+
96+
// Skips per iteration
97+
is, js, ks := 3, 2, 1
98+
if testing.Short() {
99+
is, js, ks = 7, 5, 3
100+
}
101+
102+
a := make([]byte, maxLen)
103+
b := make([]byte, maxLen)
104+
bb := make([]byte, maxLen)
105+
rand.Seed(1)
106+
for i := range a {
107+
a[i] = byte(rand.Int63())
108+
b[i] = byte(rand.Int63())
109+
}
110+
111+
// Test different lengths
112+
for i := 0; i < maxLen; i += is {
113+
// Test different dst offsets.
114+
for j := 0; j < maxLen-1; j += js {
115+
copy(bb, b)
116+
// Test different src offsets
117+
for k := i - 1; k >= 0; k -= ks {
118+
copy(bb[j:], a[k:i])
119+
maxTest := maxLen - j
120+
if maxTest > maxLen-k {
121+
maxTest = maxLen - k
122+
}
123+
got := matchLenSSE4(a[k:], bb[j:], maxTest)
124+
expect := matchLenReference(a[k:], bb[j:], maxTest)
125+
if got > maxTest || got < 0 {
126+
t.Fatalf("unexpected result %d (len:%d, src offset: %d, dst offset:%d)", got, maxTest, k, j)
127+
}
128+
if got != expect {
129+
t.Fatalf("Mismatch, expected %d, got %d", expect, got)
130+
}
131+
}
132+
}
133+
}
134+
}
135+
136+
// matchLenReference is a reference matcher.
137+
func matchLenReference(a, b []byte, max int) int {
138+
for i := 0; i < max; i++ {
139+
if a[i] != b[i] {
140+
return i
141+
}
142+
}
143+
return max
144+
}
145+
146+
func TestHistogram(t *testing.T) {
147+
if !useSSE42 {
148+
t.Skip("Skipping Matchlen test, no SSE 4.2 available")
149+
}
150+
// Maximum length tested
151+
const maxLen = 65536
152+
var maxOff = 8
153+
154+
// Skips per iteration
155+
is, js := 5, 3
156+
if testing.Short() {
157+
is, js = 9, 1
158+
maxOff = 1
159+
}
160+
161+
a := make([]byte, maxLen+maxOff)
162+
rand.Seed(1)
163+
for i := range a {
164+
a[i] = byte(rand.Int63())
165+
}
166+
167+
// Test different lengths
168+
for i := 0; i <= maxLen; i += is {
169+
// Test different offsets
170+
for j := 0; j < maxOff; j += js {
171+
var got [256]int32
172+
var reference [256]int32
173+
174+
histogram(a[j:i+j], got[:])
175+
histogramReference(a[j:i+j], reference[:])
176+
for k := range got {
177+
if got[k] != reference[k] {
178+
t.Fatalf("mismatch at len:%d, offset:%d, value %d: (got) %d != %d (expected)", i, j, k, got[k], reference[k])
179+
}
180+
}
181+
}
182+
}
183+
}
184+
185+
// histogramReference is a reference
186+
func histogramReference(b []byte, h []int32) {
187+
if len(h) < 256 {
188+
panic("Histogram too small")
189+
}
190+
for _, t := range b {
191+
h[t]++
192+
}
193+
}

flate/crc32_amd64.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//+build !noasm
2+
//+build !appengine
3+
4+
// Copyright 2015, Klaus Post, see LICENSE for details.
5+
6+
package flate
7+
8+
import (
9+
"github.com/klauspost/cpuid"
10+
)
11+
12+
// crc32sse returns a hash for the first 4 bytes of the slice
13+
// len(a) must be >= 4.
14+
//go:noescape
15+
func crc32sse(a []byte) uint32
16+
17+
// crc32sseAll calculates hashes for each 4-byte set in a.
18+
// dst must be east len(a) - 4 in size.
19+
// The size is not checked by the assembly.
20+
//go:noescape
21+
func crc32sseAll(a []byte, dst []uint32)
22+
23+
// matchLenSSE4 returns the number of matching bytes in a and b
24+
// up to length 'max'. Both slices must be at least 'max'
25+
// bytes in size.
26+
//
27+
// TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
28+
//
29+
//go:noescape
30+
func matchLenSSE4(a, b []byte, max int) int
31+
32+
// histogram accumulates a histogram of b in h.
33+
// h must be at least 256 entries in length,
34+
// and must be cleared before calling this function.
35+
//go:noescape
36+
func histogram(b []byte, h []int32)
37+
38+
// Detect SSE 4.2 feature.
39+
func init() {
40+
useSSE42 = cpuid.CPU.SSE42()
41+
}

0 commit comments

Comments
 (0)