Skip to content

Commit a92ca51

Browse files
wdvxdr1123odeke-em
authored andcommitted
cmd/compile: use LZCNT instruction for GOAMD64>=3
LZCNT is similar to BSR, but BSR(x) is undefined when x == 0, so using LZCNT can avoid a special case for zero input. Except that case, LZCNTQ(x) == 63-BSRQ(x) and LZCNTL(x) == 31-BSRL(x). And according to https://www.agner.org/optimize/instruction_tables.pdf, LZCNT instructions are much faster than BSR on AMD CPU. name old time/op new time/op delta LeadingZeros-8 0.91ns ± 1% 0.80ns ± 7% -11.68% (p=0.000 n=9+9) LeadingZeros8-8 0.98ns ±15% 0.91ns ± 1% -7.34% (p=0.000 n=9+9) LeadingZeros16-8 0.94ns ± 3% 0.92ns ± 2% -2.36% (p=0.001 n=10+10) LeadingZeros32-8 0.89ns ± 1% 0.78ns ± 2% -12.49% (p=0.000 n=10+10) LeadingZeros64-8 0.92ns ± 1% 0.78ns ± 1% -14.48% (p=0.000 n=10+10) Change-Id: I125147fe3d6994a4cfe558432780408e9a27557a Reviewed-on: https://go-review.googlesource.com/c/go/+/396794 Reviewed-by: Keith Randall <[email protected]> Trust: Emmanuel Odeke <[email protected]> Run-TryBot: Emmanuel Odeke <[email protected]> TryBot-Result: Gopher Robot <[email protected]>
1 parent ba6df85 commit a92ca51

File tree

7 files changed

+158
-15
lines changed

7 files changed

+158
-15
lines changed

src/cmd/compile/internal/amd64/ssa.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1125,7 +1125,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
11251125
p.To.Type = obj.TYPE_REG
11261126
p.To.Reg = v.Reg()
11271127
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
1128-
ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL:
1128+
ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
1129+
ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
11291130
if v.Args[0].Reg() != v.Reg() {
11301131
// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
11311132
// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.

src/cmd/compile/internal/amd64/versions_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ var featureToOpcodes = map[string][]string{
242242
"sse41": {"roundsd"},
243243
"fma": {"vfmadd231sd"},
244244
"movbe": {"movbeqq", "movbeq", "movbell", "movbel", "movbe"},
245+
"lzcnt": {"lzcntq", "lzcntl", "lzcnt"},
245246
}
246247

247248
// Test to use POPCNT instruction, if available

src/cmd/compile/internal/ssa/gen/AMD64.rules

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,14 @@
9898
// However, for zero-extended values, we can cheat a bit, and calculate
9999
// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
100100
// places the index of the highest set bit where we want it.
101-
(BitLen64 <t> x) => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
102-
(BitLen32 x) => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
103-
(BitLen16 x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
104-
(BitLen8 x) => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
101+
// For GOAMD64>=3, BitLen can be calculated by OperandSize - LZCNT(x).
102+
(BitLen64 <t> x) && buildcfg.GOAMD64 < 3 => (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
103+
(BitLen32 x) && buildcfg.GOAMD64 < 3 => (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
104+
(BitLen16 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x)))
105+
(BitLen8 x) && buildcfg.GOAMD64 < 3 => (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x)))
106+
(BitLen64 <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-64] (LZCNTQ x)))
107+
// Use 64-bit version to allow const-fold remove unnecessary arithmetic.
108+
(BitLen(32|16|8) <t> x) && buildcfg.GOAMD64 >= 3 => (NEGQ (ADDQconst <t> [-32] (LZCNTL x)))
105109

106110
(Bswap(64|32) ...) => (BSWAP(Q|L) ...)
107111

src/cmd/compile/internal/ssa/gen/AMD64Ops.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,11 @@ func init() {
923923
{name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true},
924924
{name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true},
925925

926+
// CPUID feature: LZCNT.
927+
// count the number of leading zero bits.
928+
{name: "LZCNTQ", argLength: 1, reg: gp11, asm: "LZCNTQ", typ: "UInt64", clobberFlags: true},
929+
{name: "LZCNTL", argLength: 1, reg: gp11, asm: "LZCNTL", typ: "UInt32", clobberFlags: true},
930+
926931
// CPUID feature: MOVBE
927932
// MOVBEWload does not satisfy zero extended, so only use MOVBEWstore
928933
{name: "MOVBEWstore", argLength: 3, reg: gpstore, asm: "MOVBEW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem

src/cmd/compile/internal/ssa/opGen.go

Lines changed: 30 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cmd/compile/internal/ssa/rewriteAMD64.go

Lines changed: 92 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/codegen/mathbits.go

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ import "math/bits"
1313
// ----------------------- //
1414

1515
func LeadingZeros(n uint) int {
16-
// amd64:"BSRQ"
16+
// amd64/v1,amd64/v2:"BSRQ"
17+
// amd64/v3:"LZCNTQ", -"BSRQ"
1718
// s390x:"FLOGR"
1819
// arm:"CLZ" arm64:"CLZ"
1920
// mips:"CLZ"
@@ -22,7 +23,8 @@ func LeadingZeros(n uint) int {
2223
}
2324

2425
func LeadingZeros64(n uint64) int {
25-
// amd64:"BSRQ"
26+
// amd64/v1,amd64/v2:"BSRQ"
27+
// amd64/v3:"LZCNTQ", -"BSRQ"
2628
// s390x:"FLOGR"
2729
// arm:"CLZ" arm64:"CLZ"
2830
// mips:"CLZ"
@@ -31,7 +33,8 @@ func LeadingZeros64(n uint64) int {
3133
}
3234

3335
func LeadingZeros32(n uint32) int {
34-
// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
36+
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
37+
// amd64/v3: "LZCNTL",- "BSRL"
3538
// s390x:"FLOGR"
3639
// arm:"CLZ" arm64:"CLZW"
3740
// mips:"CLZ"
@@ -40,7 +43,8 @@ func LeadingZeros32(n uint32) int {
4043
}
4144

4245
func LeadingZeros16(n uint16) int {
43-
// amd64:"BSRL","LEAL",-"CMOVQEQ"
46+
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
47+
// amd64/v3: "LZCNTL",- "BSRL"
4448
// s390x:"FLOGR"
4549
// arm:"CLZ" arm64:"CLZ"
4650
// mips:"CLZ"
@@ -49,7 +53,8 @@ func LeadingZeros16(n uint16) int {
4953
}
5054

5155
func LeadingZeros8(n uint8) int {
52-
// amd64:"BSRL","LEAL",-"CMOVQEQ"
56+
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
57+
// amd64/v3: "LZCNTL",- "BSRL"
5358
// s390x:"FLOGR"
5459
// arm:"CLZ" arm64:"CLZ"
5560
// mips:"CLZ"
@@ -62,7 +67,8 @@ func LeadingZeros8(n uint8) int {
6267
// --------------- //
6368

6469
func Len(n uint) int {
65-
// amd64:"BSRQ"
70+
// amd64/v1,amd64/v2:"BSRQ"
71+
// amd64/v3: "LZCNTQ"
6672
// s390x:"FLOGR"
6773
// arm:"CLZ" arm64:"CLZ"
6874
// mips:"CLZ"
@@ -71,7 +77,8 @@ func Len(n uint) int {
7177
}
7278

7379
func Len64(n uint64) int {
74-
// amd64:"BSRQ"
80+
// amd64/v1,amd64/v2:"BSRQ"
81+
// amd64/v3: "LZCNTQ"
7582
// s390x:"FLOGR"
7683
// arm:"CLZ" arm64:"CLZ"
7784
// mips:"CLZ"
@@ -88,7 +95,8 @@ func SubFromLen64(n uint64) int {
8895
}
8996

9097
func Len32(n uint32) int {
91-
// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
98+
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
99+
// amd64/v3: "LZCNTL"
92100
// s390x:"FLOGR"
93101
// arm:"CLZ" arm64:"CLZ"
94102
// mips:"CLZ"
@@ -99,7 +107,8 @@ func Len32(n uint32) int {
99107
}
100108

101109
func Len16(n uint16) int {
102-
// amd64:"BSRL","LEAL",-"CMOVQEQ"
110+
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
111+
// amd64/v3: "LZCNTL"
103112
// s390x:"FLOGR"
104113
// arm:"CLZ" arm64:"CLZ"
105114
// mips:"CLZ"
@@ -108,7 +117,8 @@ func Len16(n uint16) int {
108117
}
109118

110119
func Len8(n uint8) int {
111-
// amd64:"BSRL","LEAL",-"CMOVQEQ"
120+
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
121+
// amd64/v3: "LZCNTL"
112122
// s390x:"FLOGR"
113123
// arm:"CLZ" arm64:"CLZ"
114124
// mips:"CLZ"

0 commit comments

Comments
 (0)