Description
What version of Go are you using (go version
)?
$ go version go version go1.16beta1 darwin/arm64
Does this issue reproduce with the latest release?
Yes
What operating system and processor architecture are you using (go env
)?
go env
Output
$ go envGO111MODULE=""
GOARCH="arm64"
GOBIN=""
GOCACHE="/Users/frank/Library/Caches/go-build"
GOENV="/Users/frank/Library/Application Support/go/env"
GOEXE=""
GOFLAGS=""
GOHOSTARCH="arm64"
GOHOSTOS="darwin"
GOINSECURE=""
GOMODCACHE="/Users/frank/go/pkg/mod"
GONOPROXY=""
GONOSUMDB=""
GOOS="darwin"
GOPATH="/Users/frank/go"
GOPRIVATE=""
GOPROXY="https://proxy.golang.org,direct"
GOROOT="/usr/local/go"
GOSUMDB="sum.golang.org"
GOTMPDIR=""
GOTOOLDIR="/usr/local/go/pkg/tool/darwin_arm64"
GOVCS=""
GOVERSION="go1.16beta1"
GCCGO="gccgo"
AR="ar"
CC="clang"
CXX="clang++"
CGO_ENABLED="1"
GOMOD="/dev/null"
CGO_CFLAGS="-g -O2"
CGO_CPPFLAGS=""
CGO_CXXFLAGS="-g -O2"
CGO_FFLAGS="-g -O2"
CGO_LDFLAGS="-g -O2"
PKG_CONFIG="pkg-config"
GOGCCFLAGS="-fPIC -pthread -fno-caret-diagnostics -Qunused-arguments -fmessage-length=0 -fdebug-prefix-map=/var/folders/73/zsswhtt91kd0_dk_l4k7kntr0000gn/T/go-build3479900954=/tmp/go-build -gno-record-gcc-switches -fno-common"
What did you do?
Compile this code with:
go build -gcflags -S lomuto.go
https://play.golang.org/p/F8fPWbzvDRO
What did you expect to see?
with clang -O3 -S
I see a tight inner loop between LBB0_2 and LBB0_5 with very minimal instructions
; %bb.0:
ldrb w8, [x0, x2]
cmp x1, x2
b.hs LBB0_6
; %bb.1:
sub x9, x2, x1
add x10, x0, x1
b LBB0_3
LBB0_2: ; in Loop: Header=BB0_3 Depth=1
add x10, x10, #1 ; =1
subs x9, x9, #1 ; =1
b.eq LBB0_5
LBB0_3: ; =>This Inner Loop Header: Depth=1
ldrb w11, [x10]
cmp w11, w8
b.hs LBB0_2
; %bb.4: ; in Loop: Header=BB0_3 Depth=1
ldrb w12, [x0, x1]
strb w11, [x0, x1]
strb w12, [x10]
add x1, x1, #1 ; =1
b LBB0_2
LBB0_5:
ldrb w8, [x0, x2]
LBB0_6:
ldrb w9, [x0, x1]
strb w8, [x0, x1]
strb w9, [x0, x2]
mov x0, x1
ret
What did you see instead?
I see excessive register usage and many more instructions between address 64 and 124
0x001c 00028 (lomuto.go:13) MOVD "".hi+32(FP), R0
0x0020 00032 (lomuto.go:13) MOVD "".arr+8(FP), R1
0x0024 00036 (lomuto.go:13) CMP R1, R0
0x0028 00040 (lomuto.go:13) BHS 184
0x002c 00044 (lomuto.go:13) MOVD "".arr(FP), R2
0x0030 00048 (lomuto.go:13) MOVBU (R2)(R0), R3
0x0034 00052 (lomuto.go:15) MOVD "".lo+24(FP), R4
0x0038 00056 (lomuto.go:17) MOVD R4, R5
0x003c 00060 (lomuto.go:15) JMP 72
0x0040 00064 (lomuto.go:15) ADD $1, R4, R4
0x0044 00068 (lomuto.go:16) MOVD R7, R3
0x0048 00072 (lomuto.go:15) CMP R4, R0
0x004c 00076 (lomuto.go:15) BLS 128
0x0050 00080 (lomuto.go:16) MOVBU (R2)(R4), R6
0x0054 00084 (lomuto.go:16) MOVD R3, R7
0x0058 00088 (lomuto.go:16) MOVD R6, R8
0x005c 00092 (lomuto.go:16) CMPW R8, R3
0x0060 00096 (lomuto.go:16) BLS 64
0x0064 00100 (lomuto.go:17) CMP R5, R1
0x0068 00104 (lomuto.go:17) BLS 176
0x006c 00108 (lomuto.go:17) MOVBU (R2)(R5), R3
0x0070 00112 (lomuto.go:17) MOVB R6, (R2)(R5)
0x0074 00116 (lomuto.go:17) MOVB R3, (R2)(R4)
0x0078 00120 (lomuto.go:18) ADD $1, R5, R5
0x007c 00124 (lomuto.go:18) JMP 64
0x0080 00128 (lomuto.go:21) CMP R5, R1
0x0084 00132 (lomuto.go:21) BLS 168