Skip to content

Commit 69727af

Browse files
committed
Add support for the BOLT post-link binary optimizer
Using [bolt](https://github.com/llvm/llvm-project/tree/main/bolt) provides a fairly large speedup without any code or functionality changes. It provides roughly a 1% speedup on pyperformance, and a 4% improvement on the Pyston web macrobenchmarks. It is gated behind an `--enable-bolt` configure arg because not all toolchains and environments are supported. It has been tested on a Linux x86_64 toolchain, using llvm-bolt built from the LLVM 14.0.6 sources (their binary distribution of this version did not include bolt). Compared to (a previous attempt)[faster-cpython/ideas#224], this commit uses bolt's preferred "instrumentation" approach, as well as adds some non-PIE flags which enable much better optimizations from bolt. The effects of this change are a bit more dependent on CPU microarchitecture than other changes, since it optimizes i-cache behavior which seems to be a bit more variable between architectures. The 1%/4% numbers were collected on an Intel Skylake CPU, and on an AMD Zen 3 CPU I got a slightly larger speedup (2%/4%), and on a c6i.xlarge EC2 instance I got a slightly lower speedup (1%/3%). The low speedup on pyperformance is not entirely unexpected, because BOLT improves i-cache behavior which is typically not tested by the small benchmarks in the pyperformance suite. This change uses the existing pgo profiling task (`python -m test --pgo`), though I was able to measure about a 1% macrobenchmark improvement by using the macrobenchmarks as the training task. I personally think that both the PGO and BOLT tasks should be updated to use macrobenchmarks, but for the sake of splitting up the work this PR uses the existing pgo task.
1 parent 9533b40 commit 69727af

File tree

5 files changed

+323
-1
lines changed

5 files changed

+323
-1
lines changed

Makefile.pre.in

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,6 @@ COVERAGE_REPORT=$(abs_builddir)/lcov-report
314314
COVERAGE_LCOV_OPTIONS=--rc lcov_branch_coverage=1
315315
COVERAGE_REPORT_OPTIONS=--rc lcov_branch_coverage=1 --branch-coverage --title "CPython $(VERSION) LCOV report [commit $(shell $(GITVERSION))]"
316316

317-
318317
# === Definitions added by makesetup ===
319318

320319

@@ -640,6 +639,15 @@ profile-opt: profile-run-stamp
640639
-rm -f profile-clean-stamp
641640
$(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)"
642641

642+
bolt-opt: @PREBOLT_RULE@
643+
rm -f *.fdata
644+
@LLVM_BOLT@ $(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst
645+
./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true
646+
@MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata
647+
@LLVM_BOLT@ $(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions=3 -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=all -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot
648+
rm -f *.fdata
649+
mv $(BUILDPYTHON).bolt $(BUILDPYTHON)
650+
643651
# Compile and run with gcov
644652
.PHONY=coverage coverage-lcov coverage-report
645653
coverage:

Misc/no-pie-compile.specs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*self_spec:
2+
+ %{!r:%{!fpie:%{!fPIE:%{!fpic:%{!fPIC:%{!fno-pic:-fno-PIE}}}}}}

Misc/no-pie-link.specs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*self_spec:
2+
+ %{!shared:%{!r:%{!fPIE:%{!pie:-fno-PIE -no-pie}}}}

configure

Lines changed: 259 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

configure.ac

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1881,6 +1881,57 @@ if test "$Py_LTO" = 'true' ; then
18811881
LDFLAGS_NODIST="$LDFLAGS_NODIST $LTOFLAGS"
18821882
fi
18831883

1884+
# Enable bolt flags
1885+
Py_BOLT='false'
1886+
AC_MSG_CHECKING(for --enable-bolt)
1887+
AC_ARG_ENABLE(bolt, AS_HELP_STRING(
1888+
[--enable-bolt],
1889+
[enable usage of the llvm-bolt post-link optimizer (default is no)]),
1890+
[
1891+
if test "$enableval" != no
1892+
then
1893+
Py_BOLT='true'
1894+
AC_MSG_RESULT(yes);
1895+
else
1896+
Py_BOLT='false'
1897+
AC_MSG_RESULT(no);
1898+
fi],
1899+
[AC_MSG_RESULT(no)])
1900+
1901+
AC_SUBST(PREBOLT_RULE)
1902+
if test "$Py_BOLT" = 'true' ; then
1903+
PREBOLT_RULE="${DEF_MAKE_ALL_RULE}"
1904+
DEF_MAKE_ALL_RULE="bolt-opt"
1905+
DEF_MAKE_RULE="build_all"
1906+
1907+
# These flags are required for bolt to work:
1908+
CFLAGS_NODIST="$CFLAGS_NODIST -fno-reorder-blocks-and-partition"
1909+
LDFLAGS_NODIST="$LDFLAGS_NODIST -Wl,--emit-relocs"
1910+
1911+
# These flags are required to get good performance from bolt:
1912+
CFLAGS_NODIST="$CFLAGS_NODIST -specs=Misc/no-pie-compile.specs"
1913+
LDFLAGS_NODIST="$LDFLAGS_NODIST -specs=Misc/no-pie-link.specs"
1914+
LDFLAGS_NOLTO="$LDFLAGS_NOLTO -specs=Misc/no-pie-link.specs"
1915+
1916+
AC_SUBST(LLVM_BOLT)
1917+
AC_PATH_TOOL(LLVM_BOLT, llvm-bolt, '', ${llvm_path})
1918+
if test -n "${LLVM_BOLT}" -a -x "${LLVM_BOLT}"
1919+
then
1920+
AC_MSG_RESULT("Found llvm-bolt")
1921+
else
1922+
AC_MSG_ERROR([llvm-bolt is required for a --enable-bolt build but could not be found.])
1923+
fi
1924+
1925+
AC_SUBST(MERGE_FDATA)
1926+
AC_PATH_TOOL(MERGE_FDATA, merge-fdata, '', ${llvm_path})
1927+
if test -n "${MERGE_FDATA}" -a -x "${MERGE_FDATA}"
1928+
then
1929+
AC_MSG_RESULT("Found merge-fdata")
1930+
else
1931+
AC_MSG_ERROR([merge-fdata is required for a --enable-bolt build but could not be found.])
1932+
fi
1933+
fi
1934+
18841935
# Enable PGO flags.
18851936
AC_SUBST(PGO_PROF_GEN_FLAG)
18861937
AC_SUBST(PGO_PROF_USE_FLAG)

0 commit comments

Comments
 (0)