From 9262ab5c1fe11fe9ed729a778e9dd8641b9974f6 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Fri, 13 Dec 2024 14:08:56 -0500 Subject: [PATCH 1/2] selftests/bpf: Fix pyperf180 compilation failure with clang18 jira LE-2125 commit-author Yonghong Song commit 100888fb6d8a185866b1520031ee7e3182b173de With latest clang18 (main branch of llvm-project repo), when building bpf selftests, [~/work/bpf-next (master)]$ make -C tools/testing/selftests/bpf LLVM=1 -j The following compilation error happens: fatal error: error in backend: Branch target out of insn range ... Stack dump: 0. Program arguments: clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -I/home/yhs/work/bpf-next/tools/include/uapi -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -idirafter /home/yhs/work/llvm-project/llvm/build.18/install/lib/clang/18/include -idirafter /usr/local/include -idirafter /usr/include -Wno-compare-distinct-pointer-types -DENABLE_ATOMICS_TESTS -O2 --target=bpf -c progs/pyperf180.c -mcpu=v3 -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/pyperf180.bpf.o 1. parser at end of file 2. Code generation ... The compilation failure only happens to cpu=v2 and cpu=v3. cpu=v4 is okay since cpu=v4 supports 32-bit branch target offset. The above failure is due to upstream llvm patch [1] where some inlining behavior are changed in clang18. To workaround the issue, previously all 180 loop iterations are fully unrolled. The bpf macro __BPF_CPU_VERSION__ (implemented in clang18 recently) is used to avoid unrolling changes if cpu=v4. If __BPF_CPU_VERSION__ is not available and the compiler is clang18, the unrollng amount is unconditionally reduced. [1] https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev (cherry picked from commit 100888fb6d8a185866b1520031ee7e3182b173de) Signed-off-by: Brett Mastbergen --- tools/testing/selftests/bpf/progs/pyperf180.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c index c39f559d3100e..42c4a8b62e360 100644 --- a/tools/testing/selftests/bpf/progs/pyperf180.c +++ b/tools/testing/selftests/bpf/progs/pyperf180.c @@ -1,4 +1,26 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #define STACK_MAX_LEN 180 + +/* llvm upstream commit at clang18 + * https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e + * changed inlining behavior and caused compilation failure as some branch + * target distance exceeded 16bit representation which is the maximum for + * cpu v1/v2/v3. Macro __BPF_CPU_VERSION__ is later implemented in clang18 + * to specify which cpu version is used for compilation. So a smaller + * unroll_count can be set if __BPF_CPU_VERSION__ is less than 4, which + * reduced some branch target distances and resolved the compilation failure. + * + * To capture the case where a developer/ci uses clang18 but the corresponding + * repo checkpoint does not have __BPF_CPU_VERSION__, a smaller unroll_count + * will be set as well to prevent potential compilation failures. + */ +#ifdef __BPF_CPU_VERSION__ +#if __BPF_CPU_VERSION__ < 4 +#define UNROLL_COUNT 90 +#endif +#elif __clang_major__ == 18 +#define UNROLL_COUNT 90 +#endif + #include "pyperf.h" From 4da4af68c411d08e2bc740e4e84255bf19c41796 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Fri, 13 Dec 2024 15:14:00 -0500 Subject: [PATCH 2/2] selftests/bpf: Limit unroll_count for pyperf600 test jira LE-2125 commit-author Yonghong Song commit 8c89b5db7a2894e33417dd69680729e8f65f5709 LLVM commit [1] changed loop pragma behavior such that full loop unroll is always honored with user pragma. Previously, unroll count also depends on the unrolled code size. For pyperf600, without [1], the loop unroll count is 150. With [1], the loop unroll count is 600. The unroll count of 600 caused the program size close to 298k and this caused the following code is generated: 0: 7b 1a 00 ff 00 00 00 00 *(u64 *)(r10 - 256) = r1 ; uint64_t pid_tgid = bpf_get_current_pid_tgid(); 1: 85 00 00 00 0e 00 00 00 call 14 2: bf 06 00 00 00 00 00 00 r6 = r0 ; pid_t pid = (pid_t)(pid_tgid >> 32); 3: bf 61 00 00 00 00 00 00 r1 = r6 4: 77 01 00 00 20 00 00 00 r1 >>= 32 5: 63 1a fc ff 00 00 00 00 *(u32 *)(r10 - 4) = r1 6: bf a2 00 00 00 00 00 00 r2 = r10 7: 07 02 00 00 fc ff ff ff r2 += -4 ; PidData* pidData = bpf_map_lookup_elem(&pidmap, &pid); 8: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0 ll 10: 85 00 00 00 01 00 00 00 call 1 11: bf 08 00 00 00 00 00 00 r8 = r0 ; if (!pidData) 12: 15 08 15 e8 00 00 00 00 if r8 == 0 goto -6123 Note that insn 12 has a branch offset -6123 which is clearly illegal and will be rejected by the verifier. The negative offset is due to the branch range is greater than INT16_MAX. This patch changed the unroll count to be 150 to avoid above branch target insn out-of-range issue. Also the llvm is enhanced ([2]) to assert if the branch target insn is out of INT16 range. [1] https://reviews.llvm.org/D119148 [2] https://reviews.llvm.org/D123877 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220419043230.2928530-1-yhs@fb.com (cherry picked from commit 8c89b5db7a2894e33417dd69680729e8f65f5709) Signed-off-by: Brett Mastbergen # Conflicts: # tools/testing/selftests/bpf/progs/pyperf.h --- tools/testing/selftests/bpf/progs/pyperf.h | 4 ++++ tools/testing/selftests/bpf/progs/pyperf600.c | 11 +++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h index 2fb7adafb6b64..54c9845c953f6 100644 --- a/tools/testing/selftests/bpf/progs/pyperf.h +++ b/tools/testing/selftests/bpf/progs/pyperf.h @@ -231,8 +231,12 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx) #ifdef NO_UNROLL #pragma clang loop unroll(disable) #else +#ifdef UNROLL_COUNT +#pragma clang loop unroll_count(UNROLL_COUNT) +#else #pragma clang loop unroll(full) #endif +#endif /* NO_UNROLL */ /* Unwind python stack */ for (int i = 0; i < STACK_MAX_LEN; ++i) { if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) { diff --git a/tools/testing/selftests/bpf/progs/pyperf600.c b/tools/testing/selftests/bpf/progs/pyperf600.c index cb49b89e37cd4..ce1aa5189cc49 100644 --- a/tools/testing/selftests/bpf/progs/pyperf600.c +++ b/tools/testing/selftests/bpf/progs/pyperf600.c @@ -1,9 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook #define STACK_MAX_LEN 600 -/* clang will not unroll the loop 600 times. - * Instead it will unroll it to the amount it deemed - * appropriate, but the loop will still execute 600 times. - * Total program size is around 90k insns +/* Full unroll of 600 iterations will have total + * program size close to 298k insns and this may + * cause BPF_JMP insn out of 16-bit integer range. + * So limit the unroll size to 150 so the + * total program size is around 80k insns but + * the loop will still execute 600 times. */ +#define UNROLL_COUNT 150 #include "pyperf.h"