Skip to content

Commit b13547b

Browse files
committed
[LoongArch] Impl TTI hooks for LoongArch to support LoopDataPrefetch pass
Inspired by https://reviews.llvm.org/D146600, this commit adds some TTI hooks for LoongArch to make LoopDataPrefetch pass really work. Including: - `getCacheLineSize()`: 64 for loongarch64. - `getPrefetchDistance()`: After testing SPEC CPU 2017, improvements taken by prefetching are more obvious when set PrefetchDistance to 200(results shown blow), although different benchmarks fit for different best choice. - `enableWritePrefetching()`: store prefetch is supported by LoongArch, so set WritePrefetching to true in default. - `getMinPrefetchStride()` and `getMaxPrefetchIterationsAhead()` still use default values: 1 and UINT_MAX, so not override them. After this commit, the test added by https://reviews.llvm.org/D146600 can generate llvm.prefetch intrinsic IR correctly. Results of spec2017rate benchmarks (testing date: ref, copies: 1): - For all C/C++ benchmarks, compared to O3+novec/lsx/lasx, prefetch can bring about -1.58%/0.31%/0.07% performance improvement for int benchmarks and 3.26%/3.73%/3.78% improvement for floating point benchmarks. (Only O3+novec+prefetch decreases when testing intrate.) - But prefetch results in performance reduction almost for every Fortran benchmark compiled by flang. While considering all C/C++/Fortran benchmarks, prefetch performance will decrease about 1% ~ 5%. FIXME: Keep `loongarch-enable-loop-data-prefetch` option default to false for now due to the bad effect for Fortran.
1 parent 2f5bfb4 commit b13547b

File tree

3 files changed

+37
-6
lines changed

3 files changed

+37
-6
lines changed

llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,4 +89,10 @@ LoongArchTTIImpl::getPopcntSupport(unsigned TyWidth) {
8989
return ST->hasExtLSX() ? TTI::PSK_FastHardware : TTI::PSK_Software;
9090
}
9191

92+
unsigned LoongArchTTIImpl::getCacheLineSize() const { return 64; }
93+
94+
unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; }
95+
96+
bool LoongArchTTIImpl::enableWritePrefetching() const { return true; }
97+
9298
// TODO: Implement more hooks to provide TTI machinery for LoongArch.

llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> {
4747
const char *getRegisterClassName(unsigned ClassID) const;
4848
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
4949

50+
unsigned getCacheLineSize() const override;
51+
unsigned getPrefetchDistance() const override;
52+
bool enableWritePrefetching() const override;
53+
5054
// TODO: Implement more hooks to provide TTI machinery for LoongArch.
5155
};
5256

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,38 @@
1-
;; Tag this 'XFAIL' because we need a few more TTIs and ISels.
2-
; XFAIL: *
3-
; RUN: opt --mtriple=loongarch64 -mattr=+d --passes=loop-data-prefetch -loongarch-enable-loop-data-prefetch -S < %s | FileCheck %s
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt --mtriple=loongarch64 -mattr=+d --passes=loop-data-prefetch -S < %s | FileCheck %s
43

54
define void @foo(ptr %a, ptr %b) {
5+
; CHECK-LABEL: define void @foo(
6+
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[ENTRY:.*]]:
8+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
9+
; CHECK: [[FOR_BODY]]:
10+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
11+
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
12+
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 200
13+
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]]
14+
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
15+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 200
16+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
17+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]]
18+
; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[SCEVGEP]], i32 0, i32 3, i32 1)
19+
; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX]], align 8
20+
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP4]], 1.000000e+00
21+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]]
22+
; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[SCEVGEP1]], i32 1, i32 3, i32 1)
23+
; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8
24+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
25+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
26+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
27+
; CHECK: [[FOR_END]]:
28+
; CHECK-NEXT: ret void
29+
;
630
entry:
731
br label %for.body
832

9-
; CHECK: for.body:
1033
for.body: ; preds = %for.body, %entry
1134
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
1235
%arrayidx = getelementptr inbounds double, ptr %b, i64 %indvars.iv
13-
; CHECK: call void @llvm.prefetch
1436
%0 = load double, ptr %arrayidx, align 8
1537
%add = fadd double %0, 1.000000e+00
1638
%arrayidx2 = getelementptr inbounds double, ptr %a, i64 %indvars.iv
@@ -19,7 +41,6 @@ for.body: ; preds = %for.body, %entry
1941
%exitcond = icmp eq i64 %indvars.iv.next, 1600
2042
br i1 %exitcond, label %for.end, label %for.body
2143

22-
; CHECK: for.end:
2344
for.end: ; preds = %for.body
2445
ret void
2546
}

0 commit comments

Comments
 (0)