-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[DSE] Optimizing shrinking of memory intrinsic #106425
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-debuginfo Author: None (goldsteinn) Changes
Currently for the following snippet: However, its typically better to emit this with a full 8 byte This is because a 8-byte memcpy can be lowered with a single Similiar is true for This patch changes the behavior to check if we are liable to be Patch is 23.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106425.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 992139a95a43d3..9dd1822b3457f6 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -560,7 +561,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
uint64_t &DeadSize, int64_t KillingStart,
- uint64_t KillingSize, bool IsOverwriteEnd) {
+ uint64_t KillingSize, bool IsOverwriteEnd,
+ const TargetTransformInfo &TTI) {
auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
@@ -612,6 +614,24 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
assert(DeadSize > ToRemoveSize && "Can't remove more than original size");
uint64_t NewSize = DeadSize - ToRemoveSize;
+
+ // Check that we aren't going to pessimize codegen by lowering the length. I.e
+ // a memcpy(dst, src, 8) is more efficient than memcpy(dst, src, 7).
+ // These checks are relatively conservative. We bail out if:
+ // 1) We are removing less than 1 store (measured by targets load/store Vec
+ // width).
+ // 2) We are saving a load/store (assuming loads/stores occur per pow2 block)
+ // 3) We aren't preventing this from going below inline thresh
+ // 4) We are shrinking by less than half of the initial size.
+ uint64_t PrefVecWidth =
+ TTI.getLoadStoreVecRegBitWidth(DeadIntrinsic->getDestAddressSpace()) / 8U;
+ uint64_t InlineThresh = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+ if (ToRemoveSize < PrefVecWidth &&
+ popcount(DeadSize) < popcount(DeadSize - ToRemoveSize) &&
+ (DeadSize <= InlineThresh) == (DeadSize - ToRemoveSize <= InlineThresh) &&
+ ToRemoveSize < DeadSize / 2U)
+ return false;
+
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) {
// When shortening an atomic memory intrinsic, the newly shortened
// length must remain an integer multiple of the element size.
@@ -654,7 +674,8 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
}
static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
- int64_t &DeadStart, uint64_t &DeadSize) {
+ int64_t &DeadStart, uint64_t &DeadSize,
+ const TargetTransformInfo &TTI) {
if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
return false;
@@ -672,7 +693,7 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
// be non negative due to preceding checks.
KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) {
if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
- true)) {
+ true, TTI)) {
IntervalMap.erase(OII);
return true;
}
@@ -682,7 +703,8 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
static bool tryToShortenBegin(Instruction *DeadI,
OverlapIntervalsTy &IntervalMap,
- int64_t &DeadStart, uint64_t &DeadSize) {
+ int64_t &DeadStart, uint64_t &DeadSize,
+ const TargetTransformInfo &TTI) {
if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI))
return false;
@@ -701,7 +723,7 @@ static bool tryToShortenBegin(Instruction *DeadI,
assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize &&
"Should have been handled as OW_Complete");
if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
- false)) {
+ false, TTI)) {
IntervalMap.erase(OII);
return true;
}
@@ -824,6 +846,7 @@ struct DSEState {
DominatorTree &DT;
PostDominatorTree &PDT;
const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
const DataLayout &DL;
const LoopInfo &LI;
@@ -868,9 +891,9 @@ struct DSEState {
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
- const LoopInfo &LI)
+ const TargetTransformInfo &TTI, const LoopInfo &LI)
: F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
- PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
+ PDT(PDT), TLI(TLI), TTI(TTI), DL(F.getDataLayout()), LI(LI) {
// Collect blocks with throwing instructions not modeled in MemorySSA and
// alloc-like objects.
unsigned PO = 0;
@@ -2066,10 +2089,10 @@ struct DSEState {
uint64_t DeadSize = Loc.Size.getValue();
GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL);
OverlapIntervalsTy &IntervalMap = OI.second;
- Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize);
+ Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
if (IntervalMap.empty())
continue;
- Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+ Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
}
return Changed;
}
@@ -2137,10 +2160,11 @@ struct DSEState {
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI,
const LoopInfo &LI) {
bool MadeChange = false;
- DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+ DSEState State(F, AA, MSSA, DT, PDT, TLI, TTI, LI);
// For each store:
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
MemoryDef *KillingDef = State.MemDefs[I];
@@ -2332,12 +2356,13 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
AliasAnalysis &AA = AM.getResult<AAManager>(F);
const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+ bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, TTI, LI);
#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
index 2c26cb8c84c7bd..afb30e07889ee9 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
@@ -15,25 +16,35 @@
;; fragment of the shortened store.
;;
; CHECK: #dbg_assign({{.*}}, ptr %g, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID:[0-9]+]], ptr %arrayidx.i, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %arrayidx3.i, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[UniqueID1:[0-9]+]], ptr poison, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[UniqueID2:[0-9]+]], ptr poison, !DIExpression(),
-; CHECK: call void @llvm.memset{{.*}}, !DIAssignID ![[ID]]
-; CHECK-DAG: ![[ID]] = distinct !DIAssignID()
-; CHECK-DAG: ![[UniqueID1]] = distinct !DIAssignID()
-; CHECK-DAG: ![[UniqueID2]] = distinct !DIAssignID()
%struct.v = type { [4 x float] }
$_ZN1vC2Ef = comdat any
define dso_local void @_Z1fv() local_unnamed_addr !dbg !7 {
+; CHECK-LABEL: define dso_local void @_Z1fv(
+; CHECK-SAME: ) local_unnamed_addr !dbg [[DBG8:![0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[G:%.*]] = alloca [[STRUCT_V:%.*]], align 4, !DIAssignID [[DIASSIGNID24:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(i1 poison, [[META12:![0-9]+]], !DIExpression(), [[DIASSIGNID24]], ptr [[G]], !DIExpression(), [[META25:![0-9]+]])
+; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 2, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), [[META34:![0-9]+]], ptr [[ARRAYIDX_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 1, !dbg [[DBG35:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), [[META34]], ptr [[ARRAYIDX3_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 0, !dbg [[DBG36:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), [[META34]], ptr [[ARRAYIDX5_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 3, !dbg [[DBG37:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), [[META34]], ptr [[ARRAYIDX7_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast ptr [[ARRAYIDX5_I]] to ptr, !dbg [[DBG38:![0-9]+]]
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 16, i1 false), !dbg [[DBG39:![0-9]+]], !DIAssignID [[META34]]
+; CHECK-NEXT: call void @_Z3escP1v(ptr nonnull [[G]]), !dbg [[DBG38]]
+; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]]
+;
entry:
%g = alloca %struct.v, align 4, !DIAssignID !23
call void @llvm.dbg.assign(metadata i1 poison, metadata !11, metadata !DIExpression(), metadata !23, metadata ptr %g, metadata !DIExpression()), !dbg !24
- %arrayidx.i = getelementptr inbounds %struct.v, ptr %g, i64 0, i32 0, i64 2, !dbg !37
+ %arrayidx.i = getelementptr inbounds %struct.v, ptr %g, i64 0, i32 0, i64 2, !dbg !37
call void @llvm.dbg.assign(metadata float 0.000000e+00, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata !39, metadata ptr %arrayidx.i, metadata !DIExpression()), !dbg !24
%arrayidx3.i = getelementptr inbounds %struct.v, ptr %g, i64 0, i32 0, i64 1, !dbg !40
call void @llvm.dbg.assign(metadata float 0.000000e+00, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32), metadata !39, metadata ptr %arrayidx3.i, metadata !DIExpression()), !dbg !24
@@ -125,3 +136,41 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
!65 = !DISubroutineType(types: !66)
!66 = !{null, !30}
!1000 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "reduce.cpp", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[DBG8]] = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: [[META1]], file: [[META1]], line: 12, type: [[META9:![0-9]+]], scopeLine: 12, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META11:![0-9]+]])
+; CHECK: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; CHECK: [[META10]] = !{null}
+; CHECK: [[META11]] = !{[[META12]]}
+; CHECK: [[META12]] = !DILocalVariable(name: "g", scope: [[DBG8]], file: [[META1]], line: 13, type: [[META13:![0-9]+]])
+; CHECK: [[META13]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: [[META1]], line: 1, size: 128, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: [[META14:![0-9]+]], identifier: "_ZTS1v")
+; CHECK: [[META14]] = !{[[META15:![0-9]+]], [[META20:![0-9]+]]}
+; CHECK: [[META15]] = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: [[META13]], file: [[META1]], line: 2, baseType: [[META16:![0-9]+]], size: 128)
+; CHECK: [[META16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: [[META17:![0-9]+]], size: 128, elements: [[META18:![0-9]+]])
+; CHECK: [[META17]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+; CHECK: [[META18]] = !{[[META19:![0-9]+]]}
+; CHECK: [[META19]] = !DISubrange(count: 4)
+; CHECK: [[META20]] = !DISubprogram(name: "v", scope: [[META13]], file: [[META1]], line: 4, type: [[META21:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+; CHECK: [[META21]] = !DISubroutineType(types: [[META22:![0-9]+]])
+; CHECK: [[META22]] = !{null, [[META23:![0-9]+]], [[META17]]}
+; CHECK: [[META23]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META13]], size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+; CHECK: [[DIASSIGNID24]] = distinct !DIAssignID()
+; CHECK: [[META25]] = !DILocation(line: 0, scope: [[DBG8]])
+; CHECK: [[DBG26]] = !DILocation(line: 5, column: 19, scope: [[META27:![0-9]+]], inlinedAt: [[META33:![0-9]+]])
+; CHECK: [[META27]] = distinct !DILexicalBlock(scope: [[META28:![0-9]+]], file: [[META1]], line: 4, column: 14)
+; CHECK: [[META28]] = distinct !DISubprogram(name: "v", linkageName: "_ZN1vC2Ef", scope: [[META13]], file: [[META1]], line: 4, type: [[META21]], scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], declaration: [[META20]], retainedNodes: [[META29:![0-9]+]])
+; CHECK: [[META29]] = !{[[META30:![0-9]+]], [[META32:![0-9]+]]}
+; CHECK: [[META30]] = !DILocalVariable(name: "this", arg: 1, scope: [[META28]], type: [[META31:![0-9]+]], flags: DIFlagArtificial | DIFlagObjectPointer)
+; CHECK: [[META31]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META13]], size: 64)
+; CHECK: [[META32]] = !DILocalVariable(name: "d", arg: 2, scope: [[META28]], file: [[META1]], line: 4, type: [[META17]])
+; CHECK: [[META33]] = distinct !DILocation(line: 13, column: 5, scope: [[DBG8]])
+; CHECK: [[META34]] = distinct !DIAssignID()
+; CHECK: [[DBG35]] = !DILocation(line: 5, column: 12, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG36]] = !DILocation(line: 5, column: 5, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG37]] = !DILocation(line: 6, column: 5, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG38]] = !DILocation(line: 14, column: 3, scope: [[DBG8]])
+; CHECK: [[DBG39]] = !DILocation(line: 5, column: 17, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG40]] = !DILocation(line: 15, column: 1, scope: [[DBG8]])
+;.
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
index bc1756f6ca9d1b..893084df00f58e 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
@@ -23,8 +23,8 @@ define void @write4to7_weird_element_type(ptr nocapture %p) {
; CHECK-LABEL: @write4to7_weird_element_type(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 24, i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: ret void
@@ -119,8 +119,7 @@ entry:
define void @write0to7(ptr nocapture %p) {
; CHECK-LABEL: @write0to7(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 8
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i1 false)
; CHECK-NEXT: store i64 1, ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -135,8 +134,7 @@ entry:
define void @write0to7_atomic(ptr nocapture %p) {
; CHECK-LABEL: @write0to7_atomic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 8
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i32 4)
; CHECK-NEXT: store atomic i64 1, ptr [[P]] unordered, align 8
; CHECK-NEXT: ret void
;
@@ -236,8 +234,7 @@ define void @write2to10(ptr nocapture %p) {
; CHECK-LABEL: @write2to10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i1 false)
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
; CHECK-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: ret void
@@ -254,8 +251,7 @@ define void @write2to10_atomic(ptr nocapture %p) {
; CHECK-LABEL: @write2to10_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i32 4)
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
; CHECK-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8
; CHECK-NEXT: ret void
@@ -360,8 +356,7 @@ define void @ow_begin_align1(ptr nocapture %p) {
; CHECK-LABEL: @ow_begin_align1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP0]], i8 0, i64 25, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 32, i1 false)
; CHECK-NEXT: store i64 1, ptr [[P]], align 1
; CHECK-NEXT: ret void
;
@@ -376,8 +371,7 @@ define void @ow_end_align4(ptr nocapture %p) {
; CHECK-LABEL: @ow_end_align4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 32, i1 f...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: None (goldsteinn) Changes
Currently for the following snippet: However, its typically better to emit this with a full 8 byte This is because a 8-byte memcpy can be lowered with a single Similiar is true for This patch changes the behavior to check if we are liable to be Patch is 23.11 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106425.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 992139a95a43d3..9dd1822b3457f6 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -560,7 +561,8 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
uint64_t &DeadSize, int64_t KillingStart,
- uint64_t KillingSize, bool IsOverwriteEnd) {
+ uint64_t KillingSize, bool IsOverwriteEnd,
+ const TargetTransformInfo &TTI) {
auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
@@ -612,6 +614,24 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
assert(DeadSize > ToRemoveSize && "Can't remove more than original size");
uint64_t NewSize = DeadSize - ToRemoveSize;
+
+ // Check that we aren't going to pessimize codegen by lowering the length. I.e
+ // a memcpy(dst, src, 8) is more efficient than memcpy(dst, src, 7).
+ // These checks are relatively conservative. We bail out if:
+ // 1) We are removing less than 1 store (measured by targets load/store Vec
+ // width).
+ // 2) We are saving a load/store (assuming loads/stores occur per pow2 block)
+ // 3) We aren't preventing this from going below inline thresh
+ // 4) We are shrinking by less than half of the initial size.
+ uint64_t PrefVecWidth =
+ TTI.getLoadStoreVecRegBitWidth(DeadIntrinsic->getDestAddressSpace()) / 8U;
+ uint64_t InlineThresh = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+ if (ToRemoveSize < PrefVecWidth &&
+ popcount(DeadSize) < popcount(DeadSize - ToRemoveSize) &&
+ (DeadSize <= InlineThresh) == (DeadSize - ToRemoveSize <= InlineThresh) &&
+ ToRemoveSize < DeadSize / 2U)
+ return false;
+
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) {
// When shortening an atomic memory intrinsic, the newly shortened
// length must remain an integer multiple of the element size.
@@ -654,7 +674,8 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
}
static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
- int64_t &DeadStart, uint64_t &DeadSize) {
+ int64_t &DeadStart, uint64_t &DeadSize,
+ const TargetTransformInfo &TTI) {
if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
return false;
@@ -672,7 +693,7 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
// be non negative due to preceding checks.
KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) {
if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
- true)) {
+ true, TTI)) {
IntervalMap.erase(OII);
return true;
}
@@ -682,7 +703,8 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
static bool tryToShortenBegin(Instruction *DeadI,
OverlapIntervalsTy &IntervalMap,
- int64_t &DeadStart, uint64_t &DeadSize) {
+ int64_t &DeadStart, uint64_t &DeadSize,
+ const TargetTransformInfo &TTI) {
if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI))
return false;
@@ -701,7 +723,7 @@ static bool tryToShortenBegin(Instruction *DeadI,
assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize &&
"Should have been handled as OW_Complete");
if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
- false)) {
+ false, TTI)) {
IntervalMap.erase(OII);
return true;
}
@@ -824,6 +846,7 @@ struct DSEState {
DominatorTree &DT;
PostDominatorTree &PDT;
const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
const DataLayout &DL;
const LoopInfo &LI;
@@ -868,9 +891,9 @@ struct DSEState {
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
- const LoopInfo &LI)
+ const TargetTransformInfo &TTI, const LoopInfo &LI)
: F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
- PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
+ PDT(PDT), TLI(TLI), TTI(TTI), DL(F.getDataLayout()), LI(LI) {
// Collect blocks with throwing instructions not modeled in MemorySSA and
// alloc-like objects.
unsigned PO = 0;
@@ -2066,10 +2089,10 @@ struct DSEState {
uint64_t DeadSize = Loc.Size.getValue();
GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL);
OverlapIntervalsTy &IntervalMap = OI.second;
- Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize);
+ Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
if (IntervalMap.empty())
continue;
- Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+ Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
}
return Changed;
}
@@ -2137,10 +2160,11 @@ struct DSEState {
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI,
const LoopInfo &LI) {
bool MadeChange = false;
- DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+ DSEState State(F, AA, MSSA, DT, PDT, TLI, TTI, LI);
// For each store:
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
MemoryDef *KillingDef = State.MemDefs[I];
@@ -2332,12 +2356,13 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
AliasAnalysis &AA = AM.getResult<AAManager>(F);
const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+ bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, TTI, LI);
#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
index 2c26cb8c84c7bd..afb30e07889ee9 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
@@ -15,25 +16,35 @@
;; fragment of the shortened store.
;;
; CHECK: #dbg_assign({{.*}}, ptr %g, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID:[0-9]+]], ptr %arrayidx.i, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %arrayidx3.i, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[UniqueID1:[0-9]+]], ptr poison, !DIExpression(),
-; CHECK: #dbg_assign(float 0.000000e+00, ![[#]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[UniqueID2:[0-9]+]], ptr poison, !DIExpression(),
-; CHECK: call void @llvm.memset{{.*}}, !DIAssignID ![[ID]]
-; CHECK-DAG: ![[ID]] = distinct !DIAssignID()
-; CHECK-DAG: ![[UniqueID1]] = distinct !DIAssignID()
-; CHECK-DAG: ![[UniqueID2]] = distinct !DIAssignID()
%struct.v = type { [4 x float] }
$_ZN1vC2Ef = comdat any
define dso_local void @_Z1fv() local_unnamed_addr !dbg !7 {
+; CHECK-LABEL: define dso_local void @_Z1fv(
+; CHECK-SAME: ) local_unnamed_addr !dbg [[DBG8:![0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[G:%.*]] = alloca [[STRUCT_V:%.*]], align 4, !DIAssignID [[DIASSIGNID24:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(i1 poison, [[META12:![0-9]+]], !DIExpression(), [[DIASSIGNID24]], ptr [[G]], !DIExpression(), [[META25:![0-9]+]])
+; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 2, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), [[META34:![0-9]+]], ptr [[ARRAYIDX_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 1, !dbg [[DBG35:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), [[META34]], ptr [[ARRAYIDX3_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 0, !dbg [[DBG36:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), [[META34]], ptr [[ARRAYIDX5_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [[STRUCT_V]], ptr [[G]], i64 0, i32 0, i64 3, !dbg [[DBG37:![0-9]+]]
+; CHECK-NEXT: #dbg_assign(float 0.000000e+00, [[META12]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), [[META34]], ptr [[ARRAYIDX7_I]], !DIExpression(), [[META25]])
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast ptr [[ARRAYIDX5_I]] to ptr, !dbg [[DBG38:![0-9]+]]
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 16, i1 false), !dbg [[DBG39:![0-9]+]], !DIAssignID [[META34]]
+; CHECK-NEXT: call void @_Z3escP1v(ptr nonnull [[G]]), !dbg [[DBG38]]
+; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]]
+;
entry:
%g = alloca %struct.v, align 4, !DIAssignID !23
call void @llvm.dbg.assign(metadata i1 poison, metadata !11, metadata !DIExpression(), metadata !23, metadata ptr %g, metadata !DIExpression()), !dbg !24
- %arrayidx.i = getelementptr inbounds %struct.v, ptr %g, i64 0, i32 0, i64 2, !dbg !37
+ %arrayidx.i = getelementptr inbounds %struct.v, ptr %g, i64 0, i32 0, i64 2, !dbg !37
call void @llvm.dbg.assign(metadata float 0.000000e+00, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 32), metadata !39, metadata ptr %arrayidx.i, metadata !DIExpression()), !dbg !24
%arrayidx3.i = getelementptr inbounds %struct.v, ptr %g, i64 0, i32 0, i64 1, !dbg !40
call void @llvm.dbg.assign(metadata float 0.000000e+00, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32), metadata !39, metadata ptr %arrayidx3.i, metadata !DIExpression()), !dbg !24
@@ -125,3 +136,41 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
!65 = !DISubroutineType(types: !66)
!66 = !{null, !30}
!1000 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "reduce.cpp", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[DBG8]] = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: [[META1]], file: [[META1]], line: 12, type: [[META9:![0-9]+]], scopeLine: 12, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META11:![0-9]+]])
+; CHECK: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; CHECK: [[META10]] = !{null}
+; CHECK: [[META11]] = !{[[META12]]}
+; CHECK: [[META12]] = !DILocalVariable(name: "g", scope: [[DBG8]], file: [[META1]], line: 13, type: [[META13:![0-9]+]])
+; CHECK: [[META13]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "v", file: [[META1]], line: 1, size: 128, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: [[META14:![0-9]+]], identifier: "_ZTS1v")
+; CHECK: [[META14]] = !{[[META15:![0-9]+]], [[META20:![0-9]+]]}
+; CHECK: [[META15]] = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: [[META13]], file: [[META1]], line: 2, baseType: [[META16:![0-9]+]], size: 128)
+; CHECK: [[META16]] = !DICompositeType(tag: DW_TAG_array_type, baseType: [[META17:![0-9]+]], size: 128, elements: [[META18:![0-9]+]])
+; CHECK: [[META17]] = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+; CHECK: [[META18]] = !{[[META19:![0-9]+]]}
+; CHECK: [[META19]] = !DISubrange(count: 4)
+; CHECK: [[META20]] = !DISubprogram(name: "v", scope: [[META13]], file: [[META1]], line: 4, type: [[META21:![0-9]+]], scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+; CHECK: [[META21]] = !DISubroutineType(types: [[META22:![0-9]+]])
+; CHECK: [[META22]] = !{null, [[META23:![0-9]+]], [[META17]]}
+; CHECK: [[META23]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META13]], size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+; CHECK: [[DIASSIGNID24]] = distinct !DIAssignID()
+; CHECK: [[META25]] = !DILocation(line: 0, scope: [[DBG8]])
+; CHECK: [[DBG26]] = !DILocation(line: 5, column: 19, scope: [[META27:![0-9]+]], inlinedAt: [[META33:![0-9]+]])
+; CHECK: [[META27]] = distinct !DILexicalBlock(scope: [[META28:![0-9]+]], file: [[META1]], line: 4, column: 14)
+; CHECK: [[META28]] = distinct !DISubprogram(name: "v", linkageName: "_ZN1vC2Ef", scope: [[META13]], file: [[META1]], line: 4, type: [[META21]], scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], declaration: [[META20]], retainedNodes: [[META29:![0-9]+]])
+; CHECK: [[META29]] = !{[[META30:![0-9]+]], [[META32:![0-9]+]]}
+; CHECK: [[META30]] = !DILocalVariable(name: "this", arg: 1, scope: [[META28]], type: [[META31:![0-9]+]], flags: DIFlagArtificial | DIFlagObjectPointer)
+; CHECK: [[META31]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META13]], size: 64)
+; CHECK: [[META32]] = !DILocalVariable(name: "d", arg: 2, scope: [[META28]], file: [[META1]], line: 4, type: [[META17]])
+; CHECK: [[META33]] = distinct !DILocation(line: 13, column: 5, scope: [[DBG8]])
+; CHECK: [[META34]] = distinct !DIAssignID()
+; CHECK: [[DBG35]] = !DILocation(line: 5, column: 12, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG36]] = !DILocation(line: 5, column: 5, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG37]] = !DILocation(line: 6, column: 5, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG38]] = !DILocation(line: 14, column: 3, scope: [[DBG8]])
+; CHECK: [[DBG39]] = !DILocation(line: 5, column: 17, scope: [[META27]], inlinedAt: [[META33]])
+; CHECK: [[DBG40]] = !DILocation(line: 15, column: 1, scope: [[DBG8]])
+;.
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
index bc1756f6ca9d1b..893084df00f58e 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
@@ -23,8 +23,8 @@ define void @write4to7_weird_element_type(ptr nocapture %p) {
; CHECK-LABEL: @write4to7_weird_element_type(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 24, i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: ret void
@@ -119,8 +119,7 @@ entry:
define void @write0to7(ptr nocapture %p) {
; CHECK-LABEL: @write0to7(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 8
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i1 false)
; CHECK-NEXT: store i64 1, ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -135,8 +134,7 @@ entry:
define void @write0to7_atomic(ptr nocapture %p) {
; CHECK-LABEL: @write0to7_atomic(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 8
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i32 4)
; CHECK-NEXT: store atomic i64 1, ptr [[P]] unordered, align 8
; CHECK-NEXT: ret void
;
@@ -236,8 +234,7 @@ define void @write2to10(ptr nocapture %p) {
; CHECK-LABEL: @write2to10(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i1 false)
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
; CHECK-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: ret void
@@ -254,8 +251,7 @@ define void @write2to10_atomic(ptr nocapture %p) {
; CHECK-LABEL: @write2to10_atomic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i32 4)
+; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i32 4)
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
; CHECK-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8
; CHECK-NEXT: ret void
@@ -360,8 +356,7 @@ define void @ow_begin_align1(ptr nocapture %p) {
; CHECK-LABEL: @ow_begin_align1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP0]], i8 0, i64 25, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 32, i1 false)
; CHECK-NEXT: store i64 1, ptr [[P]], align 1
; CHECK-NEXT: ret void
;
@@ -376,8 +371,7 @@ define void @ow_end_align4(ptr nocapture %p) {
; CHECK-LABEL: @ow_end_align4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 32, i1 f...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the general idea here is reasonable, as this is very hard to undo in the backend.
Some thoughts on the implementation:
- Please make sure existing tests retain their intent, and add new ones for the heuristic.
- I think this should be framed more in terms of rounding up to a desirable size. For example, if we're dropping down from 32 to 15, it would be better to drop down to 16 instead.
- It looks like getLoadStoreVecRegBitWidth defaults to 128 and is not overridden on most targets. In the context of this patch, it will be misleading for any (sub-)targets without vectors. 128 bit is a good default assumption for a target that has vectors, but not for a scalar-only target.
Do you know how to test if a target supports vectors? I don't see any obvious knob. |
Might be worth to try
|
3023278
to
0605e04
Compare
Conflict seems to just be formatting, will rebase shortly. |
Did so, basically if scalar width >= vec width, assumed scalar width for width of loads/stores. |
b870b58
to
b555464
Compare
ping |
ping2 |
llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
Outdated
Show resolved
Hide resolved
b555464
to
02f455b
Compare
Currently for the following snippet: `memcpy(dst, src, 8); dst[7] = 0` DSE will transform it to: `memcpy(dst, src, 7); dst[7] = 0` Likewise if we have: `memcpy(dst, src, 9); dst[7] = 0; dst[8] = 0` DSE will transform it to: `memcpy(dst, src, 7); dst[7] = 0` However, in both cases we would prefer to emit an 8-byte `memcpy` followed by any overwrite of the trailing byte(s). This patch attempts to optimize the new intrinsic length within the available range of the original size and the maximally shrunk size.
02f455b
to
54d72d1
Compare
Fairly certain the buildkite issue is unrelated (seems to be failing on all PRs). |
Seems like this causes regressions on opt benchmark. Will look into it. |
dse-after-memcpyopt-merge.ll
test; NFCCurrently for the following snippet:
memcpy(dst, src, 8); dst[7] = 0
DSE will transform it to:
memcpy(dst, src, 7); dst[7] = 0
Likewise if we have:
memcpy(dst, src, 9); dst[7] = 0; dst[8] = 0
DSE will transform it to:
memcpy(dst, src, 7); dst[7] = 0
However, in both cases we would prefer to emit an 8-byte
memcpy
followed by any overwrite of the trailing byte(s).
This patch attempts to optimize the new intrinsic length within the
available range of the original size and the maximally shrunk size.