diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a304f7b056f5f..b673d376f1c31 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -48,6 +48,7 @@ #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -558,9 +559,10 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest, for_each(LinkedDVRAssigns, InsertAssignForOverlap); } -static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, - uint64_t &DeadSize, int64_t KillingStart, - uint64_t KillingSize, bool IsOverwriteEnd) { +static bool tryToShorten(Instruction *DeadI, int64_t DeadStart, + uint64_t DeadSize, int64_t KillingStart, + uint64_t KillingSize, bool IsOverwriteEnd, + const TargetTransformInfo &TTI) { auto *DeadIntrinsic = cast(DeadI); Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne(); @@ -583,11 +585,7 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, // Compute start and size of the region to remove. Make sure 'PrefAlign' is // maintained on the remaining store. if (IsOverwriteEnd) { - // Calculate required adjustment for 'KillingStart' in order to keep - // remaining store size aligned on 'PerfAlign'. - uint64_t Off = - offsetToAlignment(uint64_t(KillingStart - DeadStart), PrefAlign); - ToRemoveStart = KillingStart + Off; + ToRemoveStart = KillingStart; if (DeadSize <= uint64_t(ToRemoveStart - DeadStart)) return false; ToRemoveSize = DeadSize - uint64_t(ToRemoveStart - DeadStart); @@ -612,6 +610,108 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, assert(DeadSize > ToRemoveSize && "Can't remove more than original size"); uint64_t NewSize = DeadSize - ToRemoveSize; + + // Try to coerce the new memcpy/memset size to a "fast" value. This typically + // means some exact multiple of the register width of the loads/stores. + + // If scalar size >= vec size, assume target will use scalars for implementing + // memset/memcpy. + TypeSize ScalarSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar); + TypeSize VecSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector); + uint64_t MemUnit = 0; + if (ScalarSize >= VecSize) + MemUnit = ScalarSize.getFixedValue(); + // Otherwise assume memset/memcpy will be lowered with Vec's + else + MemUnit = + TTI.getLoadStoreVecRegBitWidth(DeadIntrinsic->getDestAddressSpace()); + + MemUnit /= 8U; + + // Assume loads/stores are issued by power of 2 regions. Try to minimize + // number of power of 2 blocks. + // ie if we have DeadSize = 15 + // NewSize = 7 -> 8 (4 + 3 + 2 + 1) -> (8) + // NewSize = 9 -> 9 (8 + 1) == (8 + 1) + // NewSize = 11 -> 12 (8 + 2 + 1) -> (8 + 4) + uint64_t Upper = DeadSize; + uint64_t Lower = NewSize; + + uint64_t RoundLower = MemUnit * (Lower / MemUnit); + + // We have some trailing loads/stores we can try to optimize. + if (RoundLower != Lower && Lower != 0 && (RoundLower + MemUnit) != 0) { + Upper = std::min(Upper, RoundLower + MemUnit - 1); + // Don't bust inlining doing this. + uint64_t InlineThresh = TTI.getMaxMemIntrinsicInlineSizeThreshold(); + if (Upper > InlineThresh && Lower <= InlineThresh) + Upper = InlineThresh; + + // Replace Lower with value in range [Lower, Upper] that has min popcount + // (selecting for minimum value as tiebreaker when popcount is the same). + // The idea here is this will require the minimum number of load/stores and + // within that will use the presumably preferable minimum width. + + // Get highest bit that differs between Lower and Upper. Anything above this + // bit must be in the new value. Anything below it thats larger than Lower + // is fair game. + uint64_t Dif = (Lower - 1) ^ Upper; + uint64_t HighestBit = 63 - llvm::countl_zero(Dif); + + // Make Lo/Hi masks from the HighestDif bit. Lo mask is use to find value we + // can roundup for minimum power of 2 chunk, Hi mask is preserved. + uint64_t HighestP2 = static_cast(1) << HighestBit; + uint64_t LoMask = HighestP2 - 1; + uint64_t HiMask = -HighestP2; + + // Minimum power of 2 for the "tail" + uint64_t LoVal = Lower & LoMask; + if (LoVal) + LoVal = llvm::bit_ceil(LoVal); + // Preserved high bits to stay in range. + uint64_t HiVal = Lower & HiMask; + Lower = LoVal | HiVal; + + // If we have more than two tail stores see if we can just roundup the next + // memunit. + if (llvm::popcount(Lower % MemUnit) > 1 && + DeadSize >= (RoundLower + MemUnit)) + Lower = RoundLower + MemUnit; + + uint64_t OptimizedNewSize = NewSize; + // If we are over-writing the begining, make sure we don't mess up the + // alignment. + if (IsOverwriteEnd || isAligned(PrefAlign, DeadSize - Lower)) { + OptimizedNewSize = Lower; + } else { + // Our minimal value isn't properly aligned, see if we can + // increase the size of a tail loads/stores. + Lower = HiVal | HighestP2; + if (isAligned(PrefAlign, DeadSize - Lower)) + OptimizedNewSize = Lower; + // If we can't adjust size without messing up alignment, see if the new + // size is actually preferable. + // TODO: We should probably do better here than just giving up. + else if ((NewSize <= InlineThresh) == (DeadSize <= InlineThresh) && + llvm::popcount(NewSize) > llvm::popcount(DeadSize) && + DeadSize / MemUnit == NewSize / MemUnit) + return false; + } + + // Adjust new starting point for the memset/memcpy. + if (OptimizedNewSize != NewSize) { + if (!IsOverwriteEnd) + ToRemoveSize = DeadSize - OptimizedNewSize; + NewSize = OptimizedNewSize; + } + + // Our optimal length is the original length, skip the transform. + if (NewSize == DeadSize) + return false; + } + if (auto *AMI = dyn_cast(DeadI)) { // When shortening an atomic memory intrinsic, the newly shortened // length must remain an integer multiple of the element size. @@ -654,7 +754,8 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart, } static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap, - int64_t &DeadStart, uint64_t &DeadSize) { + int64_t &DeadStart, uint64_t &DeadSize, + const TargetTransformInfo &TTI) { if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI)) return false; @@ -672,7 +773,7 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap, // be non negative due to preceding checks. KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) { if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize, - true)) { + true, TTI)) { IntervalMap.erase(OII); return true; } @@ -682,7 +783,8 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap, static bool tryToShortenBegin(Instruction *DeadI, OverlapIntervalsTy &IntervalMap, - int64_t &DeadStart, uint64_t &DeadSize) { + int64_t &DeadStart, uint64_t &DeadSize, + const TargetTransformInfo &TTI) { if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI)) return false; @@ -701,7 +803,7 @@ static bool tryToShortenBegin(Instruction *DeadI, assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize && "Should have been handled as OW_Complete"); if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize, - false)) { + false, TTI)) { IntervalMap.erase(OII); return true; } @@ -852,6 +954,7 @@ struct DSEState { DominatorTree &DT; PostDominatorTree &PDT; const TargetLibraryInfo &TLI; + const TargetTransformInfo &TTI; const DataLayout &DL; const LoopInfo &LI; @@ -896,9 +999,9 @@ struct DSEState { DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, - const LoopInfo &LI) + const TargetTransformInfo &TTI, const LoopInfo &LI) : F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT), - PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) { + PDT(PDT), TLI(TLI), TTI(TTI), DL(F.getDataLayout()), LI(LI) { // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. unsigned PO = 0; @@ -2103,10 +2206,11 @@ struct DSEState { uint64_t DeadSize = Loc.Size.getValue(); GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL); OverlapIntervalsTy &IntervalMap = OI.second; - Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize); + Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize, TTI); if (IntervalMap.empty()) continue; - Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize); + Changed |= + tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize, TTI); } return Changed; } @@ -2347,9 +2451,10 @@ bool DSEState::eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper) { static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT, PostDominatorTree &PDT, const TargetLibraryInfo &TLI, + const TargetTransformInfo &TTI, const LoopInfo &LI) { bool MadeChange = false; - DSEState State(F, AA, MSSA, DT, PDT, TLI, LI); + DSEState State(F, AA, MSSA, DT, PDT, TLI, TTI, LI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { MemoryDef *KillingDef = State.MemDefs[I]; @@ -2383,12 +2488,13 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { AliasAnalysis &AA = AM.getResult(F); const TargetLibraryInfo &TLI = AM.getResult(F); + const TargetTransformInfo &TTI = AM.getResult(F); DominatorTree &DT = AM.getResult(F); MemorySSA &MSSA = AM.getResult(F).getMSSA(); PostDominatorTree &PDT = AM.getResult(F); LoopInfo &LI = AM.getResult(F); - bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI); + bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, TTI, LI); #ifdef LLVM_ENABLE_STATS if (AreStatisticsEnabled()) diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll index bc1756f6ca9d1..135b4a18341e9 100644 --- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll +++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=dse -S | FileCheck %s +; RUN: opt < %s -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM4 +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM16 define void @write4to7(ptr nocapture %p) { ; CHECK-LABEL: @write4to7( @@ -23,8 +24,8 @@ define void @write4to7_weird_element_type(ptr nocapture %p) { ; CHECK-LABEL: @write4to7_weird_element_type( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 24, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false) ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1 ; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: ret void @@ -233,14 +234,22 @@ entry: } define void @write2to10(ptr nocapture %p) { -; CHECK-LABEL: @write2to10( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 -; CHECK-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @write2to10( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 +; CHECK-MEM4-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @write2to10( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 +; CHECK-MEM16-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8 +; CHECK-MEM16-NEXT: ret void ; entry: %arrayidx0 = getelementptr inbounds i32, ptr %p, i64 1 @@ -251,14 +260,22 @@ entry: } define void @write2to10_atomic(ptr nocapture %p) { -; CHECK-LABEL: @write2to10_atomic( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4 -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i32 4) -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 -; CHECK-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @write2to10_atomic( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4 +; CHECK-MEM4-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i32 4) +; CHECK-MEM4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 +; CHECK-MEM4-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @write2to10_atomic( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i32 4) +; CHECK-MEM16-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 +; CHECK-MEM16-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8 +; CHECK-MEM16-NEXT: ret void ; entry: %arrayidx0 = getelementptr inbounds i32, ptr %p, i64 1 @@ -269,14 +286,23 @@ entry: } define void @write8To15AndThen0To7(ptr nocapture %P) { -; CHECK-LABEL: @write8To15AndThen0To7( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16 -; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[TMP0]], i8 0, i64 16, i1 false) -; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1 -; CHECK-NEXT: store i64 1, ptr [[BASE64_1]], align 4 -; CHECK-NEXT: store i64 2, ptr [[P]], align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @write8To15AndThen0To7( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16 +; CHECK-MEM4-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[TMP0]], i8 0, i64 16, i1 false) +; CHECK-MEM4-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1 +; CHECK-MEM4-NEXT: store i64 1, ptr [[BASE64_1]], align 4 +; CHECK-MEM4-NEXT: store i64 2, ptr [[P]], align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @write8To15AndThen0To7( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16 +; CHECK-MEM16-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[TMP0]], i8 0, i64 16, i1 false) +; CHECK-MEM16-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1 +; CHECK-MEM16-NEXT: store i64 1, ptr [[BASE64_1]], align 8 +; CHECK-MEM16-NEXT: store i64 2, ptr [[P]], align 8 +; CHECK-MEM16-NEXT: ret void ; entry: @@ -357,13 +383,20 @@ declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture, i8, i64, i1) nounwi declare void @llvm.memset.element.unordered.atomic.p0.i64(ptr nocapture, i8, i64, i32) nounwind define void @ow_begin_align1(ptr nocapture %p) { -; CHECK-LABEL: @ow_begin_align1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP0]], i8 0, i64 25, i1 false) -; CHECK-NEXT: store i64 1, ptr [[P]], align 1 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @ow_begin_align1( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP0]], i8 0, i64 25, i1 false) +; CHECK-MEM4-NEXT: store i64 1, ptr [[P]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @ow_begin_align1( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 1, ptr [[P]], align 1 +; CHECK-MEM16-NEXT: ret void ; entry: %p1 = getelementptr inbounds i8, ptr %p, i64 1 @@ -373,13 +406,20 @@ entry: } define void @ow_end_align4(ptr nocapture %p) { -; CHECK-LABEL: @ow_end_align4( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false) -; CHECK-NEXT: store i64 1, ptr [[P]], align 1 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @ow_end_align4( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: store i64 1, ptr [[P]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @ow_end_align4( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 1, ptr [[P]], align 1 +; CHECK-MEM16-NEXT: ret void ; entry: %p1 = getelementptr inbounds i8, ptr %p, i64 1 @@ -402,3 +442,162 @@ entry: store i64 1, ptr %p, align 1 ret void } + +define void @memset_optimize_size_lo_33_to_x86_32_generic_28(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_lo_33_to_x86_32_generic_28( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 5 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_lo_33_to_x86_32_generic_28( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM16-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 3 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 33, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_33_misaligned_x86_fail_generic_save_unit(ptr %p) { +; CHECK-LABEL: @memset_optimize_size_lo_33_misaligned_x86_fail_generic_save_unit( +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 29, i1 false) +; CHECK-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 3 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 33, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[P0]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 4 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 32, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_34_to_32(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_lo_34_to_32( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 30, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_lo_34_to_32( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM16-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 2 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 4 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 34, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_34_x86_misaligned_fail_generic_save_unit(ptr %p) { +; CHECK-LABEL: @memset_optimize_size_lo_34_x86_misaligned_fail_generic_save_unit( +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 30, i1 false) +; CHECK-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 4 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 4 %p0, i8 0, i64 34, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_34_to_32_no_align_okay(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_lo_34_to_32_no_align_okay( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 30, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_lo_34_to_32_no_align_okay( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-MEM16-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 2 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 4 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 34, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_33_to_31_save_unit_no_change(ptr %p) { +; CHECK-LABEL: @memset_optimize_size_lo_33_to_31_save_unit_no_change( +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 2 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 31, i1 false) +; CHECK-NEXT: store i32 0, ptr [[P1]], align 1 +; CHECK-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 1 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 33, i1 false) + store i32 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_lo_36_to_32(ptr %p) { +; CHECK-LABEL: @memset_optimize_size_lo_36_to_32( +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 32, i1 false) +; CHECK-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 1 + %p1 = getelementptr inbounds i8, ptr %p, i64 0 + call void @llvm.memset.p0.i64(ptr align 4 %p0, i8 0, i64 36, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll index ac8eee7088ad8..4ad84f213c08d 100644 --- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll +++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll @@ -1,9 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=dse -S | FileCheck %s +; RUN: opt < %s -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM4 +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM16 + target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -%struct.vec2 = type { <4 x i32>, <4 x i32> } -%struct.vec2plusi = type { <4 x i32>, <4 x i32>, i32 } +%struct.vec2 = type { +<4 x i32>, <4 x i32> +} + +%struct.vec2plusi = type { +<4 x i32>, <4 x i32>, i32 +} @glob1 = global %struct.vec2 zeroinitializer, align 16 @glob2 = global %struct.vec2plusi zeroinitializer, align 16 @@ -61,12 +68,19 @@ entry: } define void @write28to32(ptr nocapture %p) nounwind uwtable ssp { -; CHECK-LABEL: @write28to32( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i1 false) -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @write28to32( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM4-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @write28to32( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM16-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 +; CHECK-MEM16-NEXT: ret void ; entry: call void @llvm.memset.p0.i64(ptr align 4 %p, i8 0, i64 32, i1 false) @@ -76,12 +90,19 @@ entry: } define void @write28to32_atomic(ptr nocapture %p) nounwind uwtable ssp { -; CHECK-LABEL: @write28to32_atomic( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i32 4) -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 -; CHECK-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @write28to32_atomic( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i32 4) +; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM4-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @write28to32_atomic( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i32 4) +; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM16-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 +; CHECK-MEM16-NEXT: ret void ; entry: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 %p, i8 0, i64 32, i32 4) @@ -91,12 +112,19 @@ entry: } define void @dontwrite28to32memset(ptr nocapture %p) nounwind uwtable ssp { -; CHECK-LABEL: @dontwrite28to32memset( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i1 false) -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @dontwrite28to32memset( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM4-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @dontwrite28to32memset( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM16-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 +; CHECK-MEM16-NEXT: ret void ; entry: call void @llvm.memset.p0.i64(ptr align 16 %p, i8 0, i64 32, i1 false) @@ -106,12 +134,19 @@ entry: } define void @dontwrite28to32memset_atomic(ptr nocapture %p) nounwind uwtable ssp { -; CHECK-LABEL: @dontwrite28to32memset_atomic( -; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i32 4) -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 -; CHECK-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @dontwrite28to32memset_atomic( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 28, i32 4) +; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM4-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @dontwrite28to32memset_atomic( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i32 4) +; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7 +; CHECK-MEM16-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 +; CHECK-MEM16-NEXT: ret void ; entry: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 %p, i8 0, i64 32, i32 4) @@ -197,12 +232,19 @@ entry: } define void @dontwrite28to32memcpy(ptr nocapture %p) nounwind uwtable ssp { -; CHECK-LABEL: @dontwrite28to32memcpy( -; CHECK-NEXT: entry: -; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i1 false) -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7 -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @dontwrite28to32memcpy( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7 +; CHECK-MEM4-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @dontwrite28to32memcpy( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7 +; CHECK-MEM16-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4 +; CHECK-MEM16-NEXT: ret void ; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 %p, ptr align 16 @glob1, i64 32, i1 false) @@ -212,12 +254,19 @@ entry: } define void @dontwrite28to32memcpy_atomic(ptr nocapture %p) nounwind uwtable ssp { -; CHECK-LABEL: @dontwrite28to32memcpy_atomic( -; CHECK-NEXT: entry: -; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i32 4) -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7 -; CHECK-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @dontwrite28to32memcpy_atomic( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 28, i32 4) +; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7 +; CHECK-MEM4-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @dontwrite28to32memcpy_atomic( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i32 4) +; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7 +; CHECK-MEM16-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4 +; CHECK-MEM16-NEXT: ret void ; entry: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 %p, ptr align 16 @glob1, i64 32, i32 4) @@ -231,7 +280,9 @@ declare void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr nocapture, ptr declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind declare void @llvm.memset.element.unordered.atomic.p0.i64(ptr nocapture, i8, i64, i32) nounwind -%struct.trapframe = type { i64, i64, i64 } +%struct.trapframe = type { +i64, i64, i64 +} ; bugzilla 11455 - make sure negative GEP's don't break this optimisation define void @cpu_lwp_fork(ptr %md_regs, i64 %pcb_rsp0) nounwind uwtable noinline ssp { @@ -259,8 +310,8 @@ define void @write16To23AndThen24To31(ptr nocapture %P, i64 %n64, i32 %n32, i16 ; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[P:%.*]], i8 0, i64 16, i1 false) ; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2 ; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3 -; CHECK-NEXT: store i64 3, ptr [[BASE64_2]] -; CHECK-NEXT: store i64 3, ptr [[BASE64_3]] +; CHECK-NEXT: store i64 3, ptr [[BASE64_2]], align 8 +; CHECK-NEXT: store i64 3, ptr [[BASE64_3]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -342,13 +393,21 @@ entry: } define void @ow_end_align1(ptr nocapture %p) { -; CHECK-LABEL: @ow_end_align1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 27, i1 false) -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 -; CHECK-NEXT: store i64 1, ptr [[P2]], align 1 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @ow_end_align1( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 +; CHECK-MEM4-NEXT: store i64 1, ptr [[P2]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @ow_end_align1( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 +; CHECK-MEM16-NEXT: store i64 1, ptr [[P2]], align 1 +; CHECK-MEM16-NEXT: ret void ; entry: %p1 = getelementptr inbounds i8, ptr %p, i64 1 @@ -359,13 +418,21 @@ entry: } define void @ow_end_align4(ptr nocapture %p) { -; CHECK-LABEL: @ow_end_align4( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 28, i1 false) -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 -; CHECK-NEXT: store i64 1, ptr [[P2]], align 1 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @ow_end_align4( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 +; CHECK-MEM4-NEXT: store i64 1, ptr [[P2]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @ow_end_align4( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 +; CHECK-MEM16-NEXT: store i64 1, ptr [[P2]], align 1 +; CHECK-MEM16-NEXT: ret void ; entry: %p1 = getelementptr inbounds i8, ptr %p, i64 1 @@ -376,13 +443,21 @@ entry: } define void @ow_end_align8(ptr nocapture %p) { -; CHECK-LABEL: @ow_end_align8( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P1]], i8 0, i64 32, i1 false) -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 -; CHECK-NEXT: store i64 1, ptr [[P2]], align 1 -; CHECK-NEXT: ret void +; CHECK-MEM4-LABEL: @ow_end_align8( +; CHECK-MEM4-NEXT: entry: +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P1]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 +; CHECK-MEM4-NEXT: store i64 1, ptr [[P2]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @ow_end_align8( +; CHECK-MEM16-NEXT: entry: +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P1]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27 +; CHECK-MEM16-NEXT: store i64 1, ptr [[P2]], align 1 +; CHECK-MEM16-NEXT: ret void ; entry: %p1 = getelementptr inbounds i8, ptr %p, i64 1 @@ -392,3 +467,98 @@ entry: ret void } +define void @memset_optimize_size_hi_31_to_24(ptr %p) { +; CHECK-LABEL: @memset_optimize_size_hi_31_to_24( +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 23 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 24, i1 false) +; CHECK-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 0 + %p1 = getelementptr inbounds i8, ptr %p, i64 23 + call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 31, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_hi_32_no_change_x86_change_generic(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_hi_32_no_change_x86_change_generic( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_hi_32_no_change_x86_change_generic( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 0 + %p1 = getelementptr inbounds i8, ptr %p, i64 28 + call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 32, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_hi_28_to_24(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_hi_28_to_24( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 21 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P0]], i8 0, i64 21, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_hi_28_to_24( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 21 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P0]], i8 0, i64 24, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 0 + %p1 = getelementptr inbounds i8, ptr %p, i64 21 + call void @llvm.memset.p0.i64(ptr align 8 %p0, i8 0, i64 28, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_hi_31_to_28(ptr %p) { +; CHECK-LABEL: @memset_optimize_size_hi_31_to_28( +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[P0]], i8 0, i64 28, i1 false) +; CHECK-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 0 + %p1 = getelementptr inbounds i8, ptr %p, i64 27 + call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 31, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} + +define void @memset_optimize_size_hi_33_to_x86_32_generic_28(ptr %p) { +; CHECK-MEM4-LABEL: @memset_optimize_size_hi_33_to_x86_32_generic_28( +; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27 +; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P0]], i8 0, i64 28, i1 false) +; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM4-NEXT: ret void +; +; CHECK-MEM16-LABEL: @memset_optimize_size_hi_33_to_x86_32_generic_28( +; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0 +; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27 +; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P0]], i8 0, i64 32, i1 false) +; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1 +; CHECK-MEM16-NEXT: ret void +; + %p0 = getelementptr inbounds i8, ptr %p, i64 0 + %p1 = getelementptr inbounds i8, ptr %p, i64 27 + call void @llvm.memset.p0.i64(ptr align 4 %p0, i8 0, i64 33, i1 false) + store i64 0, ptr %p1, align 1 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll b/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll index c9a0943de8cd9..2d04179eeb6e0 100644 --- a/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll +++ b/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll @@ -549,8 +549,8 @@ define void @test12_memset_later_store_exceeds_memset(ptr %ptr) { define void @test12_memset_later_store_before_memset(ptr %ptr) { ; CHECK-LABEL: @test12_memset_later_store_before_memset( ; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR_1]], i64 7 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 3, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR_1]], i64 6 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 4, i1 false) ; CHECK-NEXT: store i64 0, ptr [[PTR]], align 8 ; CHECK-NEXT: ret void ;