48
48
#include " llvm/Analysis/MustExecute.h"
49
49
#include " llvm/Analysis/PostDominators.h"
50
50
#include " llvm/Analysis/TargetLibraryInfo.h"
51
+ #include " llvm/Analysis/TargetTransformInfo.h"
51
52
#include " llvm/Analysis/ValueTracking.h"
52
53
#include " llvm/IR/Argument.h"
53
54
#include " llvm/IR/BasicBlock.h"
@@ -558,9 +559,10 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
558
559
for_each (LinkedDVRAssigns, InsertAssignForOverlap);
559
560
}
560
561
561
- static bool tryToShorten (Instruction *DeadI, int64_t &DeadStart,
562
- uint64_t &DeadSize, int64_t KillingStart,
563
- uint64_t KillingSize, bool IsOverwriteEnd) {
562
+ static bool tryToShorten (Instruction *DeadI, int64_t DeadStart,
563
+ uint64_t DeadSize, int64_t KillingStart,
564
+ uint64_t KillingSize, bool IsOverwriteEnd,
565
+ const TargetTransformInfo &TTI) {
564
566
auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
565
567
Align PrefAlign = DeadIntrinsic->getDestAlign ().valueOrOne ();
566
568
@@ -583,11 +585,7 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
583
585
// Compute start and size of the region to remove. Make sure 'PrefAlign' is
584
586
// maintained on the remaining store.
585
587
if (IsOverwriteEnd) {
586
- // Calculate required adjustment for 'KillingStart' in order to keep
587
- // remaining store size aligned on 'PerfAlign'.
588
- uint64_t Off =
589
- offsetToAlignment (uint64_t (KillingStart - DeadStart), PrefAlign);
590
- ToRemoveStart = KillingStart + Off;
588
+ ToRemoveStart = KillingStart;
591
589
if (DeadSize <= uint64_t (ToRemoveStart - DeadStart))
592
590
return false ;
593
591
ToRemoveSize = DeadSize - uint64_t (ToRemoveStart - DeadStart);
@@ -612,6 +610,108 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
612
610
assert (DeadSize > ToRemoveSize && " Can't remove more than original size" );
613
611
614
612
uint64_t NewSize = DeadSize - ToRemoveSize;
613
+
614
+ // Try to coerce the new memcpy/memset size to a "fast" value. This typically
615
+ // means some exact multiple of the register width of the loads/stores.
616
+
617
+ // If scalar size >= vec size, assume target will use scalars for implementing
618
+ // memset/memcpy.
619
+ TypeSize ScalarSize =
620
+ TTI.getRegisterBitWidth (TargetTransformInfo::RGK_Scalar);
621
+ TypeSize VecSize =
622
+ TTI.getRegisterBitWidth (TargetTransformInfo::RGK_FixedWidthVector);
623
+ uint64_t MemUnit = 0 ;
624
+ if (ScalarSize >= VecSize)
625
+ MemUnit = ScalarSize.getFixedValue ();
626
+ // Otherwise assume memset/memcpy will be lowered with Vec's
627
+ else
628
+ MemUnit =
629
+ TTI.getLoadStoreVecRegBitWidth (DeadIntrinsic->getDestAddressSpace ());
630
+
631
+ MemUnit /= 8U ;
632
+
633
+ // Assume loads/stores are issued by power of 2 regions. Try to minimize
634
+ // number of power of 2 blocks.
635
+ // ie if we have DeadSize = 15
636
+ // NewSize = 7 -> 8 (4 + 3 + 2 + 1) -> (8)
637
+ // NewSize = 9 -> 9 (8 + 1) == (8 + 1)
638
+ // NewSize = 11 -> 12 (8 + 2 + 1) -> (8 + 4)
639
+ uint64_t Upper = DeadSize;
640
+ uint64_t Lower = NewSize;
641
+
642
+ uint64_t RoundLower = MemUnit * (Lower / MemUnit);
643
+
644
+ // We have some trailing loads/stores we can try to optimize.
645
+ if (RoundLower != Lower && Lower != 0 && (RoundLower + MemUnit) != 0 ) {
646
+ Upper = std::min (Upper, RoundLower + MemUnit - 1 );
647
+ // Don't bust inlining doing this.
648
+ uint64_t InlineThresh = TTI.getMaxMemIntrinsicInlineSizeThreshold ();
649
+ if (Upper > InlineThresh && Lower <= InlineThresh)
650
+ Upper = InlineThresh;
651
+
652
+ // Replace Lower with value in range [Lower, Upper] that has min popcount
653
+ // (selecting for minimum value as tiebreaker when popcount is the same).
654
+ // The idea here is this will require the minimum number of load/stores and
655
+ // within that will use the presumably preferable minimum width.
656
+
657
+ // Get highest bit that differs between Lower and Upper. Anything above this
658
+ // bit must be in the new value. Anything below it thats larger than Lower
659
+ // is fair game.
660
+ uint64_t Dif = (Lower - 1 ) ^ Upper;
661
+ uint64_t HighestBit = 63 - llvm::countl_zero (Dif);
662
+
663
+ // Make Lo/Hi masks from the HighestDif bit. Lo mask is use to find value we
664
+ // can roundup for minimum power of 2 chunk, Hi mask is preserved.
665
+ uint64_t HighestP2 = static_cast <uint64_t >(1 ) << HighestBit;
666
+ uint64_t LoMask = HighestP2 - 1 ;
667
+ uint64_t HiMask = -HighestP2;
668
+
669
+ // Minimum power of 2 for the "tail"
670
+ uint64_t LoVal = Lower & LoMask;
671
+ if (LoVal)
672
+ LoVal = llvm::bit_ceil (LoVal);
673
+ // Preserved high bits to stay in range.
674
+ uint64_t HiVal = Lower & HiMask;
675
+ Lower = LoVal | HiVal;
676
+
677
+ // If we have more than two tail stores see if we can just roundup the next
678
+ // memunit.
679
+ if (llvm::popcount (Lower % MemUnit) > 1 &&
680
+ DeadSize >= (RoundLower + MemUnit))
681
+ Lower = RoundLower + MemUnit;
682
+
683
+ uint64_t OptimizedNewSize = NewSize;
684
+ // If we are over-writing the begining, make sure we don't mess up the
685
+ // alignment.
686
+ if (IsOverwriteEnd || isAligned (PrefAlign, DeadSize - Lower)) {
687
+ OptimizedNewSize = Lower;
688
+ } else {
689
+ // Our minimal value isn't properly aligned, see if we can
690
+ // increase the size of a tail loads/stores.
691
+ Lower = HiVal | HighestP2;
692
+ if (isAligned (PrefAlign, DeadSize - Lower))
693
+ OptimizedNewSize = Lower;
694
+ // If we can't adjust size without messing up alignment, see if the new
695
+ // size is actually preferable.
696
+ // TODO: We should probably do better here than just giving up.
697
+ else if ((NewSize <= InlineThresh) == (DeadSize <= InlineThresh) &&
698
+ llvm::popcount (NewSize) > llvm::popcount (DeadSize) &&
699
+ DeadSize / MemUnit == NewSize / MemUnit)
700
+ return false ;
701
+ }
702
+
703
+ // Adjust new starting point for the memset/memcpy.
704
+ if (OptimizedNewSize != NewSize) {
705
+ if (!IsOverwriteEnd)
706
+ ToRemoveSize = DeadSize - OptimizedNewSize;
707
+ NewSize = OptimizedNewSize;
708
+ }
709
+
710
+ // Our optimal length is the original length, skip the transform.
711
+ if (NewSize == DeadSize)
712
+ return false ;
713
+ }
714
+
615
715
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) {
616
716
// When shortening an atomic memory intrinsic, the newly shortened
617
717
// length must remain an integer multiple of the element size.
@@ -654,7 +754,8 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
654
754
}
655
755
656
756
static bool tryToShortenEnd (Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
657
- int64_t &DeadStart, uint64_t &DeadSize) {
757
+ int64_t &DeadStart, uint64_t &DeadSize,
758
+ const TargetTransformInfo &TTI) {
658
759
if (IntervalMap.empty () || !isShortenableAtTheEnd (DeadI))
659
760
return false ;
660
761
@@ -672,7 +773,7 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
672
773
// be non negative due to preceding checks.
673
774
KillingSize >= DeadSize - (uint64_t )(KillingStart - DeadStart)) {
674
775
if (tryToShorten (DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
675
- true )) {
776
+ true , TTI )) {
676
777
IntervalMap.erase (OII);
677
778
return true ;
678
779
}
@@ -682,7 +783,8 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
682
783
683
784
static bool tryToShortenBegin (Instruction *DeadI,
684
785
OverlapIntervalsTy &IntervalMap,
685
- int64_t &DeadStart, uint64_t &DeadSize) {
786
+ int64_t &DeadStart, uint64_t &DeadSize,
787
+ const TargetTransformInfo &TTI) {
686
788
if (IntervalMap.empty () || !isShortenableAtTheBeginning (DeadI))
687
789
return false ;
688
790
@@ -701,7 +803,7 @@ static bool tryToShortenBegin(Instruction *DeadI,
701
803
assert (KillingSize - (uint64_t )(DeadStart - KillingStart) < DeadSize &&
702
804
" Should have been handled as OW_Complete" );
703
805
if (tryToShorten (DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
704
- false )) {
806
+ false , TTI )) {
705
807
IntervalMap.erase (OII);
706
808
return true ;
707
809
}
@@ -852,6 +954,7 @@ struct DSEState {
852
954
DominatorTree &DT;
853
955
PostDominatorTree &PDT;
854
956
const TargetLibraryInfo &TLI;
957
+ const TargetTransformInfo &TTI;
855
958
const DataLayout &DL;
856
959
const LoopInfo &LI;
857
960
@@ -896,9 +999,9 @@ struct DSEState {
896
999
897
1000
DSEState (Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
898
1001
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
899
- const LoopInfo &LI)
1002
+ const TargetTransformInfo &TTI, const LoopInfo &LI)
900
1003
: F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
901
- PDT (PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
1004
+ PDT (PDT), TLI(TLI), TTI(TTI), DL(F.getDataLayout()), LI(LI) {
902
1005
// Collect blocks with throwing instructions not modeled in MemorySSA and
903
1006
// alloc-like objects.
904
1007
unsigned PO = 0 ;
@@ -2103,10 +2206,10 @@ struct DSEState {
2103
2206
uint64_t DeadSize = Loc.Size .getValue ();
2104
2207
GetPointerBaseWithConstantOffset (Ptr , DeadStart, DL);
2105
2208
OverlapIntervalsTy &IntervalMap = OI.second ;
2106
- Changed |= tryToShortenEnd (DeadI, IntervalMap, DeadStart, DeadSize);
2209
+ Changed |= tryToShortenEnd (DeadI, IntervalMap, DeadStart, DeadSize, TTI );
2107
2210
if (IntervalMap.empty ())
2108
2211
continue ;
2109
- Changed |= tryToShortenBegin (DeadI, IntervalMap, DeadStart, DeadSize);
2212
+ Changed |= tryToShortenBegin (DeadI, IntervalMap, DeadStart, DeadSize, TTI );
2110
2213
}
2111
2214
return Changed;
2112
2215
}
@@ -2347,9 +2450,10 @@ bool DSEState::eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper) {
2347
2450
static bool eliminateDeadStores (Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
2348
2451
DominatorTree &DT, PostDominatorTree &PDT,
2349
2452
const TargetLibraryInfo &TLI,
2453
+ const TargetTransformInfo &TTI,
2350
2454
const LoopInfo &LI) {
2351
2455
bool MadeChange = false ;
2352
- DSEState State (F, AA, MSSA, DT, PDT, TLI, LI);
2456
+ DSEState State (F, AA, MSSA, DT, PDT, TLI, TTI, LI);
2353
2457
// For each store:
2354
2458
for (unsigned I = 0 ; I < State.MemDefs .size (); I++) {
2355
2459
MemoryDef *KillingDef = State.MemDefs [I];
@@ -2383,12 +2487,13 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
2383
2487
PreservedAnalyses DSEPass::run (Function &F, FunctionAnalysisManager &AM) {
2384
2488
AliasAnalysis &AA = AM.getResult <AAManager>(F);
2385
2489
const TargetLibraryInfo &TLI = AM.getResult <TargetLibraryAnalysis>(F);
2490
+ const TargetTransformInfo &TTI = AM.getResult <TargetIRAnalysis>(F);
2386
2491
DominatorTree &DT = AM.getResult <DominatorTreeAnalysis>(F);
2387
2492
MemorySSA &MSSA = AM.getResult <MemorySSAAnalysis>(F).getMSSA ();
2388
2493
PostDominatorTree &PDT = AM.getResult <PostDominatorTreeAnalysis>(F);
2389
2494
LoopInfo &LI = AM.getResult <LoopAnalysis>(F);
2390
2495
2391
- bool Changed = eliminateDeadStores (F, AA, MSSA, DT, PDT, TLI, LI);
2496
+ bool Changed = eliminateDeadStores (F, AA, MSSA, DT, PDT, TLI, TTI, LI);
2392
2497
2393
2498
#ifdef LLVM_ENABLE_STATS
2394
2499
if (AreStatisticsEnabled ())
0 commit comments