@@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion(
140
140
} // namespace llvm
141
141
142
142
extern cl::opt<bool > MemProfReportHintedSizes;
143
+ extern cl::opt<unsigned > MinClonedColdBytePercent;
143
144
144
145
namespace {
145
146
// / CRTP base for graphs built from either IR or ThinLTO summary index.
@@ -617,6 +618,11 @@ class CallsiteContextGraph {
617
618
static_cast <DerivedCCG *>(this )->updateAllocationCall (Call, AllocType);
618
619
}
619
620
621
+ // / Get the AllocationType assigned to the given allocation instruction clone.
622
+ AllocationType getAllocationCallType (const CallInfo &Call) const {
623
+ return static_cast <const DerivedCCG *>(this )->getAllocationCallType (Call);
624
+ }
625
+
620
626
// / Update non-allocation call to invoke (possibly cloned) function
621
627
// / CalleeFunc.
622
628
void updateCall (CallInfo &CallerCall, FuncInfo CalleeFunc) {
@@ -711,7 +717,8 @@ class CallsiteContextGraph {
711
717
712
718
// / Map from each contextID to the profiled full contexts and their total
713
719
// / sizes (there may be more than one due to context trimming),
714
- // / optionally populated when requested (via MemProfReportHintedSizes).
720
+ // / optionally populated when requested (via MemProfReportHintedSizes or
721
+ // / MinClonedColdBytePercent).
715
722
DenseMap<uint32_t , std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
716
723
717
724
// / Identifies the context node created for a stack id when adding the MIB
@@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph
773
780
uint64_t getLastStackId (Instruction *Call);
774
781
std::vector<uint64_t > getStackIdsWithContextNodesForCall (Instruction *Call);
775
782
void updateAllocationCall (CallInfo &Call, AllocationType AllocType);
783
+ AllocationType getAllocationCallType (const CallInfo &Call) const ;
776
784
void updateCall (CallInfo &CallerCall, FuncInfo CalleeFunc);
777
785
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778
786
Instruction *>::FuncInfo
@@ -852,6 +860,7 @@ class IndexCallsiteContextGraph
852
860
uint64_t getLastStackId (IndexCall &Call);
853
861
std::vector<uint64_t > getStackIdsWithContextNodesForCall (IndexCall &Call);
854
862
void updateAllocationCall (CallInfo &Call, AllocationType AllocType);
863
+ AllocationType getAllocationCallType (const CallInfo &Call) const ;
855
864
void updateCall (CallInfo &CallerCall, FuncInfo CalleeFunc);
856
865
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
857
866
IndexCall>::FuncInfo
@@ -1201,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1201
1210
1202
1211
ContextIdToAllocationType[++LastContextId] = AllocType;
1203
1212
1204
- if (MemProfReportHintedSizes) {
1205
- assert (!ContextSizeInfo.empty ());
1213
+ if (!ContextSizeInfo.empty ()) {
1206
1214
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1207
1215
Entry.insert (Entry.begin (), ContextSizeInfo.begin (), ContextSizeInfo.end ());
1208
1216
}
@@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2043
2051
CallStack<MIBInfo, SmallVector<unsigned >::const_iterator>
2044
2052
EmptyContext;
2045
2053
unsigned I = 0 ;
2046
- assert (!MemProfReportHintedSizes ||
2047
- AN.ContextSizeInfos .size () == AN.MIBs .size ());
2054
+ assert (
2055
+ (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100 ) ||
2056
+ AN.ContextSizeInfos .size () == AN.MIBs .size ());
2048
2057
// Now add all of the MIBs and their stack nodes.
2049
2058
for (auto &MIB : AN.MIBs ) {
2050
2059
CallStack<MIBInfo, SmallVector<unsigned >::const_iterator>
2051
2060
StackContext (&MIB);
2052
2061
std::vector<ContextTotalSize> ContextSizeInfo;
2053
- if (MemProfReportHintedSizes ) {
2062
+ if (!AN. ContextSizeInfos . empty () ) {
2054
2063
for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos [I])
2055
2064
ContextSizeInfo.push_back ({FullStackId, TotalSize});
2056
2065
}
@@ -2825,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2825
2834
if (!Node->IsAllocation )
2826
2835
continue ;
2827
2836
DenseSet<uint32_t > ContextIds = Node->getContextIds ();
2837
+ auto AllocTypeFromCall = getAllocationCallType (Node->Call );
2828
2838
std::vector<uint32_t > SortedIds (ContextIds.begin (), ContextIds.end ());
2829
2839
std::sort (SortedIds.begin (), SortedIds.end ());
2830
2840
for (auto Id : SortedIds) {
@@ -2837,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2837
2847
<< getAllocTypeString ((uint8_t )TypeI->second )
2838
2848
<< " full allocation context " << Info.FullStackId
2839
2849
<< " with total size " << Info.TotalSize << " is "
2840
- << getAllocTypeString (Node->AllocTypes ) << " after cloning\n " ;
2850
+ << getAllocTypeString (Node->AllocTypes ) << " after cloning" ;
2851
+ if (allocTypeToUse (Node->AllocTypes ) != AllocTypeFromCall)
2852
+ OS << " marked " << getAllocTypeString ((uint8_t )AllocTypeFromCall)
2853
+ << " due to cold byte percent" ;
2854
+ OS << " \n " ;
2841
2855
}
2842
2856
}
2843
2857
}
@@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3487
3501
AI->Versions [Call.cloneNo ()] = (uint8_t )AllocType;
3488
3502
}
3489
3503
3504
+ AllocationType
3505
+ ModuleCallsiteContextGraph::getAllocationCallType (const CallInfo &Call) const {
3506
+ const auto *CB = cast<CallBase>(Call.call ());
3507
+ if (!CB->getAttributes ().hasFnAttr (" memprof" ))
3508
+ return AllocationType::None;
3509
+ return CB->getAttributes ().getFnAttr (" memprof" ).getValueAsString () == " cold"
3510
+ ? AllocationType::Cold
3511
+ : AllocationType::NotCold;
3512
+ }
3513
+
3514
+ AllocationType
3515
+ IndexCallsiteContextGraph::getAllocationCallType (const CallInfo &Call) const {
3516
+ const auto *AI = Call.call ().dyn_cast <AllocInfo *>();
3517
+ assert (AI->Versions .size () > Call.cloneNo ());
3518
+ return (AllocationType)AI->Versions [Call.cloneNo ()];
3519
+ }
3520
+
3490
3521
void ModuleCallsiteContextGraph::updateCall (CallInfo &CallerCall,
3491
3522
FuncInfo CalleeFunc) {
3492
3523
if (CalleeFunc.cloneNo () > 0 )
@@ -4017,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4017
4048
}
4018
4049
}
4019
4050
4051
+ uint8_t BothTypes =
4052
+ (uint8_t )AllocationType::Cold | (uint8_t )AllocationType::NotCold;
4053
+
4020
4054
auto UpdateCalls = [&](ContextNode *Node,
4021
4055
DenseSet<const ContextNode *> &Visited,
4022
4056
auto &&UpdateCalls) {
@@ -4036,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4036
4070
return ;
4037
4071
4038
4072
if (Node->IsAllocation ) {
4039
- updateAllocationCall (Node->Call , allocTypeToUse (Node->AllocTypes ));
4073
+ auto AT = allocTypeToUse (Node->AllocTypes );
4074
+ // If the allocation type is ambiguous, and more aggressive hinting
4075
+ // has been enabled via the MinClonedColdBytePercent flag, see if this
4076
+ // allocation should be hinted cold anyway because its fraction cold bytes
4077
+ // allocated is at least the given threshold.
4078
+ if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4079
+ !ContextIdToContextSizeInfos.empty ()) {
4080
+ uint64_t TotalCold = 0 ;
4081
+ uint64_t Total = 0 ;
4082
+ for (auto Id : Node->getContextIds ()) {
4083
+ auto TypeI = ContextIdToAllocationType.find (Id);
4084
+ assert (TypeI != ContextIdToAllocationType.end ());
4085
+ auto CSI = ContextIdToContextSizeInfos.find (Id);
4086
+ if (CSI != ContextIdToContextSizeInfos.end ()) {
4087
+ for (auto &Info : CSI->second ) {
4088
+ Total += Info.TotalSize ;
4089
+ if (TypeI->second == AllocationType::Cold)
4090
+ TotalCold += Info.TotalSize ;
4091
+ }
4092
+ }
4093
+ }
4094
+ if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4095
+ AT = AllocationType::Cold;
4096
+ }
4097
+ updateAllocationCall (Node->Call , AT);
4040
4098
assert (Node->MatchingCalls .empty ());
4041
4099
return ;
4042
4100
}
@@ -4419,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
4419
4477
// will still be none type or should have gotten the default NotCold.
4420
4478
// Skip that after calling clone helper since that does some sanity
4421
4479
// checks that confirm we haven't decided yet that we need cloning.
4422
- if (AllocNode.Versions .size () == 1 ) {
4480
+ // We might have a single version that is cold due to the
4481
+ // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4482
+ // case.
4483
+ if (AllocNode.Versions .size () == 1 &&
4484
+ (AllocationType)AllocNode.Versions [0 ] != AllocationType::Cold) {
4423
4485
assert ((AllocationType)AllocNode.Versions [0 ] ==
4424
4486
AllocationType::NotCold ||
4425
4487
(AllocationType)AllocNode.Versions [0 ] ==
0 commit comments