Skip to content

Commit c7451ff

Browse files
[MemProf] Supporting hinting mostly-cold allocations after cloning (#120633)
Optionally unconditionally hint allocations as cold or not cold during the cloning step if the percentage of bytes allocated is at least that of the given threshold. This is similar to PR120301 which supports this during matching, but enables the same behavior during cloning, to reduce the false positives that can be addressed by cloning at the cost of carrying the additional size metadata/summary.
1 parent d2b8acc commit c7451ff

File tree

4 files changed

+116
-12
lines changed

4 files changed

+116
-12
lines changed

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion(
140140
} // namespace llvm
141141

142142
extern cl::opt<bool> MemProfReportHintedSizes;
143+
extern cl::opt<unsigned> MinClonedColdBytePercent;
143144

144145
namespace {
145146
/// CRTP base for graphs built from either IR or ThinLTO summary index.
@@ -617,6 +618,11 @@ class CallsiteContextGraph {
617618
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
618619
}
619620

621+
/// Get the AllocationType assigned to the given allocation instruction clone.
622+
AllocationType getAllocationCallType(const CallInfo &Call) const {
623+
return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
624+
}
625+
620626
/// Update non-allocation call to invoke (possibly cloned) function
621627
/// CalleeFunc.
622628
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
@@ -711,7 +717,8 @@ class CallsiteContextGraph {
711717

712718
/// Map from each contextID to the profiled full contexts and their total
713719
/// sizes (there may be more than one due to context trimming),
714-
/// optionally populated when requested (via MemProfReportHintedSizes).
720+
/// optionally populated when requested (via MemProfReportHintedSizes or
721+
/// MinClonedColdBytePercent).
715722
DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
716723

717724
/// Identifies the context node created for a stack id when adding the MIB
@@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph
773780
uint64_t getLastStackId(Instruction *Call);
774781
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
775782
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
783+
AllocationType getAllocationCallType(const CallInfo &Call) const;
776784
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
777785
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778786
Instruction *>::FuncInfo
@@ -852,6 +860,7 @@ class IndexCallsiteContextGraph
852860
uint64_t getLastStackId(IndexCall &Call);
853861
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
854862
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
863+
AllocationType getAllocationCallType(const CallInfo &Call) const;
855864
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
856865
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
857866
IndexCall>::FuncInfo
@@ -1201,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
12011210

12021211
ContextIdToAllocationType[++LastContextId] = AllocType;
12031212

1204-
if (MemProfReportHintedSizes) {
1205-
assert(!ContextSizeInfo.empty());
1213+
if (!ContextSizeInfo.empty()) {
12061214
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
12071215
Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
12081216
}
@@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
20432051
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
20442052
EmptyContext;
20452053
unsigned I = 0;
2046-
assert(!MemProfReportHintedSizes ||
2047-
AN.ContextSizeInfos.size() == AN.MIBs.size());
2054+
assert(
2055+
(!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
2056+
AN.ContextSizeInfos.size() == AN.MIBs.size());
20482057
// Now add all of the MIBs and their stack nodes.
20492058
for (auto &MIB : AN.MIBs) {
20502059
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
20512060
StackContext(&MIB);
20522061
std::vector<ContextTotalSize> ContextSizeInfo;
2053-
if (MemProfReportHintedSizes) {
2062+
if (!AN.ContextSizeInfos.empty()) {
20542063
for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
20552064
ContextSizeInfo.push_back({FullStackId, TotalSize});
20562065
}
@@ -2825,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
28252834
if (!Node->IsAllocation)
28262835
continue;
28272836
DenseSet<uint32_t> ContextIds = Node->getContextIds();
2837+
auto AllocTypeFromCall = getAllocationCallType(Node->Call);
28282838
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
28292839
std::sort(SortedIds.begin(), SortedIds.end());
28302840
for (auto Id : SortedIds) {
@@ -2837,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
28372847
<< getAllocTypeString((uint8_t)TypeI->second)
28382848
<< " full allocation context " << Info.FullStackId
28392849
<< " with total size " << Info.TotalSize << " is "
2840-
<< getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2850+
<< getAllocTypeString(Node->AllocTypes) << " after cloning";
2851+
if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2852+
OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2853+
<< " due to cold byte percent";
2854+
OS << "\n";
28412855
}
28422856
}
28432857
}
@@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
34873501
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
34883502
}
34893503

3504+
AllocationType
3505+
ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3506+
const auto *CB = cast<CallBase>(Call.call());
3507+
if (!CB->getAttributes().hasFnAttr("memprof"))
3508+
return AllocationType::None;
3509+
return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3510+
? AllocationType::Cold
3511+
: AllocationType::NotCold;
3512+
}
3513+
3514+
AllocationType
3515+
IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3516+
const auto *AI = Call.call().dyn_cast<AllocInfo *>();
3517+
assert(AI->Versions.size() > Call.cloneNo());
3518+
return (AllocationType)AI->Versions[Call.cloneNo()];
3519+
}
3520+
34903521
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
34913522
FuncInfo CalleeFunc) {
34923523
if (CalleeFunc.cloneNo() > 0)
@@ -4017,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
40174048
}
40184049
}
40194050

4051+
uint8_t BothTypes =
4052+
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4053+
40204054
auto UpdateCalls = [&](ContextNode *Node,
40214055
DenseSet<const ContextNode *> &Visited,
40224056
auto &&UpdateCalls) {
@@ -4036,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
40364070
return;
40374071

40384072
if (Node->IsAllocation) {
4039-
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
4073+
auto AT = allocTypeToUse(Node->AllocTypes);
4074+
// If the allocation type is ambiguous, and more aggressive hinting
4075+
// has been enabled via the MinClonedColdBytePercent flag, see if this
4076+
// allocation should be hinted cold anyway because its fraction cold bytes
4077+
// allocated is at least the given threshold.
4078+
if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4079+
!ContextIdToContextSizeInfos.empty()) {
4080+
uint64_t TotalCold = 0;
4081+
uint64_t Total = 0;
4082+
for (auto Id : Node->getContextIds()) {
4083+
auto TypeI = ContextIdToAllocationType.find(Id);
4084+
assert(TypeI != ContextIdToAllocationType.end());
4085+
auto CSI = ContextIdToContextSizeInfos.find(Id);
4086+
if (CSI != ContextIdToContextSizeInfos.end()) {
4087+
for (auto &Info : CSI->second) {
4088+
Total += Info.TotalSize;
4089+
if (TypeI->second == AllocationType::Cold)
4090+
TotalCold += Info.TotalSize;
4091+
}
4092+
}
4093+
}
4094+
if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4095+
AT = AllocationType::Cold;
4096+
}
4097+
updateAllocationCall(Node->Call, AT);
40404098
assert(Node->MatchingCalls.empty());
40414099
return;
40424100
}
@@ -4419,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
44194477
// will still be none type or should have gotten the default NotCold.
44204478
// Skip that after calling clone helper since that does some sanity
44214479
// checks that confirm we haven't decided yet that we need cloning.
4422-
if (AllocNode.Versions.size() == 1) {
4480+
// We might have a single version that is cold due to the
4481+
// MinClonedColdBytePercent heuristic, make sure we don't skip in that
4482+
// case.
4483+
if (AllocNode.Versions.size() == 1 &&
4484+
(AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
44234485
assert((AllocationType)AllocNode.Versions[0] ==
44244486
AllocationType::NotCold ||
44254487
(AllocationType)AllocNode.Versions[0] ==

llvm/lib/Transforms/Instrumentation/MemProfiler.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,10 @@ static cl::opt<bool>
176176
cl::desc("Salvage stale MemProf profile"),
177177
cl::init(false), cl::Hidden);
178178

179+
cl::opt<unsigned> MinClonedColdBytePercent(
180+
"memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
181+
cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
182+
179183
extern cl::opt<bool> MemProfReportHintedSizes;
180184

181185
static cl::opt<unsigned> MinMatchedColdBytePercent(
@@ -755,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
755759
AllocInfo->Info.getAllocCount(),
756760
AllocInfo->Info.getTotalLifetime());
757761
std::vector<ContextTotalSize> ContextSizeInfo;
758-
if (MemProfReportHintedSizes) {
762+
if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
759763
auto TotalSize = AllocInfo->Info.getTotalSize();
760764
assert(TotalSize);
761765
assert(FullStackId != 0);
@@ -1126,7 +1130,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
11261130
InlinedCallStack)) {
11271131
NumOfMemProfMatchedAllocContexts++;
11281132
uint64_t FullStackId = 0;
1129-
if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes)
1133+
if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
1134+
MinClonedColdBytePercent < 100)
11301135
FullStackId = computeFullStackId(AllocInfo->CallStack);
11311136
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
11321137
TotalSize += AllocInfo->Info.getTotalSize();

llvm/test/ThinLTO/X86/memprof-missing-callsite.ll

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
;; Test callsite context graph generation for simple call graph with
22
;; two memprof contexts and no inlining, where one callsite required for
3-
;; cloning is missing (e.g. unmatched).
3+
;; cloning is missing (e.g. unmatched). Use this to test aggressive hinting
4+
;; threshold.
45
;;
56
;; Original code looks like:
67
;;
@@ -30,6 +31,30 @@
3031
; RUN: --check-prefix=SIZESUNHINTED
3132
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\""
3233

34+
;; Check that we do hint with a sufficient -memprof-cloning-cold-threshold.
35+
; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
36+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
37+
; RUN: -supports-hot-cold-new \
38+
; RUN: -r=%t.o,main,plx \
39+
; RUN: -r=%t.o,_Znam, \
40+
; RUN: -memprof-report-hinted-sizes -memprof-cloning-cold-threshold=80 \
41+
; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
42+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED \
43+
; RUN: --check-prefix=SIZESHINTED
44+
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED
45+
46+
;; Check again that we hint with a sufficient -memprof-cloning-cold-threshold,
47+
;; even if we don't specify -memprof-report-hinted-sizes.
48+
; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
49+
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
50+
; RUN: -supports-hot-cold-new \
51+
; RUN: -r=%t.o,main,plx \
52+
; RUN: -r=%t.o,_Znam, \
53+
; RUN: -memprof-cloning-cold-threshold=80 \
54+
; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
55+
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED
56+
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED
57+
3358
source_filename = "memprof-missing-callsite.ll"
3459
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
3560
target triple = "x86_64-unknown-linux-gnu"
@@ -68,3 +93,11 @@ attributes #0 = { noinline optnone }
6893
; SIZESUNHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning
6994
; SIZESUNHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning
7095
; SIZESUNHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning
96+
97+
; SIZESHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning marked Cold due to cold byte percent
98+
; SIZESHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning marked Cold due to cold byte percent
99+
; SIZESHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning marked Cold due to cold byte percent
100+
101+
; REMARKSHINTED: call in clone _Z3foov marked with memprof allocation attribute cold
102+
103+
; IRHINTED: "memprof"="cold"

llvm/test/Transforms/PGOProfile/memprof.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@
6969
;; Check that we hint additional allocations with a threshold < 100%
7070
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60
7171

72+
;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause
73+
;; the size metadata to be generated for the LTO link.
74+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES
75+
7276
;; Make sure we emit a random hotness seed if requested.
7377
; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND
7478
; RAND: random hotness seed =

0 commit comments

Comments
 (0)