Skip to content

[MemProf] Supporting hinting mostly-cold allocations after cloning #120633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 71 additions & 9 deletions llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion(
} // namespace llvm

extern cl::opt<bool> MemProfReportHintedSizes;
extern cl::opt<unsigned> MinClonedColdBytePercent;

namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
Expand Down Expand Up @@ -617,6 +618,11 @@ class CallsiteContextGraph {
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
}

/// Get the AllocationType assigned to the given allocation instruction clone.
AllocationType getAllocationCallType(const CallInfo &Call) const {
return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
}

/// Update non-allocation call to invoke (possibly cloned) function
/// CalleeFunc.
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
Expand Down Expand Up @@ -711,7 +717,8 @@ class CallsiteContextGraph {

/// Map from each contextID to the profiled full contexts and their total
/// sizes (there may be more than one due to context trimming),
/// optionally populated when requested (via MemProfReportHintedSizes).
/// optionally populated when requested (via MemProfReportHintedSizes or
/// MinClonedColdBytePercent).
DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;

/// Identifies the context node created for a stack id when adding the MIB
Expand Down Expand Up @@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph
uint64_t getLastStackId(Instruction *Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
Expand Down Expand Up @@ -852,6 +860,7 @@ class IndexCallsiteContextGraph
uint64_t getLastStackId(IndexCall &Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
Expand Down Expand Up @@ -1201,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(

ContextIdToAllocationType[++LastContextId] = AllocType;

if (MemProfReportHintedSizes) {
assert(!ContextSizeInfo.empty());
if (!ContextSizeInfo.empty()) {
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
}
Expand Down Expand Up @@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
EmptyContext;
unsigned I = 0;
assert(!MemProfReportHintedSizes ||
AN.ContextSizeInfos.size() == AN.MIBs.size());
assert(
(!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
AN.ContextSizeInfos.size() == AN.MIBs.size());
// Now add all of the MIBs and their stack nodes.
for (auto &MIB : AN.MIBs) {
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
StackContext(&MIB);
std::vector<ContextTotalSize> ContextSizeInfo;
if (MemProfReportHintedSizes) {
if (!AN.ContextSizeInfos.empty()) {
for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
ContextSizeInfo.push_back({FullStackId, TotalSize});
}
Expand Down Expand Up @@ -2825,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
if (!Node->IsAllocation)
continue;
DenseSet<uint32_t> ContextIds = Node->getContextIds();
auto AllocTypeFromCall = getAllocationCallType(Node->Call);
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds) {
Expand All @@ -2837,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
<< getAllocTypeString((uint8_t)TypeI->second)
<< " full allocation context " << Info.FullStackId
<< " with total size " << Info.TotalSize << " is "
<< getAllocTypeString(Node->AllocTypes) << " after cloning\n";
<< getAllocTypeString(Node->AllocTypes) << " after cloning";
if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
<< " due to cold byte percent";
OS << "\n";
}
}
}
Expand Down Expand Up @@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
}

AllocationType
ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
const auto *CB = cast<CallBase>(Call.call());
if (!CB->getAttributes().hasFnAttr("memprof"))
return AllocationType::None;
return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
? AllocationType::Cold
: AllocationType::NotCold;
}

AllocationType
IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
const auto *AI = Call.call().dyn_cast<AllocInfo *>();
assert(AI->Versions.size() > Call.cloneNo());
return (AllocationType)AI->Versions[Call.cloneNo()];
}

void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
if (CalleeFunc.cloneNo() > 0)
Expand Down Expand Up @@ -4017,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}
}

uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;

auto UpdateCalls = [&](ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
auto &&UpdateCalls) {
Expand All @@ -4036,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
return;

if (Node->IsAllocation) {
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
auto AT = allocTypeToUse(Node->AllocTypes);
// If the allocation type is ambiguous, and more aggressive hinting
// has been enabled via the MinClonedColdBytePercent flag, see if this
// allocation should be hinted cold anyway because its fraction cold bytes
// allocated is at least the given threshold.
if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
!ContextIdToContextSizeInfos.empty()) {
uint64_t TotalCold = 0;
uint64_t Total = 0;
for (auto Id : Node->getContextIds()) {
auto TypeI = ContextIdToAllocationType.find(Id);
assert(TypeI != ContextIdToAllocationType.end());
auto CSI = ContextIdToContextSizeInfos.find(Id);
if (CSI != ContextIdToContextSizeInfos.end()) {
for (auto &Info : CSI->second) {
Total += Info.TotalSize;
if (TypeI->second == AllocationType::Cold)
TotalCold += Info.TotalSize;
}
}
}
if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
AT = AllocationType::Cold;
}
updateAllocationCall(Node->Call, AT);
assert(Node->MatchingCalls.empty());
return;
}
Expand Down Expand Up @@ -4419,7 +4477,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// will still be none type or should have gotten the default NotCold.
// Skip that after calling clone helper since that does some sanity
// checks that confirm we haven't decided yet that we need cloning.
if (AllocNode.Versions.size() == 1) {
// We might have a single version that is cold due to the
// MinClonedColdBytePercent heuristic, make sure we don't skip in that
// case.
if (AllocNode.Versions.size() == 1 &&
(AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
assert((AllocationType)AllocNode.Versions[0] ==
AllocationType::NotCold ||
(AllocationType)AllocNode.Versions[0] ==
Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ static cl::opt<bool>
cl::desc("Salvage stale MemProf profile"),
cl::init(false), cl::Hidden);

cl::opt<unsigned> MinClonedColdBytePercent(
"memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));

extern cl::opt<bool> MemProfReportHintedSizes;

static cl::opt<unsigned> MinMatchedColdBytePercent(
Expand Down Expand Up @@ -755,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
AllocInfo->Info.getAllocCount(),
AllocInfo->Info.getTotalLifetime());
std::vector<ContextTotalSize> ContextSizeInfo;
if (MemProfReportHintedSizes) {
if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
auto TotalSize = AllocInfo->Info.getTotalSize();
assert(TotalSize);
assert(FullStackId != 0);
Expand Down Expand Up @@ -1126,7 +1130,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
InlinedCallStack)) {
NumOfMemProfMatchedAllocContexts++;
uint64_t FullStackId = 0;
if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes)
if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
MinClonedColdBytePercent < 100)
FullStackId = computeFullStackId(AllocInfo->CallStack);
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
TotalSize += AllocInfo->Info.getTotalSize();
Expand Down
35 changes: 34 additions & 1 deletion llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
;; Test callsite context graph generation for simple call graph with
;; two memprof contexts and no inlining, where one callsite required for
;; cloning is missing (e.g. unmatched).
;; cloning is missing (e.g. unmatched). Use this to test aggressive hinting
;; threshold.
;;
;; Original code looks like:
;;
Expand Down Expand Up @@ -30,6 +31,30 @@
; RUN: --check-prefix=SIZESUNHINTED
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\""

;; Check that we do hint with a sufficient -memprof-cloning-cold-threshold.
; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
; RUN: -r=%t.o,main,plx \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-report-hinted-sizes -memprof-cloning-cold-threshold=80 \
; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED \
; RUN: --check-prefix=SIZESHINTED
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED

;; Check again that we hint with a sufficient -memprof-cloning-cold-threshold,
;; even if we don't specify -memprof-report-hinted-sizes.
; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
; RUN: -r=%t.o,main,plx \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-cloning-cold-threshold=80 \
; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED

source_filename = "memprof-missing-callsite.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
Expand Down Expand Up @@ -68,3 +93,11 @@ attributes #0 = { noinline optnone }
; SIZESUNHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning
; SIZESUNHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning
; SIZESUNHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning

; SIZESHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning marked Cold due to cold byte percent
; SIZESHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning marked Cold due to cold byte percent
; SIZESHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning marked Cold due to cold byte percent

; REMARKSHINTED: call in clone _Z3foov marked with memprof allocation attribute cold

; IRHINTED: "memprof"="cold"
4 changes: 4 additions & 0 deletions llvm/test/Transforms/PGOProfile/memprof.ll
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@
;; Check that we hint additional allocations with a threshold < 100%
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60

;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause
;; the size metadata to be generated for the LTO link.
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES

;; Make sure we emit a random hotness seed if requested.
; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND
; RAND: random hotness seed =
Expand Down
Loading