From 6196acfdb264209a19228dd05bb821280d36945a Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 19 Dec 2024 12:14:31 -0800 Subject: [PATCH 1/2] [MemProf] Supporting hinting mostly-cold allocations after cloning Optionally unconditionally hint allocations as cold or not cold during the cloning step if the percentage of bytes allocated is at least that of the given threshold. This is similar to PR120301 which supports this during matching, but enables the same behavior during cloning, to reduce the false positives that can be addressed by cloning at the cost of carrying the additional size metadata/summary. --- .../IPO/MemProfContextDisambiguation.cpp | 77 ++++++++++++++++--- .../Instrumentation/MemProfiler.cpp | 9 ++- .../ThinLTO/X86/memprof-missing-callsite.ll | 32 +++++++- llvm/test/Transforms/PGOProfile/memprof.ll | 4 + 4 files changed, 110 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index cd4b9f557fb49..59c183b105c24 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -140,6 +140,7 @@ cl::opt MemProfRequireDefinitionForPromotion( } // namespace llvm extern cl::opt MemProfReportHintedSizes; +extern cl::opt MinClonedColdBytePercent; namespace { /// CRTP base for graphs built from either IR or ThinLTO summary index. @@ -617,6 +618,11 @@ class CallsiteContextGraph { static_cast(this)->updateAllocationCall(Call, AllocType); } + /// Get the AllocationType assigned to the given allocation instruction clone. + AllocationType getAllocationCallType(const CallInfo &Call) const { + return static_cast(this)->getAllocationCallType(Call); + } + /// Update non-allocation call to invoke (possibly cloned) function /// CalleeFunc. void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { @@ -711,7 +717,8 @@ class CallsiteContextGraph { /// Map from each contextID to the profiled full contexts and their total /// sizes (there may be more than one due to context trimming), - /// optionally populated when requested (via MemProfReportHintedSizes). + /// optionally populated when requested (via MemProfReportHintedSizes or + /// MinClonedColdBytePercent). DenseMap> ContextIdToContextSizeInfos; /// Identifies the context node created for a stack id when adding the MIB @@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph uint64_t getLastStackId(Instruction *Call); std::vector getStackIdsWithContextNodesForCall(Instruction *Call); void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + AllocationType getAllocationCallType(const CallInfo &Call) const; void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); CallsiteContextGraph::FuncInfo @@ -852,6 +860,7 @@ class IndexCallsiteContextGraph uint64_t getLastStackId(IndexCall &Call); std::vector getStackIdsWithContextNodesForCall(IndexCall &Call); void updateAllocationCall(CallInfo &Call, AllocationType AllocType); + AllocationType getAllocationCallType(const CallInfo &Call) const; void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); CallsiteContextGraph::FuncInfo @@ -1201,8 +1210,7 @@ void CallsiteContextGraph::addStackNodesForMIB( ContextIdToAllocationType[++LastContextId] = AllocType; - if (MemProfReportHintedSizes) { - assert(!ContextSizeInfo.empty()); + if (!ContextSizeInfo.empty()) { auto &Entry = ContextIdToContextSizeInfos[LastContextId]; Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); } @@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( CallStack::const_iterator> EmptyContext; unsigned I = 0; - assert(!MemProfReportHintedSizes || - AN.ContextSizeInfos.size() == AN.MIBs.size()); + assert( + (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) || + AN.ContextSizeInfos.size() == AN.MIBs.size()); // Now add all of the MIBs and their stack nodes. for (auto &MIB : AN.MIBs) { CallStack::const_iterator> StackContext(&MIB); std::vector ContextSizeInfo; - if (MemProfReportHintedSizes) { + if (!AN.ContextSizeInfos.empty()) { for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I]) ContextSizeInfo.push_back({FullStackId, TotalSize}); } @@ -2825,6 +2834,7 @@ void CallsiteContextGraph::printTotalSizes( if (!Node->IsAllocation) continue; DenseSet ContextIds = Node->getContextIds(); + auto AllocTypeFromCall = getAllocationCallType(Node->Call); std::vector SortedIds(ContextIds.begin(), ContextIds.end()); std::sort(SortedIds.begin(), SortedIds.end()); for (auto Id : SortedIds) { @@ -2837,7 +2847,11 @@ void CallsiteContextGraph::printTotalSizes( << getAllocTypeString((uint8_t)TypeI->second) << " full allocation context " << Info.FullStackId << " with total size " << Info.TotalSize << " is " - << getAllocTypeString(Node->AllocTypes) << " after cloning\n"; + << getAllocTypeString(Node->AllocTypes) << " after cloning"; + if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall) + OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall) + << " due to cold byte percent"; + OS << "\n"; } } } @@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; } +AllocationType +ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { + const auto *CB = cast(Call.call()); + if (!CB->getAttributes().hasFnAttr("memprof")) + return AllocationType::None; + return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" + ? AllocationType::Cold + : AllocationType::NotCold; +} + +AllocationType +IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { + const auto *AI = Call.call().dyn_cast(); + assert(AI->Versions.size() > Call.cloneNo()); + return (AllocationType)AI->Versions[Call.cloneNo()]; +} + void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { if (CalleeFunc.cloneNo() > 0) @@ -4017,6 +4048,9 @@ bool CallsiteContextGraph::assignFunctions() { } } + uint8_t BothTypes = + (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; + auto UpdateCalls = [&](ContextNode *Node, DenseSet &Visited, auto &&UpdateCalls) { @@ -4036,7 +4070,31 @@ bool CallsiteContextGraph::assignFunctions() { return; if (Node->IsAllocation) { - updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); + auto AT = allocTypeToUse(Node->AllocTypes); + // If the allocation type is ambiguous, and more aggressive hinting + // has been enabled via the MinClonedColdBytePercent flag, see if this + // allocation should be hinted cold anyway because its fraction cold bytes + // allocated is at least the given threshold. + if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 && + !ContextIdToContextSizeInfos.empty()) { + uint64_t TotalCold = 0; + uint64_t Total = 0; + for (auto Id : Node->getContextIds()) { + auto TypeI = ContextIdToAllocationType.find(Id); + assert(TypeI != ContextIdToAllocationType.end()); + auto CSI = ContextIdToContextSizeInfos.find(Id); + if (CSI != ContextIdToContextSizeInfos.end()) { + for (auto &Info : CSI->second) { + Total += Info.TotalSize; + if (TypeI->second == AllocationType::Cold) + TotalCold += Info.TotalSize; + } + } + } + if (TotalCold * 100 >= Total * MinClonedColdBytePercent) + AT = AllocationType::Cold; + } + updateAllocationCall(Node->Call, AT); assert(Node->MatchingCalls.empty()); return; } @@ -4419,7 +4477,8 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // will still be none type or should have gotten the default NotCold. // Skip that after calling clone helper since that does some sanity // checks that confirm we haven't decided yet that we need cloning. - if (AllocNode.Versions.size() == 1) { + if (AllocNode.Versions.size() == 1 && + (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { assert((AllocationType)AllocNode.Versions[0] == AllocationType::NotCold || (AllocationType)AllocNode.Versions[0] == diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index db8999911b7f9..9831b37fb4770 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -176,6 +176,10 @@ static cl::opt cl::desc("Salvage stale MemProf profile"), cl::init(false), cl::Hidden); +cl::opt MinClonedColdBytePercent( + "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden, + cl::desc("Min percent of cold bytes to hint alloc cold during cloning")); + extern cl::opt MemProfReportHintedSizes; static cl::opt MinMatchedColdBytePercent( @@ -755,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie, AllocInfo->Info.getAllocCount(), AllocInfo->Info.getTotalLifetime()); std::vector ContextSizeInfo; - if (MemProfReportHintedSizes) { + if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) { auto TotalSize = AllocInfo->Info.getTotalSize(); assert(TotalSize); assert(FullStackId != 0); @@ -1126,7 +1130,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader, InlinedCallStack)) { NumOfMemProfMatchedAllocContexts++; uint64_t FullStackId = 0; - if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes) + if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes || + MinClonedColdBytePercent < 100) FullStackId = computeFullStackId(AllocInfo->CallStack); auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); TotalSize += AllocInfo->Info.getTotalSize(); diff --git a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll index e7fbb3c0bad2c..a8002a74de965 100644 --- a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll +++ b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll @@ -1,6 +1,7 @@ ;; Test callsite context graph generation for simple call graph with ;; two memprof contexts and no inlining, where one callsite required for -;; cloning is missing (e.g. unmatched). +;; cloning is missing (e.g. unmatched). Use this to test aggressive hinting +;; threshold. ;; ;; Original code looks like: ;; @@ -30,6 +31,27 @@ ; RUN: --check-prefix=SIZESUNHINTED ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\"" +; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-report-hinted-sizes -memprof-cloning-cold-threshold=80 \ +; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED \ +; RUN: --check-prefix=SIZESHINTED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED + +; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-cloning-cold-threshold=80 \ +; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED + source_filename = "memprof-missing-callsite.ll" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -68,3 +90,11 @@ attributes #0 = { noinline optnone } ; SIZESUNHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning ; SIZESUNHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning ; SIZESUNHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning + +; SIZESHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning marked Cold due to cold byte percent +; SIZESHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning marked Cold due to cold byte percent +; SIZESHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning marked Cold due to cold byte percent + +; REMARKSHINTED: call in clone _Z3foov marked with memprof allocation attribute cold + +; IRHINTED: "memprof"="cold" diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll index 7e47c8ded4e4a..c0e44cccbf16f 100644 --- a/llvm/test/Transforms/PGOProfile/memprof.ll +++ b/llvm/test/Transforms/PGOProfile/memprof.ll @@ -69,6 +69,10 @@ ;; Check that we hint additional allocations with a threshold < 100% ; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60 +;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause +;; the size metadata to be generated for the LTO link. +; RUN: opt < %s -passes='memprof-use' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES + ;; Make sure we emit a random hotness seed if requested. ; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND ; RAND: random hotness seed = From abbf611519b8deef7e50d826655ed6a39eabac95 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 19 Dec 2024 12:28:22 -0800 Subject: [PATCH 2/2] Add a few more comments --- llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 3 +++ llvm/test/ThinLTO/X86/memprof-missing-callsite.ll | 3 +++ 2 files changed, 6 insertions(+) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 59c183b105c24..1bf7ff468d782 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -4477,6 +4477,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) { // will still be none type or should have gotten the default NotCold. // Skip that after calling clone helper since that does some sanity // checks that confirm we haven't decided yet that we need cloning. + // We might have a single version that is cold due to the + // MinClonedColdBytePercent heuristic, make sure we don't skip in that + // case. if (AllocNode.Versions.size() == 1 && (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { assert((AllocationType)AllocNode.Versions[0] == diff --git a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll index a8002a74de965..5ce786c63ac1a 100644 --- a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll +++ b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll @@ -31,6 +31,7 @@ ; RUN: --check-prefix=SIZESUNHINTED ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\"" +;; Check that we do hint with a sufficient -memprof-cloning-cold-threshold. ; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ @@ -42,6 +43,8 @@ ; RUN: --check-prefix=SIZESHINTED ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED +;; Check again that we hint with a sufficient -memprof-cloning-cold-threshold, +;; even if we don't specify -memprof-report-hinted-sizes. ; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \