| //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "CSPreInliner.h" |
| #include "ProfiledBinary.h" |
| #include "llvm/ADT/SCCIterator.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" |
| #include "llvm/Transforms/IPO/SampleProfile.h" |
| #include <cstdint> |
| #include <queue> |
| |
| #define DEBUG_TYPE "cs-preinliner" |
| |
| using namespace llvm; |
| using namespace sampleprof; |
| |
| STATISTIC(PreInlNumCSInlined, |
| "Number of functions inlined with context sensitive profile"); |
| STATISTIC(PreInlNumCSNotInlined, |
| "Number of functions not inlined with context sensitive profile"); |
| STATISTIC(PreInlNumCSInlinedHitMinLimit, |
| "Number of functions with FDO inline stopped due to min size limit"); |
| STATISTIC(PreInlNumCSInlinedHitMaxLimit, |
| "Number of functions with FDO inline stopped due to max size limit"); |
| STATISTIC( |
| PreInlNumCSInlinedHitGrowthLimit, |
| "Number of functions with FDO inline stopped due to growth size limit"); |
| |
| // The switches specify inline thresholds used in SampleProfileLoader inlining. |
| // TODO: the actual threshold to be tuned here because the size here is based |
| // on machine code not LLVM IR. |
| namespace llvm { |
| cl::opt<bool> EnableCSPreInliner( |
| "csspgo-preinliner", cl::Hidden, cl::init(true), |
| cl::desc("Run a global pre-inliner to merge context profile based on " |
| "estimated global top-down inline decisions")); |
| |
| cl::opt<bool> UseContextCostForPreInliner( |
| "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), |
| cl::desc("Use context-sensitive byte size cost for preinliner decisions")); |
| } // namespace llvm |
| |
| static cl::opt<bool> SamplePreInlineReplay( |
| "csspgo-replay-preinline", cl::Hidden, cl::init(false), |
| cl::desc( |
| "Replay previous inlining and adjust context profile accordingly")); |
| |
| static cl::opt<int> CSPreinlMultiplierForPrevInl( |
| "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden, |
| cl::init(100), |
| cl::desc( |
| "Multiplier to bump up callsite threshold for previous inlining.")); |
| |
| CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, |
| ProfiledBinary &Binary, ProfileSummary *Summary) |
| : UseContextCost(UseContextCostForPreInliner), |
| // TODO: Pass in a guid-to-name map in order for |
| // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes |
| // as their profile context. |
| ContextTracker(Tracker), Binary(Binary), Summary(Summary) { |
| // Set default preinliner hot/cold call site threshold tuned with CSSPGO. |
| // for good performance with reasonable profile size. |
| if (!SampleHotCallSiteThreshold.getNumOccurrences()) |
| SampleHotCallSiteThreshold = 1500; |
| if (!SampleColdCallSiteThreshold.getNumOccurrences()) |
| SampleColdCallSiteThreshold = 0; |
| if (!ProfileInlineLimitMax.getNumOccurrences()) |
| ProfileInlineLimitMax = 50000; |
| } |
| |
| std::vector<StringRef> CSPreInliner::buildTopDownOrder() { |
| std::vector<StringRef> Order; |
| // Trim cold edges to get a more stable call graph. This allows for a more |
| // stable top-down order which in turns helps the stablity of the generated |
| // profile from run to run. |
| uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( |
| (Summary->getDetailedSummary())); |
| ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold); |
| |
| // Now that we have a profiled call graph, construct top-down order |
| // by building up SCC and reversing SCC order. |
| scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG); |
| while (!I.isAtEnd()) { |
| auto Range = *I; |
| if (SortProfiledSCC) { |
| // Sort nodes in one SCC based on callsite hotness. |
| scc_member_iterator<ProfiledCallGraph *> SI(*I); |
| Range = *SI; |
| } |
| for (auto *Node : Range) { |
| if (Node != ProfiledCG.getEntryNode()) |
| Order.push_back(Node->Name); |
| } |
| ++I; |
| } |
| std::reverse(Order.begin(), Order.end()); |
| |
| return Order; |
| } |
| |
| bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, |
| const FunctionSamples *CallerSamples) { |
| assert(CallerSamples && "Expect non-null caller samples"); |
| |
| // Ideally we want to consider everything a function calls, but as far as |
| // context profile is concerned, only those frames that are children of |
| // current one in the trie is relavent. So we walk the trie instead of call |
| // targets from function profile. |
| ContextTrieNode *CallerNode = |
| ContextTracker.getContextNodeForProfile(CallerSamples); |
| |
| bool HasNewCandidate = false; |
| for (auto &Child : CallerNode->getAllChildContext()) { |
| ContextTrieNode *CalleeNode = &Child.second; |
| FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); |
| if (!CalleeSamples) |
| continue; |
| |
| // Call site count is more reliable, so we look up the corresponding call |
| // target profile in caller's context profile to retrieve call site count. |
| uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); |
| uint64_t CallsiteCount = 0; |
| LineLocation Callsite = CalleeNode->getCallSiteLoc(); |
| if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { |
| SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); |
| auto It = TargetCounts.find(CalleeSamples->getName()); |
| if (It != TargetCounts.end()) |
| CallsiteCount = It->second; |
| } |
| |
| // TODO: call site and callee entry count should be mostly consistent, add |
| // check for that. |
| HasNewCandidate = true; |
| uint32_t CalleeSize = getFuncSize(CalleeNode); |
| CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), |
| CalleeSize); |
| } |
| |
| return HasNewCandidate; |
| } |
| |
| uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { |
| if (UseContextCost) |
| return Binary.getFuncSizeForContext(ContextNode); |
| |
| return ContextNode->getFunctionSamples()->getBodySamples().size(); |
| } |
| |
| bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { |
| bool WasInlined = |
| Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); |
| // If replay inline is requested, simply follow the inline decision of the |
| // profiled binary. |
| if (SamplePreInlineReplay) |
| return WasInlined; |
| |
| unsigned int SampleThreshold = SampleColdCallSiteThreshold; |
| uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( |
| (Summary->getDetailedSummary())); |
| |
| if (Candidate.CallsiteCount <= ColdCountThreshold) |
| SampleThreshold = SampleColdCallSiteThreshold; |
| else { |
| // Linearly adjust threshold based on normalized hotness, i.e, a value in |
| // [0,1]. Use 10% cutoff instead of the max count as the normalization |
| // upperbound for stability. |
| double NormalizationUpperBound = |
| ProfileSummaryBuilder::getEntryForPercentile( |
| Summary->getDetailedSummary(), 100000 /* 10% */) |
| .MinCount; |
| double NormalizationLowerBound = ColdCountThreshold; |
| double NormalizedHotness = |
| (Candidate.CallsiteCount - NormalizationLowerBound) / |
| (NormalizationUpperBound - NormalizationLowerBound); |
| if (NormalizedHotness > 1.0) |
| NormalizedHotness = 1.0; |
| // Add 1 to ensure hot callsites get a non-zero threshold, which could |
| // happen when SampleColdCallSiteThreshold is 0. This is when we do not |
| // want any inlining for cold callsites. |
| SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + |
| SampleColdCallSiteThreshold + 1; |
| // Bump up the threshold to favor previous compiler inline decision. The |
| // compiler has more insight and knowledge about functions based on their IR |
| // and attribures and should be able to make a more reasonable inline |
| // decision. |
| if (WasInlined) |
| SampleThreshold *= CSPreinlMultiplierForPrevInl; |
| } |
| |
| return (Candidate.SizeCost < SampleThreshold); |
| } |
| |
| void CSPreInliner::processFunction(const StringRef Name) { |
| FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); |
| if (!FSamples) |
| return; |
| |
| unsigned FuncSize = |
| getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); |
| unsigned FuncFinalSize = FuncSize; |
| unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; |
| SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); |
| SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); |
| |
| LLVM_DEBUG(dbgs() << "Process " << Name |
| << " for context-sensitive pre-inlining (pre-inline size: " |
| << FuncSize << ", size limit: " << SizeLimit << ")\n"); |
| |
| ProfiledCandidateQueue CQueue; |
| getInlineCandidates(CQueue, FSamples); |
| |
| while (!CQueue.empty() && FuncFinalSize < SizeLimit) { |
| ProfiledInlineCandidate Candidate = CQueue.top(); |
| CQueue.pop(); |
| bool ShouldInline = false; |
| if ((ShouldInline = shouldInline(Candidate))) { |
| // We mark context as inlined as the corresponding context profile |
| // won't be merged into that function's base profile. |
| ++PreInlNumCSInlined; |
| ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); |
| Candidate.CalleeSamples->getContext().setAttribute( |
| ContextShouldBeInlined); |
| FuncFinalSize += Candidate.SizeCost; |
| getInlineCandidates(CQueue, Candidate.CalleeSamples); |
| } else { |
| ++PreInlNumCSNotInlined; |
| } |
| LLVM_DEBUG( |
| dbgs() << (ShouldInline ? " Inlined" : " Outlined") |
| << " context profile for: " |
| << ContextTracker.getContextString(*Candidate.CalleeSamples) |
| << " (callee size: " << Candidate.SizeCost |
| << ", call count:" << Candidate.CallsiteCount << ")\n"); |
| } |
| |
| if (!CQueue.empty()) { |
| if (SizeLimit == (unsigned)ProfileInlineLimitMax) |
| ++PreInlNumCSInlinedHitMaxLimit; |
| else if (SizeLimit == (unsigned)ProfileInlineLimitMin) |
| ++PreInlNumCSInlinedHitMinLimit; |
| else |
| ++PreInlNumCSInlinedHitGrowthLimit; |
| } |
| |
| LLVM_DEBUG({ |
| if (!CQueue.empty()) |
| dbgs() << " Inline candidates ignored due to size limit (inliner " |
| "original size: " |
| << FuncSize << ", inliner final size: " << FuncFinalSize |
| << ", size limit: " << SizeLimit << ")\n"; |
| |
| while (!CQueue.empty()) { |
| ProfiledInlineCandidate Candidate = CQueue.top(); |
| CQueue.pop(); |
| bool WasInlined = |
| Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); |
| dbgs() << " " |
| << ContextTracker.getContextString(*Candidate.CalleeSamples) |
| << " (candidate size:" << Candidate.SizeCost |
| << ", call count: " << Candidate.CallsiteCount << ", previously " |
| << (WasInlined ? "inlined)\n" : "not inlined)\n"); |
| } |
| }); |
| } |
| |
| void CSPreInliner::run() { |
| #ifndef NDEBUG |
| auto printProfileNames = [](SampleContextTracker &ContextTracker, |
| bool IsInput) { |
| uint32_t Size = 0; |
| for (auto *Node : ContextTracker) { |
| FunctionSamples *FSamples = Node->getFunctionSamples(); |
| if (FSamples) { |
| Size++; |
| dbgs() << " [" << ContextTracker.getContextString(Node) << "] " |
| << FSamples->getTotalSamples() << ":" |
| << FSamples->getHeadSamples() << "\n"; |
| } |
| } |
| dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" |
| << Size << " total):\n"; |
| }; |
| #endif |
| |
| LLVM_DEBUG(printProfileNames(ContextTracker, true)); |
| |
| // Execute global pre-inliner to estimate a global top-down inline |
| // decision and merge profiles accordingly. This helps with profile |
| // merge for ThinLTO otherwise we won't be able to merge profiles back |
| // to base profile across module/thin-backend boundaries. |
| // It also helps better compress context profile to control profile |
| // size, as we now only need context profile for functions going to |
| // be inlined. |
| for (StringRef FuncName : buildTopDownOrder()) { |
| processFunction(FuncName); |
| } |
| |
| // Not inlined context profiles are merged into its base, so we can |
| // trim out such profiles from the output. |
| for (auto *Node : ContextTracker) { |
| FunctionSamples *FProfile = Node->getFunctionSamples(); |
| if (FProfile && |
| (Node->getParentContext() != &ContextTracker.getRootContext() && |
| !FProfile->getContext().hasState(InlinedContext))) { |
| Node->setFunctionSamples(nullptr); |
| } |
| } |
| FunctionSamples::ProfileIsPreInlined = true; |
| |
| LLVM_DEBUG(printProfileNames(ContextTracker, false)); |
| } |