| //===- AMDGPULDSUtils.cpp -------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // AMDGPU LDS related helper utility functions. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPULDSUtils.h" |
| #include "AMDGPU.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/ADT/DepthFirstIterator.h" |
| #include "llvm/ADT/SetVector.h" |
| #include "llvm/Analysis/CallGraph.h" |
| #include "llvm/IR/Constants.h" |
| #include "llvm/IR/ReplaceConstant.h" |
| |
| using namespace llvm; |
| |
| namespace llvm { |
| |
| namespace AMDGPU { |
| |
| // An helper class for collecting all reachable callees for each kernel defined |
| // within the module. |
| class CollectReachableCallees { |
| Module &M; |
| CallGraph CG; |
| SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; |
| |
| // Collect all address taken functions within the module. |
| void collectAddressTakenFunctions() { |
| auto *ECNode = CG.getExternalCallingNode(); |
| |
| for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { |
| auto *CGN = GI->second; |
| auto *F = CGN->getFunction(); |
| if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) |
| continue; |
| AddressTakenFunctions.insert(CGN); |
| } |
| } |
| |
| // For given kernel, collect all its reachable non-kernel functions. |
| SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { |
| SmallPtrSet<Function *, 8> ReachableCallees; |
| |
| // Call graph node which represents this kernel. |
| auto *KCGN = CG[K]; |
| |
| // Go through all call graph nodes reachable from the node representing this |
| // kernel, visit all their call sites, if the call site is direct, add |
| // corresponding callee to reachable callee set, if it is indirect, resolve |
| // the indirect call site to potential reachable callees, add them to |
| // reachable callee set, and repeat the process for the newly added |
| // potential callee nodes. |
| // |
| // FIXME: Need to handle bit-casted function pointers. |
| // |
| SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN)); |
| SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; |
| while (!CGNStack.empty()) { |
| auto *CGN = CGNStack.pop_back_val(); |
| |
| if (!VisitedCGNodes.insert(CGN).second) |
| continue; |
| |
| // Ignore call graph node which does not have associated function or |
| // associated function is not a definition. |
| if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) |
| continue; |
| |
| for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { |
| auto *RCB = cast<CallBase>(GI->first.getValue()); |
| auto *RCGN = GI->second; |
| |
| if (auto *DCallee = RCGN->getFunction()) { |
| ReachableCallees.insert(DCallee); |
| } else if (RCB->isIndirectCall()) { |
| auto *RCBFTy = RCB->getFunctionType(); |
| for (auto *ACGN : AddressTakenFunctions) { |
| auto *ACallee = ACGN->getFunction(); |
| if (ACallee->getFunctionType() == RCBFTy) { |
| ReachableCallees.insert(ACallee); |
| CGNStack.append(df_begin(ACGN), df_end(ACGN)); |
| } |
| } |
| } |
| } |
| } |
| |
| return ReachableCallees; |
| } |
| |
| public: |
| explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { |
| // Collect address taken functions. |
| collectAddressTakenFunctions(); |
| } |
| |
| void collectReachableCallees( |
| DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { |
| // Collect reachable callee set for each kernel defined in the module. |
| for (Function &F : M.functions()) { |
| if (!AMDGPU::isKernelCC(&F)) |
| continue; |
| Function *K = &F; |
| KernelToCallees[K] = collectReachableCallees(K); |
| } |
| } |
| }; |
| |
| void collectReachableCallees( |
| Module &M, |
| DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { |
| CollectReachableCallees CRC{M}; |
| CRC.collectReachableCallees(KernelToCallees); |
| } |
| |
| SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { |
| SmallPtrSet<Function *, 8> LDSAccessors; |
| SmallVector<User *, 8> UserStack(GV->users()); |
| SmallPtrSet<User *, 8> VisitedUsers; |
| |
| while (!UserStack.empty()) { |
| auto *U = UserStack.pop_back_val(); |
| |
| // `U` is already visited? continue to next one. |
| if (!VisitedUsers.insert(U).second) |
| continue; |
| |
| // `U` is a global variable which is initialized with LDS. Ignore LDS. |
| if (isa<GlobalValue>(U)) |
| return SmallPtrSet<Function *, 8>(); |
| |
| // Recursively explore constant users. |
| if (isa<Constant>(U)) { |
| append_range(UserStack, U->users()); |
| continue; |
| } |
| |
| // `U` should be an instruction, if it belongs to a non-kernel function F, |
| // then collect F. |
| Function *F = cast<Instruction>(U)->getFunction(); |
| if (!AMDGPU::isKernelCC(F)) |
| LDSAccessors.insert(F); |
| } |
| |
| return LDSAccessors; |
| } |
| |
| DenseMap<Function *, SmallPtrSet<Instruction *, 8>> |
| getFunctionToInstsMap(User *U, bool CollectKernelInsts) { |
| DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; |
| SmallVector<User *, 8> UserStack; |
| SmallPtrSet<User *, 8> VisitedUsers; |
| |
| UserStack.push_back(U); |
| |
| while (!UserStack.empty()) { |
| auto *UU = UserStack.pop_back_val(); |
| |
| if (!VisitedUsers.insert(UU).second) |
| continue; |
| |
| if (isa<GlobalValue>(UU)) |
| continue; |
| |
| if (isa<Constant>(UU)) { |
| append_range(UserStack, UU->users()); |
| continue; |
| } |
| |
| auto *I = cast<Instruction>(UU); |
| Function *F = I->getFunction(); |
| if (CollectKernelInsts) { |
| if (!AMDGPU::isKernelCC(F)) { |
| continue; |
| } |
| } else { |
| if (AMDGPU::isKernelCC(F)) { |
| continue; |
| } |
| } |
| |
| FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); |
| FunctionToInsts[F].insert(I); |
| } |
| |
| return FunctionToInsts; |
| } |
| |
| bool isKernelCC(const Function *Func) { |
| return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); |
| } |
| |
| Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { |
| return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL), |
| GV->getValueType()); |
| } |
| |
| static void collectFunctionUses(User *U, const Function *F, |
| SetVector<Instruction *> &InstUsers) { |
| SmallVector<User *> Stack{U}; |
| |
| while (!Stack.empty()) { |
| U = Stack.pop_back_val(); |
| |
| if (auto *I = dyn_cast<Instruction>(U)) { |
| if (I->getFunction() == F) |
| InstUsers.insert(I); |
| continue; |
| } |
| |
| if (!isa<ConstantExpr>(U)) |
| continue; |
| |
| append_range(Stack, U->users()); |
| } |
| } |
| |
| void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { |
| SetVector<Instruction *> InstUsers; |
| |
| collectFunctionUses(C, F, InstUsers); |
| for (Instruction *I : InstUsers) { |
| convertConstantExprsToInstructions(I, C); |
| } |
| } |
| |
| bool hasUserInstruction(const GlobalValue *GV) { |
| SmallPtrSet<const User *, 8> Visited; |
| SmallVector<const User *, 16> Stack(GV->users()); |
| |
| while (!Stack.empty()) { |
| const User *U = Stack.pop_back_val(); |
| |
| if (!Visited.insert(U).second) |
| continue; |
| |
| if (isa<Instruction>(U)) |
| return true; |
| |
| append_range(Stack, U->users()); |
| } |
| |
| return false; |
| } |
| |
| bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { |
| // We are not interested in kernel LDS lowering for module LDS itself. |
| if (F && GV.getName() == "llvm.amdgcn.module.lds") |
| return false; |
| |
| bool Ret = false; |
| SmallPtrSet<const User *, 8> Visited; |
| SmallVector<const User *, 16> Stack(GV.users()); |
| SmallPtrSet<const GlobalValue *, 8> GlobalUsers; |
| |
| assert(!F || isKernelCC(F)); |
| |
| while (!Stack.empty()) { |
| const User *V = Stack.pop_back_val(); |
| Visited.insert(V); |
| |
| if (auto *G = dyn_cast<GlobalValue>(V)) { |
| StringRef GName = G->getName(); |
| if (F && GName != "llvm.used" && GName != "llvm.compiler.used") { |
| // For kernel LDS lowering, if G is not a compiler.used list, then we |
| // cannot lower the lds GV since we cannot replace the use of GV within |
| // G. |
| return false; |
| } |
| GlobalUsers.insert(G); |
| continue; |
| } |
| |
| if (auto *I = dyn_cast<Instruction>(V)) { |
| const Function *UF = I->getFunction(); |
| if (UF == F) { |
| // Used from this kernel, we want to put it into the structure. |
| Ret = true; |
| } else if (!F) { |
| // For module LDS lowering, lowering is required if the user instruction |
| // is from non-kernel function. |
| Ret |= !isKernelCC(UF); |
| } |
| continue; |
| } |
| |
| // User V should be a constant, recursively visit users of V. |
| assert(isa<Constant>(V) && "Expected a constant."); |
| append_range(Stack, V->users()); |
| } |
| |
| if (!F && !Ret) { |
| // For module LDS lowering, we have not yet decided if we should lower GV or |
| // not. Explore all global users of GV, and check if atleast one of these |
| // global users appear as an use within an instruction (possibly nested use |
| // via constant expression), if so, then conservately lower LDS. |
| for (auto *G : GlobalUsers) |
| Ret |= hasUserInstruction(G); |
| } |
| |
| return Ret; |
| } |
| |
| std::vector<GlobalVariable *> findVariablesToLower(Module &M, |
| const Function *F) { |
| std::vector<llvm::GlobalVariable *> LocalVars; |
| for (auto &GV : M.globals()) { |
| if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { |
| continue; |
| } |
| if (!GV.hasInitializer()) { |
| // addrspace(3) without initializer implies cuda/hip extern __shared__ |
| // the semantics for such a variable appears to be that all extern |
| // __shared__ variables alias one another, in which case this transform |
| // is not required |
| continue; |
| } |
| if (!isa<UndefValue>(GV.getInitializer())) { |
| // Initializers are unimplemented for local address space. |
| // Leave such variables in place for consistent error reporting. |
| continue; |
| } |
| if (GV.isConstant()) { |
| // A constant undef variable can't be written to, and any load is |
| // undef, so it should be eliminated by the optimizer. It could be |
| // dropped by the back end if not. This pass skips over it. |
| continue; |
| } |
| if (!shouldLowerLDSToStruct(GV, F)) { |
| continue; |
| } |
| LocalVars.push_back(&GV); |
| } |
| return LocalVars; |
| } |
| |
| SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { |
| SmallPtrSet<GlobalValue *, 32> UsedList; |
| |
| SmallVector<GlobalValue *, 32> TmpVec; |
| collectUsedGlobalVariables(M, TmpVec, true); |
| UsedList.insert(TmpVec.begin(), TmpVec.end()); |
| |
| TmpVec.clear(); |
| collectUsedGlobalVariables(M, TmpVec, false); |
| UsedList.insert(TmpVec.begin(), TmpVec.end()); |
| |
| return UsedList; |
| } |
| |
| } // end namespace AMDGPU |
| |
| } // end namespace llvm |