blob: 1a8096f647d8475b6d981f20deb08a3d3d937302 [file] [log] [blame]
//===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file implements two passes that enable HIP C++ Standard Parallelism
// Support:
//
// 1. AcceleratorCodeSelection (required): Given that only algorithms are
// accelerated, and that the accelerated implementation exists in the form of
// a compute kernel, we assume that only the kernel, and all functions
// reachable from it, constitute code that the user expects the accelerator
// to execute. Thus, we identify the set of all functions reachable from
// kernels, and then remove all unreachable ones. This last part is necessary
// because it is possible for code that the user did not expect to execute on
// an accelerator to contain constructs that cannot be handled by the target
// BE, which cannot be provably demonstrated to be dead code in general, and
// thus can lead to mis-compilation. The degenerate case of this is when a
// Module contains no kernels (the parent TU had no algorithm invocations fit
// for acceleration), which we handle by completely emptying said module.
// **NOTE**: The above does not handle indirectly reachable functions i.e.
// it is possible to obtain a case where the target of an indirect
// call is otherwise unreachable and thus is removed; this
// restriction is aligned with the current `-hipstdpar` limitations
// and will be relaxed in the future.
//
// 2. AllocationInterposition (required only when on-demand paging is
// unsupported): Some accelerators or operating systems might not support
// transparent on-demand paging. Thus, they would only be able to access
// memory that is allocated by an accelerator-aware mechanism. For such cases
// the user can opt into enabling allocation / deallocation interposition,
// whereby we replace calls to known allocation / deallocation functions with
// calls to runtime implemented equivalents that forward the requests to
// accelerator-aware interfaces. We also support freeing system allocated
// memory that ends up in one of the runtime equivalents, since this can
// happen if e.g. a library that was compiled without interposition returns
// an allocation that can be validly passed to `free`.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include <cassert>
#include <string>
#include <utility>
using namespace llvm;
template<typename T>
static inline void eraseFromModule(T &ToErase) {
ToErase.replaceAllUsesWith(PoisonValue::get(ToErase.getType()));
ToErase.eraseFromParent();
}
static inline bool checkIfSupported(GlobalVariable &G) {
if (!G.isThreadLocal())
return true;
G.dropDroppableUses();
if (!G.isConstantUsed())
return true;
std::string W;
raw_string_ostream OS(W);
OS << "Accelerator does not support the thread_local variable "
<< G.getName();
Instruction *I = nullptr;
SmallVector<User *> Tmp(G.user_begin(), G.user_end());
SmallPtrSet<User *, 5> Visited;
do {
auto U = std::move(Tmp.back());
Tmp.pop_back();
if (Visited.contains(U))
continue;
if (isa<Instruction>(U))
I = cast<Instruction>(U);
else
Tmp.insert(Tmp.end(), U->user_begin(), U->user_end());
Visited.insert(U);
} while (!I && !Tmp.empty());
assert(I && "thread_local global should have at least one non-constant use.");
G.getContext().diagnose(
DiagnosticInfoUnsupported(*I->getParent()->getParent(), W,
I->getDebugLoc(), DS_Error));
return false;
}
static inline void clearModule(Module &M) { // TODO: simplify.
while (!M.functions().empty())
eraseFromModule(*M.begin());
while (!M.globals().empty())
eraseFromModule(*M.globals().begin());
while (!M.aliases().empty())
eraseFromModule(*M.aliases().begin());
while (!M.ifuncs().empty())
eraseFromModule(*M.ifuncs().begin());
}
static inline void maybeHandleGlobals(Module &M) {
unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
if (!checkIfSupported(G))
return clearModule(M);
if (G.isThreadLocal())
continue;
if (G.isConstant())
continue;
if (G.getAddressSpace() != GlobAS)
continue;
if (G.getLinkage() != GlobalVariable::ExternalLinkage)
continue;
G.setLinkage(GlobalVariable::ExternalWeakLinkage);
G.setInitializer(nullptr);
G.setExternallyInitialized(true);
}
}
template<unsigned N>
static inline void removeUnreachableFunctions(
const SmallPtrSet<const Function *, N>& Reachable, Module &M) {
removeFromUsedLists(M, [&](Constant *C) {
if (auto F = dyn_cast<Function>(C))
return !Reachable.contains(F);
return false;
});
SmallVector<std::reference_wrapper<Function>> ToRemove;
copy_if(M, std::back_inserter(ToRemove), [&](auto &&F) {
return !F.isIntrinsic() && !Reachable.contains(&F);
});
for_each(ToRemove, eraseFromModule<Function>);
}
static inline bool isAcceleratorExecutionRoot(const Function *F) {
if (!F)
return false;
return F->getCallingConv() == CallingConv::AMDGPU_KERNEL;
}
static inline bool checkIfSupported(const Function *F, const CallBase *CB) {
const auto Dx = F->getName().rfind("__hipstdpar_unsupported");
if (Dx == StringRef::npos)
return true;
const auto N = F->getName().substr(0, Dx);
std::string W;
raw_string_ostream OS(W);
if (N == "__ASM")
OS << "Accelerator does not support the ASM block:\n"
<< cast<ConstantDataArray>(CB->getArgOperand(0))->getAsCString();
else
OS << "Accelerator does not support the " << N << " function.";
auto Caller = CB->getParent()->getParent();
Caller->getContext().diagnose(
DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error));
return false;
}
PreservedAnalyses
HipStdParAcceleratorCodeSelectionPass::run(Module &M,
ModuleAnalysisManager &MAM) {
auto &CGA = MAM.getResult<CallGraphAnalysis>(M);
SmallPtrSet<const Function *, 32> Reachable;
for (auto &&CGN : CGA) {
if (!isAcceleratorExecutionRoot(CGN.first))
continue;
Reachable.insert(CGN.first);
SmallVector<const Function *> Tmp({CGN.first});
do {
auto F = std::move(Tmp.back());
Tmp.pop_back();
for (auto &&N : *CGA[F]) {
if (!N.second)
continue;
if (!N.second->getFunction())
continue;
if (Reachable.contains(N.second->getFunction()))
continue;
if (!checkIfSupported(N.second->getFunction(),
dyn_cast<CallBase>(*N.first)))
return PreservedAnalyses::none();
Reachable.insert(N.second->getFunction());
Tmp.push_back(N.second->getFunction());
}
} while (!std::empty(Tmp));
}
if (std::empty(Reachable))
clearModule(M);
else
removeUnreachableFunctions(Reachable, M);
maybeHandleGlobals(M);
return PreservedAnalyses::none();
}
static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{
{"aligned_alloc", "__hipstdpar_aligned_alloc"},
{"calloc", "__hipstdpar_calloc"},
{"free", "__hipstdpar_free"},
{"malloc", "__hipstdpar_malloc"},
{"memalign", "__hipstdpar_aligned_alloc"},
{"posix_memalign", "__hipstdpar_posix_aligned_alloc"},
{"realloc", "__hipstdpar_realloc"},
{"reallocarray", "__hipstdpar_realloc_array"},
{"_ZdaPv", "__hipstdpar_operator_delete"},
{"_ZdaPvm", "__hipstdpar_operator_delete_sized"},
{"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
{"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
{"_ZdlPv", "__hipstdpar_operator_delete"},
{"_ZdlPvm", "__hipstdpar_operator_delete_sized"},
{"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
{"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
{"_Znam", "__hipstdpar_operator_new"},
{"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
{"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"},
{"_ZnamSt11align_val_tRKSt9nothrow_t",
"__hipstdpar_operator_new_aligned_nothrow"},
{"_Znwm", "__hipstdpar_operator_new"},
{"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
{"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"},
{"_ZnwmSt11align_val_tRKSt9nothrow_t",
"__hipstdpar_operator_new_aligned_nothrow"},
{"__builtin_calloc", "__hipstdpar_calloc"},
{"__builtin_free", "__hipstdpar_free"},
{"__builtin_malloc", "__hipstdpar_malloc"},
{"__builtin_operator_delete", "__hipstdpar_operator_delete"},
{"__builtin_operator_new", "__hipstdpar_operator_new"},
{"__builtin_realloc", "__hipstdpar_realloc"},
{"__libc_calloc", "__hipstdpar_calloc"},
{"__libc_free", "__hipstdpar_free"},
{"__libc_malloc", "__hipstdpar_malloc"},
{"__libc_memalign", "__hipstdpar_aligned_alloc"},
{"__libc_realloc", "__hipstdpar_realloc"}
};
PreservedAnalyses
HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
SmallDenseMap<StringRef, StringRef> AllocReplacements(std::cbegin(ReplaceMap),
std::cend(ReplaceMap));
for (auto &&F : M) {
if (!F.hasName())
continue;
if (!AllocReplacements.contains(F.getName()))
continue;
if (auto R = M.getFunction(AllocReplacements[F.getName()])) {
F.replaceAllUsesWith(R);
} else {
std::string W;
raw_string_ostream OS(W);
OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()]
<< ". Tried to run the allocation interposition pass without the "
<< "replacement functions available.";
F.getContext().diagnose(DiagnosticInfoUnsupported(F, W,
F.getSubprogram(),
DS_Warning));
}
}
if (auto F = M.getFunction("__hipstdpar_hidden_free")) {
auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(),
F->getAttributes());
F->replaceAllUsesWith(LibcFree.getCallee());
eraseFromModule(*F);
}
return PreservedAnalyses::none();
}