lib/Transforms/HipStdPar/HipStdPar.cpp - llvm-project/llvm - Git at Google

 //===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // This file implements two passes that enable HIP C++ Standard Parallelism
 // Support:
 //
 // 1. AcceleratorCodeSelection (required): Given that only algorithms are
 //    accelerated, and that the accelerated implementation exists in the form of
 //    a compute kernel, we assume that only the kernel, and all functions
 //    reachable from it, constitute code that the user expects the accelerator
 //    to execute. Thus, we identify the set of all functions reachable from
 //    kernels, and then remove all unreachable ones. This last part is necessary
 //    because it is possible for code that the user did not expect to execute on
 //    an accelerator to contain constructs that cannot be handled by the target
 //    BE, which cannot be provably demonstrated to be dead code in general, and
 //    thus can lead to mis-compilation. The degenerate case of this is when a
 //    Module contains no kernels (the parent TU had no algorithm invocations fit
 //    for acceleration), which we handle by completely emptying said module.
 //    **NOTE**: The above does not handle indirectly reachable functions i.e.
 //              it is possible to obtain a case where the target of an indirect
 //              call is otherwise unreachable and thus is removed; this
 //              restriction is aligned with the current `-hipstdpar` limitations
 //              and will be relaxed in the future.
 //
 // 2. AllocationInterposition (required only when on-demand paging is
 //    unsupported): Some accelerators or operating systems might not support
 //    transparent on-demand paging. Thus, they would only be able to access
 //    memory that is allocated by an accelerator-aware mechanism. For such cases
 //    the user can opt into enabling allocation / deallocation interposition,
 //    whereby we replace calls to known allocation / deallocation functions with
 //    calls to runtime implemented equivalents that forward the requests to
 //    accelerator-aware interfaces. We also support freeing system allocated
 //    memory that ends up in one of the runtime equivalents, since this can
 //    happen if e.g. a library that was compiled without interposition returns
 //    an allocation that can be validly passed to `free`.
 //===----------------------------------------------------------------------===//

 #include "llvm/Transforms/HipStdPar/HipStdPar.h"

 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"

 #include <cassert>
 #include <string>
 #include <utility>

 using namespace llvm;

 template<typename T>
 static inline void eraseFromModule(T &ToErase) {
   ToErase.replaceAllUsesWith(PoisonValue::get(ToErase.getType()));
   ToErase.eraseFromParent();
 }

 static inline bool checkIfSupported(GlobalVariable &G) {
   if (!G.isThreadLocal())
     return true;

   G.dropDroppableUses();

   if (!G.isConstantUsed())
     return true;

   std::string W;
   raw_string_ostream OS(W);

   OS << "Accelerator does not support the thread_local variable "
     << G.getName();

   Instruction *I = nullptr;
   SmallVector<User *> Tmp(G.user_begin(), G.user_end());
   SmallPtrSet<User *, 5> Visited;
   do {
     auto U = std::move(Tmp.back());
     Tmp.pop_back();

     if (Visited.contains(U))
       continue;

     if (isa<Instruction>(U))
       I = cast<Instruction>(U);
     else
       Tmp.insert(Tmp.end(), U->user_begin(), U->user_end());

     Visited.insert(U);
   } while (!I && !Tmp.empty());

   assert(I && "thread_local global should have at least one non-constant use.");

   G.getContext().diagnose(
     DiagnosticInfoUnsupported(*I->getParent()->getParent(), W,
                               I->getDebugLoc(), DS_Error));

   return false;
 }

 static inline void clearModule(Module &M) { // TODO: simplify.
   while (!M.functions().empty())
     eraseFromModule(*M.begin());
   while (!M.globals().empty())
     eraseFromModule(*M.globals().begin());
   while (!M.aliases().empty())
     eraseFromModule(*M.aliases().begin());
   while (!M.ifuncs().empty())
     eraseFromModule(*M.ifuncs().begin());
 }

 static inline void maybeHandleGlobals(Module &M) {
   unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
   for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
     if (!checkIfSupported(G))
       return clearModule(M);

     if (G.isThreadLocal())
       continue;
     if (G.isConstant())
       continue;
     if (G.getAddressSpace() != GlobAS)
       continue;
     if (G.getLinkage() != GlobalVariable::ExternalLinkage)
       continue;

     G.setLinkage(GlobalVariable::ExternalWeakLinkage);
     G.setInitializer(nullptr);
     G.setExternallyInitialized(true);
   }
 }

 template<unsigned N>
 static inline void removeUnreachableFunctions(
   const SmallPtrSet<const Function *, N>& Reachable, Module &M) {
   removeFromUsedLists(M, [&](Constant *C) {
     if (auto F = dyn_cast<Function>(C))
       return !Reachable.contains(F);

     return false;
   });

   SmallVector<std::reference_wrapper<Function>> ToRemove;
   copy_if(M, std::back_inserter(ToRemove), [&](auto &&F) {
     return !F.isIntrinsic() && !Reachable.contains(&F);
   });

   for_each(ToRemove, eraseFromModule<Function>);
 }

 static inline bool isAcceleratorExecutionRoot(const Function *F) {
     if (!F)
       return false;

     return F->getCallingConv() == CallingConv::AMDGPU_KERNEL;
 }

 static inline bool checkIfSupported(const Function *F, const CallBase *CB) {
   const auto Dx = F->getName().rfind("__hipstdpar_unsupported");

   if (Dx == StringRef::npos)
     return true;

   const auto N = F->getName().substr(0, Dx);

   std::string W;
   raw_string_ostream OS(W);

   if (N == "__ASM")
     OS << "Accelerator does not support the ASM block:\n"
       << cast<ConstantDataArray>(CB->getArgOperand(0))->getAsCString();
   else
     OS << "Accelerator does not support the " << N << " function.";

   auto Caller = CB->getParent()->getParent();

   Caller->getContext().diagnose(
     DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error));

   return false;
 }

 PreservedAnalyses
   HipStdParAcceleratorCodeSelectionPass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
   auto &CGA = MAM.getResult<CallGraphAnalysis>(M);

   SmallPtrSet<const Function *, 32> Reachable;
   for (auto &&CGN : CGA) {
     if (!isAcceleratorExecutionRoot(CGN.first))
       continue;

     Reachable.insert(CGN.first);

     SmallVector<const Function *> Tmp({CGN.first});
     do {
       auto F = std::move(Tmp.back());
       Tmp.pop_back();

       for (auto &&N : *CGA[F]) {
         if (!N.second)
           continue;
         if (!N.second->getFunction())
           continue;
         if (Reachable.contains(N.second->getFunction()))
           continue;

         if (!checkIfSupported(N.second->getFunction(),
                               dyn_cast<CallBase>(*N.first)))
           return PreservedAnalyses::none();

         Reachable.insert(N.second->getFunction());
         Tmp.push_back(N.second->getFunction());
       }
     } while (!std::empty(Tmp));
   }

   if (std::empty(Reachable))
     clearModule(M);
   else
     removeUnreachableFunctions(Reachable, M);

   maybeHandleGlobals(M);

   return PreservedAnalyses::none();
 }

 static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{
   {"aligned_alloc",             "__hipstdpar_aligned_alloc"},
   {"calloc",                    "__hipstdpar_calloc"},
   {"free",                      "__hipstdpar_free"},
   {"malloc",                    "__hipstdpar_malloc"},
   {"memalign",                  "__hipstdpar_aligned_alloc"},
   {"posix_memalign",            "__hipstdpar_posix_aligned_alloc"},
   {"realloc",                   "__hipstdpar_realloc"},
   {"reallocarray",              "__hipstdpar_realloc_array"},
   {"_ZdaPv",                    "__hipstdpar_operator_delete"},
   {"_ZdaPvm",                   "__hipstdpar_operator_delete_sized"},
   {"_ZdaPvSt11align_val_t",     "__hipstdpar_operator_delete_aligned"},
   {"_ZdaPvmSt11align_val_t",    "__hipstdpar_operator_delete_aligned_sized"},
   {"_ZdlPv",                    "__hipstdpar_operator_delete"},
   {"_ZdlPvm",                   "__hipstdpar_operator_delete_sized"},
   {"_ZdlPvSt11align_val_t",     "__hipstdpar_operator_delete_aligned"},
   {"_ZdlPvmSt11align_val_t",    "__hipstdpar_operator_delete_aligned_sized"},
   {"_Znam",                     "__hipstdpar_operator_new"},
   {"_ZnamRKSt9nothrow_t",       "__hipstdpar_operator_new_nothrow"},
   {"_ZnamSt11align_val_t",      "__hipstdpar_operator_new_aligned"},
   {"_ZnamSt11align_val_tRKSt9nothrow_t",
                                 "__hipstdpar_operator_new_aligned_nothrow"},

   {"_Znwm",                     "__hipstdpar_operator_new"},
   {"_ZnwmRKSt9nothrow_t",       "__hipstdpar_operator_new_nothrow"},
   {"_ZnwmSt11align_val_t",      "__hipstdpar_operator_new_aligned"},
   {"_ZnwmSt11align_val_tRKSt9nothrow_t",
                                 "__hipstdpar_operator_new_aligned_nothrow"},
   {"__builtin_calloc",          "__hipstdpar_calloc"},
   {"__builtin_free",            "__hipstdpar_free"},
   {"__builtin_malloc",          "__hipstdpar_malloc"},
   {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
   {"__builtin_operator_new",    "__hipstdpar_operator_new"},
   {"__builtin_realloc",         "__hipstdpar_realloc"},
   {"__libc_calloc",             "__hipstdpar_calloc"},
   {"__libc_free",               "__hipstdpar_free"},
   {"__libc_malloc",             "__hipstdpar_malloc"},
   {"__libc_memalign",           "__hipstdpar_aligned_alloc"},
   {"__libc_realloc",            "__hipstdpar_realloc"}
 };

 PreservedAnalyses
 HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
   SmallDenseMap<StringRef, StringRef> AllocReplacements(std::cbegin(ReplaceMap),
                                                         std::cend(ReplaceMap));

   for (auto &&F : M) {
     if (!F.hasName())
       continue;
     if (!AllocReplacements.contains(F.getName()))
       continue;

     if (auto R = M.getFunction(AllocReplacements[F.getName()])) {
       F.replaceAllUsesWith(R);
     } else {
       std::string W;
       raw_string_ostream OS(W);

       OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()]
         << ". Tried to run the allocation interposition pass without the "
         << "replacement functions available.";

       F.getContext().diagnose(DiagnosticInfoUnsupported(F, W,
                                                         F.getSubprogram(),
                                                         DS_Warning));
     }
   }

   if (auto F = M.getFunction("__hipstdpar_hidden_free")) {
     auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(),
                                           F->getAttributes());
     F->replaceAllUsesWith(LibcFree.getCallee());

     eraseFromModule(*F);
   }

   return PreservedAnalyses::none();
 }
	//===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	// This file implements two passes that enable HIP C++ Standard Parallelism
	// Support:
	//
	// 1. AcceleratorCodeSelection (required): Given that only algorithms are
	// accelerated, and that the accelerated implementation exists in the form of
	// a compute kernel, we assume that only the kernel, and all functions
	// reachable from it, constitute code that the user expects the accelerator
	// to execute. Thus, we identify the set of all functions reachable from
	// kernels, and then remove all unreachable ones. This last part is necessary
	// because it is possible for code that the user did not expect to execute on
	// an accelerator to contain constructs that cannot be handled by the target
	// BE, which cannot be provably demonstrated to be dead code in general, and
	// thus can lead to mis-compilation. The degenerate case of this is when a
	// Module contains no kernels (the parent TU had no algorithm invocations fit
	// for acceleration), which we handle by completely emptying said module.
	// NOTE: The above does not handle indirectly reachable functions i.e.
	// it is possible to obtain a case where the target of an indirect
	// call is otherwise unreachable and thus is removed; this
	// restriction is aligned with the current `-hipstdpar` limitations
	// and will be relaxed in the future.
	//
	// 2. AllocationInterposition (required only when on-demand paging is
	// unsupported): Some accelerators or operating systems might not support
	// transparent on-demand paging. Thus, they would only be able to access
	// memory that is allocated by an accelerator-aware mechanism. For such cases
	// the user can opt into enabling allocation / deallocation interposition,
	// whereby we replace calls to known allocation / deallocation functions with
	// calls to runtime implemented equivalents that forward the requests to
	// accelerator-aware interfaces. We also support freeing system allocated
	// memory that ends up in one of the runtime equivalents, since this can
	// happen if e.g. a library that was compiled without interposition returns
	// an allocation that can be validly passed to `free`.
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/HipStdPar/HipStdPar.h"

	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/Analysis/CallGraph.h"
	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Transforms/Utils/ModuleUtils.h"

	#include <cassert>
	#include <string>
	#include <utility>

	using namespace llvm;

	template<typename T>
	static inline void eraseFromModule(T &ToErase) {
	ToErase.replaceAllUsesWith(PoisonValue::get(ToErase.getType()));
	ToErase.eraseFromParent();
	}

	static inline bool checkIfSupported(GlobalVariable &G) {
	if (!G.isThreadLocal())
	return true;

	G.dropDroppableUses();

	if (!G.isConstantUsed())
	return true;

	std::string W;
	raw_string_ostream OS(W);

	OS << "Accelerator does not support the thread_local variable "
	<< G.getName();

	Instruction *I = nullptr;
	SmallVector<User *> Tmp(G.user_begin(), G.user_end());
	SmallPtrSet<User *, 5> Visited;
	do {
	auto U = std::move(Tmp.back());
	Tmp.pop_back();

	if (Visited.contains(U))
	continue;

	if (isa<Instruction>(U))
	I = cast<Instruction>(U);
	else
	Tmp.insert(Tmp.end(), U->user_begin(), U->user_end());

	Visited.insert(U);
	} while (!I && !Tmp.empty());

	assert(I && "thread_local global should have at least one non-constant use.");

	G.getContext().diagnose(
	DiagnosticInfoUnsupported(*I->getParent()->getParent(), W,
	I->getDebugLoc(), DS_Error));

	return false;
	}

	static inline void clearModule(Module &M) { // TODO: simplify.
	while (!M.functions().empty())
	eraseFromModule(*M.begin());
	while (!M.globals().empty())
	eraseFromModule(*M.globals().begin());
	while (!M.aliases().empty())
	eraseFromModule(*M.aliases().begin());
	while (!M.ifuncs().empty())
	eraseFromModule(*M.ifuncs().begin());
	}

	static inline void maybeHandleGlobals(Module &M) {
	unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
	for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
	if (!checkIfSupported(G))
	return clearModule(M);

	if (G.isThreadLocal())
	continue;
	if (G.isConstant())
	continue;
	if (G.getAddressSpace() != GlobAS)
	continue;
	if (G.getLinkage() != GlobalVariable::ExternalLinkage)
	continue;

	G.setLinkage(GlobalVariable::ExternalWeakLinkage);
	G.setInitializer(nullptr);
	G.setExternallyInitialized(true);
	}
	}

	template<unsigned N>
	static inline void removeUnreachableFunctions(
	const SmallPtrSet<const Function *, N>& Reachable, Module &M) {
	removeFromUsedLists(M, [&](Constant *C) {
	if (auto F = dyn_cast<Function>(C))
	return !Reachable.contains(F);

	return false;
	});

	SmallVector<std::reference_wrapper<Function>> ToRemove;
	copy_if(M, std::back_inserter(ToRemove), [&](auto &&F) {
	return !F.isIntrinsic() && !Reachable.contains(&F);
	});

	for_each(ToRemove, eraseFromModule<Function>);
	}

	static inline bool isAcceleratorExecutionRoot(const Function *F) {
	if (!F)
	return false;

	return F->getCallingConv() == CallingConv::AMDGPU_KERNEL;
	}

	static inline bool checkIfSupported(const Function F, const CallBase CB) {
	const auto Dx = F->getName().rfind("__hipstdpar_unsupported");

	if (Dx == StringRef::npos)
	return true;

	const auto N = F->getName().substr(0, Dx);

	std::string W;
	raw_string_ostream OS(W);

	if (N == "__ASM")
	OS << "Accelerator does not support the ASM block:\n"
	<< cast<ConstantDataArray>(CB->getArgOperand(0))->getAsCString();
	else
	OS << "Accelerator does not support the " << N << " function.";

	auto Caller = CB->getParent()->getParent();

	Caller->getContext().diagnose(
	DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error));

	return false;
	}

	PreservedAnalyses
	HipStdParAcceleratorCodeSelectionPass::run(Module &M,
	ModuleAnalysisManager &MAM) {
	auto &CGA = MAM.getResult<CallGraphAnalysis>(M);

	SmallPtrSet<const Function *, 32> Reachable;
	for (auto &&CGN : CGA) {
	if (!isAcceleratorExecutionRoot(CGN.first))
	continue;

	Reachable.insert(CGN.first);

	SmallVector<const Function *> Tmp({CGN.first});
	do {
	auto F = std::move(Tmp.back());
	Tmp.pop_back();

	for (auto &&N : *CGA[F]) {
	if (!N.second)
	continue;
	if (!N.second->getFunction())
	continue;
	if (Reachable.contains(N.second->getFunction()))
	continue;

	if (!checkIfSupported(N.second->getFunction(),
	dyn_cast<CallBase>(*N.first)))
	return PreservedAnalyses::none();

	Reachable.insert(N.second->getFunction());
	Tmp.push_back(N.second->getFunction());
	}
	} while (!std::empty(Tmp));
	}

	if (std::empty(Reachable))
	clearModule(M);
	else
	removeUnreachableFunctions(Reachable, M);

	maybeHandleGlobals(M);

	return PreservedAnalyses::none();
	}

	static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{
	{"aligned_alloc", "__hipstdpar_aligned_alloc"},
	{"calloc", "__hipstdpar_calloc"},
	{"free", "__hipstdpar_free"},
	{"malloc", "__hipstdpar_malloc"},
	{"memalign", "__hipstdpar_aligned_alloc"},
	{"posix_memalign", "__hipstdpar_posix_aligned_alloc"},
	{"realloc", "__hipstdpar_realloc"},
	{"reallocarray", "__hipstdpar_realloc_array"},
	{"_ZdaPv", "__hipstdpar_operator_delete"},
	{"_ZdaPvm", "__hipstdpar_operator_delete_sized"},
	{"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
	{"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
	{"_ZdlPv", "__hipstdpar_operator_delete"},
	{"_ZdlPvm", "__hipstdpar_operator_delete_sized"},
	{"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
	{"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
	{"_Znam", "__hipstdpar_operator_new"},
	{"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
	{"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"},
	{"_ZnamSt11align_val_tRKSt9nothrow_t",
	"__hipstdpar_operator_new_aligned_nothrow"},

	{"_Znwm", "__hipstdpar_operator_new"},
	{"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
	{"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"},
	{"_ZnwmSt11align_val_tRKSt9nothrow_t",
	"__hipstdpar_operator_new_aligned_nothrow"},
	{"__builtin_calloc", "__hipstdpar_calloc"},
	{"__builtin_free", "__hipstdpar_free"},
	{"__builtin_malloc", "__hipstdpar_malloc"},
	{"__builtin_operator_delete", "__hipstdpar_operator_delete"},
	{"__builtin_operator_new", "__hipstdpar_operator_new"},
	{"__builtin_realloc", "__hipstdpar_realloc"},
	{"__libc_calloc", "__hipstdpar_calloc"},
	{"__libc_free", "__hipstdpar_free"},
	{"__libc_malloc", "__hipstdpar_malloc"},
	{"__libc_memalign", "__hipstdpar_aligned_alloc"},
	{"__libc_realloc", "__hipstdpar_realloc"}
	};

	PreservedAnalyses
	HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
	SmallDenseMap<StringRef, StringRef> AllocReplacements(std::cbegin(ReplaceMap),
	std::cend(ReplaceMap));

	for (auto &&F : M) {
	if (!F.hasName())
	continue;
	if (!AllocReplacements.contains(F.getName()))
	continue;

	if (auto R = M.getFunction(AllocReplacements[F.getName()])) {
	F.replaceAllUsesWith(R);
	} else {
	std::string W;
	raw_string_ostream OS(W);

	OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()]
	<< ". Tried to run the allocation interposition pass without the "
	<< "replacement functions available.";

	F.getContext().diagnose(DiagnosticInfoUnsupported(F, W,
	F.getSubprogram(),
	DS_Warning));
	}
	}

	if (auto F = M.getFunction("__hipstdpar_hidden_free")) {
	auto LibcFree = M.getOrInsertFunction("__libc_free", F->getFunctionType(),
	F->getAttributes());
	F->replaceAllUsesWith(LibcFree.getCallee());

	eraseFromModule(*F);
	}

	return PreservedAnalyses::none();
	}