blob: dcb224f331561be7f7f2b328fd304711e0ec29a8 [file] [log] [blame]
//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This provides a generalized class for OpenMP runtime code generation
// specialized by GPU targets NVPTX and AMDGCN.
//
//===----------------------------------------------------------------------===//
#include "CGOpenMPRuntimeGPU.h"
#include "CodeGenFunction.h"
#include "clang/AST/Attr.h"
#include "clang/AST/DeclOpenMP.h"
#include "clang/AST/StmtOpenMP.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/Basic/Cuda.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/Support/MathExtras.h"
using namespace clang;
using namespace CodeGen;
using namespace llvm::omp;
namespace {
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
class NVPTXActionTy final : public PrePostActionTy {
llvm::FunctionCallee EnterCallee = nullptr;
ArrayRef<llvm::Value *> EnterArgs;
llvm::FunctionCallee ExitCallee = nullptr;
ArrayRef<llvm::Value *> ExitArgs;
bool Conditional = false;
llvm::BasicBlock *ContBlock = nullptr;
public:
NVPTXActionTy(llvm::FunctionCallee EnterCallee,
ArrayRef<llvm::Value *> EnterArgs,
llvm::FunctionCallee ExitCallee,
ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
ExitArgs(ExitArgs), Conditional(Conditional) {}
void Enter(CodeGenFunction &CGF) override {
llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
if (Conditional) {
llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
ContBlock = CGF.createBasicBlock("omp_if.end");
// Generate the branch (If-stmt)
CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
CGF.EmitBlock(ThenBlock);
}
}
void Done(CodeGenFunction &CGF) {
// Emit the rest of blocks/branches
CGF.EmitBranch(ContBlock);
CGF.EmitBlock(ContBlock, true);
}
void Exit(CodeGenFunction &CGF) override {
CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
}
};
/// A class to track the execution mode when codegening directives within
/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
/// to the target region and used by containing directives such as 'parallel'
/// to emit optimized code.
class ExecutionRuntimeModesRAII {
private:
CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
CGOpenMPRuntimeGPU::EM_Unknown;
CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
bool SavedRuntimeMode = false;
bool *RuntimeMode = nullptr;
public:
/// Constructor for Non-SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
: ExecMode(ExecMode) {
SavedExecMode = ExecMode;
ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
}
/// Constructor for SPMD mode.
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
bool &RuntimeMode, bool FullRuntimeMode)
: ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
SavedExecMode = ExecMode;
SavedRuntimeMode = RuntimeMode;
ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
RuntimeMode = FullRuntimeMode;
}
~ExecutionRuntimeModesRAII() {
ExecMode = SavedExecMode;
if (RuntimeMode)
*RuntimeMode = SavedRuntimeMode;
}
};
/// GPU Configuration: This information can be derived from cuda registers,
/// however, providing compile time constants helps generate more efficient
/// code. For all practical purposes this is fine because the configuration
/// is the same for all known NVPTX architectures.
enum MachineConfiguration : unsigned {
/// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
/// specific Grid Values like GV_Warp_Size, GV_Slot_Size
/// Global memory alignment for performance.
GlobalMemoryAlignment = 128,
/// Maximal size of the shared memory buffer.
SharedMemorySize = 128,
};
static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
RefExpr = RefExpr->IgnoreParens();
if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
Base = TempOASE->getBase()->IgnoreParenImpCasts();
while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = TempASE->getBase()->IgnoreParenImpCasts();
RefExpr = Base;
}
RefExpr = RefExpr->IgnoreParenImpCasts();
if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
const auto *ME = cast<MemberExpr>(RefExpr);
return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
}
static RecordDecl *buildRecordForGlobalizedVars(
ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields, int BufSize) {
using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
return nullptr;
SmallVector<VarsDataTy, 4> GlobalizedVars;
for (const ValueDecl *D : EscapedDecls)
GlobalizedVars.emplace_back(
CharUnits::fromQuantity(std::max(
C.getDeclAlign(D).getQuantity(),
static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
D);
for (const ValueDecl *D : EscapedDeclsForTeams)
GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
return L.first > R.first;
});
// Build struct _globalized_locals_ty {
// /* globalized vars */[WarSize] align (max(decl_align,
// GlobalMemoryAlignment))
// /* globalized vars */ for EscapedDeclsForTeams
// };
RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
GlobalizedRD->startDefinition();
llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
for (const auto &Pair : GlobalizedVars) {
const ValueDecl *VD = Pair.second;
QualType Type = VD->getType();
if (Type->isLValueReferenceType())
Type = C.getPointerType(Type.getNonReferenceType());
else
Type = Type.getNonReferenceType();
SourceLocation Loc = VD->getLocation();
FieldDecl *Field;
if (SingleEscaped.count(VD)) {
Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
Field->addAttr(*I);
}
} else {
llvm::APInt ArraySize(32, BufSize);
Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
0);
Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
static_cast<CharUnits::QuantityType>(
GlobalMemoryAlignment)));
Field->addAttr(AlignedAttr::CreateImplicit(
C, /*IsAlignmentExpr=*/true,
IntegerLiteral::Create(C, Align,
C.getIntTypeForBitwidth(32, /*Signed=*/0),
SourceLocation()),
{}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
}
GlobalizedRD->addDecl(Field);
MappedDeclsFields.try_emplace(VD, Field);
}
GlobalizedRD->completeDefinition();
return GlobalizedRD;
}
/// Get the list of variables that can escape their declaration context.
class CheckVarsEscapingDeclContext final
: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
CodeGenFunction &CGF;
llvm::SetVector<const ValueDecl *> EscapedDecls;
llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
RecordDecl *GlobalizedRD = nullptr;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
bool AllEscaped = false;
bool IsForCombinedParallelRegion = false;
void markAsEscaped(const ValueDecl *VD) {
// Do not globalize declare target variables.
if (!isa<VarDecl>(VD) ||
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
return;
VD = cast<ValueDecl>(VD->getCanonicalDecl());
// Use user-specified allocation.
if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
return;
// Variables captured by value must be globalized.
if (auto *CSI = CGF.CapturedStmtInfo) {
if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
// Check if need to capture the variable that was already captured by
// value in the outer region.
if (!IsForCombinedParallelRegion) {
if (!FD->hasAttrs())
return;
const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
if (!Attr)
return;
if (((Attr->getCaptureKind() != OMPC_map) &&
!isOpenMPPrivate(Attr->getCaptureKind())) ||
((Attr->getCaptureKind() == OMPC_map) &&
!FD->getType()->isAnyPointerType()))
return;
}
if (!FD->getType()->isReferenceType()) {
assert(!VD->getType()->isVariablyModifiedType() &&
"Parameter captured by value with variably modified type");
EscapedParameters.insert(VD);
} else if (!IsForCombinedParallelRegion) {
return;
}
}
}
if ((!CGF.CapturedStmtInfo ||
(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
VD->getType()->isReferenceType())
// Do not globalize variables with reference type.
return;
if (VD->getType()->isVariablyModifiedType())
EscapedVariableLengthDecls.insert(VD);
else
EscapedDecls.insert(VD);
}
void VisitValueDecl(const ValueDecl *VD) {
if (VD->getType()->isLValueReferenceType())
markAsEscaped(VD);
if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = VD->getType()->isLValueReferenceType();
Visit(VarD->getInit());
AllEscaped = SavedAllEscaped;
}
}
}
void VisitOpenMPCapturedStmt(const CapturedStmt *S,
ArrayRef<OMPClause *> Clauses,
bool IsCombinedParallelRegion) {
if (!S)
return;
for (const CapturedStmt::Capture &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
if (IsCombinedParallelRegion) {
// Check if the variable is privatized in the combined construct and
// those private copies must be shared in the inner parallel
// directive.
IsForCombinedParallelRegion = false;
for (const OMPClause *C : Clauses) {
if (!isOpenMPPrivate(C->getClauseKind()) ||
C->getClauseKind() == OMPC_reduction ||
C->getClauseKind() == OMPC_linear ||
C->getClauseKind() == OMPC_private)
continue;
ArrayRef<const Expr *> Vars;
if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
Vars = PC->getVarRefs();
else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
Vars = PC->getVarRefs();
else
llvm_unreachable("Unexpected clause.");
for (const auto *E : Vars) {
const Decl *D =
cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
if (D == VD->getCanonicalDecl()) {
IsForCombinedParallelRegion = true;
break;
}
}
if (IsForCombinedParallelRegion)
break;
}
}
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
}
}
}
void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
assert(!GlobalizedRD &&
"Record for globalized variables is built already.");
ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
if (IsInTTDRegion)
EscapedDeclsForTeams = EscapedDecls.getArrayRef();
else
EscapedDeclsForParallel = EscapedDecls.getArrayRef();
GlobalizedRD = ::buildRecordForGlobalizedVars(
CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
MappedDeclsFields, WarpSize);
}
public:
CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
ArrayRef<const ValueDecl *> TeamsReductions)
: CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
}
virtual ~CheckVarsEscapingDeclContext() = default;
void VisitDeclStmt(const DeclStmt *S) {
if (!S)
return;
for (const Decl *D : S->decls())
if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
VisitValueDecl(VD);
}
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
if (!D)
return;
if (!D->hasAssociatedStmt())
return;
if (const auto *S =
dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
// Do not analyze directives that do not actually require capturing,
// like `omp for` or `omp simd` directives.
llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
VisitStmt(S->getCapturedStmt());
return;
}
VisitOpenMPCapturedStmt(
S, D->clauses(),
CaptureRegions.back() == OMPD_parallel &&
isOpenMPDistributeDirective(D->getDirectiveKind()));
}
}
void VisitCapturedStmt(const CapturedStmt *S) {
if (!S)
return;
for (const CapturedStmt::Capture &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
void VisitLambdaExpr(const LambdaExpr *E) {
if (!E)
return;
for (const LambdaCapture &C : E->captures()) {
if (C.capturesVariable()) {
if (C.getCaptureKind() == LCK_ByRef) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
}
void VisitBlockExpr(const BlockExpr *E) {
if (!E)
return;
for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
if (C.isByRef()) {
const VarDecl *VD = C.getVariable();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
VisitValueDecl(VD);
}
}
}
void VisitCallExpr(const CallExpr *E) {
if (!E)
return;
for (const Expr *Arg : E->arguments()) {
if (!Arg)
continue;
if (Arg->isLValue()) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(Arg);
AllEscaped = SavedAllEscaped;
} else {
Visit(Arg);
}
}
Visit(E->getCallee());
}
void VisitDeclRefExpr(const DeclRefExpr *E) {
if (!E)
return;
const ValueDecl *VD = E->getDecl();
if (AllEscaped)
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
else if (const auto *VarD = dyn_cast<VarDecl>(VD))
if (VarD->isInitCapture())
VisitValueDecl(VD);
}
void VisitUnaryOperator(const UnaryOperator *E) {
if (!E)
return;
if (E->getOpcode() == UO_AddrOf) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(E->getSubExpr());
AllEscaped = SavedAllEscaped;
} else {
Visit(E->getSubExpr());
}
}
void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
if (!E)
return;
if (E->getCastKind() == CK_ArrayToPointerDecay) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(E->getSubExpr());
AllEscaped = SavedAllEscaped;
} else {
Visit(E->getSubExpr());
}
}
void VisitExpr(const Expr *E) {
if (!E)
return;
bool SavedAllEscaped = AllEscaped;
if (!E->isLValue())
AllEscaped = false;
for (const Stmt *Child : E->children())
if (Child)
Visit(Child);
AllEscaped = SavedAllEscaped;
}
void VisitStmt(const Stmt *S) {
if (!S)
return;
for (const Stmt *Child : S->children())
if (Child)
Visit(Child);
}
/// Returns the record that handles all the escaped local variables and used
/// instead of their original storage.
const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
if (!GlobalizedRD)
buildRecordForGlobalizedVars(IsInTTDRegion);
return GlobalizedRD;
}
/// Returns the field in the globalized record for the escaped variable.
const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
assert(GlobalizedRD &&
"Record for globalized variables must be generated already.");
auto I = MappedDeclsFields.find(VD);
if (I == MappedDeclsFields.end())
return nullptr;
return I->getSecond();
}
/// Returns the list of the escaped local variables/parameters.
ArrayRef<const ValueDecl *> getEscapedDecls() const {
return EscapedDecls.getArrayRef();
}
/// Checks if the escaped local variable is actually a parameter passed by
/// value.
const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
return EscapedParameters;
}
/// Returns the list of the escaped variables with the variably modified
/// types.
ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
return EscapedVariableLengthDecls.getArrayRef();
}
};
} // anonymous namespace
/// Get the id of the warp in the block.
/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.
static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
}
/// Get the id of the current lane in the Warp.
/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.
static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
"nvptx_lane_id");
}
CGOpenMPRuntimeGPU::ExecutionMode
CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
}
static CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode(CodeGenModule &CGM) {
return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
: CGOpenMPRuntimeGPU::Generic;
}
/// Check for inner (nested) SPMD construct, if any
static bool hasNestedSPMDDirective(ASTContext &Ctx,
const OMPExecutableDirective &D) {
const auto *CS = D.getInnermostCapturedStmt();
const auto *Body =
CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NestedDir =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
switch (D.getDirectiveKind()) {
case OMPD_target:
if (isOpenMPParallelDirective(DKind))
return true;
if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPParallelDirective(DKind))
return true;
}
}
return false;
case OMPD_target_teams:
return isOpenMPParallelDirective(DKind);
case OMPD_target_simd:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
llvm_unreachable("Unexpected directive.");
}
}
return false;
}
static bool supportsSPMDExecutionMode(ASTContext &Ctx,
const OMPExecutableDirective &D) {
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
switch (DirectiveKind) {
case OMPD_target:
case OMPD_target_teams:
return hasNestedSPMDDirective(Ctx, D);
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_target_simd:
case OMPD_target_teams_distribute_simd:
return true;
case OMPD_target_teams_distribute:
return false;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
break;
}
llvm_unreachable(
"Unknown programming model for OpenMP directive on NVPTX target.");
}
/// Check if the directive is loops based and has schedule clause at all or has
/// static scheduling.
static bool hasStaticScheduling(const OMPExecutableDirective &D) {
assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.");
return !D.hasClausesOfKind<OMPOrderedClause>() &&
(!D.hasClausesOfKind<OMPScheduleClause>() ||
llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
[](const OMPScheduleClause *C) {
return C->getScheduleKind() == OMPC_SCHEDULE_static;
}));
}
/// Check for inner (nested) lightweight runtime construct, if any
static bool hasNestedLightweightDirective(ASTContext &Ctx,
const OMPExecutableDirective &D) {
assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
const auto *CS = D.getInnermostCapturedStmt();
const auto *Body =
CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NestedDir =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
switch (D.getDirectiveKind()) {
case OMPD_target:
if (isOpenMPParallelDirective(DKind) &&
isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
hasStaticScheduling(*NestedDir))
return true;
if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
return true;
if (DKind == OMPD_parallel) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
}
} else if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPParallelDirective(DKind) &&
isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
if (DKind == OMPD_parallel) {
Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
}
}
}
}
return false;
case OMPD_target_teams:
if (isOpenMPParallelDirective(DKind) &&
isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
hasStaticScheduling(*NestedDir))
return true;
if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
return true;
if (DKind == OMPD_parallel) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true);
if (!Body)
return false;
ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
if (const auto *NND =
dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
return true;
}
}
return false;
case OMPD_target_parallel:
if (DKind == OMPD_simd)
return true;
return isOpenMPWorksharingDirective(DKind) &&
isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
case OMPD_target_teams_distribute:
case OMPD_target_simd:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
llvm_unreachable("Unexpected directive.");
}
}
return false;
}
/// Checks if the construct supports lightweight runtime. It must be SPMD
/// construct + inner loop-based construct with static scheduling.
static bool supportsLightweightRuntime(ASTContext &Ctx,
const OMPExecutableDirective &D) {
if (!supportsSPMDExecutionMode(Ctx, D))
return false;
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
switch (DirectiveKind) {
case OMPD_target:
case OMPD_target_teams:
case OMPD_target_parallel:
return hasNestedLightweightDirective(Ctx, D);
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
// (Last|First)-privates must be shared in parallel region.
return hasStaticScheduling(D);
case OMPD_target_simd:
case OMPD_target_teams_distribute_simd:
return true;
case OMPD_target_teams_distribute:
return false;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_depobj:
case OMPD_scan:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_variant:
case OMPD_begin_declare_variant:
case OMPD_end_declare_variant:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_declare_mapper:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_master_taskloop:
case OMPD_master_taskloop_simd:
case OMPD_parallel_master_taskloop:
case OMPD_parallel_master_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
default:
break;
}
llvm_unreachable(
"Unknown programming model for OpenMP directive on NVPTX target.");
}
void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
EntryFunctionState EST;
WrapperFunctionsMap.clear();
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: EST(EST) {}
void Enter(CodeGenFunction &CGF) override {
auto &RT =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
void Exit(CodeGenFunction &CGF) override {
auto &RT =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
}
} Action(EST);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
}
void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
EntryFunctionState &EST, bool IsSPMD) {
CGBuilderTy &Bld = CGF.Builder;
Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
IsInTargetMasterThreadRegion = IsSPMD;
if (!IsSPMD)
emitGenericVarsProlog(CGF, EST.Loc);
}
void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
EntryFunctionState &EST,
bool IsSPMD) {
if (!IsSPMD)
emitGenericVarsEpilog(CGF);
CGBuilderTy &Bld = CGF.Builder;
OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
StringRef ParentName,
llvm::Function *&OutlinedFn,
llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen) {
ExecutionRuntimeModesRAII ModeRAII(
CurrentExecutionMode, RequiresFullRuntime,
CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
!supportsLightweightRuntime(CGM.getContext(), D));
EntryFunctionState EST;
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU &RT;
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
CGOpenMPRuntimeGPU::EntryFunctionState &EST)
: RT(RT), EST(EST) {}
void Enter(CodeGenFunction &CGF) override {
RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
void Exit(CodeGenFunction &CGF) override {
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
}
} Action(*this, EST);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
}
// Create a unique global variable to indicate the execution mode of this target
// region. The execution mode is either 'generic', or 'spmd' depending on the
// target directive. This variable is picked up by the offload library to setup
// the device appropriately before kernel launch. If the execution mode is
// 'generic', the runtime reserves one warp for the master, otherwise, all
// warps participate in parallel work.
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
bool Mode) {
auto *GVMode = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::WeakAnyLinkage,
llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
: OMP_TGT_EXEC_MODE_GENERIC),
Twine(Name, "_exec_mode"));
CGM.addCompilerUsedGlobal(GVMode);
}
void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
llvm::Constant *Addr,
uint64_t Size, int32_t,
llvm::GlobalValue::LinkageTypes) {
// TODO: Add support for global variables on the device after declare target
// support.
if (!isa<llvm::Function>(Addr))
return;
llvm::Module &M = CGM.getModule();
llvm::LLVMContext &Ctx = CGM.getLLVMContext();
// Get "nvvm.annotations" metadata node
llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
llvm::Metadata *MDVals[] = {
llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
// Append metadata to nvvm.annotations
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
}
void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
if (!IsOffloadEntry) // Nothing to do.
return;
assert(!ParentName.empty() && "Invalid target region parent name!");
bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
if (Mode)
emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
else
emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
CodeGen);
setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
}
namespace {
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
/// Enum for accesseing the reserved_2 field of the ident_t struct.
enum ModeFlagsTy : unsigned {
/// Bit set to 1 when in SPMD mode.
KMP_IDENT_SPMD_MODE = 0x01,
/// Bit set to 1 when a simplified runtime is used.
KMP_IDENT_SIMPLE_RT_MODE = 0x02,
LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
};
/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
static const ModeFlagsTy UndefinedMode =
(~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
} // anonymous namespace
unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
switch (getExecutionMode()) {
case EM_SPMD:
if (requiresFullRuntime())
return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
case EM_NonSPMD:
assert(requiresFullRuntime() && "Expected full runtime.");
return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
case EM_Unknown:
return UndefinedMode;
}
llvm_unreachable("Unknown flags are requested.");
}
CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
: CGOpenMPRuntime(CGM, "_", "$") {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP can only handle device code.");
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
if (CGM.getLangOpts().OpenMPTargetNewRuntime) {
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
"__omp_rtl_debug_kind");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
"__omp_rtl_assume_teams_oversubscription");
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
"__omp_rtl_assume_threads_oversubscription");
}
}
void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
ProcBindKind ProcBind,
SourceLocation Loc) {
// Do nothing in case of SPMD mode and L0 parallel.
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
return;
CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
}
void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc) {
// Do nothing in case of SPMD mode and L0 parallel.
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
return;
CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
}
void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {}
llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
bool &IsInParallelRegion;
bool PrevIsInParallelRegion;
public:
NVPTXPrePostActionTy(bool &IsInParallelRegion)
: IsInParallelRegion(IsInParallelRegion) {}
void Enter(CodeGenFunction &CGF) override {
PrevIsInParallelRegion = IsInParallelRegion;
IsInParallelRegion = true;
}
void Exit(CodeGenFunction &CGF) override {
IsInParallelRegion = PrevIsInParallelRegion;
}
} Action(IsInParallelRegion);
CodeGen.setAction(Action);
bool PrevIsInTTDRegion = IsInTTDRegion;
IsInTTDRegion = false;
bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
IsInTargetMasterThreadRegion = false;
auto *OutlinedFun =
cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen));
IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
IsInTTDRegion = PrevIsInTTDRegion;
if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
!IsInParallelRegion) {
llvm::Function *WrapperFun =
createParallelDataSharingWrapper(OutlinedFun, D);
WrapperFunctionsMap[OutlinedFun] = WrapperFun;
}
return OutlinedFun;
}
/// Get list of lastprivate variables from the teams distribute ... or
/// teams {distribute ...} directives.
static void
getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
"expected teams directive.");
const OMPExecutableDirective *Dir = &D;
if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
Ctx,
D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true))) {
Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
Dir = nullptr;
}
}
if (!Dir)
return;
for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
for (const Expr *E : C->getVarRefs())
Vars.push_back(getPrivateItem(E));
}
}
/// Get list of reduction variables from the teams ... directives.
static void
getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
"expected teams directive.");
for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
for (const Expr *E : C->privates())
Vars.push_back(getPrivateItem(E));
}
}
llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
SourceLocation Loc = D.getBeginLoc();
const RecordDecl *GlobalizedRD = nullptr;
llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
// Globalize team reductions variable unconditionally in all modes.
if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
if (!LastPrivatesReductions.empty()) {
GlobalizedRD = ::buildRecordForGlobalizedVars(
CGM.getContext(), llvm::None, LastPrivatesReductions,
MappedDeclsFields, WarpSize);
}
} else if (!LastPrivatesReductions.empty()) {
assert(!TeamAndReductions.first &&
"Previous team declaration is not expected.");
TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
std::swap(TeamAndReductions.second, LastPrivatesReductions);
}
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
SourceLocation &Loc;
const RecordDecl *GlobalizedRD;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields;
public:
NVPTXPrePostActionTy(
SourceLocation &Loc, const RecordDecl *GlobalizedRD,
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&MappedDeclsFields)
: Loc(Loc), GlobalizedRD(GlobalizedRD),
MappedDeclsFields(MappedDeclsFields) {}
void Enter(CodeGenFunction &CGF) override {
auto &Rt =
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (GlobalizedRD) {
auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().MappedParams =
std::make_unique<CodeGenFunction::OMPMapVars>();
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const auto &Pair : MappedDeclsFields) {
assert(Pair.getFirst()->isCanonicalDecl() &&
"Expected canonical declaration");
Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
}
}
Rt.emitGenericVarsProlog(CGF, Loc);
}
void Exit(CodeGenFunction &CGF) override {
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
.emitGenericVarsEpilog(CGF);
}
} Action(Loc, GlobalizedRD, MappedDeclsFields);
CodeGen.setAction(Action);
llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen);
return OutlinedFun;
}
void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
SourceLocation Loc,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
return;
CGBuilderTy &Bld = CGF.Builder;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return;
for (auto &Rec : I->getSecond().LocalVarData) {
const auto *VD = cast<VarDecl>(Rec.first);
bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
QualType VarTy = VD->getType();
// Get the local allocation of a firstprivate variable before sharing
llvm::Value *ParValue;
if (EscapedParam) {
LValue ParLVal =
CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
}
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::Instruction *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
// Cast the void pointer and get the address of the globalized variable.
llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
Rec.second.GlobalizedVal = VoidPtr;
// Assign the local allocation to the newly globalized location.
if (EscapedParam) {
CGF.EmitStoreOfScalar(ParValue, VarAddr);
I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
}
if (auto *DI = CGF.getDebugInfo())
VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
}
for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
// Use actual memory size of the VLA object including the padding
// for alignment purposes.
llvm::Value *Size = CGF.getTypeSize(VD->getType());
CharUnits Align = CGM.getContext().getDeclAlign(VD);
Size = Bld.CreateNUWAdd(
Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
llvm::Value *AlignVal =
llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
Size = Bld.CreateUDiv(Size, AlignVal);
Size = Bld.CreateNUWMul(Size, AlignVal);
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::Instruction *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
std::pair<llvm::Value *, llvm::Value *>(
{VoidPtr, CGF.getTypeSize(VD->getType())}));
LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
CGM.getContext().getDeclAlign(VD),
AlignmentSource::Decl);
I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
Base.getAddress(CGF));
}
I->getSecond().MappedParams->apply(CGF);
}
void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
bool WithSPMDCheck) {
if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
return;
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
// Deallocate the memory for each globalized VLA object
for (auto AddrSizePair :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{AddrSizePair.first, AddrSizePair.second});
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
const auto *VD = cast<VarDecl>(Rec.first);
I->getSecond().MappedParams->restore(CGF);
llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
CGF.getTypeSize(VD->getType())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
FreeArgs);
}
}
}
void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
/*Name=*/".zero.addr");
CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
}
void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
const Expr *IfCond) {
if (!CGF.HaveInsertPoint())
return;
auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
if (WFn)
ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
// Create a private scope that will globalize the arguments
// passed from the outside of the target region.
// TODO: Is that needed?
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
"captured_vars_addrs");
// There's something to share.
if (!CapturedVars.empty()) {
// Prepare for parallel region. Indicate the outlined function.
ASTContext &Ctx = CGF.getContext();
unsigned Idx = 0;
for (llvm::Value *V : CapturedVars) {
Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
llvm::Value *PtrV;
if (V->getType()->isIntegerTy())
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
else
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
Ctx.getPointerType(Ctx.VoidPtrTy));
++Idx;
}
}
llvm::Value *IfCondVal = nullptr;
if (IfCond)
IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
/* isSigned */ false);
else
IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
assert(IfCondVal && "Expected a value");
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *Args[] = {
RTLoc,
getThreadID(CGF, Loc),
IfCondVal,
llvm::ConstantInt::get(CGF.Int32Ty, -1),
llvm::ConstantInt::get(CGF.Int32Ty, -1),
FnPtr,
ID,
Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
CGF.VoidPtrPtrTy),
llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_parallel_51),
Args);
};
RegionCodeGenTy RCG(ParallelGen);
RCG(CGF);
}
void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
// Always emit simple barriers!
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
// This function does not use parameters, so we can emit just default values.
llvm::Value *Args[] = {
llvm::ConstantPointerNull::get(
cast<llvm::PointerType>(getIdentTyPointerTy())),
llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
Args);
}
void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
SourceLocation Loc,
OpenMPDirectiveKind Kind, bool,
bool) {
// Always emit simple barriers!
if (!CGF.HaveInsertPoint())
return;
// Build call __kmpc_cancel_barrier(loc, thread_id);
unsigned Flags = getDefaultFlagsForBarriers(Kind);
llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
getThreadID(CGF, Loc)};
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
Args);
}
void CGOpenMPRuntimeGPU::emitCriticalRegion(
CodeGenFunction &CGF, StringRef CriticalName,
const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
const Expr *Hint) {
llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the mask of active threads in the warp.
llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
// Fetch team-local id of the thread.
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// Get the width of the team.
llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
// Initialize the counter variable for the loop.
QualType Int32Ty =
CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
/*isInit=*/true);
// Block checks if loop counter exceeds upper bound.
CGF.EmitBlock(LoopBB);
llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
// Block tests which single thread should execute region, and which threads
// should go straight to synchronisation point.
CGF.EmitBlock(TestBB);
CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
llvm::Value *CmpThreadToCounter =
CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
// Block emits the body of the critical region.
CGF.EmitBlock(BodyBB);
// Output the critical statement.
CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
Hint);
// After the body surrounded by the critical region, the single executing
// thread will jump to the synchronisation point.
// Block waits for all threads in current team to finish then increments the
// counter variable and returns to the loop.
CGF.EmitBlock(SyncBB);
// Reconverge active threads in the warp.
(void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_syncwarp),
Mask);
llvm::Value *IncCounterVal =
CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
CGF.EmitBranch(LoopBB);
// Block that is reached when all threads in the team complete the region.
CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
}
/// Cast value to the specified type.
static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
QualType ValTy, QualType CastTy,
SourceLocation Loc) {
assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
"Cast type must sized.");
assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
"Val type must sized.");
llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
if (ValTy == CastTy)
return Val;
if (CGF.getContext().getTypeSizeInChars(ValTy) ==
CGF.getContext().getTypeSizeInChars(CastTy))
return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
if (CastTy->isIntegerType() && ValTy->isIntegerType())
return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
CastTy->hasSignedIntegerRepresentation());
Address CastItem = CGF.CreateMemTemp(CastTy);
Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
}
/// This function creates calls to one of two shuffle functions to copy
/// variables between lanes in a warp.
static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
llvm::Value *Elem,
QualType ElemType,
llvm::Value *Offset,
SourceLocation Loc) {
CodeGenModule &CGM = CGF.CGM;
CGBuilderTy &Bld = CGF.Builder;
CGOpenMPRuntimeGPU &RT =
*(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
assert(Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.");
RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
? OMPRTL___kmpc_shuffle_int32
: OMPRTL___kmpc_shuffle_int64;
// Cast all types to 32- or 64-bit values before calling shuffle routines.
QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
llvm::Value *WarpSize =
Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
{ElemCast, Offset, WarpSize});
return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
}
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
Address DestAddr, QualType ElemType,
llvm::Value *Offset, SourceLocation Loc) {
CGBuilderTy &Bld = CGF.Builder;
CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
// Create the loop over the big sized data.
// ptr = (void*)Elem;
// ptrEnd = (void*) Elem + 1;
// Step = 8;
// while (ptr + Step < ptrEnd)
// shuffle((int64_t)*ptr);
// Step = 4;
// while (ptr + Step < ptrEnd)
// shuffle((int32_t)*ptr);
// ...
Address ElemPtr = DestAddr;
Address Ptr = SrcAddr;
Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
if (Size < CharUnits::fromQuantity(IntSize))
continue;
QualType IntType = CGF.getContext().getIntTypeForBitwidth(
CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
/*Signed=*/1);
llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
ElemPtr =
Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
if (Size.getQuantity() / IntSize > 1) {
llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
CGF.EmitBlock(PreCondBB);
llvm::PHINode *PhiSrc =
Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
llvm::PHINode *PhiDest =
Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
Ptr = Address(PhiSrc, Ptr.getAlignment());
ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
llvm::Value *PtrDiff = Bld.CreatePtrDiff(
PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
Ptr.getPointer(), CGF.VoidPtrTy));
Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
ThenBB, ExitBB);
CGF.EmitBlock(ThenBB);
llvm::Value *Res = createRuntimeShuffleFunction(
CGF,
CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo()),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
CGF.EmitBranch(PreCondBB);
CGF.EmitBlock(ExitBB);
} else {
llvm::Value *Res = createRuntimeShuffleFunction(
CGF,
CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo()),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Ptr = Bld.CreateConstGEP(Ptr, 1);
ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
}
Size = Size % IntSize;
}
}
namespace {
enum CopyAction : unsigned {
// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
// the warp using shuffle instructions.
RemoteLaneToThread,
// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
ThreadCopy,
// ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
ThreadToScratchpad,
// ScratchpadToThread: Copy from a scratchpad array in global memory
// containing team-reduced data to a thread's stack.
ScratchpadToThread,
};
} // namespace
struct CopyOptionsTy {
llvm::Value *RemoteLaneOffset;
llvm::Value *ScratchpadIndex;
llvm::Value *ScratchpadWidth;
};
/// Emit instructions to copy a Reduce list, which contains partially
/// aggregated values, in the specified direction.
static void emitReductionListCopy(
CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
CodeGenModule &CGM = CGF.CGM;
ASTContext &C = CGM.getContext();
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
// Iterates, element-by-element, through the source Reduce list and
// make a copy.
unsigned Idx = 0;
unsigned Size = Privates.size();
for (const Expr *Private : Privates) {
Address SrcElementAddr = Address::invalid();
Address DestElementAddr = Address::invalid();
Address DestElementPtrAddr = Address::invalid();
// Should we shuffle in an element from a remote lane?
bool ShuffleInElement = false;
// Set to true to update the pointer in the dest Reduce list to a
// newly created element.
bool UpdateDestListPtr = false;
// Increment the src or dest pointer to the scratchpad, for each
// new element.
bool IncrScratchpadSrc = false;
bool IncrScratchpadDest = false;
switch (Action) {
case RemoteLaneToThread: {
// Step 1.1: Get the address for the src element in the Reduce list.
Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
SrcElementAddr = CGF.EmitLoadOfPointer(
SrcElementPtrAddr,
C.getPointerType(Private->getType())->castAs<PointerType>());
// Step 1.2: Create a temporary to store the element in the destination
// Reduce list.
DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
DestElementAddr =
CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
ShuffleInElement = true;
UpdateDestListPtr = true;
break;
}
case ThreadCopy: {
// Step 1.1: Get the address for the src element in the Reduce list.
Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
SrcElementAddr = CGF.EmitLoadOfPointer(
SrcElementPtrAddr,
C.getPointerType(Private->getType())->castAs<PointerType>());
// Step 1.2: Get the address for dest element. The destination
// element has already been created on the thread's stack.
DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
DestElementAddr = CGF.EmitLoadOfPointer(
DestElementPtrAddr,
C.getPointerType(Private->getType())->castAs<PointerType>());
break;
}
case ThreadToScratchpad: {
// Step 1.1: Get the address for the src element in the Reduce list.
Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
SrcElementAddr = CGF.EmitLoadOfPointer(
SrcElementPtrAddr,
C.getPointerType(Private->getType())->castAs<PointerType>());
// Step 1.2: Get the address for dest element:
// address = base + index * ElementSizeInChars.
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
llvm::Value *CurrentOffset =
Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
llvm::Value *ScratchPadElemAbsolutePtrVal =
Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
ScratchPadElemAbsolutePtrVal =
Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
C.getTypeAlignInChars(Private->getType()));
IncrScratchpadDest = true;
break;
}
case ScratchpadToThread: {
// Step 1.1: Get the address for the src element in the scratchpad.
// address = base + index * ElementSizeInChars.
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
llvm::Value *CurrentOffset =
Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
llvm::Value *ScratchPadElemAbsolutePtrVal =
Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
ScratchPadElemAbsolutePtrVal =
Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
C.getTypeAlignInChars(Private->getType()));
IncrScratchpadSrc = true;
// Step 1.2: Create a temporary to store the element in the destination
// Reduce list.
DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
DestElementAddr =
CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
UpdateDestListPtr = true;
break;
}
}
// Regardless of src and dest of copy, we emit the load of src
// element as this is required in all directions
SrcElementAddr = Bld.CreateElementBitCast(
SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
SrcElementAddr.getElementType());
// Now that all active lanes have read the element in the
// Reduce list, shuffle over the value from the remote lane.
if (ShuffleInElement) {
shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
RemoteLaneOffset, Private->getExprLoc());
} else {
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *Elem = CGF.EmitLoadOfScalar(
SrcElementAddr, /*Volatile=*/false, Private->getType(),
Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
// Store the source element value to the dest element address.
CGF.EmitStoreOfScalar(
Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
break;
}
case TEK_Complex: {
CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
Private->getExprLoc());
CGF.EmitStoreOfComplex(
Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
/*isInit=*/false);
break;
}
case TEK_Aggregate:
CGF.EmitAggregateCopy(
CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
Private->getType(), AggValueSlot::DoesNotOverlap);
break;
}
}
// Step 3.1: Modify reference in dest Reduce list as needed.
// Modifying the reference in Reduce list to point to the newly
// created element. The element is live in the current function
// scope and that of functions it invokes (i.e., reduce_function).
// RemoteReduceData[i] = (void*)&RemoteElem
if (UpdateDestListPtr) {
CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
DestElementAddr.getPointer(), CGF.VoidPtrTy),
DestElementPtrAddr, /*Volatile=*/false,
C.VoidPtrTy);
}
// Step 4.1: Increment SrcBase/DestBase so that it points to the starting
// address of the next element in scratchpad memory, unless we're currently
// processing the last one. Memory alignment is also taken care of here.
if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
llvm::Value *ScratchpadBasePtr =
IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
ScratchpadBasePtr = Bld.CreateNUWAdd(
ScratchpadBasePtr,
Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
// Take care of global memory alignment for performance
ScratchpadBasePtr = Bld.CreateNUWSub(
ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateUDiv(
ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
ScratchpadBasePtr = Bld.CreateNUWAdd(
ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
ScratchpadBasePtr = Bld.CreateNUWMul(
ScratchpadBasePtr,
llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
if (IncrScratchpadDest)
DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
else /* IncrScratchpadSrc = true */
SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
}
++Idx;
}
}
/// This function emits a helper that gathers Reduce lists from the first
/// lane of every active warp to lanes in the first warp.
///
/// void inter_warp_copy_func(void* reduce_data, num_warps)
/// shared smem[warp_size];
/// For all data entries D in reduce_data:
/// sync
/// If (I am the first lane in each warp)
/// Copy my local D to smem[warp_id]
/// sync
/// if (I am the first warp)
/// Copy smem[thread_id] to my local D
static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy,
SourceLocation Loc) {
ASTContext &C = CGM.getContext();
llvm::Module &M = CGM.getModule();
// ReduceList: thread local Reduce list.
// At the stage of the computation when this function is called, partially
// aggregated values reside in the first lane of every active warp.
ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
// NumWarps: number of warps active in the parallel region. This could
// be smaller than 32 (max warps in a CTA) for partial block reduction.
ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.getIntTypeForBitwidth(32, /* Signed */ true),
ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&ReduceListArg);
Args.push_back(&NumWarpsArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
llvm::GlobalValue::InternalLinkage,
"_omp_reduction_inter_warp_copy_func", &M);
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
// This array is used as a medium to transfer, one reduce element at a time,
// the data from the first lane of every warp to lanes in the first warp
// in order to perform the final step of a reduction in a parallel region
// (reduction across warps). The array is placed in NVPTX __shared__ memory
// for reduced latency, as well as to have a distinct copy for concurrently
// executing target regions. The array is declared with common linkage so
// as to be shared across compilation units.
StringRef TransferMediumName =
"__openmp_nvptx_data_transfer_temporary_storage";
llvm::GlobalVariable *TransferMedium =
M.getGlobalVariable(TransferMediumName);
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
if (!TransferMedium) {
auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
TransferMedium = new llvm::GlobalVariable(
M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
llvm::UndefValue::get(Ty), TransferMediumName,
/*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
SharedAddressSpace);
CGM.addCompilerUsedGlobal(TransferMedium);
}
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the CUDA thread id of the current OpenMP thread on the GPU.
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// nvptx_lane_id = nvptx_id % warpsize
llvm::Value *LaneID = getNVPTXLaneID(CGF);
// nvptx_warp_id = nvptx_id / warpsize
llvm::Value *WarpID = getNVPTXWarpID(CGF);
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(
AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
CGF.getPointerAlign());
unsigned Idx = 0;
for (const Expr *Private : Privates) {
//
// Warp master copies reduce element to transfer medium in __shared__
// memory.
//
unsigned RealTySize =
C.getTypeSizeInChars(Private->getType())
.alignTo(C.getTypeAlignInChars(Private->getType()))
.getQuantity();
for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
unsigned NumIters = RealTySize / TySize;
if (NumIters == 0)
continue;
QualType CType = C.getIntTypeForBitwidth(
C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
CharUnits Align = CharUnits::fromQuantity(TySize);
llvm::Value *Cnt = nullptr;
Address CntAddr = Address::invalid();
llvm::BasicBlock *PrecondBB = nullptr;
llvm::BasicBlock *ExitBB = nullptr;
if (NumIters > 1) {
CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
/*Volatile=*/false, C.IntTy);
PrecondBB = CGF.createBasicBlock("precond");
ExitBB = CGF.createBasicBlock("exit");
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(PrecondBB);
Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
llvm::Value *Cmp =
Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
CGF.EmitBlock(BodyBB);
}
// kmpc_barrier.
CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
/*EmitChecks=*/false,
/*ForceSimpleCall=*/true);
llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
// if (lane_id == 0)
llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
CGF.EmitBlock(ThenBB);
// Reduce element = LocalReduceList[i]
Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
// elemptr = ((CopyType*)(elemptrptr)) + I
Address ElemPtr = Address(ElemPtrPtr, Align);
ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
if (NumIters > 1) {
ElemPtr = Address(Bld.CreateGEP(ElemPtr.getElementType(),
ElemPtr.getPointer(), Cnt),
ElemPtr.getAlignment());
}
// Get pointer to location in transfer medium.
// MediumPtr = &medium[warp_id]
llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
TransferMedium->getValueType(), TransferMedium,
{llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
Address MediumPtr(MediumPtrVal, Align);
// Casting to actual data type.
// MediumPtr = (CopyType*)MediumPtrAddr;
MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
// elem = *elemptr
//*MediumPtr = elem
llvm::Value *Elem = CGF.EmitLoadOfScalar(
ElemPtr, /*Volatile=*/false, CType, Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
// Store the source element value to the dest element address.
CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Bld.CreateBr(MergeBB);
CGF.EmitBlock(ElseBB);
Bld.CreateBr(MergeBB);
CGF.EmitBlock(MergeBB);
// kmpc_barrier.
CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
/*EmitChecks=*/false,
/*ForceSimpleCall=*/true);
//
// Warp 0 copies reduce element from transfer medium.
//
llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
// Up to 32 threads in warp 0 are active.
llvm::Value *IsActiveThread =
Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
CGF.EmitBlock(W0ThenBB);
// SrcMediumPtr = &medium[tid]
llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
TransferMedium->getValueType(), TransferMedium,
{llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
Address SrcMediumPtr(SrcMediumPtrVal, Align);
// SrcMediumVal = *SrcMediumPtr;
SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
// TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
Address TargetElemPtr = Address(TargetElemPtrVal, Align);
TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
if (NumIters > 1) {
TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getElementType(),
TargetElemPtr.getPointer(), Cnt),
TargetElemPtr.getAlignment());
}
// *TargetElemPtr = SrcMediumVal;
llvm::Value *SrcMediumValue =
CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
CType);
Bld.CreateBr(W0MergeBB);
CGF.EmitBlock(W0ElseBB);
Bld.CreateBr(W0MergeBB);
CGF.EmitBlock(W0MergeBB);
if (NumIters > 1) {
Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
CGF.EmitBranch(PrecondBB);
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(ExitBB);
}
RealTySize %= TySize;
}
++Idx;
}
CGF.FinishFunction();
return Fn;
}
/// Emit a helper that reduces data across two OpenMP threads (lanes)
/// in the same warp. It uses shuffle instructions to copy over data from
/// a remote lane's stack. The reduction algorithm performed is specified
/// by the fourth parameter.
///
/// Algorithm Versions.
/// Full Warp Reduce (argument value 0):
/// This algorithm assumes that all 32 lanes are active and gathers
/// data from these 32 lanes, producing a single resultant value.
/// Contiguous Partial Warp Reduce (argument value 1):
/// This algorithm assumes that only a *contiguous* subset of lanes
/// are active. This happens for the last warp in a parallel region
/// when the user specified num_threads is not an integer multiple of
/// 32. This contiguous subset always starts with the zeroth lane.
/// Partial Warp Reduce (argument value 2):
/// This algorithm gathers data from any number of lanes at any position.
/// All reduced values are stored in the lowest possible lane. The set
/// of problems every algorithm addresses is a super set of those
/// addressable by algorithms with a lower version number. Overhead
/// increases as algorithm version increases.
///
/// Terminology
/// Reduce element:
/// Reduce element refers to the individual data field with primitive
/// data types to be combined and reduced across threads.
/// Reduce list:
/// Reduce list refers to a collection of local, thread-private
/// reduce elements.
/// Remote Reduce list:
/// Remote Reduce list refers to a collection of remote (relative to
/// the current thread) reduce elements.
///
/// We distinguish between three states of threads that are important to
/// the implementation of this function.
/// Alive threads:
/// Threads in a warp executing the SIMT instruction, as distinguished from
/// threads that are inactive due to divergent control flow.
/// Active threads:
/// The minimal set of threads that has to be alive upon entry to this
/// function. The computation is correct iff active threads are alive.
/// Some threads are alive but they are not active because they do not
/// contribute to the computation in any useful manner. Turning them off
/// may introduce control flow overheads without any tangible benefits.
/// Effective threads:
/// In order to comply with the argument requirements of the shuffle
/// function, we must keep all lanes holding data alive. But at most
/// half of them perform value aggregation; we refer to this half of
/// threads as effective. The other half is simply handing off their
/// data.
///
/// Procedure
/// Value shuffle:
/// In this step active threads transfer data from higher lane positions
/// in the warp to lower lane positions, creating Remote Reduce list.
/// Value aggregation:
/// In this step, effective threads combine their thread local Reduce list
/// with Remote Reduce list and store the result in the thread local
/// Reduce list.
/// Value copy:
/// In this step, we deal with the assumption made by algorithm 2
/// (i.e. contiguity assumption). When we have an odd number of lanes
/// active, say 2k+1, only k threads will be effective and therefore k
/// new values will be produced. However, the Reduce list owned by the
/// (2k+1)th thread is ignored in the value aggregation. Therefore
/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
/// that the contiguity assumption still holds.
static llvm::Function *emitShuffleAndReduceFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
ASTContext &C = CGM.getContext();
// Thread local Reduce list used to host the values of data to be reduced.
ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
// Current lane id; could be logical.
ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
ImplicitParamDecl::Other);
// Offset of the remote source lane relative to the current lane.
ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.ShortTy, ImplicitParamDecl::Other);
// Algorithm version. This is expected to be known at compile time.
ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.ShortTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&ReduceListArg);
Args.push_back(&LaneIDArg);
Args.push_back(&RemoteLaneOffsetArg);
Args.push_back(&AlgoVerArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
C.VoidPtrTy, SourceLocation()),
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
CGF.getPointerAlign());
Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
// Create a local thread-private variable to host the Reduce list
// from a remote lane.
Address RemoteReduceList =
CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
// This loop iterates through the list of reduce elements and copies,
// element by element, from a remote lane in the warp to RemoteReduceList,
// hosted on the thread's stack.
emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
LocalReduceList, RemoteReduceList,
{/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
/*ScratchpadIndex=*/nullptr,
/*ScratchpadWidth=*/nullptr});
// The actions to be performed on the Remote Reduce list is dependent
// on the algorithm version.
//
// if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
// LaneId % 2 == 0 && Offset > 0):
// do the reduction value aggregation
//
// The thread local variable Reduce list is mutated in place to host the
// reduced data, which is the aggregated value produced from local and
// remote lanes.
//
// Note that AlgoVer is expected to be a constant integer known at compile
// time.
// When AlgoVer==0, the first conjunction evaluates to true, making
// the entire predicate true during compile time.
// When AlgoVer==1, the second conjunction has only the second part to be
// evaluated during runtime. Other conjunctions evaluates to false
// during compile time.
// When AlgoVer==2, the third conjunction has only the second part to be
// evaluated during runtime. Other conjunctions evaluates to false
// during compile time.
llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
llvm::Value *CondAlgo1 = Bld.CreateAnd(
Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
llvm::Value *CondAlgo2 = Bld.CreateAnd(
Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
CondAlgo2 = Bld.CreateAnd(
CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
CGF.EmitBlock(ThenBB);
// reduce_function(LocalReduceList, RemoteReduceList)
llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
LocalReduceList.getPointer(), CGF.VoidPtrTy);
llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
RemoteReduceList.getPointer(), CGF.VoidPtrTy);
CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Bld.CreateBr(MergeBB);
CGF.EmitBlock(ElseBB);
Bld.CreateBr(MergeBB);
CGF.EmitBlock(MergeBB);
// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
// Reduce list.
Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
llvm::Value *CondCopy = Bld.CreateAnd(
Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
CGF.EmitBlock(CpyThenBB);
emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
RemoteReduceList, LocalReduceList);
Bld.CreateBr(CpyMergeBB);
CGF.EmitBlock(CpyElseBB);
Bld.CreateBr(CpyMergeBB);
CGF.EmitBlock(CpyMergeBB);
CGF.FinishFunction();
return Fn;
}
/// This function emits a helper that copies all the reduction variables from
/// the team into the provided global buffer for the reduction variables.
///
/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
/// For all data entries D in reduce_data:
/// Copy local D to buffer.D[Idx]
static llvm::Value *emitListToGlobalCopyFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, SourceLocation Loc,
const RecordDecl *TeamReductionRec,
const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
&VarFieldMap) {
ASTContext &C = CGM.getContext();
// Buffer: global reduction buffer.
ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
// Idx: index of the buffer.
ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
ImplicitParamDecl::Other);
// ReduceList: thread local Reduce list.
ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
C.VoidPtrTy, ImplicitParamDecl::Other);
FunctionArgList Args;
Args.push_back(&BufferArg);
Args.push_back(&IdxArg);
Args.push_back(&ReduceListArg);
const CGFunctionInfo &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
"_omp_reduction_list_to_global_copy_func", &CGM.getModule());
CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Fn->setDoesNotRecurse();
CodeGenFunction CGF(CGM);
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
C.VoidPtrTy, Loc),
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
CGF.getPointerAlign());
QualType StaticTy = C.getRecordType(TeamReductionRec);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
LLVMReductionsBufferTy->getPointerTo());
llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
/*Volatile=*/false, C.IntTy,
Loc)};
unsigned Idx = 0;
for (const Expr *Private : Privates) {
// Reduce element = LocalReduceList[i]
Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
// elemptr = ((CopyType*)(elemptrptr)) + I
ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
Address ElemPtr =
Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
// Global = Buffer.VD[Idx];
const FieldDecl *FD = VarFieldMap.lookup(VD);
LValue GlobLVal = CGF.EmitLValueForField(
CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
Address GlobAddr = GlobLVal.getAddress(CGF);
llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *V = CGF.EmitLoadOfScalar(
ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
CGF.EmitStoreOfScalar(V, GlobLVal);
break;
}
case TEK_Complex: {
CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
break;
}
case TEK_Aggregate:
CGF.EmitAggregateCopy(GlobLVal,
CGF.MakeAddrLValue(ElemPtr, Private->getType()),
Private->getType(), AggValueSlot::DoesNotOverlap);
break;
}
++Idx;
}
CGF.FinishFunction();
return Fn;
}
/// This function emits a helper that reduces all the reduction variables from
/// the team into the provided global buffer for the reduction variables.
///
/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
/// void *GlobPtrs[];
/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
/// ...
/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
/// reduce_function(GlobPtrs, reduce_data);
static llvm::Value *emitListToGlobalReduceFunction(
CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
QualType ReductionArrayTy, SourceLocation Loc,