blob: b7b800216288aadc0715e5841579291129ef8494 [file] [log] [blame]
//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains the implementation of data sharing environments/
//
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
#include <stdio.h>
// Warp ID in the CUDA block
INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
// Lane ID in the CUDA warp.
INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
// Return true if this is the first active thread in the warp.
INLINE static bool IsWarpMasterActiveThread() {
unsigned long long Mask = __ACTIVEMASK();
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
unsigned long long Sh = Mask << ShNum;
// Truncate Sh to the 32 lower bits
return (unsigned)Sh == 0;
}
// Return true if this is the master thread.
INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
}
/// Return the provided size aligned to the size of a pointer.
INLINE static size_t AlignVal(size_t Val) {
const size_t Align = (size_t)sizeof(void *);
if (Val & (Align - 1)) {
Val += Align;
Val &= ~(Align - 1);
}
return Val;
}
#define DSFLAG 0
#define DSFLAG_INIT 0
#define DSPRINT(_flag, _str, _args...) \
{ \
if (_flag) { \
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
} \
}
#define DSPRINT0(_flag, _str) \
{ \
if (_flag) { \
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
} \
}
// Initialize the shared data structures. This is expected to be called for the
// master thread and warp masters. \param RootS: A pointer to the root of the
// data sharing stack. \param InitialDataSize: The initial size of the data in
// the slot.
EXTERN void
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
size_t InitialDataSize) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
DSPRINT0(DSFLAG_INIT,
"Entering __kmpc_initialize_data_sharing_environment\n");
unsigned WID = getWarpId();
DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
__kmpc_data_sharing_slot *RootS =
teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
// We don't need to initialize the frame and active threads.
DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
(unsigned long long)RootS->DataEnd);
DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
(unsigned long long)RootS->Next);
DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
(unsigned long long)DataSharingState.SlotPtr[WID]);
DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
(unsigned long long)DataSharingState.StackPtr[WID]);
DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
}
EXTERN void *__kmpc_data_sharing_environment_begin(
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
void **SavedSharedFrame, int32_t *SavedActiveThreads,
size_t SharingDataSize, size_t SharingDefaultDataSize,
int16_t IsOMPRuntimeInitialized) {
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
// If the runtime has been elided, used __shared__ memory for master-worker
// data sharing.
if (!IsOMPRuntimeInitialized)
return (void *)&DataSharingState;
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
(unsigned long long)SharingDefaultDataSize);
unsigned WID = getWarpId();
unsigned CurActiveThreads = __ACTIVEMASK();
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
void * volatile &FrameP = DataSharingState.FramePtr[WID];
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
// Save the current values.
*SavedSharedSlot = SlotP;
*SavedSharedStack = StackP;
*SavedSharedFrame = FrameP;
*SavedActiveThreads = ActiveT;
DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
// Only the warp active master needs to grow the stack.
if (IsWarpMasterActiveThread()) {
// Save the current active threads.
ActiveT = CurActiveThreads;
// Make sure we use aligned sizes to avoid rematerialization of data.
SharingDataSize = AlignVal(SharingDataSize);
// FIXME: The default data size can be assumed to be aligned?
SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
// Check if we have room for the data in the current slot.
const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
const uintptr_t RequiredEndAddress =
CurrentStartAddress + (uintptr_t)SharingDataSize;
DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
(unsigned long long)SharingDefaultDataSize);
DSPRINT(DSFLAG, "Current Start Address %016llx\n",
(unsigned long long)CurrentStartAddress);
DSPRINT(DSFLAG, "Current End Address %016llx\n",
(unsigned long long)CurrentEndAddress);
DSPRINT(DSFLAG, "Required End Address %016llx\n",
(unsigned long long)RequiredEndAddress);
DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);
// If we require a new slot, allocate it and initialize it (or attempt to
// reuse one). Also, set the shared stack and slot pointers to the new
// place. If we do not need to grow the stack, just adapt the stack and
// frame pointers.
if (CurrentEndAddress < RequiredEndAddress) {
size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
? SharingDataSize
: SharingDefaultDataSize;
__kmpc_data_sharing_slot *NewSlot = 0;
// Attempt to reuse an existing slot.
if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
(uintptr_t)(&ExistingSlot->Data[0]);
if (ExistingSlotSize >= NewSize) {
DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
(unsigned long long)ExistingSlot);
NewSlot = ExistingSlot;
} else {
DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
(unsigned long long)SlotP->Next);
free(ExistingSlot);
}
}
if (!NewSlot) {
NewSlot = (__kmpc_data_sharing_slot *)malloc(
sizeof(__kmpc_data_sharing_slot) + NewSize);
DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
(unsigned long long)NewSlot, NewSize);
}
NewSlot->Next = 0;
NewSlot->DataEnd = &NewSlot->Data[NewSize];
SlotP->Next = NewSlot;
SlotP = NewSlot;
StackP = &NewSlot->Data[SharingDataSize];
FrameP = &NewSlot->Data[0];
} else {
// Clean up any old slot that we may still have. The slot producers, do
// not eliminate them because that may be used to return data.
if (SlotP->Next) {
DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
(unsigned long long)SlotP->Next);
free(SlotP->Next);
SlotP->Next = 0;
}
FrameP = StackP;
StackP = (void *)RequiredEndAddress;
}
}
// FIXME: Need to see the impact of doing it here.
__threadfence_block();
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
// All the threads in this warp get the frame they should work with.
return FrameP;
}
EXTERN void __kmpc_data_sharing_environment_end(
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
void **SavedSharedFrame, int32_t *SavedActiveThreads,
int32_t IsEntryPoint) {
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
unsigned WID = getWarpId();
if (IsEntryPoint) {
if (IsWarpMasterActiveThread()) {
DSPRINT0(DSFLAG, "Doing clean up\n");
// The master thread cleans the saved slot, because this is an environment
// only for the master.
__kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
? *SavedSharedSlot
: DataSharingState.SlotPtr[WID];
if (S->Next) {
free(S->Next);
S->Next = 0;
}
}
DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
return;
}
int32_t CurActive = __ACTIVEMASK();
// Only the warp master can restore the stack and frame information, and only
// if there are no other threads left behind in this environment (i.e. the
// warp diverged and returns in different places). This only works if we
// assume that threads will converge right after the call site that started
// the environment.
if (IsWarpMasterActiveThread()) {
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
DSPRINT0(DSFLAG, "Before restoring the stack\n");
// Zero the bits in the mask. If it is still different from zero, then we
// have other threads that will return after the current ones.
ActiveT &= ~CurActive;
DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
(unsigned)CurActive, (unsigned)ActiveT);
if (!ActiveT) {
// No other active threads? Great, lets restore the stack.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
void * volatile &FrameP = DataSharingState.FramePtr[WID];
SlotP = *SavedSharedSlot;
StackP = *SavedSharedStack;
FrameP = *SavedSharedFrame;
ActiveT = *SavedActiveThreads;
DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
(unsigned long long)SlotP);
DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
(unsigned long long)StackP);
DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
(unsigned long long)FrameP);
DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
}
}
// FIXME: Need to see the impact of doing it here.
__threadfence_block();
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
return;
}
EXTERN void *
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
int16_t IsOMPRuntimeInitialized) {
DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
// If the runtime has been elided, use __shared__ memory for master-worker
// data sharing. We're reusing the statically allocated data structure
// that is used for standard data sharing.
if (!IsOMPRuntimeInitialized)
return (void *)&DataSharingState;
// Get the frame used by the requested thread.
unsigned SourceWID = SourceThreadID / WARPSIZE;
DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID);
void * volatile P = DataSharingState.FramePtr[SourceWID];
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
return P;
}
////////////////////////////////////////////////////////////////////////////////
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////
INLINE static void data_sharing_init_stack_common() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
for (int WID = 0; WID < WARPSIZE; WID++) {
__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
}
}
// Initialize data sharing data structure. This function needs to be called
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called only by the MASTER thread of each
// team in non-SPMD mode.
EXTERN void __kmpc_data_sharing_init_stack() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
data_sharing_init_stack_common();
omptarget_nvptx_globalArgs.Init();
}
// Initialize data sharing data structure. This function needs to be called
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called in SPMD mode only.
EXTERN void __kmpc_data_sharing_init_stack_spmd() {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
if (threadIdx.x == 0)
data_sharing_init_stack_common();
__threadfence_block();
}
INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
// Only warp active master threads manage the stack.
bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
// Add worst-case padding to DataSize so that future stack allocations are
// correctly aligned.
const size_t Alignment = 8;
PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
// Frame pointer must be visible to all workers in the same warp.
const unsigned WID = getWarpId();
void *FrameP = 0;
int32_t CurActive = __ACTIVEMASK();
if (IsWarpMaster) {
// SlotP will point to either the shared memory slot or an existing
// global memory slot.
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
void *&StackP = DataSharingState.StackPtr[WID];
// Check if we have room for the data in the current slot.
const uintptr_t StartAddress = (uintptr_t)StackP;
const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
// If we requested more data than there is room for in the rest
// of the slot then we need to either re-use the next slot, if one exists,
// or create a new slot.
if (EndAddress < RequestedEndAddress) {
__kmpc_data_sharing_slot *NewSlot = 0;
size_t NewSize = PushSize;
// Allocate at least the default size for each type of slot.
// Master is a special case and even though there is only one thread,
// it can share more things with the workers. For uniformity, it uses
// the full size of a worker warp slot.
size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
if (DefaultSlotSize > NewSize)
NewSize = DefaultSlotSize;
NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
sizeof(__kmpc_data_sharing_slot) + NewSize,
"Global memory slot allocation.");
NewSlot->Next = 0;
NewSlot->Prev = SlotP;
NewSlot->PrevSlotStackPtr = StackP;
NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
// Make previous slot point to the newly allocated slot.
SlotP->Next = NewSlot;
// The current slot becomes the new slot.
SlotP = NewSlot;
// The stack pointer always points to the next free stack frame.
StackP = &NewSlot->Data[0] + PushSize;
// The frame pointer always points to the beginning of the frame.
FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
} else {
// Add the data chunk to the current slot. The frame pointer is set to
// point to the start of the new frame held in StackP.
FrameP = DataSharingState.FramePtr[WID] = StackP;
// Reset stack pointer to the requested address.
StackP = (void *)RequestedEndAddress;
}
}
// Get address from lane 0.
int *FP = (int *)&FrameP;
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
if (sizeof(FrameP) == 8)
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
return FrameP;
}
EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
int16_t UseSharedMemory) {
return data_sharing_push_stack_common(DataSize);
}
// Called at the time of the kernel initialization. This is used to initilize
// the list of references to shared variables and to pre-allocate global storage
// for holding the globalized variables.
//
// By default the globalized variables are stored in global memory. If the
// UseSharedMemory is set to true, the runtime will attempt to use shared memory
// as long as the size requested fits the pre-allocated size.
EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
int16_t UseSharedMemory) {
// Compute the total memory footprint of the requested data.
// The master thread requires a stack only for itself. A worker
// thread (which at this point is a warp master) will require
// space for the variables of each thread in the warp,
// i.e. one DataSize chunk per warp lane.
// TODO: change WARPSIZE to the number of active threads in the warp.
size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
? DataSize
: WARPSIZE * DataSize;
// Compute the start address of the frame of each thread in the warp.
uintptr_t FrameStartAddress =
(uintptr_t) data_sharing_push_stack_common(PushSize);
FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
return (void *)FrameStartAddress;
}
// Pop the stack and free any memory which can be reclaimed.
//
// When the pop operation removes the last global memory slot,
// reclaim all outstanding global memory slots since it is
// likely we have reached the end of the kernel.
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
__threadfence_block();
if (GetThreadIdInBlock() % WARPSIZE == 0) {
unsigned WID = getWarpId();
// Current slot
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
// Pointer to next available stack.
void *&StackP = DataSharingState.StackPtr[WID];
// Pop the frame.
StackP = FrameStart;
// If the current slot is empty, we need to free the slot after the
// pop.
bool SlotEmpty = (StackP == &SlotP->Data[0]);
if (SlotEmpty && SlotP->Prev) {
// Before removing the slot we need to reset StackP.
StackP = SlotP->PrevSlotStackPtr;
// Remove the slot.
SlotP = SlotP->Prev;
SafeFree(SlotP->Next, "Free slot.");
SlotP->Next = 0;
}
}
}
// Begin a data sharing context. Maintain a list of references to shared
// variables. This list of references to shared variables will be passed
// to one or more threads.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
omptarget_nvptx_globalArgs.EnsureSize(nArgs);
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
}
// End a data sharing context. There is no need to have a list of refs
// to shared variables because the context in which those variables were
// shared has now ended. This should clean-up the list of references only
// without affecting the actual global storage of the variables.
// In L0 data sharing this is called by master thread.
// In L1 data sharing this is called by active warp master thread.
EXTERN void __kmpc_end_sharing_variables() {
omptarget_nvptx_globalArgs.DeInit();
}
// This function will return a list of references to global variables. This
// is how the workers will get a reference to the globalized variable. The
// members of this list will be passed to the outlined parallel function
// preserving the order.
// Called by all workers.
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
*GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
}
// This function is used to init static memory manager. This manager is used to
// manage statically allocated global memory. This memory is allocated by the
// compiler and used to correctly implement globalization of the variables in
// target, teams and distribute regions.
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
const void *buf, size_t size,
int16_t is_shared,
const void **frame) {
if (is_shared) {
*frame = buf;
return;
}
if (isSPMDExecutionMode) {
if (GetThreadIdInBlock() == 0) {
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
}
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
return;
}
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
__threadfence();
}
EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
int16_t is_shared) {
if (is_shared)
return;
if (isSPMDExecutionMode) {
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
if (GetThreadIdInBlock() == 0) {
omptarget_nvptx_simpleMemoryManager.Release();
}
return;
}
__threadfence();
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
"Must be called only in the target master thread.");
omptarget_nvptx_simpleMemoryManager.Release();
}