| //===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file contains the implementation of data sharing environments |
| // |
| //===----------------------------------------------------------------------===// |
| #pragma omp declare target |
| |
| #include "common/omptarget.h" |
| #include "target_impl.h" |
| |
| // Return true if this is the master thread. |
| INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { |
| return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Runtime functions for trunk data sharing scheme. |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| INLINE static void data_sharing_init_stack_common() { |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); |
| omptarget_nvptx_TeamDescr *teamDescr = |
| &omptarget_nvptx_threadPrivateContext->TeamContext(); |
| |
| for (int WID = 0; WID < DS_Max_Warp_Number; WID++) { |
| __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); |
| DataSharingState.SlotPtr[WID] = RootS; |
| DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; |
| } |
| } |
| |
| // Initialize data sharing data structure. This function needs to be called |
| // once at the beginning of a data sharing context (coincides with the kernel |
| // initialization). This function is called only by the MASTER thread of each |
| // team in non-SPMD mode. |
| EXTERN void __kmpc_data_sharing_init_stack() { |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); |
| // This function initializes the stack pointer with the pointer to the |
| // statically allocated shared memory slots. The size of a shared memory |
| // slot is pre-determined to be 256 bytes. |
| data_sharing_init_stack_common(); |
| omptarget_nvptx_globalArgs.Init(); |
| } |
| |
| // Initialize data sharing data structure. This function needs to be called |
| // once at the beginning of a data sharing context (coincides with the kernel |
| // initialization). This function is called in SPMD mode only. |
| EXTERN void __kmpc_data_sharing_init_stack_spmd() { |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); |
| // This function initializes the stack pointer with the pointer to the |
| // statically allocated shared memory slots. The size of a shared memory |
| // slot is pre-determined to be 256 bytes. |
| if (GetThreadIdInBlock() == 0) |
| data_sharing_init_stack_common(); |
| |
| __kmpc_impl_threadfence_block(); |
| } |
| |
| INLINE static void *data_sharing_push_stack_common(size_t PushSize) { |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); |
| |
| // Only warp active master threads manage the stack. |
| bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; |
| |
| // Add worst-case padding to DataSize so that future stack allocations are |
| // correctly aligned. |
| const size_t Alignment = 8; |
| PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; |
| |
| // Frame pointer must be visible to all workers in the same warp. |
| const unsigned WID = GetWarpId(); |
| void *FrameP = 0; |
| __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); |
| |
| if (IsWarpMaster) { |
| // SlotP will point to either the shared memory slot or an existing |
| // global memory slot. |
| __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; |
| void *&StackP = DataSharingState.StackPtr[WID]; |
| |
| // Check if we have room for the data in the current slot. |
| const uintptr_t StartAddress = (uintptr_t)StackP; |
| const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; |
| const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; |
| |
| // If we requested more data than there is room for in the rest |
| // of the slot then we need to either re-use the next slot, if one exists, |
| // or create a new slot. |
| if (EndAddress < RequestedEndAddress) { |
| __kmpc_data_sharing_slot *NewSlot = 0; |
| size_t NewSize = PushSize; |
| |
| // Allocate at least the default size for each type of slot. |
| // Master is a special case and even though there is only one thread, |
| // it can share more things with the workers. For uniformity, it uses |
| // the full size of a worker warp slot. |
| size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; |
| if (DefaultSlotSize > NewSize) |
| NewSize = DefaultSlotSize; |
| NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( |
| sizeof(__kmpc_data_sharing_slot) + NewSize, |
| "Global memory slot allocation."); |
| |
| NewSlot->Next = 0; |
| NewSlot->Prev = SlotP; |
| NewSlot->PrevSlotStackPtr = StackP; |
| NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; |
| |
| // Make previous slot point to the newly allocated slot. |
| SlotP->Next = NewSlot; |
| // The current slot becomes the new slot. |
| SlotP = NewSlot; |
| // The stack pointer always points to the next free stack frame. |
| StackP = &NewSlot->Data[0] + PushSize; |
| // The frame pointer always points to the beginning of the frame. |
| FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; |
| } else { |
| // Add the data chunk to the current slot. The frame pointer is set to |
| // point to the start of the new frame held in StackP. |
| FrameP = DataSharingState.FramePtr[WID] = StackP; |
| // Reset stack pointer to the requested address. |
| StackP = (void *)RequestedEndAddress; |
| } |
| } |
| // Get address from lane 0. |
| int *FP = (int *)&FrameP; |
| FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); |
| if (sizeof(FrameP) == 8) |
| FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); |
| |
| return FrameP; |
| } |
| |
| EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, |
| int16_t UseSharedMemory) { |
| return data_sharing_push_stack_common(DataSize); |
| } |
| |
| // Called at the time of the kernel initialization. This is used to initilize |
| // the list of references to shared variables and to pre-allocate global storage |
| // for holding the globalized variables. |
| // |
| // By default the globalized variables are stored in global memory. If the |
| // UseSharedMemory is set to true, the runtime will attempt to use shared memory |
| // as long as the size requested fits the pre-allocated size. |
| EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, |
| int16_t UseSharedMemory) { |
| // Compute the total memory footprint of the requested data. |
| // The master thread requires a stack only for itself. A worker |
| // thread (which at this point is a warp master) will require |
| // space for the variables of each thread in the warp, |
| // i.e. one DataSize chunk per warp lane. |
| // TODO: change WARPSIZE to the number of active threads in the warp. |
| size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) |
| ? DataSize |
| : WARPSIZE * DataSize; |
| |
| // Compute the start address of the frame of each thread in the warp. |
| uintptr_t FrameStartAddress = |
| (uintptr_t)data_sharing_push_stack_common(PushSize); |
| FrameStartAddress += (uintptr_t)(GetLaneId() * DataSize); |
| return (void *)FrameStartAddress; |
| } |
| |
| // Pop the stack and free any memory which can be reclaimed. |
| // |
| // When the pop operation removes the last global memory slot, |
| // reclaim all outstanding global memory slots since it is |
| // likely we have reached the end of the kernel. |
| EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { |
| ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); |
| |
| __kmpc_impl_threadfence_block(); |
| |
| if (GetThreadIdInBlock() % WARPSIZE == 0) { |
| unsigned WID = GetWarpId(); |
| |
| // Current slot |
| __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; |
| |
| // Pointer to next available stack. |
| void *&StackP = DataSharingState.StackPtr[WID]; |
| |
| // Pop the frame. |
| StackP = FrameStart; |
| |
| // If the current slot is empty, we need to free the slot after the |
| // pop. |
| bool SlotEmpty = (StackP == &SlotP->Data[0]); |
| |
| if (SlotEmpty && SlotP->Prev) { |
| // Before removing the slot we need to reset StackP. |
| StackP = SlotP->PrevSlotStackPtr; |
| |
| // Remove the slot. |
| SlotP = SlotP->Prev; |
| SafeFree(SlotP->Next, "Free slot."); |
| SlotP->Next = 0; |
| } |
| } |
| } |
| |
| // Begin a data sharing context. Maintain a list of references to shared |
| // variables. This list of references to shared variables will be passed |
| // to one or more threads. |
| // In L0 data sharing this is called by master thread. |
| // In L1 data sharing this is called by active warp master thread. |
| EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { |
| omptarget_nvptx_globalArgs.EnsureSize(nArgs); |
| *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); |
| } |
| |
| // End a data sharing context. There is no need to have a list of refs |
| // to shared variables because the context in which those variables were |
| // shared has now ended. This should clean-up the list of references only |
| // without affecting the actual global storage of the variables. |
| // In L0 data sharing this is called by master thread. |
| // In L1 data sharing this is called by active warp master thread. |
| EXTERN void __kmpc_end_sharing_variables() { |
| omptarget_nvptx_globalArgs.DeInit(); |
| } |
| |
| // This function will return a list of references to global variables. This |
| // is how the workers will get a reference to the globalized variable. The |
| // members of this list will be passed to the outlined parallel function |
| // preserving the order. |
| // Called by all workers. |
| EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { |
| *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); |
| } |
| |
| // This function is used to init static memory manager. This manager is used to |
| // manage statically allocated global memory. This memory is allocated by the |
| // compiler and used to correctly implement globalization of the variables in |
| // target, teams and distribute regions. |
| EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, |
| const void *buf, size_t size, |
| int16_t is_shared, |
| const void **frame) { |
| if (is_shared) { |
| *frame = buf; |
| return; |
| } |
| if (isSPMDExecutionMode) { |
| if (GetThreadIdInBlock() == 0) { |
| *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); |
| } |
| __kmpc_impl_syncthreads(); |
| return; |
| } |
| ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), |
| "Must be called only in the target master thread."); |
| *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); |
| __kmpc_impl_threadfence(); |
| } |
| |
| EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, |
| int16_t is_shared) { |
| if (is_shared) |
| return; |
| if (isSPMDExecutionMode) { |
| __kmpc_impl_syncthreads(); |
| if (GetThreadIdInBlock() == 0) { |
| omptarget_nvptx_simpleMemoryManager.Release(); |
| } |
| return; |
| } |
| __kmpc_impl_threadfence(); |
| ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), |
| "Must be called only in the target master thread."); |
| omptarget_nvptx_simpleMemoryManager.Release(); |
| } |
| |
| #pragma omp end declare target |