libomptarget/deviceRTLs/nvptx/src/data_sharing.cu - openmp - Git at Google

 //===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the implementation of data sharing environments/
 //
 //===----------------------------------------------------------------------===//
 #include "omptarget-nvptx.h"
 #include "target_impl.h"
 #include <stdio.h>

 // Warp ID in the CUDA block
 INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
 // Lane ID in the CUDA warp.
 INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }

 // Return true if this is the first active thread in the warp.
 INLINE static bool IsWarpMasterActiveThread() {
   unsigned long long Mask = __ACTIVEMASK();
   unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
   unsigned long long Sh = Mask << ShNum;
   // Truncate Sh to the 32 lower bits
   return (unsigned)Sh == 0;
 }
 // Return true if this is the master thread.
 INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
   return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock();
 }

 /// Return the provided size aligned to the size of a pointer.
 INLINE static size_t AlignVal(size_t Val) {
   const size_t Align = (size_t)sizeof(void *);
   if (Val & (Align - 1)) {
     Val += Align;
     Val &= ~(Align - 1);
   }
   return Val;
 }

 #define DSFLAG 0
 #define DSFLAG_INIT 0
 #define DSPRINT(_flag, _str, _args...)                                         \
   {                                                                            \
     if (_flag) {                                                               \
       /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/          \
     }                                                                          \
   }
 #define DSPRINT0(_flag, _str)                                                  \
   {                                                                            \
     if (_flag) {                                                               \
       /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/                 \
     }                                                                          \
   }

 // Initialize the shared data structures. This is expected to be called for the
 // master thread and warp masters. \param RootS: A pointer to the root of the
 // data sharing stack. \param InitialDataSize: The initial size of the data in
 // the slot.
 EXTERN void
 __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
                                            size_t InitialDataSize) {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
   DSPRINT0(DSFLAG_INIT,
            "Entering __kmpc_initialize_data_sharing_environment\n");

   unsigned WID = getWarpId();
   DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);

   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();
   __kmpc_data_sharing_slot *RootS =
       teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));

   DataSharingState.SlotPtr[WID] = RootS;
   DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];

   // We don't need to initialize the frame and active threads.

   DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize);
   DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS);
   DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
           (unsigned long long)RootS->DataEnd);
   DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n",
           (unsigned long long)RootS->Next);
   DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
           (unsigned long long)DataSharingState.SlotPtr[WID]);
   DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
           (unsigned long long)DataSharingState.StackPtr[WID]);

   DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
 }

 EXTERN void *__kmpc_data_sharing_environment_begin(
     __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
     void **SavedSharedFrame, int32_t *SavedActiveThreads,
     size_t SharingDataSize, size_t SharingDefaultDataSize,
     int16_t IsOMPRuntimeInitialized) {

   DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");

   // If the runtime has been elided, used __shared__ memory for master-worker
   // data sharing.
   if (!IsOMPRuntimeInitialized)
     return (void *)&DataSharingState;

   DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
   DSPRINT(DSFLAG, "Default Data Size %016llx\n",
           (unsigned long long)SharingDefaultDataSize);

   unsigned WID = getWarpId();
   unsigned CurActiveThreads = __ACTIVEMASK();

   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
   void *&StackP = DataSharingState.StackPtr[WID];
   void * volatile &FrameP = DataSharingState.FramePtr[WID];
   int32_t &ActiveT = DataSharingState.ActiveThreads[WID];

   DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
   // Save the current values.
   *SavedSharedSlot = SlotP;
   *SavedSharedStack = StackP;
   *SavedSharedFrame = FrameP;
   *SavedActiveThreads = ActiveT;

   DSPRINT(DSFLAG, "Warp ID: %u\n", WID);
   DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP);
   DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP);
   DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
   DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);

   // Only the warp active master needs to grow the stack.
   if (IsWarpMasterActiveThread()) {
     // Save the current active threads.
     ActiveT = CurActiveThreads;

     // Make sure we use aligned sizes to avoid rematerialization of data.
     SharingDataSize = AlignVal(SharingDataSize);
     // FIXME: The default data size can be assumed to be aligned?
     SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);

     // Check if we have room for the data in the current slot.
     const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
     const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
     const uintptr_t RequiredEndAddress =
         CurrentStartAddress + (uintptr_t)SharingDataSize;

     DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize);
     DSPRINT(DSFLAG, "Default Data Size %016llx\n",
             (unsigned long long)SharingDefaultDataSize);
     DSPRINT(DSFLAG, "Current Start Address %016llx\n",
             (unsigned long long)CurrentStartAddress);
     DSPRINT(DSFLAG, "Current End Address %016llx\n",
             (unsigned long long)CurrentEndAddress);
     DSPRINT(DSFLAG, "Required End Address %016llx\n",
             (unsigned long long)RequiredEndAddress);
     DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT);

     // If we require a new slot, allocate it and initialize it (or attempt to
     // reuse one). Also, set the shared stack and slot pointers to the new
     // place. If we do not need to grow the stack, just adapt the stack and
     // frame pointers.
     if (CurrentEndAddress < RequiredEndAddress) {
       size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
                            ? SharingDataSize
                            : SharingDefaultDataSize;
       __kmpc_data_sharing_slot *NewSlot = 0;

       // Attempt to reuse an existing slot.
       if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
         uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
                                      (uintptr_t)(&ExistingSlot->Data[0]);
         if (ExistingSlotSize >= NewSize) {
           DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
                   (unsigned long long)ExistingSlot);
           NewSlot = ExistingSlot;
         } else {
           DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
                   (unsigned long long)SlotP->Next);
           free(ExistingSlot);
         }
       }

       if (!NewSlot) {
         NewSlot = (__kmpc_data_sharing_slot *)malloc(
             sizeof(__kmpc_data_sharing_slot) + NewSize);
         DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
                 (unsigned long long)NewSlot, NewSize);
       }

       NewSlot->Next = 0;
       NewSlot->DataEnd = &NewSlot->Data[NewSize];

       SlotP->Next = NewSlot;
       SlotP = NewSlot;
       StackP = &NewSlot->Data[SharingDataSize];
       FrameP = &NewSlot->Data[0];
     } else {

       // Clean up any old slot that we may still have. The slot producers, do
       // not eliminate them because that may be used to return data.
       if (SlotP->Next) {
         DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
                 (unsigned long long)SlotP->Next);
         free(SlotP->Next);
         SlotP->Next = 0;
       }

       FrameP = StackP;
       StackP = (void *)RequiredEndAddress;
     }
   }

   // FIXME: Need to see the impact of doing it here.
   __threadfence_block();

   DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");

   // All the threads in this warp get the frame they should work with.
   return FrameP;
 }

 EXTERN void __kmpc_data_sharing_environment_end(
     __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
     void **SavedSharedFrame, int32_t *SavedActiveThreads,
     int32_t IsEntryPoint) {

   DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");

   unsigned WID = getWarpId();

   if (IsEntryPoint) {
     if (IsWarpMasterActiveThread()) {
       DSPRINT0(DSFLAG, "Doing clean up\n");

       // The master thread cleans the saved slot, because this is an environment
       // only for the master.
       __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
                                         ? *SavedSharedSlot
                                         : DataSharingState.SlotPtr[WID];

       if (S->Next) {
         free(S->Next);
         S->Next = 0;
       }
     }

     DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
     return;
   }

   int32_t CurActive = __ACTIVEMASK();

   // Only the warp master can restore the stack and frame information, and only
   // if there are no other threads left behind in this environment (i.e. the
   // warp diverged and returns in different places). This only works if we
   // assume that threads will converge right after the call site that started
   // the environment.
   if (IsWarpMasterActiveThread()) {
     int32_t &ActiveT = DataSharingState.ActiveThreads[WID];

     DSPRINT0(DSFLAG, "Before restoring the stack\n");
     // Zero the bits in the mask. If it is still different from zero, then we
     // have other threads that will return after the current ones.
     ActiveT &= ~CurActive;

     DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n",
             (unsigned)CurActive, (unsigned)ActiveT);

     if (!ActiveT) {
       // No other active threads? Great, lets restore the stack.

       __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
       void *&StackP = DataSharingState.StackPtr[WID];
       void * volatile &FrameP = DataSharingState.FramePtr[WID];

       SlotP = *SavedSharedSlot;
       StackP = *SavedSharedStack;
       FrameP = *SavedSharedFrame;
       ActiveT = *SavedActiveThreads;

       DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n",
               (unsigned long long)SlotP);
       DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n",
               (unsigned long long)StackP);
       DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n",
               (unsigned long long)FrameP);
       DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT);
     }
   }

   // FIXME: Need to see the impact of doing it here.
   __threadfence_block();

   DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
   return;
 }

 EXTERN void *
 __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
                                           int16_t IsOMPRuntimeInitialized) {
   DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");

   // If the runtime has been elided, use __shared__ memory for master-worker
   // data sharing.  We're reusing the statically allocated data structure
   // that is used for standard data sharing.
   if (!IsOMPRuntimeInitialized)
     return (void *)&DataSharingState;

   // Get the frame used by the requested thread.

   unsigned SourceWID = SourceThreadID / WARPSIZE;

   DSPRINT(DSFLAG, "Source  warp: %u\n", SourceWID);

   void * volatile P = DataSharingState.FramePtr[SourceWID];
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////

 INLINE static void data_sharing_init_stack_common() {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
   omptarget_nvptx_TeamDescr *teamDescr =
       &omptarget_nvptx_threadPrivateContext->TeamContext();

   for (int WID = 0; WID < WARPSIZE; WID++) {
     __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
     DataSharingState.SlotPtr[WID] = RootS;
     DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
   }
 }

 // Initialize data sharing data structure. This function needs to be called
 // once at the beginning of a data sharing context (coincides with the kernel
 // initialization). This function is called only by the MASTER thread of each
 // team in non-SPMD mode.
 EXTERN void __kmpc_data_sharing_init_stack() {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
   data_sharing_init_stack_common();
   omptarget_nvptx_globalArgs.Init();
 }

 // Initialize data sharing data structure. This function needs to be called
 // once at the beginning of a data sharing context (coincides with the kernel
 // initialization). This function is called in SPMD mode only.
 EXTERN void __kmpc_data_sharing_init_stack_spmd() {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
   if (threadIdx.x == 0)
     data_sharing_init_stack_common();

   __threadfence_block();
 }

 INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");

   // Only warp active master threads manage the stack.
   bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;

   // Add worst-case padding to DataSize so that future stack allocations are
   // correctly aligned.
   const size_t Alignment = 8;
   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;

   // Frame pointer must be visible to all workers in the same warp.
   const unsigned WID = getWarpId();
   void *FrameP = 0;
   int32_t CurActive = __ACTIVEMASK();

   if (IsWarpMaster) {
     // SlotP will point to either the shared memory slot or an existing
     // global memory slot.
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
     void *&StackP = DataSharingState.StackPtr[WID];

     // Check if we have room for the data in the current slot.
     const uintptr_t StartAddress = (uintptr_t)StackP;
     const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
     const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;

     // If we requested more data than there is room for in the rest
     // of the slot then we need to either re-use the next slot, if one exists,
     // or create a new slot.
     if (EndAddress < RequestedEndAddress) {
       __kmpc_data_sharing_slot *NewSlot = 0;
       size_t NewSize = PushSize;

       // Allocate at least the default size for each type of slot.
       // Master is a special case and even though there is only one thread,
       // it can share more things with the workers. For uniformity, it uses
       // the full size of a worker warp slot.
       size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
       if (DefaultSlotSize > NewSize)
         NewSize = DefaultSlotSize;
       NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc(
           sizeof(__kmpc_data_sharing_slot) + NewSize,
           "Global memory slot allocation.");

       NewSlot->Next = 0;
       NewSlot->Prev = SlotP;
       NewSlot->PrevSlotStackPtr = StackP;
       NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;

       // Make previous slot point to the newly allocated slot.
       SlotP->Next = NewSlot;
       // The current slot becomes the new slot.
       SlotP = NewSlot;
       // The stack pointer always points to the next free stack frame.
       StackP = &NewSlot->Data[0] + PushSize;
       // The frame pointer always points to the beginning of the frame.
       FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
     } else {
       // Add the data chunk to the current slot. The frame pointer is set to
       // point to the start of the new frame held in StackP.
       FrameP = DataSharingState.FramePtr[WID] = StackP;
       // Reset stack pointer to the requested address.
       StackP = (void *)RequestedEndAddress;
     }
   }
   // Get address from lane 0.
   int *FP = (int *)&FrameP;
   FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
   if (sizeof(FrameP) == 8)
     FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);

   return FrameP;
 }

 EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
                                                       int16_t UseSharedMemory) {
   return data_sharing_push_stack_common(DataSize);
 }

 // Called at the time of the kernel initialization. This is used to initilize
 // the list of references to shared variables and to pre-allocate global storage
 // for holding the globalized variables.
 //
 // By default the globalized variables are stored in global memory. If the
 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
 // as long as the size requested fits the pre-allocated size.
 EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
                                             int16_t UseSharedMemory) {
   // Compute the total memory footprint of the requested data.
   // The master thread requires a stack only for itself. A worker
   // thread (which at this point is a warp master) will require
   // space for the variables of each thread in the warp,
   // i.e. one DataSize chunk per warp lane.
   // TODO: change WARPSIZE to the number of active threads in the warp.
   size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
                         ? DataSize
                         : WARPSIZE * DataSize;

   // Compute the start address of the frame of each thread in the warp.
   uintptr_t FrameStartAddress =
       (uintptr_t) data_sharing_push_stack_common(PushSize);
   FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }

 // Pop the stack and free any memory which can be reclaimed.
 //
 // When the pop operation removes the last global memory slot,
 // reclaim all outstanding global memory slots since it is
 // likely we have reached the end of the kernel.
 EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
   ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");

   __threadfence_block();

   if (GetThreadIdInBlock() % WARPSIZE == 0) {
     unsigned WID = getWarpId();

     // Current slot
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];

     // Pointer to next available stack.
     void *&StackP = DataSharingState.StackPtr[WID];

     // Pop the frame.
     StackP = FrameStart;

     // If the current slot is empty, we need to free the slot after the
     // pop.
     bool SlotEmpty = (StackP == &SlotP->Data[0]);

     if (SlotEmpty && SlotP->Prev) {
       // Before removing the slot we need to reset StackP.
       StackP = SlotP->PrevSlotStackPtr;

       // Remove the slot.
       SlotP = SlotP->Prev;
       SafeFree(SlotP->Next, "Free slot.");
       SlotP->Next = 0;
     }
   }
 }

 // Begin a data sharing context. Maintain a list of references to shared
 // variables. This list of references to shared variables will be passed
 // to one or more threads.
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
   omptarget_nvptx_globalArgs.EnsureSize(nArgs);
   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
 }

 // End a data sharing context. There is no need to have a list of refs
 // to shared variables because the context in which those variables were
 // shared has now ended. This should clean-up the list of references only
 // without affecting the actual global storage of the variables.
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_end_sharing_variables() {
   omptarget_nvptx_globalArgs.DeInit();
 }

 // This function will return a list of references to global variables. This
 // is how the workers will get a reference to the globalized variable. The
 // members of this list will be passed to the outlined parallel function
 // preserving the order.
 // Called by all workers.
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
 }

 // This function is used to init static memory manager. This manager is used to
 // manage statically allocated global memory. This memory is allocated by the
 // compiler and used to correctly implement globalization of the variables in
 // target, teams and distribute regions.
 EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
                                           const void *buf, size_t size,
                                           int16_t is_shared,
                                           const void **frame) {
   if (is_shared) {
     *frame = buf;
     return;
   }
   if (isSPMDExecutionMode) {
     if (GetThreadIdInBlock() == 0) {
       *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
     }
     // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
     __SYNCTHREADS();
     return;
   }
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
   __threadfence();
 }

 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
                                               int16_t is_shared) {
   if (is_shared)
     return;
   if (isSPMDExecutionMode) {
     // FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
     __SYNCTHREADS();
     if (GetThreadIdInBlock() == 0) {
       omptarget_nvptx_simpleMemoryManager.Release();
     }
     return;
   }
   __threadfence();
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "Must be called only in the target master thread.");
   omptarget_nvptx_simpleMemoryManager.Release();
 }