[OpenMP][libomptarget] Enable multiple frames per global memory slot

Summary: To save on calls to malloc, this patch enables the re-use of pre-allocated global memory slots.

Reviewers: ABataev, grokos, carlo.bertolli, caomhin

Reviewed By: grokos

Subscribers: guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D44470

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@327637 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index cd73a6b..41976f6 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -341,7 +341,17 @@
   __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
 
   DataSharingState.SlotPtr[WID] = RootS;
-  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+  DataSharingState.TailPtr[WID] = RootS;
+
+  // Initialize the stack pointer to be equal to the end of
+  // the shared memory slot. This way we ensure that the global
+  // version of the stack will be used.
+  // TODO: remove this:
+  DataSharingState.StackPtr[WID] = RootS->DataEnd;
+
+  // TODO: When the use of shared memory is enabled we will have to
+  // initialize this with the start of the Data region like so:
+  // DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
 
   // We initialize the list of references to arguments here.
   omptarget_nvptx_globalArgs.Init();
@@ -355,12 +365,8 @@
 // UseSharedMemory is set to true, the runtime will attempt to use shared memory
 // as long as the size requested fits the pre-allocated size.
 //
-// TODO: allow more than one push per slot to save on calls to malloc.
-// Currently there is only one slot for each push so the data size in the slot
-// is the same size as the size being requested.
-//
 // Called by: master, TODO: call by workers
-EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
+EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
     int16_t UseSharedMemory) {
   // TODO: Add shared memory support. For now, use global memory only for
   // storing the data sharing slots so ignore the pre-allocated
@@ -374,39 +380,85 @@
     // global memory slot.
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
     __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
+    void *&StackP = DataSharingState.StackPtr[WID];
+    void *FrameP = 0;
 
-    // The slot for holding the data we are pushing.
-    __kmpc_data_sharing_slot *NewSlot = 0;
-    size_t NewSize = size;
+    // Check if we have room for the data in the current slot.
+    const uintptr_t StartAddress = (uintptr_t)StackP;
+    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
+    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)DataSize;
 
-    // Check if there is a next slot.
-    if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
-      // Attempt to re-use an existing slot provided the data fits in the slot.
-      // The leftover data space will not be used.
-      ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
-                                   (uintptr_t)(&ExistingSlot->Data[0]);
-      if (ExistingSlotSize >= NewSize)
-        NewSlot = ExistingSlot;
-      else
-        free(ExistingSlot);
+    // If we requested more data than there is room for in the rest
+    // of the slot then we need to either re-use the next slot, if one exists,
+    // or create a new slot.
+    if (EndAddress < RequestedEndAddress) {
+      size_t NewSize = DataSize;
+
+      // The new or reused slot for holding the data being pushed.
+      __kmpc_data_sharing_slot *NewSlot = 0;
+
+      // Check if there is a next slot.
+      if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+        // Attempt to reuse an existing slot provided the data fits in the slot.
+        // The leftover data space will not be used.
+        ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+                                     (uintptr_t)(&ExistingSlot->Data[0]);
+
+        // Try to add the data in the next available slot. Search for a slot
+        // with enough space.
+        while (ExistingSlotSize < NewSize) {
+          SlotP->Next = ExistingSlot->Next;
+          SlotP->Next->Prev = ExistingSlot->Prev;
+          free(ExistingSlot);
+          ExistingSlot = SlotP->Next;
+          if (!ExistingSlot)
+            break;
+          ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+                             (uintptr_t)(&ExistingSlot->Data[0]);
+        }
+
+        // Check if a slot has been found.
+        if (ExistingSlotSize >= NewSize) {
+          NewSlot = ExistingSlot;
+          NewSlot->PrevSlotStackPtr = StackP;
+        }
+      }
+
+      if (!NewSlot) {
+        // Allocate at least the default size.
+        // TODO: generalize this for workers which need a larger data slot
+        // i.e. using DS_Worker_Warp_Slot_Size.
+        if (DS_Slot_Size > DataSize)
+          NewSize = DS_Slot_Size;
+        NewSlot = (__kmpc_data_sharing_slot *)malloc(
+            sizeof(__kmpc_data_sharing_slot) + NewSize);
+        NewSlot->Next = 0;
+        NewSlot->Prev = SlotP;
+        NewSlot->PrevSlotStackPtr = StackP;
+        NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+        // Newly allocated slots are also tail slots.
+        TailSlotP = NewSlot;
+
+        // Make previous slot point to the newly allocated slot.
+        SlotP->Next = NewSlot;
+      }
+
+      // The current slot becomes the new slot.
+      SlotP = NewSlot;
+      // The stack pointer always points to the next free stack frame.
+      StackP = &NewSlot->Data[DataSize];
+      // The frame pointer always points to the beginning of the frame.
+      FrameP = &NewSlot->Data[0];
+    } else {
+      // Add the data chunk to the current slot. The frame pointer is set to
+      // point to the start of the new frame held in StackP.
+      FrameP = StackP;
+      // Reset stack pointer to the requested address.
+      StackP = (void *)RequestedEndAddress;
     }
 
-    if (!NewSlot) {
-      NewSlot = (__kmpc_data_sharing_slot *)malloc(
-          sizeof(__kmpc_data_sharing_slot) + NewSize);
-      NewSlot->Next = 0;
-      NewSlot->Prev = SlotP;
-
-      // This is the last slot, save it.
-      TailSlotP = NewSlot;
-    }
-
-    NewSlot->DataEnd = &NewSlot->Data[NewSize];
-
-    SlotP->Next = NewSlot;
-    SlotP = NewSlot;
-
-    return (void*)&SlotP->Data[0];
+    return FrameP;
   }
 
   // TODO: add memory fence here when this function can be called by
@@ -422,26 +474,43 @@
 // When the pop operation removes the last global memory slot,
 // reclaim all outstanding global memory slots since it is
 // likely we have reached the end of the kernel.
-EXTERN void __kmpc_data_sharing_pop_stack(void *a) {
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
   if (IsMasterThread()) {
     unsigned WID = getWarpId();
 
-    __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID];
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+    void *&StackP = DataSharingState.StackPtr[WID];
 
-    if (S->Prev)
-      S = S->Prev;
-
-    // If this will "pop" the last global memory node then it is likely
-    // that we are at the end of the data sharing region and we can
-    // de-allocate any existing global memory slots.
-    if (!S->Prev) {
-      __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
-
-      while(Tail && Tail->Prev) {
-        Tail = Tail->Prev;
-        free(Tail->Next);
-        Tail->Next=0;
+    // If we try to pop the last frame of the current slot we need to
+    // move to the previous slot if there is one.
+    const uintptr_t StartAddress = (uintptr_t)FrameStart;
+    if (StartAddress == (uintptr_t)&SlotP->Data[0]) {
+      if (SlotP->Prev) {
+        // The new stack pointer is the end of the data field of the
+        // previous slot. This will allow the stack pointer to be
+        // used in the computation of the remaining data space in
+        // the current slot.
+        StackP = SlotP->PrevSlotStackPtr;
+        // Reset SlotP to previous slot.
+        SlotP = SlotP->Prev;
       }
+
+      // If this will "pop" the last global memory node then it is likely
+      // that we are at the end of the data sharing region and we can
+      // de-allocate any existing global memory slots.
+      if (!SlotP->Prev) {
+        __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
+
+        while(Tail && Tail->Prev) {
+          Tail = Tail->Prev;
+          free(Tail->Next);
+          Tail->Next=0;
+        }
+      }
+    } else {
+      // This is not the last frame popped from this slot.
+      // Reset StackP
+      StackP = FrameStart;
     }
 
     return;
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index 34e33d1..84f6ec6 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -497,6 +497,7 @@
 struct __kmpc_data_sharing_slot {
   __kmpc_data_sharing_slot *Next;
   __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
   void *DataEnd;
   char Data[];
 };
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 4276f02..8f4f1cd 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -129,6 +129,7 @@
 struct __kmpc_data_sharing_worker_slot_static {
   __kmpc_data_sharing_slot *Next;
   __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
   void *DataEnd;
   char Data[DS_Worker_Warp_Slot_Size];
 };
@@ -137,6 +138,7 @@
 struct __kmpc_data_sharing_master_slot_static {
   __kmpc_data_sharing_slot *Next;
   __kmpc_data_sharing_slot *Prev;
+  void *PrevSlotStackPtr;
   void *DataEnd;
   char Data[DS_Slot_Size];
 };
@@ -267,6 +269,7 @@
       // We currently do not have a next slot.
       master_rootS[0].Next = 0;
       master_rootS[0].Prev = 0;
+      master_rootS[0].PrevSlotStackPtr = 0;
       return (__kmpc_data_sharing_slot *)&master_rootS[0];
     }
     // Initialize the pointer to the end of the slot given the size of the data
@@ -276,6 +279,7 @@
     // We currently do not have a next slot.
     worker_rootS[wid].Next = 0;
     worker_rootS[wid].Prev = 0;
+    worker_rootS[wid].PrevSlotStackPtr = 0;
     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
   }