[OpenMP][libomptarget] Add global memory data sharing support for master-worker sharing.

Summary:
This patch adds support for the sharing of variables from the master thread of a team to the worker threads of the team.
The runtime uses a stack structure implemented as a doubly-linked list of slots with each slot having the exact same size as the size requested. This implementation leverages existing data structures. The runtime functions are added as separate functions to avoid interfering with the current interface. 

Limitations to be addressed in future patches:
- This current patch only employs global memory. In a future patch we will enable to usage for shared memory as an optimization.
- Allow the allocation of several requested sizes in the same slot.

Reviewers: ABataev, grokos, caomhin, carlo.bertolli

Reviewed By: grokos

Subscribers: Hahnfeld, guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D44260

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@327440 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 65ba1f5..e2a38e3 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -322,3 +322,162 @@
   DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
   return P;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime functions for trunk data sharing scheme.
+////////////////////////////////////////////////////////////////////////////////
+
+// Initialize data sharing data structure. This function needs to be called
+// once at the beginning of a data sharing context (coincides with the kernel
+// initialization).
+EXTERN void __kmpc_data_sharing_init_stack() {
+  // This function initializes the stack pointer with the pointer to the
+  // statically allocated shared memory slots. The size of a shared memory
+  // slot is pre-determined to be 256 bytes.
+  unsigned WID = getWarpId();
+  omptarget_nvptx_TeamDescr *teamDescr =
+      &omptarget_nvptx_threadPrivateContext->TeamContext();
+  __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
+
+  DataSharingState.SlotPtr[WID] = RootS;
+  DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
+
+  // We initialize the list of references to arguments here.
+  omptarget_nvptx_globalArgs.Init();
+}
+
+// Called at the time of the kernel initialization. This is used to initilize
+// the list of references to shared variables and to pre-allocate global storage
+// for holding the globalized variables.
+//
+// By default the globalized variables are stored in global memory. If the
+// UseSharedMemory is set to true, the runtime will attempt to use shared memory
+// as long as the size requested fits the pre-allocated size.
+//
+// TODO: allow more than one push per slot to save on calls to malloc.
+// Currently there is only one slot for each push so the data size in the slot
+// is the same size as the size being requested.
+//
+// Called by: master, TODO: call by workers
+EXTERN void* __kmpc_data_sharing_push_stack(size_t size,
+    int16_t UseSharedMemory) {
+  // TODO: Add shared memory support. For now, use global memory only for
+  // storing the data sharing slots so ignore the pre-allocated
+  // shared memory slot.
+
+  // Use global memory for storing the stack.
+  if (IsMasterThread()) {
+    unsigned WID = getWarpId();
+
+    // SlotP will point to either the shared memory slot or an existing
+    // global memory slot.
+    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
+    __kmpc_data_sharing_slot *&TailSlotP = DataSharingState.TailPtr[WID];
+
+    // The slot for holding the data we are pushing.
+    __kmpc_data_sharing_slot *NewSlot = 0;
+    size_t NewSize = size;
+
+    // Check if there is a next slot.
+    if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
+      // Attempt to re-use an existing slot provided the data fits in the slot.
+      // The leftover data space will not be used.
+      ptrdiff_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
+                                   (uintptr_t)(&ExistingSlot->Data[0]);
+      if (ExistingSlotSize >= NewSize)
+        NewSlot = ExistingSlot;
+      else
+        free(ExistingSlot);
+    }
+
+    if (!NewSlot) {
+      NewSlot = (__kmpc_data_sharing_slot *)malloc(
+          sizeof(__kmpc_data_sharing_slot) + NewSize);
+      NewSlot->Next = 0;
+      NewSlot->Prev = SlotP;
+
+      // This is the last slot, save it.
+      TailSlotP = NewSlot;
+    }
+
+    NewSlot->DataEnd = &NewSlot->Data[NewSize];
+
+    SlotP->Next = NewSlot;
+    SlotP = NewSlot;
+
+    return (void*)&SlotP->Data[0];
+  }
+
+  // TODO: add memory fence here when this function can be called by
+  // worker threads also. For now, this function is only called by the
+  // master thread of each team.
+
+  // TODO: implement sharing across workers.
+  return 0;
+}
+
+// Pop the stack and free any memory which can be reclaimed.
+//
+// When the pop operation removes the last global memory slot,
+// reclaim all outstanding global memory slots since it is
+// likely we have reached the end of the kernel.
+EXTERN void __kmpc_data_sharing_pop_stack(void *a) {
+  if (IsMasterThread()) {
+    unsigned WID = getWarpId();
+
+    __kmpc_data_sharing_slot *S = DataSharingState.SlotPtr[WID];
+
+    if (S->Prev)
+      S = S->Prev;
+
+    // If this will "pop" the last global memory node then it is likely
+    // that we are at the end of the data sharing region and we can
+    // de-allocate any existing global memory slots.
+    if (!S->Prev) {
+      __kmpc_data_sharing_slot *Tail = DataSharingState.TailPtr[WID];
+
+      while(Tail && Tail->Prev) {
+        Tail = Tail->Prev;
+        free(Tail->Next);
+        Tail->Next=0;
+      }
+    }
+
+    return;
+  }
+
+  // TODO: add memory fence here when this function can be called by
+  // worker threads also. For now, this function is only called by the
+  // master thread of each team.
+
+  // TODO: implement sharing across workers.
+}
+
+// Begin a data sharing context. Maintain a list of references to shared
+// variables. This list of references to shared variables will be passed
+// to one or more threads.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
+  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
+
+// End a data sharing context. There is no need to have a list of refs
+// to shared variables because the context in which those variables were
+// shared has now ended. This should clean-up the list of references only
+// without affecting the actual global storage of the variables.
+// In L0 data sharing this is called by master thread.
+// In L1 data sharing this is called by active warp master thread.
+EXTERN void __kmpc_end_sharing_variables() {
+  omptarget_nvptx_globalArgs.DeInit();
+}
+
+// This function will return a list of references to global variables. This
+// is how the workers will get a reference to the globalized variable. The
+// members of this list will be passed to the outlined parallel function
+// preserving the order.
+// Called by all workers.
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+}
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index f314443..34e33d1 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -483,11 +483,20 @@
                                           int32_t *LaneId, int32_t *NumLanes);
 EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
 
+
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN void __kmpc_end_sharing_variables();
+EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+
 // The slot used for data sharing by the master and worker threads. We use a
 // complete (default size version and an incomplete one so that we allow sizes
 // greater than the default).
 struct __kmpc_data_sharing_slot {
   __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
   void *DataEnd;
   char Data[];
 };
diff --git a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index adedc0b..33303e7 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -46,3 +46,8 @@
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
 __device__ __shared__ void *ReductionScratchpadPtr;
+
+////////////////////////////////////////////////////////////////////////////////
+// Data sharing related variables.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 9ceebfc..4276f02 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -62,6 +62,46 @@
 #define __ACTIVEMASK() __ballot(1)
 #endif
 
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+    args  = buffer;
+    nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+    // Free any memory allocated for outlined parallel function with a large
+    // number of arguments.
+    if (nArgs > MAX_SHARED_ARGS) {
+      SafeFree(args, (char *)"new extended args");
+      Init();
+    }
+  }
+  INLINE void EnsureSize(size_t size) {
+    if (size > nArgs) {
+      if (nArgs > MAX_SHARED_ARGS) {
+        SafeFree(args, (char *)"new extended args");
+      }
+      args = (void **) SafeMalloc(size * sizeof(void *),
+                                  (char *)"new extended args");
+      nArgs = size;
+    }
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
@@ -80,6 +120,7 @@
 struct DataSharingStateTy {
   __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
   void *StackPtr[DS_Max_Warp_Number];
+  __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number];
   void *FramePtr[DS_Max_Warp_Number];
   int32_t ActiveThreads[DS_Max_Warp_Number];
 };
@@ -87,6 +128,7 @@
 // size of 4*32 bytes.
 struct __kmpc_data_sharing_worker_slot_static {
   __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
   void *DataEnd;
   char Data[DS_Worker_Warp_Slot_Size];
 };
@@ -94,6 +136,7 @@
 // size of 4 bytes.
 struct __kmpc_data_sharing_master_slot_static {
   __kmpc_data_sharing_slot *Next;
+  __kmpc_data_sharing_slot *Prev;
   void *DataEnd;
   char Data[DS_Slot_Size];
 };
@@ -223,6 +266,7 @@
       master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
       // We currently do not have a next slot.
       master_rootS[0].Next = 0;
+      master_rootS[0].Prev = 0;
       return (__kmpc_data_sharing_slot *)&master_rootS[0];
     }
     // Initialize the pointer to the end of the slot given the size of the data
@@ -231,6 +275,7 @@
         &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
     // We currently do not have a next slot.
     worker_rootS[wid].Next = 0;
+    worker_rootS[wid].Prev = 0;
     return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
   }
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/option.h b/libomptarget/deviceRTLs/nvptx/src/option.h
index c5f5d9c..43172ad 100644
--- a/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -28,6 +28,10 @@
 // region to synchronize with each other.
 #define L1_BARRIER (1)
 
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #if __CUDA_ARCH__ >= 600