[OPENMP][NVPTX]Fixed initialization of the data-sharing interface.

Summary:
Avoid using of the atomic loop to wait for the completion of the
data-sharing interface initialization, use __shfl_sync instead for the
communication within the warp to signal other threads in the warp about
completion of the initialization.

Reviewers: gtbercea, kkwli0, grokos

Subscribers: guansong, jfb, caomhin, openmp-commits

Differential Revision: https://reviews.llvm.org/D56100

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@350129 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index f69daa1..9bd5cab 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -390,8 +390,9 @@
   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
-  unsigned WID = getWarpId();
-  void *volatile &FrameP = DataSharingState.FramePtr[WID];
+  const unsigned WID = getWarpId();
+  void *FrameP = 0;
+  const int32_t CurActive = getActiveThreadsMask();
 
   if (IsWarpMaster) {
     // SlotP will point to either the shared memory slot or an existing
@@ -434,17 +435,19 @@
       // The stack pointer always points to the next free stack frame.
       StackP = &NewSlot->Data[0] + PushSize;
       // The frame pointer always points to the beginning of the frame.
-      FrameP = &NewSlot->Data[0];
+      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
     } else {
       // Add the data chunk to the current slot. The frame pointer is set to
       // point to the start of the new frame held in StackP.
-      FrameP = StackP;
+      FrameP = DataSharingState.FramePtr[WID] = StackP;
       // Reset stack pointer to the requested address.
       StackP = (void *)RequestedEndAddress;
     }
-  } else {
-    while (!FrameP);
   }
+  // Get address from lane 0.
+  ((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
+  if (sizeof(FrameP) == 8)
+    ((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
 
   return FrameP;
 }