[OpenMP] Remove implicit data sharing using device shared memory from libomptarget

Summary:
This patch reverts the changes to libomptarget that were coupled with the changes to Clang code gen for data sharing using shared memory. A similar patch exists for Clang: D43625

Shared memory is meant to be used as an optimization on top of a more general scheme. So far we didn't have a global memory implementation ready so shared memory was a solution which applied to the current level of OpenMP complexity supported by trunk on GPU devices (due to the missing NVPTX backend patch this functionality has never been exercised). Now that we have a global memory solution this patch is "in the way" and needs to be removed (for now). This patch (or an equivalent version of it) will be put out for review once the global memory scheme is in place.


Reviewers: ABataev, grokos, carlo.bertolli, caomhin

Reviewed By: grokos

Subscribers: Hahnfeld, guansong, openmp-commits

Differential Revision: https://reviews.llvm.org/D43626

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@326950 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index 4005353..f314443 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -470,9 +470,8 @@
                                     int16_t RequiresDataSharing);
 EXTERN void __kmpc_spmd_kernel_deinit();
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           void ***SharedArgs, int32_t nArgs,
                                            int16_t IsOMPRuntimeInitialized);
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn, void ***SharedArgs,
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
                                    int16_t IsOMPRuntimeInitialized);
 EXTERN void __kmpc_kernel_end_parallel();
 EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
diff --git a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index 8ff8f7b..adedc0b 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -46,8 +46,3 @@
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
 __device__ __shared__ void *ReductionScratchpadPtr;
-
-////////////////////////////////////////////////////////////////////////////////
-// Data sharing related variables.
-////////////////////////////////////////////////////////////////////////////////
-__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index fca2aa0..5c5c88b 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -54,9 +54,6 @@
   PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
         OMPTARGET_NVPTX_VERSION);
 
-  // init parallel work arguments
-  omptarget_nvptx_sharedArgs.Init();
-
   if (!RequiresOMPRuntime) {
     // If OMP runtime is not required don't initialize OMP state.
     setExecutionParameters(Generic, RuntimeUninitialized);
@@ -110,9 +107,6 @@
   }
   // Done with work.  Kill the workers.
   omptarget_nvptx_workFn = 0;
-
-  // Deinit parallel work arguments
-  omptarget_nvptx_sharedArgs.DeInit();
 }
 
 EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 43b8dd2..3ee32f9 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -62,46 +62,6 @@
 #define __ACTIVEMASK() __ballot(1)
 #endif
 
-// arguments needed for L0 parallelism only.
-class omptarget_nvptx_SharedArgs {
-public:
-  // All these methods must be called by the master thread only.
-  INLINE void Init() {
-    args  = buffer;
-    nArgs = MAX_SHARED_ARGS;
-  }
-  INLINE void DeInit() {
-    // Free any memory allocated for outlined parallel function with a large
-    // number of arguments.
-    if (nArgs > MAX_SHARED_ARGS) {
-      SafeFree(args, (char *)"new extended args");
-      Init();
-    }
-  }
-  INLINE void EnsureSize(int size) {
-    if (size > nArgs) {
-      if (nArgs > MAX_SHARED_ARGS) {
-        SafeFree(args, (char *)"new extended args");
-      }
-      args = (void **) SafeMalloc(size * sizeof(void *),
-                                  (char *)"new extended args");
-      nArgs = size;
-    }
-  }
-  // Called by all threads.
-  INLINE void **GetArgs() { return args; };
-private:
-  // buffer of pre-allocated arguments.
-  void *buffer[MAX_SHARED_ARGS];
-  // pointer to arguments buffer.
-  // starts off as a pointer to 'buffer' but can be dynamically allocated.
-  void **args;
-  // starts off as MAX_SHARED_ARGS but can increase in size.
-  uint32_t nArgs;
-};
-
-extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
-
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
diff --git a/libomptarget/deviceRTLs/nvptx/src/option.h b/libomptarget/deviceRTLs/nvptx/src/option.h
index 43172ad..c5f5d9c 100644
--- a/libomptarget/deviceRTLs/nvptx/src/option.h
+++ b/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -28,10 +28,6 @@
 // region to synchronize with each other.
 #define L1_BARRIER (1)
 
-// Maximum number of preallocated arguments to an outlined parallel/simd function.
-// Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #if __CUDA_ARCH__ >= 600
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 0b9ebd4..d454628 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -214,16 +214,10 @@
 //
 // This routine is always called by the team master..
 EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
-                                           void ***SharedArgs, int32_t nArgs,
                                            int16_t IsOMPRuntimeInitialized) {
   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
   omptarget_nvptx_workFn = WorkFn;
 
-  if (nArgs > 0) {
-    omptarget_nvptx_sharedArgs.EnsureSize(nArgs);
-    *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
-  }
-
   if (!IsOMPRuntimeInitialized)
     return;
 
@@ -323,13 +317,11 @@
 //
 // Only the worker threads call this routine.
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
-                                   void ***SharedArgs,
                                    int16_t IsOMPRuntimeInitialized) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
 
   // Work function and arguments for L1 parallel region.
   *WorkFn = omptarget_nvptx_workFn;
-  *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
 
   if (!IsOMPRuntimeInitialized)
     return true;