[OpenMP] Add support for dynamic shared memory in new RTL

This patch adds support for using dynamic shared memory in the new
device runtime. The new function `__kmpc_get_dynamic_shared` will return a
pointer to the buffer of dynamic shared memory. Currently the amount of memory
allocated is set by an environment variable.

In the future this amount will be added to the amount used for the smart stack
which will be configured in a similar way.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D110006
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index b8b8da8..2d8bd024 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -32,6 +32,7 @@
     * ``LIBOMPTARGET_INFO=<Num>``
     * ``LIBOMPTARGET_HEAP_SIZE=<Num>``
     * ``LIBOMPTARGET_STACK_SIZE=<Num>``
+    * ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -338,6 +339,14 @@
 for some applications that allocate too much memory either through the user or
 globalization.
 
+LIBOMPTARGET_SHARED_MEMORY_SIZE
+"""""""""""""""""""""""""""""""
+
+This environment variable sets the amount of dynamic shared memory in bytes used 
+by the kernel once it is launched. A pointer to the dynamic memory buffer can 
+currently only be accessed using the ``__kmpc_get_dynamic_shared`` device 
+runtime call.
+
 .. toctree::
    :hidden:
    :maxdepth: 1
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 11aa548..97e9449 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -31,6 +31,9 @@
 /// Return the user choosen debug level.
 uint32_t getDebugLevel();
 
+/// Return the amount of dynamic shared memory that was allocated at launch.
+uint64_t getDynamicMemorySize();
+
 bool isDebugMode(DebugLevel Level);
 
 } // namespace config
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 9ed396d..1a8471a 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -174,6 +174,10 @@
 /// allocated by __kmpc_alloc_shared by the same thread.
 void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
 
+/// Get a pointer to the memory buffer containing dynamically allocated shared
+/// memory configured at launch.
+void *__kmpc_get_dynamic_shared();
+
 /// Allocate sufficient space for \p NumArgs sequential `void*` and store the
 /// allocation address in \p GlobalArgs.
 ///
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
index 63e0923..c860bd1 100644
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -188,6 +188,9 @@
 /// Alloca \p Size bytes in global memory, if possible, for \p Reason.
 void *allocGlobal(uint64_t Size, const char *Reason);
 
+/// Return a pointer to the dynamic shared memory buffer.
+void *getDynamicBuffer();
+
 /// Free \p Ptr, alloated via allocGlobal, for \p Reason.
 void freeGlobal(void *Ptr, const char *Reason);
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index 4e48599..dc30707 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -21,6 +21,7 @@
   uint32_t DebugLevel;
   uint32_t NumDevices;
   uint32_t DeviceNum;
+  uint64_t DynamicMemSize;
 };
 
 #pragma omp declare target
@@ -43,6 +44,10 @@
   return omptarget_device_environment.DeviceNum;
 }
 
+uint64_t config::getDynamicMemorySize() {
+  return omptarget_device_environment.DynamicMemSize;
+}
+
 bool config::isDebugMode(config::DebugLevel Level) {
   return config::getDebugLevel() > Level;
 }
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index 2e00a6e..e336155 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -25,6 +25,13 @@
 ///
 ///{
 
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// External symbol to access dynamic shared memory.
+extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
+#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+
 namespace {
 
 /// Fallback implementations are missing to trigger a link time error.
@@ -57,9 +64,6 @@
 #pragma omp end declare variant
 ///}
 
-/// Add worst-case padding so that future allocations are properly aligned.
-constexpr const uint32_t Alignment = 8;
-
 /// A "smart" stack in shared memory.
 ///
 /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -147,6 +151,8 @@
 
 } // namespace
 
+void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
+
 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
   return SharedMemorySmartStack.push(Bytes);
 }
@@ -497,6 +503,10 @@
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
 
+__attribute__((noinline)) void *__kmpc_get_dynamic_shared() {
+  return memory::getDynamicBuffer();
+}
+
 /// Allocate storage in shared memory to communicate arguments from the main
 /// thread to the workers in generic mode. If we exceed
 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index f3a810c..aaff0d3 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -103,6 +103,7 @@
   int32_t debug_level;
   uint32_t num_devices;
   uint32_t device_num;
+  uint64_t dynamic_shared_size;
 };
 
 namespace {
@@ -346,6 +347,8 @@
   int EnvTeamThreadLimit;
   // OpenMP requires flags
   int64_t RequiresFlags;
+  // Amount of dynamic shared memory to use at launch.
+  uint64_t DynamicMemorySize;
 
   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
   static constexpr const int HardThreadLimit = 1024;
@@ -499,7 +502,8 @@
 
   DeviceRTLTy()
       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
+        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
+        DynamicMemorySize(0) {
 
     DP("Start initializing CUDA\n");
 
@@ -540,6 +544,11 @@
       EnvNumTeams = std::stoi(EnvStr);
       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
     }
+    if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
+      // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
+      DynamicMemorySize = std::stoi(EnvStr);
+      DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE", DynamicMemorySize);
+    }
 
     StreamManager =
         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
@@ -904,7 +913,7 @@
       // TODO: The device ID used here is not the real device ID used by OpenMP.
       omptarget_device_environmentTy DeviceEnv{
           0, static_cast<uint32_t>(NumberOfDevices),
-          static_cast<uint32_t>(DeviceId)};
+          static_cast<uint32_t>(DeviceId), DynamicMemorySize};
 
 #ifdef OMPTARGET_DEBUG
       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
@@ -1190,7 +1199,7 @@
     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
                          /* gridDimZ */ 1, CudaThreadsPerBlock,
                          /* blockDimY */ 1, /* blockDimZ */ 1,
-                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+                         DynamicMemorySize, Stream, &Args[0], nullptr);
     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
       return OFFLOAD_FAIL;
 
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
new file mode 100644
index 0000000..9a74ed3
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -0,0 +1,31 @@
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=4 \
+// RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *get_dynamic_shared() { return NULL; }
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+extern void *__kmpc_get_dynamic_shared();
+void *get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
+#pragma omp end declare variant
+
+int main() {
+  int x;
+#pragma omp target parallel map(from : x)
+  {
+    int *buf = get_dynamic_shared();
+#pragma omp barrier
+    if (omp_get_thread_num() == 0)
+      *buf = 1;
+#pragma omp barrier
+    if (omp_get_thread_num() == 1)
+      x = *buf;
+  }
+
+  // CHECK: PASS
+  if (x == 1)
+    printf("PASS\n");
+}