[OpenMP][libomptarget] Add support for close map modifier

Summary:
This patch adds support for the close map modifier.

The close map modifier will overwrite the unified shared memory requirement and create a device copy of the data.

Reviewers: ABataev, Hahnfeld, caomhin, grokos, jdoerfert, AlexEichenberger

Reviewed By: Hahnfeld, AlexEichenberger

Subscribers: guansong, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D65340

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@368488 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/include/omptarget.h b/libomptarget/include/omptarget.h
index ff6e85c..826d8ed 100644
--- a/libomptarget/include/omptarget.h
+++ b/libomptarget/include/omptarget.h
@@ -47,6 +47,8 @@
   OMP_TGT_MAPTYPE_LITERAL         = 0x100,
   // mapping is implicit
   OMP_TGT_MAPTYPE_IMPLICIT        = 0x200,
+  // copy data to device
+  OMP_TGT_MAPTYPE_CLOSE           = 0x400,
   // member of struct, member given by [16 MSBs] - 1
   OMP_TGT_MAPTYPE_MEMBER_OF       = 0xffff000000000000
 };
diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
index 718419d..cf7e0fe 100644
--- a/libomptarget/src/device.cpp
+++ b/libomptarget/src/device.cpp
@@ -158,7 +158,7 @@
 // to do an illegal mapping.
 void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
     int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
-    bool UpdateRefCount) {
+    bool UpdateRefCount, bool HasCloseModifier) {
   void *rc = NULL;
   IsHostPtr = false;
   DataMapMtx.lock();
@@ -192,9 +192,9 @@
     // privatized use host address. Any explicitly mapped variables also use
     // host address where correctness is not impeded. In all other cases
     // maps are respected.
-    // TODO: In addition to the mapping rules above, when the close map
-    // modifier is implemented, foce the mapping of the variable to the device.
-    if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+    // In addition to the mapping rules above, the close map
+    // modifier forces the mapping of the variable to the device.
+    if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) {
       DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
          DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
       IsHostPtr = true;
@@ -204,8 +204,8 @@
       IsNew = true;
       uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
       DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
-          "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
-          DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+         "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
+         DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
       HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
           (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
       rc = (void *)tp;
@@ -269,8 +269,9 @@
   return NULL;
 }
 
-int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
-  if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete,
+                            bool HasCloseModifier) {
+  if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier)
     return OFFLOAD_SUCCESS;
   // Check if the pointer is contained in any sub-nodes.
   int rc;
diff --git a/libomptarget/src/device.h b/libomptarget/src/device.h
index cf79e94..d33512b 100644
--- a/libomptarget/src/device.h
+++ b/libomptarget/src/device.h
@@ -132,11 +132,13 @@
   long getMapEntryRefCnt(void *HstPtrBegin);
   LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
   void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
-      bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true);
+      bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true,
+      bool HasCloseModifier = false);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
       bool UpdateRefCount, bool &IsHostPtr);
-  int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
+  int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete,
+                    bool HasCloseModifier = false);
   int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
   int disassociatePtr(void *HstPtrBegin);
 
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index 2f896a1..2feb7c8 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -244,6 +244,9 @@
     bool IsNew, Pointer_IsNew;
     bool IsHostPtr = false;
     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
+    // Force the creation of a device side copy of the data when:
+    // a close map modifier was associated with a map that contained a to.
+    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
     // have reached this point via __tgt_target_data_begin and not __tgt_target
     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
@@ -254,7 +257,8 @@
       DP("Has a pointer entry: \n");
       // base is address of pointer.
       Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
-          sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef);
+          sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef,
+          HasCloseModifier);
       if (!Pointer_TgtPtrBegin) {
         DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
             "illegal mapping).\n");
@@ -270,7 +274,7 @@
     }
 
     void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
-        data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef);
+        data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier);
     if (!TgtPtrBegin && data_size) {
       // If data_size==0, then the argument could be a zero-length pointer to
       // NULL, so getOrAlloc() returning NULL is not an error.
@@ -290,7 +294,8 @@
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
       bool copy = false;
-      if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+      if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
+          HasCloseModifier) {
         if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
           copy = true;
         } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
@@ -370,6 +375,7 @@
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
         (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
     bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
+    bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
@@ -390,7 +396,8 @@
       if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
         bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
         bool CopyMember = false;
-        if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+        if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
+            HasCloseModifier) {
           if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
               !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
             // Copy data only if the "parent" struct has RefCount==1.
@@ -455,7 +462,8 @@
 
       // Deallocate map
       if (DelEntry) {
-        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
+        int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete,
+                                      HasCloseModifier);
         if (rt != OFFLOAD_SUCCESS) {
           DP("Deallocating data from device failed.\n");
           return OFFLOAD_FAIL;
diff --git a/libomptarget/test/unified_shared_memory/close_enter_exit.c b/libomptarget/test/unified_shared_memory/close_enter_exit.c
new file mode 100644
index 0000000..4cedbae
--- /dev/null
+++ b/libomptarget/test/unified_shared_memory/close_enter_exit.c
@@ -0,0 +1,95 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc = 0, *device_alloc = 0;
+  int *a = (int *)malloc(N * sizeof(int));
+
+  // Init
+  for (int i = 0; i < N; ++i) {
+    a[i] = 10;
+  }
+  host_alloc = &a[0];
+
+  //
+  // map + target no close
+  //
+#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc)
+  {
+#pragma omp target map(tofrom : device_alloc)
+    { device_alloc = &a[0]; }
+  }
+
+  // CHECK: a used from unified memory.
+  if (device_alloc == host_alloc)
+    printf("a used from unified memory.\n");
+
+  //
+  // map + target with close
+  //
+  device_alloc = 0;
+#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc)
+  {
+#pragma omp target map(tofrom : device_alloc)
+    { device_alloc = &a[0]; }
+  }
+  // CHECK: a copied to device.
+  if (device_alloc != host_alloc)
+    printf("a copied to device.\n");
+
+  //
+  // map + use_device_ptr no close
+  //
+  device_alloc = 0;
+#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a)
+  { device_alloc = &a[0]; }
+
+  // CHECK: a used from unified memory with use_device_ptr.
+  if (device_alloc == host_alloc)
+    printf("a used from unified memory with use_device_ptr.\n");
+
+  //
+  // map + use_device_ptr close
+  //
+  device_alloc = 0;
+#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a)
+  { device_alloc = &a[0]; }
+
+  // CHECK: a used from device memory with use_device_ptr.
+  if (device_alloc != host_alloc)
+    printf("a used from device memory with use_device_ptr.\n");
+
+  //
+  // map enter/exit + close
+  //
+  device_alloc = 0;
+#pragma omp target enter data map(close, to : a[ : N])
+
+#pragma omp target map(from : device_alloc)
+  { device_alloc = &a[0]; }
+
+#pragma omp target exit data map(from : a[ : N])
+
+  // CHECK: a has been mapped to the device.
+  if (device_alloc != host_alloc)
+    printf("a has been mapped to the device.\n");
+
+  free(a);
+
+  // CHECK: Done!
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/unified_shared_memory/close_manual.c b/libomptarget/test/unified_shared_memory/close_manual.c
new file mode 100644
index 0000000..0417b8b
--- /dev/null
+++ b/libomptarget/test/unified_shared_memory/close_manual.c
@@ -0,0 +1,86 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <omp.h>
+#include <stdio.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
+                                    void **args_base, void **args,
+                                    int64_t *arg_sizes, int64_t *arg_types);
+
+extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num,
+                                  void **args_base, void **args,
+                                  int64_t *arg_sizes, int64_t *arg_types);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc = 0, *device_alloc = 0;
+  int *a = (int *)malloc(N * sizeof(int));
+
+  // Manual registration of requires flags for Clang versions
+  // that do not support requires.
+  __tgt_register_requires(8);
+
+  // Init
+  for (int i = 0; i < N; ++i) {
+    a[i] = 10;
+  }
+  host_alloc = &a[0];
+
+// Dummy target region that ensures the runtime library is loaded when
+// the target data begin/end functions are manually called below.
+#pragma omp target
+  {}
+
+  // Manual calls
+  int device_id = omp_get_default_device();
+  int arg_num = 1;
+  void **args_base = (void **)&a;
+  void **args = (void **)&a;
+  int64_t arg_sizes[arg_num];
+
+  arg_sizes[0] = sizeof(int) * N;
+
+  int64_t arg_types[arg_num];
+
+  // Ox400 enables the CLOSE map type in the runtime:
+  // OMP_TGT_MAPTYPE_CLOSE = 0x400
+  // OMP_TGT_MAPTYPE_TO    = 0x001
+  arg_types[0] = 0x400 | 0x001;
+
+  device_alloc = host_alloc;
+
+  __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
+                          arg_types);
+
+#pragma omp target data use_device_ptr(a)
+  { device_alloc = a; }
+
+  __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
+                        arg_types);
+
+  // CHECK: a was copied to the device
+  if (device_alloc != host_alloc)
+    printf("a was copied to the device\n");
+
+  free(a);
+
+  // CHECK: Done!
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/unified_shared_memory/close_modifier.c b/libomptarget/test/unified_shared_memory/close_modifier.c
new file mode 100644
index 0000000..b319c6b
--- /dev/null
+++ b/libomptarget/test/unified_shared_memory/close_modifier.c
@@ -0,0 +1,135 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
+
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc, *device_alloc;
+  void *host_data, *device_data;
+  int *alloc = (int *)malloc(N * sizeof(int));
+  int data[N];
+
+  for (int i = 0; i < N; ++i) {
+    alloc[i] = 10;
+    data[i] = 1;
+  }
+
+  host_data = &data[0];
+  host_alloc = &alloc[0];
+
+//
+// Test that updates on the device are not visible to host
+// when only a TO mapping is used.
+//
+#pragma omp target map(tofrom                                                  \
+                       : device_data, device_alloc) map(close, to              \
+                                                        : alloc[:N], data      \
+                                                        [:N])
+  {
+    device_data = &data[0];
+    device_alloc = &alloc[0];
+
+    for (int i = 0; i < N; i++) {
+      alloc[i] += 1;
+      data[i] += 1;
+    }
+  }
+
+  // CHECK: Address of alloc on device different from host address.
+  if (device_alloc != host_alloc)
+    printf("Address of alloc on device different from host address.\n");
+
+  // CHECK: Address of data on device different from host address.
+  if (device_data != host_data)
+    printf("Address of data on device different from host address.\n");
+
+  // On the host, check that the arrays have been updated.
+  // CHECK: Alloc host values not updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (alloc[i] != 10)
+      fails++;
+  }
+  printf("Alloc host values not updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  // CHECK: Data host values not updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (data[i] != 1)
+      fails++;
+  }
+  printf("Data host values not updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  //
+  // Test that updates on the device are visible on host
+  // when a from is used.
+  //
+
+  for (int i = 0; i < N; i++) {
+    alloc[i] += 1;
+    data[i] += 1;
+  }
+
+#pragma omp target map(close, tofrom : alloc[:N], data[:N])
+  {
+    // CHECK: Alloc device values are correct: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (alloc[i] != 11)
+        fails++;
+    }
+    printf("Alloc device values are correct: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+    // CHECK: Data device values are correct: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (data[i] != 2)
+        fails++;
+    }
+    printf("Data device values are correct: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+
+    // Update values on the device
+    for (int i = 0; i < N; i++) {
+      alloc[i] += 1;
+      data[i] += 1;
+    }
+  }
+
+  // CHECK: Alloc host values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (alloc[i] != 12)
+      fails++;
+  }
+  printf("Alloc host values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  // CHECK: Data host values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (data[i] != 3)
+      fails++;
+  }
+  printf("Data host values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  free(alloc);
+
+  // CHECK: Done!
+  printf("Done!\n");
+
+  return 0;
+}