[OpenMP][libomptarget] Add support for unified memory for regular maps

Summary:
This patch adds support for using unified memory in the case of regular maps that happen when a target region is offloaded to the device.

For cases where only a single version of the data is required then the host address can be used. When variables need to be privatized in any way or globalized, then the copy to the device is still required for correctness.

Reviewers: ABataev, jdoerfert, Hahnfeld, AlexEichenberger, caomhin, grokos

Reviewed By: Hahnfeld

Subscribers: mgorny, guansong, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D65001

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@368192 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/src/api.cpp b/libomptarget/src/api.cpp
index 430425a..9eeef4e 100644
--- a/libomptarget/src/api.cpp
+++ b/libomptarget/src/api.cpp
@@ -113,7 +113,15 @@
 
   DeviceTy& Device = Devices[device_num];
   bool IsLast; // not used
-  int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
+  bool IsHostPtr;
+  void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr);
+  int rc = (TgtPtr != NULL);
+  // Under unified memory the host pointer can be returned by the
+  // getTgtPtrBegin() function which means that there is no device
+  // corresponding point for ptr. This function should return false
+  // in that situation.
+  if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+    rc = !IsHostPtr;
   DP("Call to omp_target_is_present returns %d\n", rc);
   return rc;
 }
diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
index 5ecba57..0bdcc50 100644
--- a/libomptarget/src/device.cpp
+++ b/libomptarget/src/device.cpp
@@ -157,12 +157,17 @@
 // If NULL is returned, then either data allocation failed or the user tried
 // to do an illegal mapping.
 void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
-    int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
+    int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
+    bool UpdateRefCount) {
   void *rc = NULL;
+  IsHostPtr = false;
   DataMapMtx.lock();
   LookupResult lr = lookupMapping(HstPtrBegin, Size);
 
   // Check if the pointer is contained.
+  // If a variable is mapped to the device manually by the user - which would
+  // lead to the IsContained flag to be true - then we must ensure that the
+  // device address is returned even under unified memory conditions.
   if (lr.Flags.IsContained ||
       ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
     auto &HT = *lr.Entry;
@@ -183,15 +188,28 @@
     // Explicit extension of mapped data - not allowed.
     DP("Explicit extension of mapping is not allowed.\n");
   } else if (Size) {
-    // If it is not contained and Size > 0 we should create a new entry for it.
-    IsNew = true;
-    uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
-    DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
-        "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
-        DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
-    HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
-        (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
-    rc = (void *)tp;
+    // If unified shared memory is active, implicitly mapped variables that are not
+    // privatized use host address. Any explicitly mapped variables also use
+    // host address where correctness is not impeded. In all other cases
+    // maps are respected.
+    // TODO: In addition to the mapping rules above, when the close map
+    // modifier is implemented, foce the mapping of the variable to the device.
+    if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+      DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+         DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+      IsHostPtr = true;
+      rc = HstPtrBegin;
+    } else {
+      // If it is not contained and Size > 0 we should create a new entry for it.
+      IsNew = true;
+      uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
+      DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
+          "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
+          DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
+      HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
+          (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
+      rc = (void *)tp;
+    }
   }
 
   DataMapMtx.unlock();
@@ -202,8 +220,10 @@
 // Return the target pointer begin (where the data will be moved).
 // Decrement the reference counter if called from target_data_end.
 void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-    bool UpdateRefCount) {
+    bool UpdateRefCount, bool &IsHostPtr) {
   void *rc = NULL;
+  IsHostPtr = false;
+  IsLast = false;
   DataMapMtx.lock();
   LookupResult lr = lookupMapping(HstPtrBegin, Size);
 
@@ -221,8 +241,14 @@
         (CONSIDERED_INF(HT.RefCount)) ? "INF" :
             std::to_string(HT.RefCount).c_str());
     rc = (void *)tp;
-  } else {
-    IsLast = false;
+  } else if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+    // If the value isn't found in the mapping and unified shared memory
+    // is on then it means we have stumbled upon a value which we need to
+    // use directly from the host.
+    DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
+       DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
+    IsHostPtr = true;
+    rc = HstPtrBegin;
   }
 
   DataMapMtx.unlock();
@@ -244,6 +270,8 @@
 }
 
 int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
+  if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
+    return OFFLOAD_SUCCESS;
   // Check if the pointer is contained in any sub-nodes.
   int rc;
   DataMapMtx.lock();
diff --git a/libomptarget/src/device.h b/libomptarget/src/device.h
index ded84e3..2115076 100644
--- a/libomptarget/src/device.h
+++ b/libomptarget/src/device.h
@@ -137,10 +137,10 @@
   long getMapEntryRefCnt(void *HstPtrBegin);
   LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
   void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
-      bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
+      bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
   void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
-      bool UpdateRefCount);
+      bool UpdateRefCount, bool &IsHostPtr);
   int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
   int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
   int disassociatePtr(void *HstPtrBegin);
diff --git a/libomptarget/src/omptarget.cpp b/libomptarget/src/omptarget.cpp
index c41bf31..2813f28 100644
--- a/libomptarget/src/omptarget.cpp
+++ b/libomptarget/src/omptarget.cpp
@@ -242,6 +242,7 @@
     // Address of pointer on the host and device, respectively.
     void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
     bool IsNew, Pointer_IsNew;
+    bool IsHostPtr = false;
     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
     // have reached this point via __tgt_target_data_begin and not __tgt_target
@@ -253,7 +254,7 @@
       DP("Has a pointer entry: \n");
       // base is address of pointer.
       Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
-          sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
+          sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef);
       if (!Pointer_TgtPtrBegin) {
         DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
             "illegal mapping).\n");
@@ -269,7 +270,7 @@
     }
 
     void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
-        data_size, IsNew, IsImplicit, UpdateRef);
+        data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef);
     if (!TgtPtrBegin && data_size) {
       // If data_size==0, then the argument could be a zero-length pointer to
       // NULL, so getOrAlloc() returning NULL is not an error.
@@ -289,19 +290,21 @@
 
     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
       bool copy = false;
-      if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
-        copy = true;
-      } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
-        // Copy data only if the "parent" struct has RefCount==1.
-        int32_t parent_idx = member_of(arg_types[i]);
-        long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
-        assert(parent_rc > 0 && "parent struct not found");
-        if (parent_rc == 1) {
+      if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+        if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
           copy = true;
+        } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
+          // Copy data only if the "parent" struct has RefCount==1.
+          int32_t parent_idx = member_of(arg_types[i]);
+          long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+          assert(parent_rc > 0 && "parent struct not found");
+          if (parent_rc == 1) {
+            copy = true;
+          }
         }
       }
 
-      if (copy) {
+      if (copy && !IsHostPtr) {
         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
             data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
         int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
@@ -312,7 +315,7 @@
       }
     }
 
-    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
+    if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
@@ -363,14 +366,14 @@
       }
     }
 
-    bool IsLast;
+    bool IsLast, IsHostPtr;
     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
         (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
     bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
 
     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
-        UpdateRef);
+        UpdateRef, IsHostPtr);
     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
         " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
         (IsLast ? "" : " not"));
@@ -387,18 +390,22 @@
       if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
         bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
         bool CopyMember = false;
-        if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
-            !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
-          // Copy data only if the "parent" struct has RefCount==1.
-          int32_t parent_idx = member_of(arg_types[i]);
-          long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
-          assert(parent_rc > 0 && "parent struct not found");
-          if (parent_rc == 1) {
-            CopyMember = true;
+        if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
+          if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
+              !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
+            // Copy data only if the "parent" struct has RefCount==1.
+            int32_t parent_idx = member_of(arg_types[i]);
+            long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
+            assert(parent_rc > 0 && "parent struct not found");
+            if (parent_rc == 1) {
+              CopyMember = true;
+            }
           }
         }
 
-        if (DelEntry || Always || CopyMember) {
+        if ((DelEntry || Always || CopyMember) &&
+            !(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+              TgtPtrBegin == HstPtrBegin)) {
           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
               data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
           int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
@@ -471,14 +478,21 @@
 
     void *HstPtrBegin = args[i];
     int64_t MapSize = arg_sizes[i];
-    bool IsLast;
+    bool IsLast, IsHostPtr;
     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
-        false);
+        false, IsHostPtr);
     if (!TgtPtrBegin) {
       DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
       continue;
     }
 
+    if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+        TgtPtrBegin == HstPtrBegin) {
+      DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
+         DPxPTR(HstPtrBegin));
+      continue;
+    }
+
     if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
           arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
@@ -514,6 +528,7 @@
         DP("Copying data to device failed.\n");
         return OFFLOAD_FAIL;
       }
+
       uintptr_t lb = (uintptr_t) HstPtrBegin;
       uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
       Device.ShadowMtx.lock();
@@ -640,19 +655,26 @@
         void *HstPtrVal = args[i];
         void *HstPtrBegin = args_base[i];
         void *HstPtrBase = args[idx];
-        bool IsLast; // unused.
+        bool IsLast, IsHostPtr; // unused.
         void *TgtPtrBase =
             (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
         void *Pointer_TgtPtrBegin =
-            Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false);
+            Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
+                                  IsHostPtr);
         if (!Pointer_TgtPtrBegin) {
           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
              DPxPTR(HstPtrVal));
           continue;
         }
+        if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+            TgtPtrBegin == HstPtrBegin) {
+          DP("Unified memory is active, no need to map lambda captured"
+             "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
+          continue;
+        }
         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
            DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
         int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
@@ -668,7 +690,7 @@
     void *HstPtrBase = args_base[i];
     void *TgtPtrBegin;
     ptrdiff_t TgtBaseOffset;
-    bool IsLast; // unused.
+    bool IsLast, IsHostPtr; // unused.
     if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
           DPxPTR(HstPtrBase));
@@ -705,14 +727,14 @@
       }
     } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
-          false);
+          false, IsHostPtr);
       TgtBaseOffset = 0; // no offset for ptrs.
       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
          "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
          DPxPTR(HstPtrBase));
     } else {
       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
-          false);
+          false, IsHostPtr);
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
 #ifdef OMPTARGET_DEBUG
       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
diff --git a/libomptarget/test/offloading/requires.c b/libomptarget/test/offloading/requires.c
index 7f014d3..079ce5c 100644
--- a/libomptarget/test/offloading/requires.c
+++ b/libomptarget/test/offloading/requires.c
@@ -43,4 +43,4 @@
   {}
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/libomptarget/test/unified_shared_memory/api.c b/libomptarget/test/unified_shared_memory/api.c
new file mode 100644
index 0000000..b0a71ad
--- /dev/null
+++ b/libomptarget/test/unified_shared_memory/api.c
@@ -0,0 +1,164 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+void init(int A[], int B[], int C[]) {
+  for (int i = 0; i < N; ++i) {
+    A[i] = 0;
+    B[i] = 1;
+    C[i] = i;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  const int device = omp_get_default_device();
+
+  // Manual registration of requires flags for Clang versions
+  // that do not support requires.
+  __tgt_register_requires(8);
+
+  // CHECK: Initial device: -10
+  printf("Initial device: %d\n", omp_get_initial_device());
+
+  //
+  // Target alloc & target memcpy
+  //
+  int A[N], B[N], C[N];
+
+  // Init
+  init(A, B, C);
+
+  int *pA, *pB, *pC;
+
+  // map ptrs
+  pA = &A[0];
+  pB = &B[0];
+  pC = &C[0];
+
+  int *d_A = (int *)omp_target_alloc(N * sizeof(int), device);
+  int *d_B = (int *)omp_target_alloc(N * sizeof(int), device);
+  int *d_C = (int *)omp_target_alloc(N * sizeof(int), device);
+
+  // CHECK: omp_target_alloc succeeded
+  printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed");
+
+  omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device,
+                    omp_get_initial_device());
+  omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device,
+                    omp_get_initial_device());
+
+#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device)
+  {
+#pragma omp parallel for schedule(static, 1)
+    for (int i = 0; i < N; i++) {
+      d_A[i] = d_B[i] + d_C[i] + 1;
+    }
+  }
+
+  omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(),
+                    device);
+
+  // CHECK: Test omp_target_memcpy: Succeeded
+  int fail = 0;
+  for (int i = 0; i < N; ++i) {
+    if (A[i] != i + 2)
+      fail++;
+  }
+  if (fail) {
+    printf("Test omp_target_memcpy: Failed\n");
+  } else {
+    printf("Test omp_target_memcpy: Succeeded\n");
+  }
+
+  //
+  // target_is_present and target_associate/disassociate_ptr
+  //
+  init(A, B, C);
+
+  // CHECK: B is not present, associating it...
+  // CHECK: omp_target_associate_ptr B succeeded
+  if (!omp_target_is_present(B, device)) {
+    printf("B is not present, associating it...\n");
+    int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device);
+    printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed");
+  }
+
+  // CHECK: C is not present, associating it...
+  // CHECK: omp_target_associate_ptr C succeeded
+  if (!omp_target_is_present(C, device)) {
+    printf("C is not present, associating it...\n");
+    int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device);
+    printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed");
+  }
+
+// CHECK: Inside target data: A is not present
+// CHECK: Inside target data: B is present
+// CHECK: Inside target data: C is present
+#pragma omp target data map(from : B, C) device(device)
+  {
+    printf("Inside target data: A is%s present\n",
+           omp_target_is_present(A, device) ? "" : " not");
+    printf("Inside target data: B is%s present\n",
+           omp_target_is_present(B, device) ? "" : " not");
+    printf("Inside target data: C is%s present\n",
+           omp_target_is_present(C, device) ? "" : " not");
+
+#pragma omp target map(from : A) device(device)
+    {
+#pragma omp parallel for schedule(static, 1)
+      for (int i = 0; i < N; i++)
+        A[i] = B[i] + C[i] + 1;
+    }
+  }
+
+  // CHECK: B is present, disassociating it...
+  // CHECK: omp_target_disassociate_ptr B succeeded
+  // CHECK: C is present, disassociating it...
+  // CHECK: omp_target_disassociate_ptr C succeeded
+  if (omp_target_is_present(B, device)) {
+    printf("B is present, disassociating it...\n");
+    int rc = omp_target_disassociate_ptr(B, device);
+    printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed");
+  }
+  if (omp_target_is_present(C, device)) {
+    printf("C is present, disassociating it...\n");
+    int rc = omp_target_disassociate_ptr(C, device);
+    printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed");
+  }
+
+  // CHECK: Test omp_target_associate_ptr: Succeeded
+  fail = 0;
+  for (int i = 0; i < N; ++i) {
+    if (A[i] != i + 2)
+      fail++;
+  }
+  if (fail) {
+    printf("Test omp_target_associate_ptr: Failed\n");
+  } else {
+    printf("Test omp_target_associate_ptr: Succeeded\n");
+  }
+
+  omp_target_free(d_A, device);
+  omp_target_free(d_B, device);
+  omp_target_free(d_C, device);
+
+  printf("Done!\n");
+
+  return 0;
+}
diff --git a/libomptarget/test/unified_shared_memory/shared_update.c b/libomptarget/test/unified_shared_memory/shared_update.c
new file mode 100644
index 0000000..8036bc2
--- /dev/null
+++ b/libomptarget/test/unified_shared_memory/shared_update.c
@@ -0,0 +1,114 @@
+// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
+
+#include <stdio.h>
+#include <omp.h>
+
+// ---------------------------------------------------------------------------
+// Various definitions copied from OpenMP RTL
+
+extern void __tgt_register_requires(int64_t);
+
+// End of definitions copied from OpenMP RTL.
+// ---------------------------------------------------------------------------
+
+#pragma omp requires unified_shared_memory
+
+#define N 1024
+
+int main(int argc, char *argv[]) {
+  int fails;
+  void *host_alloc, *device_alloc;
+  void *host_data, *device_data;
+  int *alloc = (int *)malloc(N * sizeof(int));
+  int data[N];
+
+  // Manual registration of requires flags for Clang versions
+  // that do not support requires.
+  __tgt_register_requires(8);
+
+  for (int i = 0; i < N; ++i) {
+    alloc[i] = 10;
+    data[i] = 1;
+  }
+
+  host_data = &data[0];
+  host_alloc = &alloc[0];
+
+// implicit mapping of data
+#pragma omp target map(tofrom : device_data, device_alloc)
+  {
+    device_data = &data[0];
+    device_alloc = &alloc[0];
+
+    for (int i = 0; i < N; i++) {
+      alloc[i] += 1;
+      data[i] += 1;
+    }
+  }
+
+  // CHECK: Address of alloc on device matches host address.
+  if (device_alloc == host_alloc)
+    printf("Address of alloc on device matches host address.\n");
+
+  // CHECK: Address of data on device matches host address.
+  if (device_data == host_data)
+    printf("Address of data on device matches host address.\n");
+
+  // On the host, check that the arrays have been updated.
+  // CHECK: Alloc device values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (alloc[i] != 11)
+      fails++;
+  }
+  printf("Alloc device values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  // CHECK: Data device values updated: Succeeded
+  fails = 0;
+  for (int i = 0; i < N; i++) {
+    if (data[i] != 2)
+      fails++;
+  }
+  printf("Data device values updated: %s\n",
+         (fails == 0) ? "Succeeded" : "Failed");
+
+  //
+  // Test that updates on the host snd on the device are both visible.
+  //
+
+  // Update on the host.
+  for (int i = 0; i < N; ++i) {
+    alloc[i] += 1;
+    data[i] += 1;
+  }
+
+#pragma omp target
+  {
+    // CHECK: Alloc host values updated: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (alloc[i] != 12)
+        fails++;
+    }
+    printf("Alloc host values updated: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+    // CHECK: Data host values updated: Succeeded
+    fails = 0;
+    for (int i = 0; i < N; i++) {
+      if (data[i] != 3)
+        fails++;
+    }
+    printf("Data host values updated: %s\n",
+           (fails == 0) ? "Succeeded" : "Failed");
+  }
+
+  free(alloc);
+
+  printf("Done!\n");
+
+  return 0;
+}