[libomptarget][test] Add support for APU testing feature. (#82054)

Add test and support for `// REQUIRES: apu` for the category of tests
which exercise APU specific behavior.

Note: when running on an actual APU you may have to use the following if
the architecture ID is not enough to determine if the underlying device
is an APU:

```
IS_APU=1 ninja check-openmp
```
GitOrigin-RevId: 12ade6fc606c269101b9b13e52093c868e1a4924
diff --git a/libomptarget/test/lit.cfg b/libomptarget/test/lit.cfg
index fc1d436..565556e 100644
--- a/libomptarget/test/lit.cfg
+++ b/libomptarget/test/lit.cfg
@@ -34,6 +34,11 @@
 if 'HSA_ENABLE_SDMA' in os.environ:
     config.environment['HSA_ENABLE_SDMA'] = os.environ['HSA_ENABLE_SDMA']
 
+# Architectures like gfx942 may or may not be APUs so an additional environment
+# variable is required as some tests can be APU specific.
+if 'IS_APU' in os.environ:
+    config.environment['IS_APU'] = os.environ['IS_APU']
+
 # set default environment variables for test
 if 'CHECK_OPENMP_ENV' in os.environ:
     test_env = os.environ['CHECK_OPENMP_ENV'].split()
@@ -111,6 +116,7 @@
 # For CUDA, this is the case with compute capability 70 (Volta) or higher.
 # For all other targets, we currently assume it is.
 supports_unified_shared_memory = True
+supports_apu = False
 if config.libomptarget_current_target.startswith('nvptx'):
   try:
     cuda_arch = int(config.cuda_test_arch[:3])
@@ -126,8 +132,15 @@
             config.amdgpu_test_arch.startswith("gfx940") or
             config.amdgpu_test_arch.startswith("gfx942")):
        supports_unified_shared_memory = False
+    # check if AMD architecture is an APU:
+    if (config.amdgpu_test_arch.startswith("gfx940") or
+        (config.amdgpu_test_arch.startswith("gfx942") and
+         evaluate_bool_env(config.environment['IS_APU']))):
+       supports_apu = True
 if supports_unified_shared_memory:
    config.available_features.add('unified_shared_memory')
+if supports_apu:
+   config.available_features.add('apu')
 
 # Setup environment to find dynamic library at runtime
 if config.operating_system == 'Windows':
diff --git a/libomptarget/test/mapping/auto_zero_copy_apu.cpp b/libomptarget/test/mapping/auto_zero_copy_apu.cpp
new file mode 100644
index 0000000..48360e4
--- /dev/null
+++ b/libomptarget/test/mapping/auto_zero_copy_apu.cpp
@@ -0,0 +1,57 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// REQUIRES: apu
+
+// clang-format on
+
+#include <cstdio>
+
+int main() {
+  int n = 1024;
+
+  // test various mapping types
+  int *a = new int[n];
+  int k = 3;
+  int b[n];
+
+  for (int i = 0; i < n; i++)
+    b[i] = i;
+
+    // clang-format off
+  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+
+  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+// clang-format on
+#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
+    map(to : b[ : n])
+  for (int i = 0; i < n; i++)
+    a[i] = i + b[i] + k;
+
+  int err = 0;
+  for (int i = 0; i < n; i++)
+    if (a[i] != i + b[i] + k)
+      err++;
+
+  // CHECK: PASS
+  if (err == 0)
+    printf("PASS\n");
+  return err;
+}