[OpenMP][OMPT] Add OMPT callback for device data exchange 'Device-to-Device' (#81991)

Since there's no `ompt_target_data_transfer_tofrom_device` (within
ompt_target_data_op_t enum) or something other that conveys the meaning
of inter-device data exchange we decided to indicate a Device-to-Device
transfer by using: optype == ompt_target_data_transfer_from_device (=3)

Hence, a device transfer may be identified e.g. by checking for: (optype
== 3) &&
(src_device_num < omp_get_num_devices()) &&
(dest_device_num < omp_get_num_devices())

Fixes: #66478
GitOrigin-RevId: e521752c04a479e3751003645a728667f3199d24
diff --git a/libomptarget/include/OpenMP/OMPT/Interface.h b/libomptarget/include/OpenMP/OMPT/Interface.h
index 13eca73..327fadf 100644
--- a/libomptarget/include/OpenMP/OMPT/Interface.h
+++ b/libomptarget/include/OpenMP/OMPT/Interface.h
@@ -54,12 +54,14 @@
                           void **TgtPtrBegin, size_t Size, void *Code);
 
   /// Top-level function for invoking callback before data submit
-  void beginTargetDataSubmit(int64_t DeviceId, void *HstPtrBegin,
-                             void *TgtPtrBegin, size_t Size, void *Code);
+  void beginTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                             int64_t DstDeviceId, void *DstPtrBegin,
+                             size_t Size, void *Code);
 
   /// Top-level function for invoking callback after data submit
-  void endTargetDataSubmit(int64_t DeviceId, void *HstPtrBegin,
-                           void *TgtPtrBegin, size_t Size, void *Code);
+  void endTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                           int64_t DstDeviceId, void *DstPtrBegin, size_t Size,
+                           void *Code);
 
   /// Top-level function for invoking callback before device data deallocation
   void beginTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin, void *Code);
@@ -68,12 +70,14 @@
   void endTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin, void *Code);
 
   /// Top-level function for invoking callback before data retrieve
-  void beginTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                               void *TgtPtrBegin, size_t Size, void *Code);
+  void beginTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                               int64_t DstDeviceId, void *DstPtrBegin,
+                               size_t Size, void *Code);
 
   /// Top-level function for invoking callback after data retrieve
-  void endTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                             void *TgtPtrBegin, size_t Size, void *Code);
+  void endTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                             int64_t DstDeviceId, void *DstPtrBegin,
+                             size_t Size, void *Code);
 
   /// Top-level function for invoking callback before kernel dispatch
   void beginTargetSubmit(unsigned int NumTeams = 1);
diff --git a/libomptarget/src/OpenMP/OMPT/Callback.cpp b/libomptarget/src/OpenMP/OMPT/Callback.cpp
index 66435d2..f285843 100644
--- a/libomptarget/src/OpenMP/OMPT/Callback.cpp
+++ b/libomptarget/src/OpenMP/OMPT/Callback.cpp
@@ -119,41 +119,38 @@
   endTargetDataOperation();
 }
 
-void Interface::beginTargetDataSubmit(int64_t DeviceId, void *TgtPtrBegin,
-                                      void *HstPtrBegin, size_t Size,
-                                      void *Code) {
+void Interface::beginTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                      int64_t DstDeviceId, void *DstPtrBegin,
+                                      size_t Size, void *Code) {
   beginTargetDataOperation();
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_to_device, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), TgtPtrBegin, DeviceId, Size,
-        Code);
+        ompt_target_data_transfer_to_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   } else if (ompt_callback_target_data_op_fn) {
     // HostOpId is set by the runtime
     HostOpId = createOpId();
     // Invoke the tool supplied data op callback
     ompt_callback_target_data_op_fn(
         TargetData.value, HostOpId, ompt_target_data_transfer_to_device,
-        HstPtrBegin, /*SrcDeviceNum=*/omp_get_initial_device(), TgtPtrBegin,
-        DeviceId, Size, Code);
+        SrcPtrBegin, SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
   }
 }
 
-void Interface::endTargetDataSubmit(int64_t DeviceId, void *TgtPtrBegin,
-                                    void *HstPtrBegin, size_t Size,
-                                    void *Code) {
+void Interface::endTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                    int64_t DstDeviceId, void *DstPtrBegin,
+                                    size_t Size, void *Code) {
   // Only EMI callback handles end scope
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_to_device, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), TgtPtrBegin, DeviceId, Size,
-        Code);
+        ompt_target_data_transfer_to_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   }
   endTargetDataOperation();
 }
@@ -193,41 +190,38 @@
   endTargetDataOperation();
 }
 
-void Interface::beginTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                                        void *TgtPtrBegin, size_t Size,
-                                        void *Code) {
+void Interface::beginTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                        int64_t DstDeviceId, void *DstPtrBegin,
+                                        size_t Size, void *Code) {
   beginTargetDataOperation();
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_from_device, TgtPtrBegin, DeviceId,
-        HstPtrBegin,
-        /*TgtDeviceNum=*/omp_get_initial_device(), Size, Code);
+        ompt_target_data_transfer_from_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   } else if (ompt_callback_target_data_op_fn) {
     // HostOpId is set by the runtime
     HostOpId = createOpId();
     // Invoke the tool supplied data op callback
     ompt_callback_target_data_op_fn(
         TargetData.value, HostOpId, ompt_target_data_transfer_from_device,
-        TgtPtrBegin, DeviceId, HstPtrBegin,
-        /*TgtDeviceNum=*/omp_get_initial_device(), Size, Code);
+        SrcPtrBegin, SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
   }
 }
 
-void Interface::endTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                                      void *TgtPtrBegin, size_t Size,
-                                      void *Code) {
+void Interface::endTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                      int64_t DstDeviceId, void *DstPtrBegin,
+                                      size_t Size, void *Code) {
   // Only EMI callback handles end scope
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_from_device, TgtPtrBegin, DeviceId,
-        HstPtrBegin,
-        /*TgtDeviceNum=*/omp_get_initial_device(), Size, Code);
+        ompt_target_data_transfer_from_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   }
   endTargetDataOperation();
 }
diff --git a/libomptarget/src/device.cpp b/libomptarget/src/device.cpp
index 5fe3f50..3345277 100644
--- a/libomptarget/src/device.cpp
+++ b/libomptarget/src/device.cpp
@@ -151,7 +151,7 @@
   OMPT_IF_BUILT(
       InterfaceRAII TargetDataSubmitRAII(
           RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
-          DeviceID, TgtPtrBegin, HstPtrBegin, Size,
+          omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
   if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
@@ -173,7 +173,7 @@
   OMPT_IF_BUILT(
       InterfaceRAII TargetDataRetrieveRAII(
           RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
-          DeviceID, HstPtrBegin, TgtPtrBegin, Size,
+          DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
   if (!RTL->data_retrieve_async || !RTL->synchronize)
@@ -185,6 +185,17 @@
 // Copy data from current device to destination device directly
 int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                int64_t Size, AsyncInfoTy &AsyncInfo) {
+  /// RAII to establish tool anchors before and after data exchange
+  /// Note: Despite the fact that this is a data exchange, we use 'from_device'
+  ///       operation enum (w.r.t. ompt_target_data_op_t) as there is currently
+  ///       no better alternative. It is still possible to distinguish this
+  ///       scenario from a real data retrieve by checking if both involved
+  ///       device numbers are less than omp_get_num_devices().
+  OMPT_IF_BUILT(
+      InterfaceRAII TargetDataExchangeRAII(
+          RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
+          RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
+          /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
   if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
     assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
     return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
diff --git a/libomptarget/test/ompt/callbacks.h b/libomptarget/test/ompt/callbacks.h
index 1f9b7c1..95437d9 100644
--- a/libomptarget/test/ompt/callbacks.h
+++ b/libomptarget/test/ompt/callbacks.h
@@ -81,11 +81,14 @@
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
   if (endpoint == ompt_scope_begin)
     *host_op_id = next_op_id++;
+  // target_task_data may be null, avoid dereferencing it
+  uint64_t target_task_data_value =
+      (target_task_data) ? target_task_data->value : 0;
   printf("  Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
          "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
          "src_device_num=%d "
          "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         endpoint, optype, target_task_data, target_task_data->value,
+         endpoint, optype, target_task_data, target_task_data_value,
          target_data, target_data->value, host_op_id, *host_op_id, src_addr,
          src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
 }
diff --git a/libomptarget/test/ompt/target_memcpy.c b/libomptarget/test/ompt/target_memcpy.c
index 444f4b7..80a8d6a 100644
--- a/libomptarget/test/ompt/target_memcpy.c
+++ b/libomptarget/test/ompt/target_memcpy.c
@@ -33,6 +33,10 @@
   if (omp_target_memcpy(dev_ptr, &host_var1, sizeof(int), 0, 0, dev, host))
     abort();
 
+  // D2D transfer
+  if (omp_target_memcpy(dev_ptr, dev_ptr, sizeof(int), 0, 0, dev, dev))
+    abort();
+
   // D2H transfer
   if (omp_target_memcpy(&host_var2, dev_ptr, sizeof(int), 0, 0, host, dev))
     abort();
@@ -46,16 +50,25 @@
 
 // clang-format off
 /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
+/// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
 /// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE1:.*]]
+/// CHECK: code=[[CODE1:0x[0-f]+]]
 /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK-SAME: src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE1]]
-/// CHECK: code=[[CODE2:.*]]
+/// CHECK: code=[[CODE2:0x[0-f]+]]
 /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE2]]
-/// CHECK: code=[[CODE3:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: code=[[CODE3:0x[0-f]+]]
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE3]]
+/// CHECK: code=[[CODE4:0x[0-f]+]]
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK-NOT: code=(nil)
+/// CHECK-NOT: code=[[CODE4]]
diff --git a/libomptarget/test/ompt/target_memcpy_emi.c b/libomptarget/test/ompt/target_memcpy_emi.c
new file mode 100644
index 0000000..5347f38
--- /dev/null
+++ b/libomptarget/test/ompt/target_memcpy_emi.c
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// REQUIRES: ompt
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+/*
+ * Verify all three data transfer directions: H2D, D2D and D2H
+ */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "callbacks.h"
+#include "register_emi.h"
+
+int main(void) {
+  int NumDevices = omp_get_num_devices();
+  assert(NumDevices > 0 && "No device(s) present.");
+  int Device = omp_get_default_device();
+  int Host = omp_get_initial_device();
+  // Note: Zero value depicts an OFFLOAD_SUCCESS
+  int Status;
+
+  printf("Allocating Memory on Device\n");
+  int *DevPtr = (int *)omp_target_alloc(sizeof(int), Device);
+  assert(DevPtr && "Could not allocate memory on device.");
+  int *HstPtr = (int *)malloc(sizeof(int));
+  *HstPtr = 42;
+
+  printf("Testing: Host to Device\n");
+  Status = omp_target_memcpy(DevPtr, HstPtr, sizeof(int), 0, 0, Device, Host);
+  assert(Status == 0 && "H2D memory copy operation failed.\n");
+
+  printf("Testing: Device to Device\n");
+  Status = omp_target_memcpy(DevPtr, DevPtr, sizeof(int), 0, 0, Device, Device);
+  assert(Status == 0 && "D2D memory copy operation failed.\n");
+
+  printf("Testing: Device to Host\n");
+  Status = omp_target_memcpy(HstPtr, DevPtr, sizeof(int), 0, 0, Host, Device);
+  assert(Status == 0 && "D2H memory copy operation failed.\n");
+
+  printf("Checking Correctness\n");
+  assert(*HstPtr == 42);
+
+  printf("Freeing Memory on Device\n");
+  free(HstPtr);
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
+
+// clang-format off
+
+/// CHECK: Callback Init:
+
+/// CHECK: Allocating Memory on Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
+/// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+
+/// CHECK: Testing: Host to Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+
+/// CHECK: Testing: Device to Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+
+/// CHECK: Testing: Device to Host
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+
+/// CHECK: Checking Correctness
+
+/// CHECK: Freeing Memory on Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 {{.+}} src_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 {{.+}} src_device_num=[[DEVICE]]
+
+/// CHECK: Callback Fini:
+
+// clang-format on