[openmp][amdgpu] Make plugin robust to presence of explicit implicit arguments

OpenMP (compiler) does not currently request any implicit kernel
arguments. OpenMP (runtime) allocates and initialises a reasonable guess at
the implicit kernel arguments anyway.

This change makes the plugin check the number of explicit arguments, instead
of all arguments, and puts the pointer to hostcall buffer in both the current
location and at the offset expected when implicit arguments are added to the
metadata by D113538.

This is intended to keep things running while fixing the oversight in the
compiler (in D113538). Once that patch lands, and a following one marks
openmp kernels that use printf such that the backend emits an args element
with the right type (instead of hidden_node), the over-allocation can be
removed and the hardcoded 8*e+3 offset replaced with one read from the
.offset of the corresponding metadata element.

Reviewed By: estewart08

Differential Revision: https://reviews.llvm.org/D114274

GitOrigin-RevId: ae5348a38eb1668cd9042d9a5207dc32bc4edb87
diff --git a/libomptarget/plugins/amdgpu/impl/internal.h b/libomptarget/plugins/amdgpu/impl/internal.h
index bdac98c..fe974d3 100644
--- a/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/libomptarget/plugins/amdgpu/impl/internal.h
@@ -54,7 +54,8 @@
   uint32_t sgpr_spill_count;
   uint32_t vgpr_spill_count;
   uint32_t kernel_segment_size;
-  uint32_t num_args;
+  uint32_t explicit_argument_count;
+  uint32_t implicit_argument_count;
 } atl_kernel_info_t;
 
 typedef struct atl_symbol_info_s {
diff --git a/libomptarget/plugins/amdgpu/impl/system.cpp b/libomptarget/plugins/amdgpu/impl/system.cpp
index 4d661dc..6dd464e 100644
--- a/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -381,7 +381,7 @@
       return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
     }
 
-    atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+    atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
     msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
@@ -446,8 +446,6 @@
         return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
       }
 
-      info.num_args = argsSize;
-
       for (size_t i = 0; i < argsSize; ++i) {
         KernelArgMD lcArg;
 
@@ -476,8 +474,10 @@
         // check if the arg is a hidden/implicit arg
         // this logic assumes that all hidden args are 8-byte aligned
         if (!isImplicit(lcArg.valueKind_)) {
+          info.explicit_argument_count++;
           kernel_explicit_args_size += lcArg.size_;
         } else {
+          info.implicit_argument_count++;
           hasHiddenArgs = true;
         }
         kernel_explicit_args_size += padding;
diff --git a/libomptarget/plugins/amdgpu/src/rtl.cpp b/libomptarget/plugins/amdgpu/src/rtl.cpp
index 71321be..45d9476 100644
--- a/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -2071,7 +2071,7 @@
   const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
   const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
 
-  assert(arg_num == (int)KernelInfoEntry.num_args);
+  assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
 
   /*
    * Set limit based on ThreadsPerGroup and GroupsPerDevice
@@ -2173,14 +2173,31 @@
         // under a multiple reader lock, not a writer lock.
         static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
         pthread_mutex_lock(&hostcall_init_lock);
-        impl_args->hostcall_ptr = hostrpc_assign_buffer(
+        unsigned long buffer = hostrpc_assign_buffer(
             DeviceInfo.HSAAgents[device_id], queue, device_id);
         pthread_mutex_unlock(&hostcall_init_lock);
-        if (!impl_args->hostcall_ptr) {
+        if (!buffer) {
           DP("hostrpc_assign_buffer failed, gpu would dereference null and "
              "error\n");
           return OFFLOAD_FAIL;
         }
+
+        if (KernelInfoEntry.implicit_argument_count >= 4) {
+          // Initialise pointer for implicit_argument_count != 0 ABI
+          // Guess that the right implicit argument is at offset 24 after
+          // the explicit arguments. In the future, should be able to read
+          // the offset from msgpack. Clang is not annotating it at present.
+          uint64_t Offset =
+              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
+          if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
+            DP("Bad offset of hostcall, exceeds kernarg segment size\n");
+          } else {
+            memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
+          }
+        }
+
+        // initialise pointer for implicit_argument_count == 0 ABI
+        impl_args->hostcall_ptr = buffer;
       }
 
       packet->kernarg_address = kernarg;