[libomptarget][devicertl] Drop templated atomic functions

[libomptarget][devicertl] Drop templated atomic functions

The five __kmpc_atomic templates are instantiated a total of seven times.
This change replaces the template with explictly typed functions, which
have the same prototype for amdgcn and nvptx, and implements them with
the same code presently in use.

Rolls in the accepted but not yet landed D95085.

The unsigned long long type can be replaced with uint64_t when replacing
the cuda function. Until then, clang warns on casting a pointer to one to
a pointer to the other.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95093

GitOrigin-RevId: 9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 6e8a651..228d3f6 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -126,29 +126,17 @@
 DEVICE unsigned GetLaneId();
 
 // Atomics
-template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
-  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
-}
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
 
-INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
-  return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
-}
-
-template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
-  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
-}
-
-template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
-  T r;
-  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
-  return r;
-}
-
-template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
-  (void)__atomic_compare_exchange(address, &compare, &val, false,
-                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-  return compare;
-}
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                                 unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+                                            unsigned long long);
 
 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 7388a29..35828cd 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -132,11 +132,13 @@
 } // namespace
 
 DEVICE int GetNumberOfBlocksInKernel() {
-  return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
+  return get_grid_dim(__builtin_amdgcn_grid_size_x(),
+                      __builtin_amdgcn_workgroup_size_x());
 }
 
 DEVICE int GetNumberOfThreadsInBlock() {
-  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
+  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
+                           __builtin_amdgcn_grid_size_x(),
                            __builtin_amdgcn_workgroup_size_x());
 }
 
@@ -149,6 +151,40 @@
   return GetNumberOfThreadsInBlock();
 }
 
+// Atomics
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
+}
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+  return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, "");
+}
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
+}
+
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
+}
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
+                                  uint32_t Val) {
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
+                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return Compare;
+}
+
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
+                                                 unsigned long long Val) {
+  unsigned long long R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
+}
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
+                                            unsigned long long Val) {
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
+}
+
 // Stub implementations
 DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
 DEVICE void __kmpc_impl_free(void *) {}
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 75945e3..2bf1952 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -140,6 +140,41 @@
 DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
+// Forward declaration of atomics. Although they're template functions, we
+// already have definitions for different types in CUDA internal headers with
+// the right mangled names.
+template <typename T> DEVICE T atomicAdd(T *address, T val);
+template <typename T> DEVICE T atomicInc(T *address, T val);
+template <typename T> DEVICE T atomicMax(T *address, T val);
+template <typename T> DEVICE T atomicExch(T *address, T val);
+template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
+
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+  return atomicAdd(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+  return atomicInc(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+  return atomicMax(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+  return atomicExch(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
+                                  uint32_t Val) {
+  return atomicCAS(Address, Compare, Val);
+}
+
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
+                                                 unsigned long long Val) {
+  return atomicExch(Address, Val);
+}
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
+                                            unsigned long long Val) {
+  return atomicAdd(Address, Val);
+}
+
 #define __OMP_SPIN 1000
 #define UNSET 0u
 #define SET 1u
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 1d7b649..1828fcf 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -130,35 +130,18 @@
 DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();
 
-// Forward declaration of atomics. Although they're template functions, we
-// already have definitions for different types in CUDA internal headers with
-// the right mangled names.
-template <typename T> DEVICE T atomicAdd(T *address, T val);
-template <typename T> DEVICE T atomicInc(T *address, T val);
-template <typename T> DEVICE T atomicMax(T *address, T val);
-template <typename T> DEVICE T atomicExch(T *address, T val);
-template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
-
 // Atomics
-template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
-  return atomicAdd(address, val);
-}
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
 
-template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
-  return atomicInc(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
-  return atomicMax(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
-  return atomicExch(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
-  return atomicCAS(address, compare, val);
-}
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                                 unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+                                            unsigned long long);
 
 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);