[libomptarget] Fix devicertl build

[libomptarget] Fix devicertl build

The target specific functions in target_interface are extern C, but the
implementations for nvptx were mostly C++ mangling. That worked out as
a quirk of DEVICE macro expanding to nothing, except for shuffle.h which
only forward declared the functions with C++ linkage.

Also implements GetWarpSize, as used by shuffle, and includes target_interface
in nvptx target_impl.cu to help catch future divergence between interface and
implementation.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D98651

GitOrigin-RevId: bcb3f0f867b27179f9cab49d2ef41fe7769112c0
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 7241a39..56ecab0 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -130,6 +130,7 @@
 }
 
 EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+EXTERN unsigned GetWarpSize() { return WARPSIZE; }
 EXTERN unsigned GetLaneId() {
   return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
 }
diff --git a/libomptarget/deviceRTLs/common/include/target/shuffle.h b/libomptarget/deviceRTLs/common/include/target/shuffle.h
index e05315c..e274aa9 100644
--- a/libomptarget/deviceRTLs/common/include/target/shuffle.h
+++ b/libomptarget/deviceRTLs/common/include/target/shuffle.h
@@ -33,10 +33,12 @@
 /// Forward declarations
 ///
 ///{
+extern "C" {
 unsigned GetLaneId();
 unsigned GetWarpSize();
 void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
 uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+}
 ///}
 
 /// Fallback implementations of the shuffle sync idiom.
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 959452a..b856b41 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -13,64 +13,65 @@
 
 #include "common/debug.h"
 #include "target_impl.h"
+#include "target_interface.h"
 
-DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
+EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
 
-DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
+EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   uint64_t val;
   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
   return val;
 }
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
   __kmpc_impl_lanemask_t res;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
   return res;
 }
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
   __kmpc_impl_lanemask_t res;
   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
   return res;
 }
 
-DEVICE uint32_t __kmpc_impl_smid() {
+EXTERN uint32_t __kmpc_impl_smid() {
   uint32_t id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
   return id;
 }
 
-DEVICE double __kmpc_impl_get_wtick() {
+EXTERN double __kmpc_impl_get_wtick() {
   // Timer precision is 1ns
   return ((double)1E-9);
 }
 
-DEVICE double __kmpc_impl_get_wtime() {
+EXTERN double __kmpc_impl_get_wtime() {
   unsigned long long nsecs;
   asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   unsigned int Mask;
   asm volatile("activemask.b32 %0;" : "=r"(Mask));
   return Mask;
 }
 
-DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
+EXTERN void __kmpc_impl_syncthreads() { __syncthreads(); }
 
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
   __nvvm_bar_warp_sync(Mask);
 }
 
 // NVPTX specific kernel initialization
-DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
 }
 
 // Barrier until num_threads arrive.
-DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.
   int barrier = 1;
@@ -80,19 +81,20 @@
                : "memory");
 }
 
-DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
-DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
-DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
+EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
+EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
+EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
 
 // Calls to the NVPTX layer (assuming 1D layout)
-DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
-DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-DEVICE int GetNumberOfBlocksInKernel() {
+EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+EXTERN int GetNumberOfBlocksInKernel() {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
-DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
-DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
-DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
+EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+EXTERN unsigned GetWarpSize() { return WARPSIZE; }
+EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
 // Atomics
 DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
@@ -135,15 +137,15 @@
 #define UNSET 0u
 #define SET 1u
 
-DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
+EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
   __kmpc_impl_unset_lock(lock);
 }
 
-DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
+EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
   __kmpc_impl_unset_lock(lock);
 }
 
-DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
+EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // TODO: not sure spinning is a good idea here..
   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
     int32_t start = __nvvm_read_ptx_sreg_clock();
@@ -158,15 +160,15 @@
   } // wait for 0 to be the read value
 }
 
-DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
+EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
   (void)__kmpc_atomic_exchange(lock, UNSET);
 }
 
-DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
+EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
   return __kmpc_atomic_add(lock, 0u);
 }
 
-DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
-DEVICE void __kmpc_impl_free(void *x) { free(x); }
+EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
+EXTERN void __kmpc_impl_free(void *x) { free(x); }
 
 #pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/target_interface.h b/libomptarget/deviceRTLs/target_interface.h
index 058021c..92fca47 100644
--- a/libomptarget/deviceRTLs/target_interface.h
+++ b/libomptarget/deviceRTLs/target_interface.h
@@ -21,19 +21,20 @@
 EXTERN int GetNumberOfBlocksInKernel();
 EXTERN int GetNumberOfThreadsInBlock();
 EXTERN unsigned GetWarpId();
+EXTERN unsigned GetWarpSize();
 EXTERN unsigned GetLaneId();
 
 // Atomics
-extern DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
-extern DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
-extern DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
-extern DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
-extern DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-extern DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
-                                                        unsigned long long);
-extern DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
-                                                   unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                                 unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+                                            unsigned long long);
 
 // Locks
 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);