[OpenMP][DeviceRTL] Extract shuffle idiom and port it to declare variant

The shuffle idiom is differently implemented in our supported targets.
To reduce the "target_impl" file we now move the shuffle idiom in it's
own self-contained header that provides the implementation for AMDGPU
and NVPTX. A fallback can be added later on.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95752

GitOrigin-RevId: 66ba494b4974017ba6e42deed138b9fb9ad50af7
diff --git a/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
index 0e890b1..2c278f7 100644
--- a/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -68,6 +68,7 @@
   ${devicertl_base_directory}/common/src/parallel.cu
   ${devicertl_base_directory}/common/src/reduction.cu
   ${devicertl_base_directory}/common/src/support.cu
+  ${devicertl_base_directory}/common/src/shuffle.cpp
   ${devicertl_base_directory}/common/src/sync.cu
   ${devicertl_base_directory}/common/src/task.cu)
 
@@ -112,6 +113,7 @@
     -O${optimization_level}
     ${CUDA_DEBUG}
     -I${CMAKE_CURRENT_SOURCE_DIR}/src
+    -I${devicertl_base_directory}/common/include
     -I${devicertl_base_directory})
 
   set(bc1_files)
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 51eaf71..7241a39 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -52,22 +52,6 @@
   return __builtin_amdgcn_read_exec();
 }
 
-EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                     int32_t srcLane) {
-  int width = WARPSIZE;
-  int self = GetLaneId();
-  int index = srcLane + (self & ~(width - 1));
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
-EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
-                                          uint32_t laneDelta, int32_t width) {
-  int self = GetLaneId();
-  int index = self + laneDelta;
-  index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index;
-  return __builtin_amdgcn_ds_bpermute(index << 2, var);
-}
-
 uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
 #pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)
 
diff --git a/libomptarget/deviceRTLs/common/include/target/shuffle.h b/libomptarget/deviceRTLs/common/include/target/shuffle.h
new file mode 100644
index 0000000..26e29e8
--- /dev/null
+++ b/libomptarget/deviceRTLs/common/include/target/shuffle.h
@@ -0,0 +1,107 @@
+//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Shuffle function implementations for all supported targets.
+//
+// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
+#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
+
+#include <assert.h>
+#include <inttypes.h>
+
+#pragma omp declare target
+
+/// External shuffle API
+///
+///{
+
+extern "C" {
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+}
+
+///}
+
+/// Forward declarations
+///
+///{
+unsigned GetLaneId();
+unsigned GetWarpSize();
+void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
+uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
+///}
+
+/// Fallback implementations of the shuffle sync idiom.
+///
+///{
+
+inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  assert(false &&
+         "Fallback version of __kmpc_impl_shfl_sync is not available!");
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  assert(false &&
+         "Fallback version of __kmpc_impl_shfl_down_sync is not available!");
+}
+
+///}
+
+/// AMDGCN implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  int Width = GetWarpSize();
+  int Self = GetLaneId();
+  int Index = SrcLane + (Self & ~(Width - 1));
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
+                                          uint32_t LaneDelta, int32_t Width) {
+  int Self = GetLaneId();
+  int Index = Self + LaneDelta;
+  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX implementations of the shuffle and shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
+                                     int32_t SrcLane) {
+  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
+}
+
+inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
+                                          uint32_t Delta, int32_t Width) {
+  int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f;
+  return __nvvm_shfl_down_i32(Var, Delta, T);
+}
+
+#pragma omp end declare variant
+///}
+
+#pragma omp end declare target
+
+#endif
diff --git a/libomptarget/deviceRTLs/common/src/data_sharing.cu b/libomptarget/deviceRTLs/common/src/data_sharing.cu
index d3bff1c..e80211d 100644
--- a/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -12,6 +12,7 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
+#include "target/shuffle.h"
 #include "target_impl.h"
 
 // Return true if this is the master thread.
diff --git a/libomptarget/deviceRTLs/common/src/loop.cu b/libomptarget/deviceRTLs/common/src/loop.cu
index c203bc9..b1fce8e 100644
--- a/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/libomptarget/deviceRTLs/common/src/loop.cu
@@ -14,6 +14,7 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
+#include "target/shuffle.h"
 #include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/common/src/reduction.cu b/libomptarget/deviceRTLs/common/src/reduction.cu
index 7b1bb3c..9daa78d 100644
--- a/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -12,6 +12,7 @@
 #pragma omp declare target
 
 #include "common/omptarget.h"
+#include "target/shuffle.h"
 #include "target_impl.h"
 
 EXTERN
@@ -20,18 +21,6 @@
 EXTERN
 void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
 
-EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
-  return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size);
-}
-
-EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
-  uint32_t lo, hi;
-  __kmpc_impl_unpack(val, lo, hi);
-  hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size);
-  lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size);
-  return __kmpc_impl_pack(lo, hi);
-}
-
 INLINE static void gpu_regular_warp_reduce(void *reduce_data,
                                            kmp_ShuffleReductFctPtr shflFct) {
   for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
diff --git a/libomptarget/deviceRTLs/common/src/shuffle.cpp b/libomptarget/deviceRTLs/common/src/shuffle.cpp
new file mode 100644
index 0000000..9cb49c7
--- /dev/null
+++ b/libomptarget/deviceRTLs/common/src/shuffle.cpp
@@ -0,0 +1,29 @@
+//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "target/shuffle.h"
+
+#pragma omp declare target
+
+static constexpr uint64_t AllLanes = -1;
+
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
+  return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
+}
+
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
+  uint32_t lo, hi;
+  __kmpc_impl_unpack(val, lo, hi);
+  hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
+  lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
+  return __kmpc_impl_pack(lo, hi);
+}
+
+#pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index d4ff23a..26d7706 100644
--- a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -128,6 +128,7 @@
   ${devicertl_common_directory}/src/support.cu
   ${devicertl_common_directory}/src/sync.cu
   ${devicertl_common_directory}/src/task.cu
+  ${devicertl_common_directory}/src/shuffle.cpp
   src/target_impl.cu
 )
 
@@ -140,6 +141,7 @@
              -Xclang -target-feature -Xclang +ptx61
              -D__CUDACC__
              -I${devicertl_base_directory}
+             -I${devicertl_common_directory}/include
              -I${devicertl_nvptx_directory}/src)
 
 if(${LIBOMPTARGET_NVPTX_DEBUG})
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 3ed8c8b..959452a 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -59,18 +59,6 @@
   return Mask;
 }
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-  int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
-  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
-}
-
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
diff --git a/libomptarget/deviceRTLs/target_interface.h b/libomptarget/deviceRTLs/target_interface.h
index 6b5477e..058021c 100644
--- a/libomptarget/deviceRTLs/target_interface.h
+++ b/libomptarget/deviceRTLs/target_interface.h
@@ -57,12 +57,6 @@
 
 EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane);
-EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width);
-
 EXTERN void __kmpc_impl_syncthreads();
 EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);