Factor architecture dependent code out of loop.cu

Summary:
[libomptarget] Factor architecture dependent code out of loop.cu

Related to the patch series starting D64217. Added subscribers to said series as reviewers. This effort is smaller in scope.

This patch factors out just enough architecture dependent code from loop.cu to allow the same source to be used with amdgcn, given a different target_impl.h. Testing is that the same bitcode (modulo variable names) is generated for libomptarget before and after the refactor, for nvptx and the out of tree amdgcn.

Reviewers: jdoerfert, ABataev, bollu, jfb, tra, grokos, Hahnfeld, guansong, xtian, gregrodgers, ronlieb, hfinkel, gtbercea, guraypp, arpith-jacob

Reviewed By: jdoerfert, ABataev

Subscribers: dexonsmith, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D65836

git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@368751 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index c255137..ff9fac3 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
+#include "target_impl.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
@@ -381,20 +382,18 @@
 
   INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
     int lo, hi;
-    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+    __kmpc_impl_unpack(val, lo, hi);
     hi = __SHFL_SYNC(active, hi, leader);
     lo = __SHFL_SYNC(active, lo, leader);
-    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
-    return val;
+    return __kmpc_impl_pack(lo, hi);
   }
 
   INLINE static uint64_t NextIter() {
-    unsigned int active = __ACTIVEMASK();
-    int leader = __ffs(active) - 1;
-    int change = __popc(active);
-    unsigned lane_mask_lt;
-    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
-    unsigned int rank = __popc(active & lane_mask_lt);
+    __kmpc_impl_lanemask_t active = __ACTIVEMASK();
+    int leader = __kmpc_impl_ffs(active) - 1;
+    int change = __kmpc_impl_popc(active);
+    __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
+    unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
     uint64_t warp_res;
     if (rank == 0) {
       warp_res = atomicAdd(
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
new file mode 100644
index 0000000..6104695
--- /dev/null
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -0,0 +1,41 @@
+//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definitions of target specific functions
+//
+//===----------------------------------------------------------------------===//
+#ifndef _TARGET_IMPL_H_
+#define _TARGET_IMPL_H_
+
+#include <stdint.h>
+
+#include "option.h"
+
+INLINE void __kmpc_impl_unpack(int64_t val, int32_t &lo, int32_t &hi) {
+  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+}
+
+INLINE int64_t __kmpc_impl_pack(int32_t lo, int32_t hi) {
+  int64_t val;
+  asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+  return val;
+}
+
+typedef uint32_t __kmpc_impl_lanemask_t;
+
+INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
+  __kmpc_impl_lanemask_t res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
+  return res;
+}
+
+INLINE int __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
+
+INLINE int __kmpc_impl_popc(uint32_t x) { return __popc(x); }
+
+#endif