[Libomptarget] Remove remaining inline assembly from the device RTL (#79922)

Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.
GitOrigin-RevId: 6aed6cc40ec0006bb43f1ec4b2ec87702392ad6e
diff --git a/libomptarget/DeviceRTL/CMakeLists.txt b/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e..2509f12 100644
--- a/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/libomptarget/DeviceRTL/CMakeLists.txt
@@ -293,7 +293,7 @@
   if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
     compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
   elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
+    compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
   else()
     libomptarget_error_say("Unknown GPU architecture '${gpu_arch}'")
   endif()
diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
index 822b8dc..31dd805 100644
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -154,23 +154,11 @@
 
 const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
 
-LaneMaskTy activemask() {
-  unsigned int Mask;
-  asm("activemask.b32 %0;" : "=r"(Mask));
-  return Mask;
-}
+LaneMaskTy activemask() { return __nvvm_activemask(); }
 
-LaneMaskTy lanemaskLT() {
-  __kmpc_impl_lanemask_t Res;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
-  return Res;
-}
+LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
 
-LaneMaskTy lanemaskGT() {
-  __kmpc_impl_lanemask_t Res;
-  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
-  return Res;
-}
+LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
 
 uint32_t getThreadIdInBlock(int32_t Dim) {
   switch (Dim) {
diff --git a/libomptarget/DeviceRTL/src/Misc.cpp b/libomptarget/DeviceRTL/src/Misc.cpp
index 87d5687..c24af94 100644
--- a/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/libomptarget/DeviceRTL/src/Misc.cpp
@@ -62,9 +62,8 @@
 }
 
 double getWTime() {
-  unsigned long long nsecs;
-  asm volatile("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
-  return (double)nsecs * getWTick();
+  uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
+  return static_cast<double>(nsecs) * getWTick();
 }
 
 #pragma omp end declare variant
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index 57f98a3..80ba87b 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -345,10 +345,7 @@
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.
   constexpr int BarrierNo = 7;
-  asm volatile("barrier.sync %0, %1;"
-               :
-               : "r"(BarrierNo), "r"(NumThreads)
-               : "memory");
+  __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
 }
 
 void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
@@ -361,7 +358,7 @@
 
 void syncThreads(atomic::OrderingTy Ordering) {
   constexpr int BarrierNo = 8;
-  asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
+  __nvvm_barrier_sync(BarrierNo);
 }
 
 void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
index 7da4da4..d07ac0f 100644
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/libomptarget/DeviceRTL/src/Utils.cpp
@@ -22,52 +22,17 @@
 namespace impl {
 
 bool isSharedMemPtr(const void *Ptr) { return false; }
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
   static_assert(sizeof(unsigned long) == 8, "");
-  *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
-  *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
+  *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+  *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
 }
 
 uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
   return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
 }
 
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match(                                       \
-        device = {arch(nvptx, nvptx64)},                                       \
-            implementation = {extension(match_any)})
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  uint32_t LowBitsLocal, HighBitsLocal;
-  asm("mov.b64 {%0,%1}, %2;"
-      : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
-      : "l"(Val));
-  *LowBits = LowBitsLocal;
-  *HighBits = HighBitsLocal;
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
-  uint64_t Val;
-  asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
-  return Val;
-}
-
-#pragma omp end declare variant
-///}
-
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);