[Libomptarget] Remove remaining inline assembly from the device RTL (#79922)
Summary:
Recent patches have added some missing intrinsic functions NVPTX. This
patch gets rid of all the remaining uses of inline assembly. The one
change that wasn't directly replaced with a built-in was the `pack` and
`unpack` implementations. However, using the generic C implementation is
equivalent to the output SASS when run through PTXAS.
GitOrigin-RevId: 6aed6cc40ec0006bb43f1ec4b2ec87702392ad6e
diff --git a/libomptarget/DeviceRTL/CMakeLists.txt b/libomptarget/DeviceRTL/CMakeLists.txt
index 1ce3e1e..2509f12 100644
--- a/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/libomptarget/DeviceRTL/CMakeLists.txt
@@ -293,7 +293,7 @@
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
- compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
+ compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
else()
libomptarget_error_say("Unknown GPU architecture '${gpu_arch}'")
endif()
diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
index 822b8dc..31dd805 100644
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -154,23 +154,11 @@
const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
-LaneMaskTy activemask() {
- unsigned int Mask;
- asm("activemask.b32 %0;" : "=r"(Mask));
- return Mask;
-}
+LaneMaskTy activemask() { return __nvvm_activemask(); }
-LaneMaskTy lanemaskLT() {
- __kmpc_impl_lanemask_t Res;
- asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
- return Res;
-}
+LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
-LaneMaskTy lanemaskGT() {
- __kmpc_impl_lanemask_t Res;
- asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
- return Res;
-}
+LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
uint32_t getThreadIdInBlock(int32_t Dim) {
switch (Dim) {
diff --git a/libomptarget/DeviceRTL/src/Misc.cpp b/libomptarget/DeviceRTL/src/Misc.cpp
index 87d5687..c24af94 100644
--- a/libomptarget/DeviceRTL/src/Misc.cpp
+++ b/libomptarget/DeviceRTL/src/Misc.cpp
@@ -62,9 +62,8 @@
}
double getWTime() {
- unsigned long long nsecs;
- asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
- return (double)nsecs * getWTick();
+ uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer();
+ return static_cast<double>(nsecs) * getWTick();
}
#pragma omp end declare variant
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index 57f98a3..80ba87b 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -345,10 +345,7 @@
// The named barrier for active parallel threads of a team in an L1 parallel
// region to synchronize with each other.
constexpr int BarrierNo = 7;
- asm volatile("barrier.sync %0, %1;"
- :
- : "r"(BarrierNo), "r"(NumThreads)
- : "memory");
+ __nvvm_barrier_sync_cnt(BarrierNo, NumThreads);
}
void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); }
@@ -361,7 +358,7 @@
void syncThreads(atomic::OrderingTy Ordering) {
constexpr int BarrierNo = 8;
- asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
+ __nvvm_barrier_sync(BarrierNo);
}
void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
index 7da4da4..d07ac0f 100644
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/libomptarget/DeviceRTL/src/Utils.cpp
@@ -22,52 +22,17 @@
namespace impl {
bool isSharedMemPtr(const void *Ptr) { return false; }
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits);
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits);
-
-/// AMDGCN Implementation
-///
-///{
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
- *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
- *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
+ *LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
+ *HighBits = static_cast<uint32_t>((Val & 0xFFFFFFFF00000000UL) >> 32);
}
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
}
-#pragma omp end declare variant
-///}
-
-/// NVPTX Implementation
-///
-///{
-#pragma omp begin declare variant match( \
- device = {arch(nvptx, nvptx64)}, \
- implementation = {extension(match_any)})
-
-void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
- uint32_t LowBitsLocal, HighBitsLocal;
- asm("mov.b64 {%0,%1}, %2;"
- : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
- : "l"(Val));
- *LowBits = LowBitsLocal;
- *HighBits = HighBitsLocal;
-}
-
-uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
- uint64_t Val;
- asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
- return Val;
-}
-
-#pragma omp end declare variant
-///}
-
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);