[Libomptarget] Use NVPTX lane id intrinsic in DeviceRTL (#84928)

Summary:
We are currently taking the lower 5 bites of the thread ID as the warp
ID. This doesn't work in non-1D grids and is also slower than just using
the dedicated hardware register.
GitOrigin-RevId: 9f69d3cf88905df5006f93dce536b7e73c0b1735
diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
index 31dd805..b2028a8 100644
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -172,10 +172,7 @@
   UNREACHABLE("Dim outside range!");
 }
 
-uint32_t getThreadIdInWarp() {
-  return impl::getThreadIdInBlock(mapping::DIM_X) &
-         (mapping::getWarpSize() - 1);
-}
+uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
 
 uint32_t getBlockIdInKernel(int32_t Dim) {
   switch (Dim) {