[libc] Use __builtin_ffsll for RPC lane mask (#85000)

src/__support/GPU/utils.h doesn't compile on a 32-bit platforms because
__builtin_ffsl uses long which is a 32-bit number. Use __builtin_ffsll
which uses long long which is guaranteed to be at least 64-bits.

GitOrigin-RevId: c6a93fe80b3cf30ff82d06e959c1177798c858ae
diff --git a/src/__support/GPU/utils.h b/src/__support/GPU/utils.h
index 93022e8..cb04a35 100644
--- a/src/__support/GPU/utils.h
+++ b/src/__support/GPU/utils.h
@@ -23,7 +23,7 @@
 namespace gpu {
 /// Get the first active thread inside the lane.
 LIBC_INLINE uint64_t get_first_lane_id(uint64_t lane_mask) {
-  return __builtin_ffsl(lane_mask) - 1;
+  return __builtin_ffsll(lane_mask) - 1;
 }
 
 /// Conditional that is only true for a single thread in a lane.