[Clang] Fix GPU match any truncating 64-bit lane mask

Summary:
This is a lane mask, needs all 64 bits for those wave64 targets. At some
point we should introduce __lanemask_t for this.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 0fb3916..d308cc9 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -261,7 +261,7 @@
 // Returns a bitmask marking all lanes that have the same value of __x.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
-  uint32_t __match_mask = 0;
+  uint64_t __match_mask = 0;
 
   bool __done = 0;
   while (__gpu_ballot(__lane_mask, !__done)) {