[test-suite, CUDA] Run test kernel with just one thread.

For some reason this test is flaky on sm_60+, with the flaky failures irrelevant
to what we're testing here. Reducing grid size should reduce the failure rate.
diff --git a/External/CUDA/new.cu b/External/CUDA/new.cu
index 23fc530..95ef760 100644
--- a/External/CUDA/new.cu
+++ b/External/CUDA/new.cu
@@ -58,7 +58,7 @@
 }
 
 int main() {
-  kernel<<<32, 32>>>();
+  kernel<<<1, 1>>>();
   cudaError_t err = cudaDeviceSynchronize();
   if (err != cudaSuccess) {
     printf("CUDA error %d\n", (int)err);