[OpenMP] Directly use user's grid and block size in kernel language mode (#70612)

In kernel language mode, use user's grid and blocks size directly. No
validity
check, which means if user's values are too large, the launch will fail,
similar
to what CUDA and HIP are doing right now.

GitOrigin-RevId: 3768039913be32666a316a2b5c12739c423dbc61
diff --git a/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index 716b0ad..28484ae 100644
--- a/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -397,6 +397,9 @@
 
   /// The prototype kernel launch environment.
   KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+
+  /// If the kernel is a bare kernel.
+  bool IsBareKernel = false;
 };
 
 /// Class representing a map of host pinned allocations. We track these pinned
diff --git a/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 1d96468..1c9777d 100644
--- a/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -436,6 +436,7 @@
        Name, ErrStr.data());
     assert(KernelEnvironment.Configuration.ReductionDataSize == 0 &&
            "Default initialization failed.");
+    IsBareKernel = true;
   }
 
   // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
@@ -594,6 +595,10 @@
                                         uint32_t ThreadLimitClause[3]) const {
   assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 &&
          "Multi dimensional launch not supported yet.");
+
+  if (IsBareKernel && ThreadLimitClause[0] > 0)
+    return ThreadLimitClause[0];
+
   if (ThreadLimitClause[0] > 0 && isGenericMode())
     ThreadLimitClause[0] += GenericDevice.getWarpSize();
 
@@ -610,6 +615,9 @@
   assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
          "Multi dimensional launch not supported yet.");
 
+  if (IsBareKernel && NumTeamsClause[0] > 0)
+    return NumTeamsClause[0];
+
   if (NumTeamsClause[0] > 0) {
     // TODO: We need to honor any value and consequently allow more than the
     // block limit. For this we might need to start multiple kernels or let the
diff --git a/libomptarget/test/offloading/ompx_bare.c b/libomptarget/test/offloading/ompx_bare.c
new file mode 100644
index 0000000..fb3810b
--- /dev/null
+++ b/libomptarget/test/offloading/ompx_bare.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=63 %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+
+#include <assert.h>
+#include <ompx.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+  const int num_blocks = 64;
+  const int block_size = 64;
+  const int N = num_blocks * block_size;
+  int *data = (int *)malloc(N * sizeof(int));
+
+  // CHECK: "PluginInterface" device 0 info: Launching kernel __omp_offloading_{{.*}} with 64 blocks and 64 threads in SPMD mode
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(from: data[0:N])
+  {
+    int bid = ompx_block_id_x();
+    int bdim = ompx_block_dim_x();
+    int tid = ompx_thread_id_x();
+    int idx = bid * bdim + tid;
+    data[idx] = idx;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(data[i] == i);
+
+  // CHECK: PASS
+  printf("PASS\n");
+
+  return 0;
+}