[OpenMP] Improve default block count selection fow low block counts

If a combined loop has insufficient parallelism (= low trip count), we
might end up with too few teams/blocks. To counter that we can reduce
the number of threads per team we use. This patch implements a heuristic
and exposes a new environment variable to control the minimum of threads
to be employed in this case.

Issue reported by:
Felipe Cabarcas Jaramillo <cabarcas@udel.edu> (@fel-cab).

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D152014

GitOrigin-RevId: 6629a96a8ce5f07c72bd4931180a3ca9fc535cbb
diff --git a/docs/design/Runtimes.rst b/docs/design/Runtimes.rst
index 6004129..978ef88 100644
--- a/docs/design/Runtimes.rst
+++ b/docs/design/Runtimes.rst
@@ -720,6 +720,7 @@
     * ``LIBOMPTARGET_JIT_REPLACEMENT_MODULE=<in:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
     * ``LIBOMPTARGET_JIT_POST_OPT_IR_MODULE=<out:Filename> (LLVM-IR file)``
+    * ``LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=<Num> (default: 32)``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -1108,7 +1109,7 @@
 
 
 LIBOMPTARGET_JIT_POST_OPT_IR_MODULE
-""""""""""""""""""""""""""""""""""
+"""""""""""""""""""""""""""""""""""
 
 This environment variable can be used to extract the embedded device code after
 the device JIT runs additional IR optimizations on it (see
@@ -1118,6 +1119,18 @@
 :ref:`LIBOMPTARGET_JIT_REPLACEMENT_MODULE`.
 
 
+LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT
+"""""""""""""""""""""""""""""""""""""""""""
+
+This environment variable defines a lower bound for the number of threads if a
+combined kernel, e.g., `target teams distribute parallel for`, has insufficient
+parallelism. Especially if the trip count of the loops is lower than the number
+of threads possible times the number of teams (aka. blocks) the device preferes
+(see also :ref:`LIBOMPTARGET_AMDGPU_TEAMS_PER_CU), we will reduce the thread
+count to increase outer (team/block) parallelism. The thread count will never
+be reduced below the value passed for this environment variable though.
+
+
 
 .. _libomptarget_plugin:
 
diff --git a/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 8899f45..c86b2eb 100644
--- a/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #include <cstdint>
@@ -301,7 +302,7 @@
 uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
                                        uint32_t NumTeamsClause[3],
                                        uint64_t LoopTripCount,
-                                       uint32_t NumThreads) const {
+                                       uint32_t &NumThreads) const {
   assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
          "Multi dimensional launch not supported yet.");
 
@@ -312,14 +313,50 @@
     return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
   }
 
+  uint64_t DefaultNumBlocks = getDefaultNumBlocks(GenericDevice);
   uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
   if (LoopTripCount > 0) {
     if (isSPMDMode()) {
       // We have a combined construct, i.e. `target teams distribute
       // parallel for [simd]`. We launch so many teams so that each thread
-      // will execute one iteration of the loop. round up to the nearest
-      // integer
-      TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      // will execute one iteration of the loop; rounded up to the nearest
+      // integer. However, if that results in too few teams, we artificially
+      // reduce the thread count per team to increase the outer parallelism.
+      auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
+      MinThreads = std::min(MinThreads, NumThreads);
+
+      // Honor the thread_limit clause; only lower the number of threads.
+      auto OldNumThreads = NumThreads;
+      if (LoopTripCount >= DefaultNumBlocks * NumThreads) {
+        // Enough parallelism for teams and threads.
+        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+        assert(TripCountNumBlocks >= DefaultNumBlocks &&
+               "Expected sufficient outer parallelism.");
+      } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
+        // Enough parallelism for teams, limit threads.
+
+        // This case is hard; for now, we force "full warps":
+        // First, compute a thread count assuming DefaultNumBlocks.
+        auto NumThreadsDefaultBlocks =
+            (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
+        // Now get a power of two that is larger or equal.
+        auto NumThreadsDefaultBlocksP2 =
+            llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
+        // Do not increase a thread limit given be the user.
+        NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
+        assert(NumThreads >= MinThreads &&
+               "Expected sufficient inner parallelism.");
+        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      } else {
+        // Not enough parallelism for teams and threads, limit both.
+        NumThreads = std::min(NumThreads, MinThreads);
+        TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+      }
+
+      assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
+             "Expected sufficient parallelism");
+      assert(OldNumThreads >= NumThreads &&
+             "Number of threads cannot be increased!");
     } else {
       assert((isGenericMode() || isGenericSPMDMode()) &&
              "Unexpected execution mode!");
@@ -339,8 +376,7 @@
     }
   }
   // If the loops are long running we rather reuse blocks than spawn too many.
-  uint32_t PreferredNumBlocks = std::min(uint32_t(TripCountNumBlocks),
-                                         getDefaultNumBlocks(GenericDevice));
+  uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
   return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
 }
 
diff --git a/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 542d518..189406a 100644
--- a/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -313,9 +313,11 @@
   /// user-defined threads and block clauses.
   uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
                          uint32_t ThreadLimitClause[3]) const;
+
+  /// The number of threads \p NumThreads can be adjusted by this method.
   uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
                         uint32_t BlockLimitClause[3], uint64_t LoopTripCount,
-                        uint32_t NumThreads) const;
+                        uint32_t &NumThreads) const;
 
   /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
   bool isGenericSPMDMode() const {
@@ -740,6 +742,14 @@
     return std::move(MB);
   }
 
+  /// The minimum number of threads we use for a low-trip count combined loop.
+  /// Instead of using more threads we increase the outer (block/team)
+  /// parallelism.
+  /// @see OMPX_MinThreadsForLowTripCount
+  virtual uint32_t getMinThreadsForLowTripCountLoop() {
+    return OMPX_MinThreadsForLowTripCount;
+  }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -783,6 +793,12 @@
   UInt64Envar OMPX_TargetStackSize;
   UInt64Envar OMPX_TargetHeapSize;
 
+  /// Environment flag to set the minimum number of threads we use for a
+  /// low-trip count combined loop. Instead of using more threads we increase
+  /// the outer (block/team) parallelism.
+  UInt32Envar OMPX_MinThreadsForLowTripCount =
+      UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
+
 protected:
   /// Return the execution mode used for kernel \p Name.
   Expected<OMPTgtExecModeFlags> getExecutionModeForKernel(StringRef Name,
diff --git a/libomptarget/test/offloading/small_trip_count.c b/libomptarget/test/offloading/small_trip_count.c
new file mode 100644
index 0000000..f502a68
--- /dev/null
+++ b/libomptarget/test/offloading/small_trip_count.c
@@ -0,0 +1,41 @@
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+// RUN: env LIBOMPTARGET_INFO=16 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=8 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=EIGHT
+
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#define N 128
+
+__attribute__((optnone)) void optnone() {}
+
+int main() {
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 4 blocks and 32 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 16 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N - 1; ++i) {
+    optnone();
+  }
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 5 blocks and 32 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 17 blocks and 8 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N + 1; ++i) {
+    optnone();
+  }
+  // DEFAULT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
+  // EIGHT: Launching kernel {{.+_main_.+}} with 32 blocks and 4 threads in SPMD mode
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+}
+