[OpenMP] Fixed a crash in hidden helper thread

It is reported that after enabling hidden helper thread, the program
can hit the assertion `new_gtid < __kmp_threads_capacity` sometimes. The root
cause is explained as follows. Let's say the default `__kmp_threads_capacity` is
`N`. If hidden helper thread is enabled, `__kmp_threads_capacity` will be offset
to `N+8` by default. If the number of threads we need exceeds `N+8`, e.g. via
`num_threads` clause, we need to expand `__kmp_threads`. In
`__kmp_expand_threads`, the expansion starts from `__kmp_threads_capacity`, and
repeatedly doubling it until the new capacity meets the requirement. Let's
assume the new requirement is `Y`.  If `Y` happens to meet the constraint
`(N+8)*2^X=Y` where `X` is the number of iterations, the new capacity is not
enough because we have 8 slots for hidden helper threads.

Here is an example.
```
#include <vector>

int main(int argc, char *argv[]) {
  constexpr const size_t N = 1344;
  std::vector<int> data(N);

#pragma omp parallel for
  for (unsigned i = 0; i < N; ++i) {
    data[i] = i;
  }

#pragma omp parallel for num_threads(N)
  for (unsigned i = 0; i < N; ++i) {
    data[i] += i;
  }

  return 0;
}
```
My CPU is 20C40T, then `__kmp_threads_capacity` is 160. After offset,
`__kmp_threads_capacity` becomes 168. `1344 = (160+8)*2^3`, then the assertions
hit.

Reviewed By: protze.joachim

Differential Revision: https://reviews.llvm.org/D98838

GitOrigin-RevId: 2df65f87c1ea81008768e14522e5d9277234ba70
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index 8f42a9d..8ebbd03 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -854,6 +854,12 @@
   if (TCR_PTR(__kmp_threads[0]) == NULL) {
     --capacity;
   }
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
   if (__kmp_nth + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
       capacity) {
@@ -3607,6 +3613,13 @@
     --capacity;
   }
 
+  // If it is not for initializing the hidden helper team, we need to take
+  // __kmp_hidden_helper_threads_num out of the capacity because it is included
+  // in __kmp_threads_capacity.
+  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
+    capacity -= __kmp_hidden_helper_threads_num;
+  }
+
   /* see if there are too many threads */
   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
     if (__kmp_tp_cached) {
@@ -3639,7 +3652,7 @@
     /* find an available thread slot */
     // Don't reassign the zero slot since we need that to only be used by
     // initial thread. Slots for hidden helper threads should also be skipped.
-    if (initial_thread && __kmp_threads[0] == NULL) {
+    if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
       gtid = 0;
     } else {
       for (gtid = __kmp_hidden_helper_threads_num + 1;
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index 35c15ee..dd23348 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -504,9 +504,10 @@
     nth = (4 * __kmp_xproc);
 
   // If hidden helper task is enabled, we initialize the thread capacity with
-  // extra
-  // __kmp_hidden_helper_threads_num.
-  nth += __kmp_hidden_helper_threads_num;
+  // extra __kmp_hidden_helper_threads_num.
+  if (__kmp_enable_hidden_helper) {
+    nth += __kmp_hidden_helper_threads_num;
+  }
 
   if (nth > __kmp_max_nth)
     nth = __kmp_max_nth;
diff --git a/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
new file mode 100644
index 0000000..776aee9
--- /dev/null
+++ b/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
@@ -0,0 +1,45 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <omp.h>
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <thread>
+#include <vector>
+
+void dummy_root() {
+  // omp_get_max_threads() will do middle initialization
+  int nthreads = omp_get_max_threads();
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+}
+
+int main(int argc, char *argv[]) {
+  const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+                                  4 * omp_get_num_procs()),
+                         std::numeric_limits<int>::max());
+
+  std::vector<int> data(N);
+
+  // Create a new thread to initialize the OpenMP RTL. The new thread will not
+  // be taken as the "initial thread".
+  std::thread root(dummy_root);
+
+#pragma omp parallel for num_threads(N)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] = i;
+  }
+
+#pragma omp parallel for num_threads(N + 1)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] += i;
+  }
+
+  for (unsigned i = 0; i < N; ++i) {
+    assert(data[i] == 2 * i);
+  }
+
+  root.join();
+
+  return 0;
+}
diff --git a/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
new file mode 100644
index 0000000..a9d394f
--- /dev/null
+++ b/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
@@ -0,0 +1,31 @@
+// RUN: %libomp-cxx-compile-and-run
+
+#include <omp.h>
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+  const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+                                  4 * omp_get_num_procs()),
+                         std::numeric_limits<int>::max());
+
+  std::vector<int> data(N);
+
+#pragma omp parallel for num_threads(N)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] = i;
+  }
+
+#pragma omp parallel for num_threads(N + 1)
+  for (unsigned i = 0; i < N; ++i) {
+    data[i] += i;
+  }
+
+  for (unsigned i = 0; i < N; ++i) {
+    assert(data[i] == 2 * i);
+  }
+
+  return 0;
+}