[OpenMP] runtime support for efficient partitioning of collapsed triangular loops (#83939)

This PR adds OMP runtime support for more efficient partitioning of
certain types of collapsed loops that can be used by compilers that
support loop collapsing (i.e. MSVC) to achieve more optimal thread load
balancing.

In particular, this PR addresses double nested upper and lower isosceles
triangular loops of the following types

1. lower triangular 'less_than'
   for (int i=0; i<N; i++)
     for (int j=0; j<i; j++)
2. lower triangular 'less_than_equal'
   for (int i=0; i<N; j++)
     for (int j=0; j<=i; j++)
3. upper triangular
   for (int i=0; i<N; i++)
     for (int j=i; j<N; j++)

Includes tests for the three supported loop types.

---------

Co-authored-by: Vadim Paretsky <b-vadipa@microsoft.com>
GitOrigin-RevId: fcd2d483251605f1b6cdace0ce5baf5dfd31b880
diff --git a/runtime/src/kmp_collapse.cpp b/runtime/src/kmp_collapse.cpp
index 2c410ca..569d2c1 100644
--- a/runtime/src/kmp_collapse.cpp
+++ b/runtime/src/kmp_collapse.cpp
@@ -1272,6 +1272,304 @@
   }
 }
 
+/**************************************************************************
+ * Identify nested loop structure - loops come in the canonical form
+ * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0}
+ *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
+ * Upper Triangle matrix
+ *                        i = 0;     i <= N; i++    {0,0}:{N,0}
+ *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0}
+ * ************************************************************************/
+nested_loop_type_t
+kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
+                                   /*in*/ kmp_index_t n) {
+  // only 2-level nested loops are supported
+  if (n != 2) {
+    return nested_loop_type_unkown;
+  }
+  // loops must be canonical
+  KMP_ASSERT(
+      (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
+      (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
+  // check outer loop bounds: for triangular need to be {0,0}:{N,0}
+  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].lb0_u64);
+  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].lb1_u64);
+  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].ub1_u64);
+  if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
+    return nested_loop_type_unkown;
+  }
+  // check inner bounds to determine triangle type
+  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].lb0_u64);
+  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].ub0_u64);
+  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].lb1_u64);
+  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].ub1_u64);
+  // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
+      (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
+    return nested_loop_type_lower_triangular_matrix;
+  }
+  // upper triangle loop inner bounds need to be {0,1}:{N,0}
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
+      inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
+    return nested_loop_type_upper_triangular_matrix;
+  }
+  return nested_loop_type_unkown;
+}
+
+/**************************************************************************
+ * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
+ * Start point is x so the result is always > sqrt(x)
+ * The method has uniform convergence, PRECISION is set to 0.1
+ * ************************************************************************/
+#define level_of_precision 0.1
+double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
+  double sqrt_old = 0.;
+  double sqrt_new = (double)x;
+  do {
+    sqrt_old = sqrt_new;
+    sqrt_new = (sqrt_old + x / sqrt_old) / 2;
+  } while ((sqrt_old - sqrt_new) > level_of_precision);
+  return sqrt_new;
+}
+
+/**************************************************************************
+ *  Handle lower triangle matrix in the canonical form
+ *  i = 0; i <= N; i++          {0,0}:{N,0}
+ *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
+ * ************************************************************************/
+void kmp_handle_lower_triangle_matrix(
+    /*in*/ kmp_uint32 nth,
+    /*in*/ kmp_uint32 tid,
+    /*in */ kmp_index_t n,
+    /*in/out*/ bounds_info_t *original_bounds_nest,
+    /*out*/ bounds_info_t *chunk_bounds_nest) {
+
+  // transfer loop types from the original loop to the chunks
+  for (kmp_index_t i = 0; i < n; ++i) {
+    chunk_bounds_nest[i] = original_bounds_nest[i];
+  }
+  // cleanup iv variables
+  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].lb0_u64);
+  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                    original_bounds_nest[1].ub0_u64);
+  // calculate the chunk's lower and upper bounds
+  // the total number of iterations in the loop is the sum of the arithmetic
+  // progression from the outer lower to outer upper bound (inclusive since the
+  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+  // + 1) -> N - 1
+  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
+  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+  // the current thread's number of iterations:
+  // each thread gets an equal number of iterations: total number of iterations
+  // divided by the number of threads plus, if there's a remainder,
+  // the first threads with the number up to the remainder get an additional
+  // iteration each to cover it
+  kmp_uint64 iter_current =
+      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  // cumulative number of iterations executed by all the previous threads:
+  // threads with the tid below the remainder will have (iter_total/nth+1)
+  // elements, and so will all threads before them so the cumulative number of
+  // iterations executed by the all previous will be the current thread's number
+  // of iterations multiplied by the number of previous threads which is equal
+  // to the current thread's tid; threads with the number equal or above the
+  // remainder will have (iter_total/nth) elements so the cumulative number of
+  // iterations previously executed is its number of iterations multipled by the
+  // number of previous threads which is again equal to the current thread's tid
+  // PLUS all the remainder iterations that will have been executed by the
+  // previous threads
+  kmp_uint64 iter_before_current =
+      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is
+  // the cumulative number executed before it plus its own
+  kmp_uint64 iter_with_current = iter_before_current + iter_current;
+  // calculate the outer loop lower bound (lbo) which is the max outer iv value
+  // that gives the number of iterations that is equal or just below the total
+  // number of iterations executed by the previous threads, for less_than
+  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
+  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
+  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
+  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
+  // using a parameter to control the equation sign
+  kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
+  kmp_uint64 lower_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+                                      8 * iter_before_current) +
+                   inner_adjustment) /
+          2 -
+      inner_adjustment;
+  // calculate the inner loop lower bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed by the
+  // previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner =
+      iter_before_current -
+      ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the
+  // inner bound except using the total number of iterations executed with the
+  // current thread
+  kmp_uint64 upper_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+                                      8 * iter_with_current) +
+                   inner_adjustment) /
+          2 -
+      inner_adjustment;
+  // calculate the inner loop upper bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed after
+  // the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner =
+      iter_with_current -
+      ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of
+  // the current thread the first iteration of the next thread
+  if (upper_bound_inner == 0) {
+    // {n,0} => {n-1,n-1}
+    upper_bound_outer -= 1;
+    upper_bound_inner = upper_bound_outer;
+  } else {
+    // {n,m} => {n,m-1} (m!=0)
+    upper_bound_inner -= 1;
+  }
+
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space
+  // is now one-dimensional
+  chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
+  chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
+  chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
+  chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
+  chunk_bounds_nest[0].lb1_u64 = 0;
+  chunk_bounds_nest[0].ub1_u64 = 0;
+  chunk_bounds_nest[1].lb1_u64 = 0;
+  chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+}
+
+/**************************************************************************
+ *  Handle upper triangle matrix in the canonical form
+ *  i = 0; i <= N; i++     {0,0}:{N,0}
+ *  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
+ * ************************************************************************/
+void kmp_handle_upper_triangle_matrix(
+    /*in*/ kmp_uint32 nth,
+    /*in*/ kmp_uint32 tid,
+    /*in */ kmp_index_t n,
+    /*in/out*/ bounds_info_t *original_bounds_nest,
+    /*out*/ bounds_info_t *chunk_bounds_nest) {
+
+  // transfer loop types from the original loop to the chunks
+  for (kmp_index_t i = 0; i < n; ++i) {
+    chunk_bounds_nest[i] = original_bounds_nest[i];
+  }
+  // cleanup iv variables
+  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].lb0_u64);
+  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                    original_bounds_nest[1].ub0_u64);
+  // calculate the chunk's lower and upper bounds
+  // the total number of iterations in the loop is the sum of the arithmetic
+  // progression from the outer lower to outer upper bound (inclusive since the
+  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+  // + 1) -> N - 1
+  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
+  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+  // the current thread's number of iterations:
+  // each thread gets an equal number of iterations: total number of iterations
+  // divided by the number of threads plus, if there's a remainder,
+  // the first threads with the number up to the remainder get an additional
+  // iteration each to cover it
+  kmp_uint64 iter_current =
+      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  // cumulative number of iterations executed by all the previous threads:
+  // threads with the tid below the remainder will have (iter_total/nth+1)
+  // elements, and so will all threads before them so the cumulative number of
+  // iterations executed by the all previous will be the current thread's number
+  // of iterations multiplied by the number of previous threads which is equal
+  // to the current thread's tid; threads with the number equal or above the
+  // remainder will have (iter_total/nth) elements so the cumulative number of
+  // iterations previously executed is its number of iterations multipled by the
+  // number of previous threads which is again equal to the current thread's tid
+  // PLUS all the remainder iterations that will have been executed by the
+  // previous threads
+  kmp_uint64 iter_before_current =
+      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is
+  // the cumulative number executed before it plus its own
+  kmp_uint64 iter_with_current = iter_before_current + iter_current;
+  // calculate the outer loop lower bound (lbo) which is the max outer iv value
+  // that gives the number of iterations that is equal or just below the total
+  // number of iterations executed by the previous threads, for less_than
+  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
+  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
+  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
+  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
+  // using a parameter to control the equatio sign
+  kmp_uint64 lower_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
+  ;
+  // calculate the inner loop lower bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed by the
+  // previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner =
+      iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the
+  // inner bound except using the total number of iterations executed with the
+  // current thread
+  kmp_uint64 upper_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
+  // calculate the inner loop upper bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed after
+  // the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner =
+      iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of
+  // the current thread the first iteration of the next thread
+  if (upper_bound_inner == 0) {
+    // {n,0} => {n-1,n-1}
+    upper_bound_outer -= 1;
+    upper_bound_inner = upper_bound_outer;
+  } else {
+    // {n,m} => {n,m-1} (m!=0)
+    upper_bound_inner -= 1;
+  }
+
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space
+  // is now one-dimensional
+  chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
+  chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
+  chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
+  chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
+  chunk_bounds_nest[0].lb1_u64 = 0;
+  chunk_bounds_nest[0].ub1_u64 = 0;
+  chunk_bounds_nest[1].lb1_u64 = 0;
+  chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+}
 //----------Init API for non-rectangular loops--------------------------------
 
 // Init API for collapsed loops (static, no chunks defined).
@@ -1334,6 +1632,19 @@
 
   KMP_DEBUG_ASSERT(tid < nth);
 
+  // Handle special cases
+  nested_loop_type_t loop_type =
+      kmp_identify_nested_loop_structure(original_bounds_nest, n);
+  if (loop_type == nested_loop_type_lower_triangular_matrix) {
+    kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
+                                     chunk_bounds_nest);
+    return TRUE;
+  } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
+    kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
+                                     chunk_bounds_nest);
+    return TRUE;
+  }
+
   CollapseAllocator<kmp_uint64> original_ivs_start(n);
 
   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
diff --git a/runtime/src/kmp_collapse.h b/runtime/src/kmp_collapse.h
index e487018..1044478 100644
--- a/runtime/src/kmp_collapse.h
+++ b/runtime/src/kmp_collapse.h
@@ -45,6 +45,13 @@
   loop_type_int64 = 7
 };
 
+// Defining loop types to handle special cases
+enum nested_loop_type_t : kmp_int32 {
+  nested_loop_type_unkown = 0,
+  nested_loop_type_lower_triangular_matrix = 1,
+  nested_loop_type_upper_triangular_matrix = 2
+};
+
 /*!
  @ingroup WORK_SHARING
  * Describes the structure for rectangular nested loops.
@@ -124,14 +131,14 @@
 // It's represented in kmp_uint64, but each dimention is calculated in
 // that loop IV type. Also dimentions have to be converted to those types
 // when used in generated code.
-typedef kmp_uint64* kmp_point_t;
+typedef kmp_uint64 *kmp_point_t;
 
 // Array: Number of loop iterations on each nesting level to achieve some point,
 // in expanded space or in original space.
 // OMPTODO: move from using iterations to using offsets (iterations multiplied
 // by steps). For those we need to be careful with the types, as step can be
 // negative, but it'll remove multiplications and divisions in several places.
-typedef kmp_loop_nest_iv_t* kmp_iterations_t;
+typedef kmp_loop_nest_iv_t *kmp_iterations_t;
 
 // Internal struct with additional info:
 template <typename T> struct bounds_info_internalXX_template {
diff --git a/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c b/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
new file mode 100644
index 0000000..9d74206
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
@@ -0,0 +1,124 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp.h"
+
+#ifndef MAX_BOUND
+#define MAX_BOUND 64
+#endif
+#ifndef _MSC_VER
+#define NO_EFFICIENCY_CHECK
+#endif
+
+/* To ensure Correctness, only valid iterations are executed and are executed
+   only once. Stores the number of times an iteration is executed. */
+unsigned *execution_count = NULL;
+/* Stores the number of iterations executed by each thread. */
+unsigned *iterations_per_thread = NULL;
+
+unsigned *Alloc(unsigned bound1, unsigned bound2) {
+  return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
+}
+
+void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
+  memset(p, 0, bound1 * bound2 * sizeof(unsigned));
+}
+
+void Free(unsigned *p) { free((void *)p); }
+
+unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
+  return &p[i * bound2 + j];
+}
+
+int test(unsigned upper_bound) {
+
+  unsigned total_iterations = upper_bound * (upper_bound - 1) / 2;
+  unsigned num_threads = omp_get_max_threads();
+  unsigned lower_per_chunk = total_iterations / num_threads;
+  unsigned upper_per_chunk =
+      lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+  int i, j;
+
+  omp_set_num_threads(num_threads);
+
+  ZeroOut(execution_count, upper_bound, upper_bound);
+  ZeroOut(iterations_per_thread, num_threads, 1);
+
+#ifdef VERBOSE
+  fprintf(stderr,
+          "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
+          "chunks "
+          "loop type lower triangle <,< - ",
+          num_threads, upper_bound, total_iterations, lower_per_chunk,
+          upper_per_chunk);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+#pragma omp for schedule(static) collapse(2)
+    for (i = 0; i < upper_bound; i++) {
+      for (j = 0; j < i; j++) {
+        (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+        (*Index(execution_count, i, j, upper_bound))++;
+      }
+    } /* end of for*/
+  } /* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound; i++) {
+    for (j = 0; j < i; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j<=i are valid, but should have been executed only once
+       */
+      if (value != 1) {
+        fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+    for (j = i; j < upper_bound; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j>=i are invalid and should not have been executed
+       */
+      if (value > 0) {
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+  }
+
+#ifndef NO_EFFICIENCY_CHECK
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for (i = 0; i < num_threads; i++) {
+    unsigned value = *Index(iterations_per_thread, i, 0, 1);
+    if (value < lower_per_chunk || value > upper_per_chunk) {
+      fprintf(stderr,
+              "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+              "iterations; must be between %d and %d\n",
+              i, num_threads, value, lower_per_chunk, upper_per_chunk);
+      return 0;
+    }
+  }
+#endif
+#ifdef VERBOSE
+  fprintf(stderr, "PASSED\r\n");
+#endif
+  return 1;
+}
+
+int main() {
+
+  execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+  iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+
+  for (unsigned j = 0; j < MAX_BOUND; j++) {
+    if (!test(j))
+      return 1;
+  }
+  Free(execution_count);
+  Free(iterations_per_thread);
+  return 0;
+}
diff --git a/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c b/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
new file mode 100644
index 0000000..154ee0f
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
@@ -0,0 +1,124 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp.h"
+
+#ifndef MAX_BOUND
+#define MAX_BOUND 64
+#endif
+#ifndef _MSC_VER
+#define NO_EFFICIENCY_CHECK
+#endif
+
+/* To ensure Correctness, only valid iterations are executed and are executed
+   only once. Stores the number of times an iteration is executed. */
+unsigned *execution_count = NULL;
+/* Stores the number of iterations executed by each thread. */
+unsigned *iterations_per_thread = NULL;
+
+unsigned *Alloc(unsigned bound1, unsigned bound2) {
+  return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
+}
+
+void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
+  memset(p, 0, bound1 * bound2 * sizeof(unsigned));
+}
+
+void Free(unsigned *p) { free((void *)p); }
+
+unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
+  return &p[i * bound2 + j];
+}
+
+int test(int upper_bound) {
+
+  unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
+  unsigned num_threads = omp_get_max_threads();
+  unsigned lower_per_chunk = total_iterations / num_threads;
+  unsigned upper_per_chunk =
+      lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+  int i, j;
+
+  omp_set_num_threads(num_threads);
+
+  ZeroOut(execution_count, upper_bound, upper_bound);
+  ZeroOut(iterations_per_thread, num_threads, 1);
+
+#ifdef VERBOSE
+  fprintf(stderr,
+          "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
+          "chunks "
+          "loop type lower triangle <,<= - ",
+          num_threads, upper_bound, total_iterations, lower_per_chunk,
+          upper_per_chunk);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+#pragma omp for schedule(static) collapse(2)
+    for (i = 0; i < upper_bound; i++) {
+      for (j = 0; j <= i; j++) {
+        (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+        (*Index(execution_count, i, j, upper_bound))++;
+      }
+    } /* end of for*/
+  } /* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound; i++) {
+    for (j = 0; j <= i; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j<=i are valid, but should have been executed only once
+       */
+      if (value != 1) {
+        fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+    for (j = i + 1; j < upper_bound; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j>=i are invalid and should not have been executed
+       */
+      if (value > 0) {
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+  }
+
+#ifndef NO_EFFICIENCY_CHECK
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for (i = 0; i < num_threads; i++) {
+    unsigned value = *Index(iterations_per_thread, i, 0, 1);
+    if (value < lower_per_chunk || value > upper_per_chunk) {
+      fprintf(stderr,
+              "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+              "iterations; must be between %d and %d\n",
+              i, num_threads, value, lower_per_chunk, upper_per_chunk);
+      return 0;
+    }
+  }
+#endif
+#ifdef VERBOSE
+  fprintf(stderr, "PASSED\r\n");
+#endif
+  return 1;
+}
+
+int main() {
+
+  execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+  iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+
+  for (unsigned j = 0; j < MAX_BOUND; j++) {
+    if (!test(j))
+      return 1;
+  }
+  Free(execution_count);
+  Free(iterations_per_thread);
+  return 0;
+}
diff --git a/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c b/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
new file mode 100644
index 0000000..4524100
--- /dev/null
+++ b/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
@@ -0,0 +1,124 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp.h"
+
+#ifndef MAX_BOUND
+#define MAX_BOUND 64
+#endif
+#ifndef _MSC_VER
+#define NO_EFFICIENCY_CHECK
+#endif
+
+/* To ensure Correctness, only valid iterations are executed and are executed
+   only once. Stores the number of times an iteration is executed. */
+unsigned *execution_count = NULL;
+/* Stores the number of iterations executed by each thread. */
+unsigned *iterations_per_thread = NULL;
+
+unsigned *Alloc(unsigned bound1, unsigned bound2) {
+  return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
+}
+
+void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
+  memset(p, 0, bound1 * bound2 * sizeof(unsigned));
+}
+
+void Free(unsigned *p) { free((void *)p); }
+
+unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
+  return &p[i * bound2 + j];
+}
+
+int test(unsigned upper_bound) {
+
+  unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
+  unsigned num_threads = omp_get_max_threads();
+  unsigned lower_per_chunk = total_iterations / num_threads;
+  unsigned upper_per_chunk =
+      lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+  int i, j;
+
+  omp_set_num_threads(num_threads);
+
+  ZeroOut(execution_count, upper_bound, upper_bound);
+  ZeroOut(iterations_per_thread, num_threads, 1);
+
+#ifdef VERBOSE
+  fprintf(stderr,
+          "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
+          "chunks "
+          "loop type upper triangle <,< - ",
+          num_threads, upper_bound, total_iterations, lower_per_chunk,
+          upper_per_chunk);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+#pragma omp for schedule(static) collapse(2)
+    for (i = 0; i < upper_bound; i++) {
+      for (j = i; j < upper_bound; j++) {
+        (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+        (*Index(execution_count, i, j, upper_bound))++;
+      }
+    } /* end of for*/
+  } /* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound; i++) {
+    for (j = i; j < upper_bound; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j<=i are valid, but should have been executed only once
+       */
+      if (value != 1) {
+        fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+    for (j = 0; j < i; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j>=i are invalid and should not have been executed
+       */
+      if (value > 0) {
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+  }
+
+#ifndef NO_EFFICIENCY_CHECK
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for (i = 0; i < num_threads; i++) {
+    unsigned value = *Index(iterations_per_thread, i, 0, 1);
+    if (value < lower_per_chunk || value > upper_per_chunk) {
+      fprintf(stderr,
+              "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+              "iterations; must be between %d and %d\n",
+              i, num_threads, value, lower_per_chunk, upper_per_chunk);
+      return 0;
+    }
+  }
+#endif
+#ifdef VERBOSE
+  fprintf(stderr, "PASSED\r\n");
+#endif
+  return 1;
+}
+
+int main() {
+
+  execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+  iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+
+  for (unsigned j = 0; j < MAX_BOUND; j++) {
+    if (!test(j))
+      return 1;
+  }
+  Free(execution_count);
+  Free(iterations_per_thread);
+  return 0;
+}