[OpenMP][host runtime] Add support for teams affinity

This patch implements teams affinity on the host.
The default is spread. A user can specify either spread, close, or
primary using KMP_TEAMS_PROC_BIND environment variable. Unlike
OMP_PROC_BIND, KMP_TEAMS_PROC_BIND is only a single value and is not a
list of values. The values follow the same semantics under the OpenMP
specification for parallel regions except T is the number of teams in
a league instead of the number of threads in a parallel region.

Differential Revision: https://reviews.llvm.org/D109921

GitOrigin-RevId: 50b68a3d030543daf97794d68682cc698964ca26
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index a4d8ece..8537dca 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -849,6 +849,7 @@
 } kmp_nested_proc_bind_t;
 
 extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
+extern kmp_proc_bind_t __kmp_teams_proc_bind;
 
 extern int __kmp_display_affinity;
 extern char *__kmp_affinity_format;
diff --git a/runtime/src/kmp_global.cpp b/runtime/src/kmp_global.cpp
index b6babbe..4aea5a2 100644
--- a/runtime/src/kmp_global.cpp
+++ b/runtime/src/kmp_global.cpp
@@ -280,6 +280,7 @@
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
+kmp_proc_bind_t __kmp_teams_proc_bind = proc_bind_spread;
 int __kmp_affinity_num_places = 0;
 int __kmp_display_affinity = FALSE;
 char *__kmp_affinity_format = NULL;
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index 05272a9..4505d26 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -914,7 +914,8 @@
    assured that there are enough threads available, because we checked on that
    earlier within critical section forkjoin */
 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
-                                    kmp_info_t *master_th, int master_gtid) {
+                                    kmp_info_t *master_th, int master_gtid,
+                                    int fork_teams_workers) {
   int i;
   int use_hot_team;
 
@@ -1003,7 +1004,12 @@
     }
 
 #if KMP_AFFINITY_SUPPORTED
-    __kmp_partition_places(team);
+    // Do not partition the places list for teams construct workers who
+    // haven't actually been forked to do real work yet. This partitioning
+    // will take place in the parallel region nested within the teams construct.
+    if (!fork_teams_workers) {
+      __kmp_partition_places(team);
+    }
 #endif
   }
 
@@ -1597,6 +1603,41 @@
       }
 #endif
 
+      // Figure out the proc_bind policy for the nested parallel within teams
+      kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
+      // proc_bind_default means don't update
+      kmp_proc_bind_t proc_bind_icv = proc_bind_default;
+      if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
+        proc_bind = proc_bind_false;
+      } else {
+        // No proc_bind clause specified; use current proc-bind-var
+        if (proc_bind == proc_bind_default) {
+          proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
+        }
+        /* else: The proc_bind policy was specified explicitly on parallel
+           clause.
+           This overrides proc-bind-var for this parallel region, but does not
+           change proc-bind-var. */
+        // Figure the value of proc-bind-var for the child threads.
+        if ((level + 1 < __kmp_nested_proc_bind.used) &&
+            (__kmp_nested_proc_bind.bind_types[level + 1] !=
+             master_th->th.th_current_task->td_icvs.proc_bind)) {
+          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+        }
+      }
+      KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
+      // Need to change the bind-var ICV to correct value for each implicit task
+      if (proc_bind_icv != proc_bind_default &&
+          master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
+        kmp_info_t **other_threads = parent_team->t.t_threads;
+        for (i = 0; i < master_th->th.th_team_nproc; ++i) {
+          other_threads[i]->th.th_current_task->td_icvs.proc_bind =
+              proc_bind_icv;
+        }
+      }
+      // Reset for next parallel region
+      master_th->th.th_set_proc_bind = proc_bind_default;
+
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
       if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
            KMP_ITT_DEBUG) &&
@@ -1613,6 +1654,9 @@
         parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
       }
 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+#if KMP_AFFINITY_SUPPORTED
+      __kmp_partition_places(parent_team);
+#endif
 
       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
@@ -1953,16 +1997,21 @@
 
     // Figure out the proc_bind_policy for the new team.
     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
-    kmp_proc_bind_t proc_bind_icv =
-        proc_bind_default; // proc_bind_default means don't update
+    // proc_bind_default means don't update
+    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
       proc_bind = proc_bind_false;
     } else {
+      // No proc_bind clause specified; use current proc-bind-var for this
+      // parallel region
       if (proc_bind == proc_bind_default) {
-        // No proc_bind clause specified; use current proc-bind-var for this
-        // parallel region
         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
       }
+      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
+      if (master_th->th.th_teams_microtask &&
+          microtask == (microtask_t)__kmp_teams_master) {
+        proc_bind = __kmp_teams_proc_bind;
+      }
       /* else: The proc_bind policy was specified explicitly on parallel clause.
          This overrides proc-bind-var for this parallel region, but does not
          change proc-bind-var. */
@@ -1970,7 +2019,11 @@
       if ((level + 1 < __kmp_nested_proc_bind.used) &&
           (__kmp_nested_proc_bind.bind_types[level + 1] !=
            master_th->th.th_current_task->td_icvs.proc_bind)) {
-        proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
+        // Do not modify the proc bind icv for the two teams construct forks
+        // They just let the proc bind icv pass through
+        if (!master_th->th.th_teams_microtask ||
+            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
+          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
       }
     }
 
@@ -2142,7 +2195,7 @@
     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
       root->r.r_active = TRUE;
 
-    __kmp_fork_team_threads(root, team, master_th, gtid);
+    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
     __kmp_setup_icv_copy(team, nthreads,
                          &master_th->th.th_current_task->td_icvs, loc);
 
@@ -2411,6 +2464,14 @@
   } // active_level == 1
 #endif /* USE_ITT_BUILD */
 
+#if KMP_AFFINITY_SUPPORTED
+  if (!exit_teams) {
+    // Restore master thread's partition.
+    master_th->th.th_first_place = team->t.t_first_place;
+    master_th->th.th_last_place = team->t.t_last_place;
+  }
+#endif // KMP_AFFINITY_SUPPORTED
+
   if (master_th->th.th_teams_microtask && !exit_teams &&
       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
       team->t.t_level == master_th->th.th_teams_level + 1) {
@@ -2518,11 +2579,6 @@
                 master_th, team));
   __kmp_pop_current_task_from_thread(master_th);
 
-#if KMP_AFFINITY_SUPPORTED
-  // Restore master thread's partition.
-  master_th->th.th_first_place = team->t.t_first_place;
-  master_th->th.th_last_place = team->t.t_last_place;
-#endif // KMP_AFFINITY_SUPPORTED
   master_th->th.th_def_allocator = team->t.t_def_allocator;
 
 #if OMPD_SUPPORT
@@ -5016,6 +5072,7 @@
   kmp_team_t *team;
   int use_hot_team = !root->r.r_active;
   int level = 0;
+  int do_place_partition = 1;
 
   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
@@ -5037,6 +5094,12 @@
         ++level; // not increment if #teams==1, or for outer fork of the teams;
         // increment otherwise
       }
+      // Do not perform the place partition if inner fork of the teams
+      // Wait until nested parallel region encountered inside teams construct
+      if ((master->th.th_teams_size.nteams == 1 &&
+           master->th.th_teams_level >= team->t.t_level) ||
+          (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
+        do_place_partition = 0;
     }
     hot_teams = master->th.th_hot_teams;
     if (level < __kmp_hot_teams_max_level && hot_teams &&
@@ -5074,6 +5137,10 @@
       __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
     }
 
+    // If not doing the place partition, then reset the team's proc bind
+    // to indicate that partitioning of all threads still needs to take place
+    if (do_place_partition == 0)
+      team->t.t_proc_bind = proc_bind_default;
     // Has the number of threads changed?
     /* Let's assume the most common case is that the number of threads is
        unchanged, and put that case first. */
@@ -5103,16 +5170,20 @@
       if ((team->t.t_size_changed == 0) &&
           (team->t.t_proc_bind == new_proc_bind)) {
         if (new_proc_bind == proc_bind_spread) {
-          __kmp_partition_places(
-              team, 1); // add flag to update only master for spread
+          if (do_place_partition) {
+            // add flag to update only master for spread
+            __kmp_partition_places(team, 1);
+          }
         }
         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
                        "proc_bind = %d, partition = [%d,%d]\n",
                        team->t.t_id, new_proc_bind, team->t.t_first_place,
                        team->t.t_last_place));
       } else {
-        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
-        __kmp_partition_places(team);
+        if (do_place_partition) {
+          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+          __kmp_partition_places(team);
+        }
       }
 #else
       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
@@ -5189,10 +5260,12 @@
       }
 #endif
 
-      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+      if (do_place_partition) {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #if KMP_AFFINITY_SUPPORTED
-      __kmp_partition_places(team);
+        __kmp_partition_places(team);
 #endif
+      }
     } else { // team->t.t_nproc < new_nproc
 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
       kmp_affin_mask_t *old_mask;
@@ -5328,10 +5401,12 @@
       }
 #endif
 
-      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
+      if (do_place_partition) {
+        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #if KMP_AFFINITY_SUPPORTED
-      __kmp_partition_places(team);
+        __kmp_partition_places(team);
 #endif
+      }
     } // Check changes in number of threads
 
     kmp_info_t *master = team->t.t_threads[0];
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index 00830b5..aa0641f 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -3207,6 +3207,47 @@
   }
 } // __kmp_stg_print_topology_method
 
+// KMP_TEAMS_PROC_BIND
+struct kmp_proc_bind_info_t {
+  const char *name;
+  kmp_proc_bind_t proc_bind;
+};
+static kmp_proc_bind_info_t proc_bind_table[] = {
+    {"spread", proc_bind_spread},
+    {"true", proc_bind_spread},
+    {"close", proc_bind_close},
+    // teams-bind = false means "replicate the primary thread's affinity"
+    {"false", proc_bind_primary},
+    {"primary", proc_bind_primary}};
+static void __kmp_stg_parse_teams_proc_bind(char const *name, char const *value,
+                                            void *data) {
+  int valid;
+  const char *end;
+  valid = 0;
+  for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+       ++i) {
+    if (__kmp_match_str(proc_bind_table[i].name, value, &end)) {
+      __kmp_teams_proc_bind = proc_bind_table[i].proc_bind;
+      valid = 1;
+      break;
+    }
+  }
+  if (!valid) {
+    KMP_WARNING(StgInvalidValue, name, value);
+  }
+}
+static void __kmp_stg_print_teams_proc_bind(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  const char *value = KMP_I18N_STR(NotDefined);
+  for (size_t i = 0; i < sizeof(proc_bind_table) / sizeof(proc_bind_table[0]);
+       ++i) {
+    if (__kmp_teams_proc_bind == proc_bind_table[i].proc_bind) {
+      value = proc_bind_table[i].name;
+      break;
+    }
+  }
+  __kmp_stg_print_str(buffer, name, value);
+}
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 // OMP_PROC_BIND / bind-var is functional on all 4.0 builds, including OS X*
@@ -5312,6 +5353,8 @@
 #endif /* KMP_GOMP_COMPAT */
     {"OMP_PROC_BIND", __kmp_stg_parse_proc_bind, __kmp_stg_print_proc_bind,
      NULL, 0, 0},
+    {"KMP_TEAMS_PROC_BIND", __kmp_stg_parse_teams_proc_bind,
+     __kmp_stg_print_teams_proc_bind, NULL, 0, 0},
     {"OMP_PLACES", __kmp_stg_parse_places, __kmp_stg_print_places, NULL, 0, 0},
     {"KMP_TOPOLOGY_METHOD", __kmp_stg_parse_topology_method,
      __kmp_stg_print_topology_method, NULL, 0, 0},
diff --git a/runtime/test/affinity/libomp_test_topology.h b/runtime/test/affinity/libomp_test_topology.h
index 4a84742..410103d 100644
--- a/runtime/test/affinity/libomp_test_topology.h
+++ b/runtime/test/affinity/libomp_test_topology.h
@@ -8,6 +8,7 @@
 #include <errno.h>
 #include <ctype.h>
 #include <omp.h>
+#include <stdarg.h>
 
 typedef enum topology_obj_type_t {
   TOPOLOGY_OBJ_THREAD,
@@ -18,6 +19,8 @@
 
 typedef struct place_list_t {
   int num_places;
+  int current_place;
+  int *place_nums;
   affinity_mask_t **masks;
 } place_list_t;
 
@@ -147,6 +150,7 @@
 static place_list_t *topology_alloc_type_places(topology_obj_type_t type) {
   char buf[1024];
   int i, cpu, num_places, num_unique;
+  int *place_nums;
   int num_cpus = topology_get_num_cpus();
   place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
   affinity_mask_t **masks =
@@ -184,8 +188,13 @@
     if (mask)
       masks[num_unique++] = mask;
   }
+  place_nums = (int *)malloc(sizeof(int) * num_unique);
+  for (i = 0; i < num_unique; ++i)
+    place_nums[i] = i;
   places->num_places = num_unique;
   places->masks = masks;
+  places->place_nums = place_nums;
+  places->current_place = -1;
   return places;
 }
 
@@ -195,6 +204,7 @@
   place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
   affinity_mask_t **masks =
       (affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_places);
+  int *place_nums = (int *)malloc(sizeof(int) * num_places);
   for (place = 0; place < num_places; ++place) {
     int num_procs = omp_get_place_num_procs(place);
     int *ids = (int *)malloc(sizeof(int) * num_procs);
@@ -203,9 +213,45 @@
     for (i = 0; i < num_procs; ++i)
       affinity_mask_set(mask, ids[i]);
     masks[place] = mask;
+    place_nums[place] = place;
   }
   places->num_places = num_places;
+  places->place_nums = place_nums;
   places->masks = masks;
+  places->current_place = omp_get_place_num();
+  return places;
+}
+
+static place_list_t *topology_alloc_openmp_partition() {
+  int p, i;
+  int num_places = omp_get_partition_num_places();
+  place_list_t *places = (place_list_t *)malloc(sizeof(place_list_t));
+  int *place_nums = (int *)malloc(sizeof(int) * num_places);
+  affinity_mask_t **masks =
+      (affinity_mask_t **)malloc(sizeof(affinity_mask_t *) * num_places);
+  omp_get_partition_place_nums(place_nums);
+  for (p = 0; p < num_places; ++p) {
+    int place = place_nums[p];
+    int num_procs = omp_get_place_num_procs(place);
+    int *ids = (int *)malloc(sizeof(int) * num_procs);
+    if (num_procs == 0) {
+      fprintf(stderr, "place %d has 0 procs?\n", place);
+      exit(EXIT_FAILURE);
+    }
+    omp_get_place_proc_ids(place, ids);
+    affinity_mask_t *mask = affinity_mask_alloc();
+    for (i = 0; i < num_procs; ++i)
+      affinity_mask_set(mask, ids[i]);
+    if (affinity_mask_count(mask) == 0) {
+      fprintf(stderr, "place %d has 0 procs set?\n", place);
+      exit(EXIT_FAILURE);
+    }
+    masks[p] = mask;
+  }
+  places->num_places = num_places;
+  places->place_nums = place_nums;
+  places->masks = masks;
+  places->current_place = omp_get_place_num();
   return places;
 }
 
@@ -216,6 +262,7 @@
   for (i = 0; i < places->num_places; ++i)
     affinity_mask_free(places->masks[i]);
   free(places->masks);
+  free(places->place_nums);
   free(places);
 }
 
@@ -224,8 +271,306 @@
   char buf[1024];
   for (i = 0; i < p->num_places; ++i) {
     affinity_mask_snprintf(buf, sizeof(buf), p->masks[i]);
-    printf("Place %d: %s\n", i, buf);
+    printf("Place %d: %s\n", p->place_nums[i], buf);
   }
 }
 
+// Print out an error message, possibly with two problem place lists,
+// and then exit with failure
+static void proc_bind_die(omp_proc_bind_t proc_bind, int T, int P,
+                          const char *format, ...) {
+  va_list args;
+  va_start(args, format);
+  const char *pb;
+  switch (proc_bind) {
+  case omp_proc_bind_false:
+    pb = "False";
+    break;
+  case omp_proc_bind_true:
+    pb = "True";
+    break;
+  case omp_proc_bind_master:
+    pb = "Master (Primary)";
+    break;
+  case omp_proc_bind_close:
+    pb = "Close";
+    break;
+  case omp_proc_bind_spread:
+    pb = "Spread";
+    break;
+  default:
+    pb = "(Unknown Proc Bind Type)";
+    break;
+  }
+  if (proc_bind == omp_proc_bind_spread || proc_bind == omp_proc_bind_close) {
+    if (T <= P) {
+      fprintf(stderr, "%s : (T(%d) <= P(%d)) : ", pb, T, P);
+    } else {
+      fprintf(stderr, "%s : (T(%d) > P(%d)) : ", pb, T, P);
+    }
+  } else {
+    fprintf(stderr, "%s : T = %d, P = %d : ", pb, T, P);
+  }
+  vfprintf(stderr, format, args);
+  va_end(args);
+
+  exit(EXIT_FAILURE);
+}
+
+// Return 1 on failure, 0 on success.
+static void proc_bind_check(omp_proc_bind_t proc_bind,
+                            const place_list_t *parent, place_list_t **children,
+                            int nchildren) {
+  place_list_t *partition;
+  int T, i, j, place, low, high, first, last, count, current_place, num_places;
+  const int *place_nums;
+  int P = parent->num_places;
+
+  // Find the correct T (there could be null entries in children)
+  place_list_t **partitions =
+      (place_list_t **)malloc(sizeof(place_list_t *) * nchildren);
+  T = 0;
+  for (i = 0; i < nchildren; ++i)
+    if (children[i])
+      partitions[T++] = children[i];
+  // Only able to check spread, close, master (primary)
+  if (proc_bind != omp_proc_bind_spread && proc_bind != omp_proc_bind_close &&
+      proc_bind != omp_proc_bind_master)
+    proc_bind_die(proc_bind, T, P, NULL, NULL,
+                  "Cannot check this proc bind type\n");
+
+  if (proc_bind == omp_proc_bind_spread) {
+    if (T <= P) {
+      // Run through each subpartition
+      for (i = 0; i < T; ++i) {
+        partition = partitions[i];
+        place_nums = partition->place_nums;
+        num_places = partition->num_places;
+        current_place = partition->current_place;
+        // Correct count?
+        low = P / T;
+        high = P / T + (P % T ? 1 : 0);
+        if (num_places != low && num_places != high) {
+          proc_bind_die(proc_bind, T, P,
+                        "Incorrect number of places for thread %d: %d. "
+                        "Expecting between %d and %d\n",
+                        i, num_places, low, high);
+        }
+        // Consecutive places?
+        for (j = 1; j < num_places; ++j) {
+          if (place_nums[j] != (place_nums[j - 1] + 1) % P) {
+            proc_bind_die(proc_bind, T, P,
+                          "Not consecutive places: %d, %d in partition\n",
+                          place_nums[j - 1], place_nums[j]);
+          }
+        }
+        first = place_nums[0];
+        last = place_nums[num_places - 1];
+        // Primary thread executes on place of the parent thread?
+        if (i == 0) {
+          if (current_place != parent->current_place) {
+            proc_bind_die(
+                proc_bind, T, P,
+                "Primary thread not on same place (%d) as parent thread (%d)\n",
+                current_place, parent->current_place);
+          }
+        } else {
+          // Thread's current place is first place within it's partition?
+          if (current_place != first) {
+            proc_bind_die(proc_bind, T, P,
+                          "Thread's current place (%d) is not the first place "
+                          "in its partition [%d, %d]\n",
+                          current_place, first, last);
+          }
+        }
+        // Partitions don't have intersections?
+        int f1 = first;
+        int l1 = last;
+        for (j = 0; j < i; ++j) {
+          int f2 = partitions[j]->place_nums[0];
+          int l2 = partitions[j]->place_nums[partitions[j]->num_places - 1];
+          if (f1 > l1 && f2 > l2) {
+            proc_bind_die(proc_bind, T, P,
+                          "partitions intersect. [%d, %d] and [%d, %d]\n", f1,
+                          l1, f2, l2);
+          }
+          if (f1 > l1 && f2 <= l2)
+            if (f1 < l2 || l1 > f2) {
+              proc_bind_die(proc_bind, T, P,
+                            "partitions intersect. [%d, %d] and [%d, %d]\n", f1,
+                            l1, f2, l2);
+            }
+          if (f1 <= l1 && f2 > l2)
+            if (f2 < l1 || l2 > f1) {
+              proc_bind_die(proc_bind, T, P,
+                            "partitions intersect. [%d, %d] and [%d, %d]\n", f1,
+                            l1, f2, l2);
+            }
+          if (f1 <= l1 && f2 <= l2)
+            if (!(f2 > l1 || l2 < f1)) {
+              proc_bind_die(proc_bind, T, P,
+                            "partitions intersect. [%d, %d] and [%d, %d]\n", f1,
+                            l1, f2, l2);
+            }
+        }
+      }
+    } else {
+      // T > P
+      // Each partition has only one place?
+      for (i = 0; i < T; ++i) {
+        if (partitions[i]->num_places != 1) {
+          proc_bind_die(
+              proc_bind, T, P,
+              "Incorrect number of places for thread %d: %d. Expecting 1\n", i,
+              partitions[i]->num_places);
+        }
+      }
+      // Correct number of consecutive threads per partition?
+      low = T / P;
+      high = T / P + (T % P ? 1 : 0);
+      for (i = 1, count = 1; i < T; ++i) {
+        if (partitions[i]->place_nums[0] == partitions[i - 1]->place_nums[0]) {
+          count++;
+          if (count > high) {
+            proc_bind_die(
+                proc_bind, T, P,
+                "Too many threads have place %d for their partition\n",
+                partitions[i]->place_nums[0]);
+          }
+        } else {
+          if (count < low) {
+            proc_bind_die(
+                proc_bind, T, P,
+                "Not enough threads have place %d for their partition\n",
+                partitions[i]->place_nums[0]);
+          }
+          count = 1;
+        }
+      }
+      // Primary thread executes on place of the parent thread?
+      current_place = partitions[0]->place_nums[0];
+      if (parent->current_place != -1 &&
+          current_place != parent->current_place) {
+        proc_bind_die(
+            proc_bind, T, P,
+            "Primary thread not on same place (%d) as parent thread (%d)\n",
+            current_place, parent->current_place);
+      }
+    }
+  } else if (proc_bind == omp_proc_bind_close ||
+             proc_bind == omp_proc_bind_master) {
+    // Check that each subpartition is the same as the parent
+    for (i = 0; i < T; ++i) {
+      partition = partitions[i];
+      place_nums = partition->place_nums;
+      num_places = partition->num_places;
+      current_place = partition->current_place;
+      if (parent->num_places != num_places) {
+        proc_bind_die(proc_bind, T, P,
+                      "Number of places in subpartition (%d) does not match "
+                      "parent (%d)\n",
+                      num_places, parent->num_places);
+      }
+      for (j = 0; j < num_places; ++j) {
+        if (parent->place_nums[j] != place_nums[j]) {
+          proc_bind_die(proc_bind, T, P,
+                        "Subpartition place (%d) does not match "
+                        "parent partition place (%d)\n",
+                        place_nums[j], parent->place_nums[j]);
+        }
+      }
+    }
+    // Find index into place_nums of current place for parent
+    for (j = 0; j < parent->num_places; ++j)
+      if (parent->place_nums[j] == parent->current_place)
+        break;
+    if (proc_bind == omp_proc_bind_close) {
+      if (T <= P) {
+        // close T <= P
+        // check place assignment for each thread
+        for (i = 0; i < T; ++i) {
+          partition = partitions[i];
+          current_place = partition->current_place;
+          if (current_place != parent->place_nums[j]) {
+            proc_bind_die(
+                proc_bind, T, P,
+                "Thread %d's current place (%d) is incorrect. expected %d\n", i,
+                current_place, parent->place_nums[j]);
+          }
+          j = (j + 1) % parent->num_places;
+        }
+      } else {
+        // close T > P
+        // check place assignment for each thread
+        low = T / P;
+        high = T / P + (T % P ? 1 : 0);
+        count = 1;
+        if (partitions[0]->current_place != parent->current_place) {
+          proc_bind_die(
+              proc_bind, T, P,
+              "Primary thread's place (%d) is not parent thread's place (%d)\n",
+              partitions[0]->current_place, parent->current_place);
+        }
+        for (i = 1; i < T; ++i) {
+          current_place = partitions[i]->current_place;
+          if (current_place == parent->place_nums[j]) {
+            count++;
+            if (count > high) {
+              proc_bind_die(
+                  proc_bind, T, P,
+                  "Too many threads have place %d for their current place\n",
+                  current_place);
+            }
+          } else {
+            if (count < low) {
+              proc_bind_die(
+                  proc_bind, T, P,
+                  "Not enough threads have place %d for their current place\n",
+                  parent->place_nums[j]);
+            }
+            j = (j + 1) % parent->num_places;
+            if (current_place != parent->place_nums[j]) {
+              proc_bind_die(
+                  proc_bind, T, P,
+                  "Thread %d's place (%d) is not corret. Expected %d\n", i,
+                  partitions[i]->current_place, parent->place_nums[j]);
+            }
+            count = 1;
+          }
+        }
+      }
+    } else {
+      // proc_bind_primary
+      // Every thread should be assigned to the primary thread's place
+      for (i = 0; i < T; ++i) {
+        if (partitions[i]->current_place != parent->current_place) {
+          proc_bind_die(
+              proc_bind, T, P,
+              "Thread %d's place (%d) is not the primary thread's place (%d)\n",
+              i, partitions[i]->current_place, parent->current_place);
+        }
+      }
+    }
+  }
+
+  // Check that each partition's current place is within the partition
+  for (i = 0; i < T; ++i) {
+    current_place = partitions[i]->current_place;
+    num_places = partitions[i]->num_places;
+    first = partitions[i]->place_nums[0];
+    last = partitions[i]->place_nums[num_places - 1];
+    for (j = 0; j < num_places; ++j)
+      if (partitions[i]->place_nums[j] == current_place)
+        break;
+    if (j == num_places) {
+      proc_bind_die(proc_bind, T, P,
+                    "Thread %d's current place (%d) is not within its "
+                    "partition [%d, %d]\n",
+                    i, current_place, first, last);
+    }
+  }
+
+  free(partitions);
+}
+
 #endif
diff --git a/runtime/test/affinity/teams-affinity.c b/runtime/test/affinity/teams-affinity.c
new file mode 100644
index 0000000..0ca7475
--- /dev/null
+++ b/runtime/test/affinity/teams-affinity.c
@@ -0,0 +1,119 @@
+// RUN: %libomp-compile && env OMP_PLACES=cores OMP_TEAMS_THREAD_LIMIT=1 KMP_TEAMS_THREAD_LIMIT=256 %libomp-run
+// RUN: %libomp-compile && env OMP_PLACES=cores OMP_TEAMS_THREAD_LIMIT=1 KMP_TEAMS_THREAD_LIMIT=256 KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: %libomp-compile && env OMP_PLACES=cores OMP_TEAMS_THREAD_LIMIT=1 KMP_TEAMS_THREAD_LIMIT=256 KMP_TEAMS_PROC_BIND=close %libomp-run
+// RUN: %libomp-compile && env OMP_PLACES=cores OMP_TEAMS_THREAD_LIMIT=1 KMP_TEAMS_THREAD_LIMIT=256 KMP_TEAMS_PROC_BIND=close KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: %libomp-compile && env OMP_PLACES=cores OMP_TEAMS_THREAD_LIMIT=1 KMP_TEAMS_THREAD_LIMIT=256 KMP_TEAMS_PROC_BIND=primary %libomp-run
+// RUN: %libomp-compile && env OMP_PLACES=cores OMP_TEAMS_THREAD_LIMIT=1 KMP_TEAMS_THREAD_LIMIT=256 KMP_TEAMS_PROC_BIND=primary KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// REQUIRES: linux
+// UNSUPPORTED: clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
+// UNSUPPORTED: gcc-5, gcc-6, gcc-7, gcc-8
+// UNSUPPORTED: icc
+//
+// KMP_TEAMS_THREAD_LIMIT limits the number of total teams
+// OMP_TEAMS_THREAD_LIMIT limits the number of threads per team
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "libomp_test_affinity.h"
+#include "libomp_test_topology.h"
+
+#define _STR(X) #X
+#define STR(X) _STR(X)
+
+#ifndef MAX_NTEAMS
+#define MAX_NTEAMS 256
+#endif
+
+static void set_default_max_nteams() {
+  // Do not overwrite if already in environment
+  setenv("KMP_TEAMS_THREAD_LIMIT", STR(MAX_NTEAMS), 0);
+}
+
+static int get_max_nteams() {
+  int max_nteams;
+  const char *value = getenv("KMP_TEAMS_THREAD_LIMIT");
+  if (!value) {
+    fprintf(stderr, "KMP_TEAMS_THREAD_LIMIT must be set!\n");
+    exit(EXIT_FAILURE);
+  }
+  max_nteams = atoi(value);
+  if (max_nteams <= 0)
+    max_nteams = 1;
+  if (max_nteams > MAX_NTEAMS)
+    max_nteams = MAX_NTEAMS;
+  return max_nteams;
+}
+
+// Return the value in KMP_TEAMS_PROC_BIND
+static omp_proc_bind_t get_teams_proc_bind() {
+  // defaults to spread
+  omp_proc_bind_t proc_bind = omp_proc_bind_spread;
+  const char *value = getenv("KMP_TEAMS_PROC_BIND");
+  if (value) {
+    if (strcmp(value, "spread") == 0) {
+      proc_bind = omp_proc_bind_spread;
+    } else if (strcmp(value, "close") == 0) {
+      proc_bind = omp_proc_bind_close;
+    } else if (strcmp(value, "primary") == 0 || strcmp(value, "master") == 0) {
+      proc_bind = omp_proc_bind_master;
+    } else {
+      fprintf(stderr,
+              "KMP_TEAMS_PROC_BIND should be one of spread, close, primary");
+      exit(EXIT_FAILURE);
+    }
+  }
+  return proc_bind;
+}
+
+int main(int argc, char **argv) {
+  int i, nteams, max_nteams, factor;
+  place_list_t **teams_places;
+  place_list_t *place_list;
+  omp_proc_bind_t teams_proc_bind;
+
+  // Set a default for the max number of teams if it is not already set
+  set_default_max_nteams();
+  place_list = topology_alloc_openmp_places();
+  max_nteams = get_max_nteams();
+  // Further limit the number of teams twice the number of OMP_PLACES
+  if (max_nteams > 2 * place_list->num_places)
+    max_nteams = 2 * place_list->num_places;
+  teams_places = (place_list_t **)malloc(sizeof(place_list_t *) * max_nteams);
+  for (i = 0; i < max_nteams; ++i)
+    teams_places[i] = NULL;
+  teams_proc_bind = get_teams_proc_bind();
+
+  // factor inversely controls the number of test cases.
+  // the larger the factor, the more test cases will be performed.
+  if (teams_proc_bind == omp_proc_bind_master) {
+    factor = 2;
+  } else {
+    factor = 8;
+  }
+
+  for (nteams = 1; nteams <= max_nteams;
+       nteams = nteams * factor / (factor - 1) + 1) {
+    // Check the same value twice to make sure hot teams are ok
+    int j;
+    for (j = 0; j < 2; ++j) {
+      // Gather the proc bind partitions from each team
+      #pragma omp teams num_teams(nteams)
+      teams_places[omp_get_team_num()] = topology_alloc_openmp_partition();
+
+      // Check all the partitions with the parent partition
+      proc_bind_check(teams_proc_bind, place_list, teams_places, nteams);
+
+      // Free the proc bind partitions
+      for (i = 0; i < nteams; ++i)
+        topology_free_places(teams_places[i]);
+    }
+  }
+
+  free(teams_places);
+  topology_free_places(place_list);
+  return EXIT_SUCCESS;
+}