[OpenMP][libomp] Improve Windows Processor Group handling within topology

The current implementation of Windows Processor Groups has
a separate topology method to handle them. This patch deprecates
that specific method and uses the regular CPUID topology
method by default and inserts the Windows Processor Group objects
in the topology manually.

Notes:
* The preference for processor groups is lowered to a value less than
  socket so that the user will see sockets in the KMP_AFFINITY=verbose
  output instead of processor groups when sockets=processor groups.
* The topology's capacity is modified to handle additional topology layers
  without the need for reallocation.
* If a user asks for a granularity setting that is "above" the processor
  group layer, then the granularity is adjusted "down" to the processor
  group since this is the coarsest layer available for threads.

Differential Revision: https://reviews.llvm.org/D112273

GitOrigin-RevId: 286094af9bc0cfc89d00d8d645e8eaca3d78cd91
diff --git a/runtime/src/i18n/en_US.txt b/runtime/src/i18n/en_US.txt
index 5a54e11..351da54 100644
--- a/runtime/src/i18n/en_US.txt
+++ b/runtime/src/i18n/en_US.txt
@@ -469,6 +469,8 @@
 AffHWSubsetEqvLayers         "KMP_HW_SUBSET ignored: %1$s, %2$s: layers are equivalent, please only specify one."
 AffHWSubsetOutOfOrder        "KMP_HW_SUBSET ignored: %1$s layer should come after %2$s."
 AffEqualTopologyTypes        "%1$s: topology layer \"%2$s\" is equivalent to \"%3$s\"."
+AffGranTooCoarseProcGroup    "%1$s: granularity=%2$s is too coarse, setting granularity=group."
+StgDeprecatedValue           "%1$s: \"%2$s\" value is deprecated. Please use \"%3$s\" instead."
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
diff --git a/runtime/src/kmp_affinity.cpp b/runtime/src/kmp_affinity.cpp
index 9521cc4..a46a7bb 100644
--- a/runtime/src/kmp_affinity.cpp
+++ b/runtime/src/kmp_affinity.cpp
@@ -198,14 +198,82 @@
 ////////////////////////////////////////////////////////////////////////////////
 // kmp_topology_t methods
 
+// Add a layer to the topology based on the ids. Assume the topology
+// is perfectly nested (i.e., so no object has more than one parent)
+void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
+  // Figure out where the layer should go by comparing the ids of the current
+  // layers with the new ids
+  int target_layer;
+  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
+  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
+
+  // Start from the highest layer and work down to find target layer
+  // If new layer is equal to another layer then put the new layer above
+  for (target_layer = 0; target_layer < depth; ++target_layer) {
+    bool layers_equal = true;
+    bool strictly_above_target_layer = false;
+    for (int i = 0; i < num_hw_threads; ++i) {
+      int id = hw_threads[i].ids[target_layer];
+      int new_id = ids[i];
+      if (id != previous_id && new_id == previous_new_id) {
+        // Found the layer we are strictly above
+        strictly_above_target_layer = true;
+        layers_equal = false;
+        break;
+      } else if (id == previous_id && new_id != previous_new_id) {
+        // Found a layer we are below. Move to next layer and check.
+        layers_equal = false;
+        break;
+      }
+      previous_id = id;
+      previous_new_id = new_id;
+    }
+    if (strictly_above_target_layer || layers_equal)
+      break;
+  }
+
+  // Found the layer we are above. Now move everything to accommodate the new
+  // layer. And put the new ids and type into the topology.
+  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+    types[j] = types[i];
+  types[target_layer] = type;
+  for (int k = 0; k < num_hw_threads; ++k) {
+    for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
+      hw_threads[k].ids[j] = hw_threads[k].ids[i];
+    hw_threads[k].ids[target_layer] = ids[k];
+  }
+  equivalent[type] = type;
+  depth++;
+}
+
+#if KMP_GROUP_AFFINITY
+// Insert the Windows Processor Group structure into the topology
+void kmp_topology_t::_insert_windows_proc_groups() {
+  // Do not insert the processor group structure for a single group
+  if (__kmp_num_proc_groups == 1)
+    return;
+  kmp_affin_mask_t *mask;
+  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
+  KMP_CPU_ALLOC(mask);
+  for (int i = 0; i < num_hw_threads; ++i) {
+    KMP_CPU_ZERO(mask);
+    KMP_CPU_SET(hw_threads[i].os_id, mask);
+    ids[i] = __kmp_get_proc_group(mask);
+  }
+  KMP_CPU_FREE(mask);
+  _insert_layer(KMP_HW_PROC_GROUP, ids);
+  __kmp_free(ids);
+}
+#endif
+
 // Remove layers that don't add information to the topology.
 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
 void kmp_topology_t::_remove_radix1_layers() {
   int preference[KMP_HW_LAST];
   int top_index1, top_index2;
   // Set up preference associative array
-  preference[KMP_HW_PROC_GROUP] = 110;
-  preference[KMP_HW_SOCKET] = 100;
+  preference[KMP_HW_SOCKET] = 110;
+  preference[KMP_HW_PROC_GROUP] = 100;
   preference[KMP_HW_CORE] = 95;
   preference[KMP_HW_THREAD] = 90;
   preference[KMP_HW_NUMA] = 85;
@@ -440,7 +508,7 @@
   kmp_topology_t *retval;
   // Allocate all data in one large allocation
   size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
-                sizeof(int) * ndepth * 3;
+                sizeof(int) * (size_t)KMP_HW_LAST * 3;
   char *bytes = (char *)__kmp_allocate(size);
   retval = (kmp_topology_t *)bytes;
   if (nproc > 0) {
@@ -453,8 +521,8 @@
   int *arr =
       (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
   retval->types = (kmp_hw_t *)arr;
-  retval->ratio = arr + ndepth;
-  retval->count = arr + 2 * ndepth;
+  retval->ratio = arr + (size_t)KMP_HW_LAST;
+  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
   KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
   for (int i = 0; i < ndepth; ++i) {
     retval->types[i] = types[i];
@@ -651,6 +719,9 @@
 }
 
 void kmp_topology_t::canonicalize() {
+#if KMP_GROUP_AFFINITY
+  _insert_windows_proc_groups();
+#endif
   _remove_radix1_layers();
   _gather_enumeration_information();
   _discover_uniformity();
@@ -699,6 +770,25 @@
                   __kmp_hw_get_catalog_string(gran_type));
       __kmp_affinity_gran = gran_type;
     }
+#if KMP_GROUP_AFFINITY
+    // If more than one processor group exists, and the level of
+    // granularity specified by the user is too coarse, then the
+    // granularity must be adjusted "down" to processor group affinity
+    // because threads can only exist within one processor group.
+    // For example, if a user sets granularity=socket and there are two
+    // processor groups that cover a socket, then the runtime must
+    // restrict the granularity down to the processor group level.
+    if (__kmp_num_proc_groups > 1) {
+      int gran_depth = __kmp_topology->get_level(gran_type);
+      int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP);
+      if (gran_depth >= 0 && proc_group_depth >= 0 &&
+          gran_depth < proc_group_depth) {
+        KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY",
+                    __kmp_hw_get_catalog_string(__kmp_affinity_gran));
+        __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP;
+      }
+    }
+#endif
     __kmp_affinity_gran_levels = 0;
     for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
       __kmp_affinity_gran_levels++;
diff --git a/runtime/src/kmp_affinity.h b/runtime/src/kmp_affinity.h
index 5b1569b..76ba38b 100644
--- a/runtime/src/kmp_affinity.h
+++ b/runtime/src/kmp_affinity.h
@@ -638,7 +638,9 @@
 
   int depth;
 
-  // The following arrays are all 'depth' long
+  // The following arrays are all 'depth' long and have been
+  // allocated to hold up to KMP_HW_LAST number of objects if
+  // needed so layers can be added without reallocation of any array
 
   // Orderd array of the types in the topology
   kmp_hw_t *types;
@@ -671,6 +673,14 @@
   // Flags describing the topology
   flags_t flags;
 
+  // Insert a new topology layer after allocation
+  void _insert_layer(kmp_hw_t type, const int *ids);
+
+#if KMP_GROUP_AFFINITY
+  // Insert topology information about Windows Processor groups
+  void _insert_windows_proc_groups();
+#endif
+
   // Count each item & get the num x's per y
   // e.g., get the number of cores and the number of threads per core
   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index a8c08f8..8f7cee2 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -3136,6 +3136,7 @@
   }
 #if KMP_GROUP_AFFINITY
   else if (__kmp_str_match("group", 1, value)) {
+    KMP_WARNING(StgDeprecatedValue, name, value, "all");
     __kmp_affinity_top_method = affinity_top_method_group;
   }
 #endif /* KMP_GROUP_AFFINITY */
@@ -6029,65 +6030,27 @@
       // Handle the Win 64 group affinity stuff if there are multiple
       // processor groups, or if the user requested it, and OMP 4.0
       // affinity is not in effect.
-      if (((__kmp_num_proc_groups > 1) &&
-           (__kmp_affinity_type == affinity_default) &&
-           (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)) ||
-          (__kmp_affinity_top_method == affinity_top_method_group)) {
+      if (__kmp_num_proc_groups > 1 &&
+          __kmp_affinity_type == affinity_default &&
+          __kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
+        // Do not respect the initial processor affinity mask if it is assigned
+        // exactly one Windows Processor Group since this is interpreted as the
+        // default OS assignment. Not respecting the mask allows the runtime to
+        // use all the logical processors in all groups.
         if (__kmp_affinity_respect_mask == affinity_respect_mask_default &&
             exactly_one_group) {
           __kmp_affinity_respect_mask = FALSE;
         }
+        // Use compact affinity with anticipation of pinning to at least the
+        // group granularity since threads can only be bound to one group.
         if (__kmp_affinity_type == affinity_default) {
           __kmp_affinity_type = affinity_compact;
           __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
         }
-        if (__kmp_affinity_top_method == affinity_top_method_default) {
-          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
-            __kmp_affinity_top_method = affinity_top_method_group;
-            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
-          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
-            __kmp_affinity_top_method = affinity_top_method_group;
-          } else {
-            __kmp_affinity_top_method = affinity_top_method_all;
-          }
-        } else if (__kmp_affinity_top_method == affinity_top_method_group) {
-          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
-            __kmp_affinity_gran = KMP_HW_PROC_GROUP;
-          } else if ((__kmp_affinity_gran != KMP_HW_PROC_GROUP) &&
-                     (__kmp_affinity_gran != KMP_HW_THREAD)) {
-            const char *str = __kmp_hw_get_keyword(__kmp_affinity_gran);
-            KMP_WARNING(AffGranTopGroup, var, str);
-            __kmp_affinity_gran = KMP_HW_THREAD;
-          }
-        } else {
-          if (__kmp_affinity_gran == KMP_HW_UNKNOWN) {
-            __kmp_affinity_gran = KMP_HW_CORE;
-          } else if (__kmp_affinity_gran == KMP_HW_PROC_GROUP) {
-            const char *str = NULL;
-            switch (__kmp_affinity_type) {
-            case affinity_physical:
-              str = "physical";
-              break;
-            case affinity_logical:
-              str = "logical";
-              break;
-            case affinity_compact:
-              str = "compact";
-              break;
-            case affinity_scatter:
-              str = "scatter";
-              break;
-            case affinity_explicit:
-              str = "explicit";
-              break;
-            // No MIC on windows, so no affinity_balanced case
-            default:
-              KMP_DEBUG_ASSERT(0);
-            }
-            KMP_WARNING(AffGranGroupType, var, str);
-            __kmp_affinity_gran = KMP_HW_CORE;
-          }
-        }
+        if (__kmp_affinity_top_method == affinity_top_method_default)
+          __kmp_affinity_top_method = affinity_top_method_all;
+        if (__kmp_affinity_gran == KMP_HW_UNKNOWN)
+          __kmp_affinity_gran = KMP_HW_PROC_GROUP;
       } else
 
 #endif /* KMP_GROUP_AFFINITY */