| /* |
| * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures |
| */ |
| |
| //===----------------------------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef KMP_DISPATCH_HIER_H |
| #define KMP_DISPATCH_HIER_H |
| #include "kmp.h" |
| #include "kmp_dispatch.h" |
| |
| // Layer type for scheduling hierarchy |
| enum kmp_hier_layer_e { |
| LAYER_THREAD = -1, |
| LAYER_L1, |
| LAYER_L2, |
| LAYER_L3, |
| LAYER_NUMA, |
| LAYER_LOOP, |
| LAYER_LAST |
| }; |
| |
| // Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string |
| static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) { |
| switch (type) { |
| case kmp_hier_layer_e::LAYER_THREAD: |
| return "THREAD"; |
| case kmp_hier_layer_e::LAYER_L1: |
| return "L1"; |
| case kmp_hier_layer_e::LAYER_L2: |
| return "L2"; |
| case kmp_hier_layer_e::LAYER_L3: |
| return "L3"; |
| case kmp_hier_layer_e::LAYER_NUMA: |
| return "NUMA"; |
| case kmp_hier_layer_e::LAYER_LOOP: |
| return "WHOLE_LOOP"; |
| case kmp_hier_layer_e::LAYER_LAST: |
| return "LAST"; |
| } |
| KMP_ASSERT(0); |
| // Appease compilers, should never get here |
| return "ERROR"; |
| } |
| |
| // Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy |
| typedef struct kmp_hier_sched_env_t { |
| int size; |
| int capacity; |
| enum sched_type *scheds; |
| kmp_int32 *small_chunks; |
| kmp_int64 *large_chunks; |
| kmp_hier_layer_e *layers; |
| // Append a level of the hierarchy |
| void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) { |
| if (capacity == 0) { |
| scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) * |
| kmp_hier_layer_e::LAYER_LAST); |
| small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) * |
| kmp_hier_layer_e::LAYER_LAST); |
| large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) * |
| kmp_hier_layer_e::LAYER_LAST); |
| layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) * |
| kmp_hier_layer_e::LAYER_LAST); |
| capacity = kmp_hier_layer_e::LAYER_LAST; |
| } |
| int current_size = size; |
| KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST); |
| scheds[current_size] = sched; |
| layers[current_size] = layer; |
| small_chunks[current_size] = chunk; |
| large_chunks[current_size] = (kmp_int64)chunk; |
| size++; |
| } |
| // Sort the hierarchy using selection sort, size will always be small |
| // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm |
| void sort() { |
| if (size <= 1) |
| return; |
| for (int i = 0; i < size; ++i) { |
| int switch_index = i; |
| for (int j = i + 1; j < size; ++j) { |
| if (layers[j] < layers[switch_index]) |
| switch_index = j; |
| } |
| if (switch_index != i) { |
| kmp_hier_layer_e temp1 = layers[i]; |
| enum sched_type temp2 = scheds[i]; |
| kmp_int32 temp3 = small_chunks[i]; |
| kmp_int64 temp4 = large_chunks[i]; |
| layers[i] = layers[switch_index]; |
| scheds[i] = scheds[switch_index]; |
| small_chunks[i] = small_chunks[switch_index]; |
| large_chunks[i] = large_chunks[switch_index]; |
| layers[switch_index] = temp1; |
| scheds[switch_index] = temp2; |
| small_chunks[switch_index] = temp3; |
| large_chunks[switch_index] = temp4; |
| } |
| } |
| } |
| // Free all memory |
| void deallocate() { |
| if (capacity > 0) { |
| __kmp_free(scheds); |
| __kmp_free(layers); |
| __kmp_free(small_chunks); |
| __kmp_free(large_chunks); |
| scheds = NULL; |
| layers = NULL; |
| small_chunks = NULL; |
| large_chunks = NULL; |
| } |
| size = 0; |
| capacity = 0; |
| } |
| } kmp_hier_sched_env_t; |
| |
| extern int __kmp_dispatch_hand_threading; |
| extern kmp_hier_sched_env_t __kmp_hier_scheds; |
| |
| // Sizes of layer arrays bounded by max number of detected L1s, L2s, etc. |
| extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1]; |
| extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1]; |
| |
| extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type); |
| extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type); |
| extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, |
| kmp_hier_layer_e t2); |
| extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team); |
| |
| template <typename T> struct kmp_hier_shared_bdata_t { |
| typedef typename traits_t<T>::signed_t ST; |
| volatile kmp_uint64 val[2]; |
| kmp_int32 status[2]; |
| T lb[2]; |
| T ub[2]; |
| ST st[2]; |
| dispatch_shared_info_template<T> sh[2]; |
| void zero() { |
| val[0] = val[1] = 0; |
| status[0] = status[1] = 0; |
| lb[0] = lb[1] = 0; |
| ub[0] = ub[1] = 0; |
| st[0] = st[1] = 0; |
| sh[0].u.s.iteration = sh[1].u.s.iteration = 0; |
| } |
| void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus, |
| kmp_uint64 index) { |
| lb[1 - index] = nlb; |
| ub[1 - index] = nub; |
| st[1 - index] = nst; |
| status[1 - index] = nstatus; |
| } |
| void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) { |
| lb[1 - index] = nlb; |
| ub[1 - index] = nub; |
| st[1 - index] = nst; |
| status[1 - index] = nstatus; |
| sh[1 - index].u.s.iteration = 0; |
| } |
| |
| kmp_int32 get_next_status(kmp_uint64 index) const { |
| return status[1 - index]; |
| } |
| T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; } |
| T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; } |
| ST get_next_st(kmp_uint64 index) const { return st[1 - index]; } |
| dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) { |
| return &(sh[1 - index]); |
| } |
| |
| kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; } |
| T get_curr_lb(kmp_uint64 index) const { return lb[index]; } |
| T get_curr_ub(kmp_uint64 index) const { return ub[index]; } |
| ST get_curr_st(kmp_uint64 index) const { return st[index]; } |
| dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) { |
| return &(sh[index]); |
| } |
| }; |
| |
| /* |
| * In the barrier implementations, num_active is the number of threads that are |
| * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy. |
| * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t |
| * structure. tdata is the thread private data that resides on the thread |
| * data structure. |
| * |
| * The reset_shared() method is used to initialize the barrier data on the |
| * kmp_hier_top_unit_t hierarchy structure |
| * |
| * The reset_private() method is used to initialize the barrier data on the |
| * thread's private dispatch buffer structure |
| * |
| * The barrier() method takes an id, which is that thread's id for the |
| * kmp_hier_top_unit_t structure, and implements the barrier. All threads wait |
| * inside barrier() until all fellow threads who are attached to that |
| * kmp_hier_top_unit_t structure have arrived. |
| */ |
| |
| // Core barrier implementation |
| // Can be used in a unit with between 2 to 8 threads |
| template <typename T> class core_barrier_impl { |
| static inline kmp_uint64 get_wait_val(int num_active) { |
| kmp_uint64 wait_val = 0LL; |
| switch (num_active) { |
| case 2: |
| wait_val = 0x0101LL; |
| break; |
| case 3: |
| wait_val = 0x010101LL; |
| break; |
| case 4: |
| wait_val = 0x01010101LL; |
| break; |
| case 5: |
| wait_val = 0x0101010101LL; |
| break; |
| case 6: |
| wait_val = 0x010101010101LL; |
| break; |
| case 7: |
| wait_val = 0x01010101010101LL; |
| break; |
| case 8: |
| wait_val = 0x0101010101010101LL; |
| break; |
| default: |
| // don't use the core_barrier_impl for more than 8 threads |
| KMP_ASSERT(0); |
| } |
| return wait_val; |
| } |
| |
| public: |
| static void reset_private(kmp_int32 num_active, |
| kmp_hier_private_bdata_t *tdata); |
| static void reset_shared(kmp_int32 num_active, |
| kmp_hier_shared_bdata_t<T> *bdata); |
| static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata, |
| kmp_hier_private_bdata_t *tdata); |
| }; |
| |
| template <typename T> |
| void core_barrier_impl<T>::reset_private(kmp_int32 num_active, |
| kmp_hier_private_bdata_t *tdata) { |
| tdata->num_active = num_active; |
| tdata->index = 0; |
| tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active); |
| } |
| template <typename T> |
| void core_barrier_impl<T>::reset_shared(kmp_int32 num_active, |
| kmp_hier_shared_bdata_t<T> *bdata) { |
| bdata->val[0] = bdata->val[1] = 0LL; |
| bdata->status[0] = bdata->status[1] = 0LL; |
| } |
| template <typename T> |
| void core_barrier_impl<T>::barrier(kmp_int32 id, |
| kmp_hier_shared_bdata_t<T> *bdata, |
| kmp_hier_private_bdata_t *tdata) { |
| kmp_uint64 current_index = tdata->index; |
| kmp_uint64 next_index = 1 - current_index; |
| kmp_uint64 current_wait_value = tdata->wait_val[current_index]; |
| kmp_uint64 next_wait_value = |
| (current_wait_value ? 0 : get_wait_val(tdata->num_active)); |
| KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu " |
| "next_index:%llu curr_wait:%llu next_wait:%llu\n", |
| __kmp_get_gtid(), current_index, next_index, current_wait_value, |
| next_wait_value)); |
| char v = (current_wait_value ? '\1' : '\0'); |
| (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v; |
| __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value, |
| __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL)); |
| tdata->wait_val[current_index] = next_wait_value; |
| tdata->index = next_index; |
| } |
| |
| // Counter barrier implementation |
| // Can be used in a unit with arbitrary number of active threads |
| template <typename T> class counter_barrier_impl { |
| public: |
| static void reset_private(kmp_int32 num_active, |
| kmp_hier_private_bdata_t *tdata); |
| static void reset_shared(kmp_int32 num_active, |
| kmp_hier_shared_bdata_t<T> *bdata); |
| static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata, |
| kmp_hier_private_bdata_t *tdata); |
| }; |
| |
| template <typename T> |
| void counter_barrier_impl<T>::reset_private(kmp_int32 num_active, |
| kmp_hier_private_bdata_t *tdata) { |
| tdata->num_active = num_active; |
| tdata->index = 0; |
| tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active; |
| } |
| template <typename T> |
| void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active, |
| kmp_hier_shared_bdata_t<T> *bdata) { |
| bdata->val[0] = bdata->val[1] = 0LL; |
| bdata->status[0] = bdata->status[1] = 0LL; |
| } |
| template <typename T> |
| void counter_barrier_impl<T>::barrier(kmp_int32 id, |
| kmp_hier_shared_bdata_t<T> *bdata, |
| kmp_hier_private_bdata_t *tdata) { |
| volatile kmp_int64 *val; |
| kmp_uint64 current_index = tdata->index; |
| kmp_uint64 next_index = 1 - current_index; |
| kmp_uint64 current_wait_value = tdata->wait_val[current_index]; |
| kmp_uint64 next_wait_value = current_wait_value + tdata->num_active; |
| |
| KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu " |
| "next_index:%llu curr_wait:%llu next_wait:%llu\n", |
| __kmp_get_gtid(), current_index, next_index, current_wait_value, |
| next_wait_value)); |
| val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index])); |
| KMP_TEST_THEN_INC64(val); |
| __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value, |
| __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL)); |
| tdata->wait_val[current_index] = next_wait_value; |
| tdata->index = next_index; |
| } |
| |
| // Data associated with topology unit within a layer |
| // For example, one kmp_hier_top_unit_t corresponds to one L1 cache |
| template <typename T> struct kmp_hier_top_unit_t { |
| typedef typename traits_t<T>::signed_t ST; |
| typedef typename traits_t<T>::unsigned_t UT; |
| kmp_int32 active; // number of topology units that communicate with this unit |
| // chunk information (lower/upper bound, stride, etc.) |
| dispatch_private_info_template<T> hier_pr; |
| kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit |
| kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit |
| |
| kmp_int32 get_hier_id() const { return hier_pr.hier_id; } |
| void reset_shared_barrier() { |
| KMP_DEBUG_ASSERT(active > 0); |
| if (active == 1) |
| return; |
| hier_barrier.zero(); |
| if (active >= 2 && active <= 8) { |
| core_barrier_impl<T>::reset_shared(active, &hier_barrier); |
| } else { |
| counter_barrier_impl<T>::reset_shared(active, &hier_barrier); |
| } |
| } |
| void reset_private_barrier(kmp_hier_private_bdata_t *tdata) { |
| KMP_DEBUG_ASSERT(tdata); |
| KMP_DEBUG_ASSERT(active > 0); |
| if (active == 1) |
| return; |
| if (active >= 2 && active <= 8) { |
| core_barrier_impl<T>::reset_private(active, tdata); |
| } else { |
| counter_barrier_impl<T>::reset_private(active, tdata); |
| } |
| } |
| void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) { |
| KMP_DEBUG_ASSERT(tdata); |
| KMP_DEBUG_ASSERT(active > 0); |
| KMP_DEBUG_ASSERT(id >= 0 && id < active); |
| if (active == 1) { |
| tdata->index = 1 - tdata->index; |
| return; |
| } |
| if (active >= 2 && active <= 8) { |
| core_barrier_impl<T>::barrier(id, &hier_barrier, tdata); |
| } else { |
| counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata); |
| } |
| } |
| |
| kmp_int32 get_next_status(kmp_uint64 index) const { |
| return hier_barrier.get_next_status(index); |
| } |
| T get_next_lb(kmp_uint64 index) const { |
| return hier_barrier.get_next_lb(index); |
| } |
| T get_next_ub(kmp_uint64 index) const { |
| return hier_barrier.get_next_ub(index); |
| } |
| ST get_next_st(kmp_uint64 index) const { |
| return hier_barrier.get_next_st(index); |
| } |
| dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) { |
| return hier_barrier.get_next_sh(index); |
| } |
| |
| kmp_int32 get_curr_status(kmp_uint64 index) const { |
| return hier_barrier.get_curr_status(index); |
| } |
| T get_curr_lb(kmp_uint64 index) const { |
| return hier_barrier.get_curr_lb(index); |
| } |
| T get_curr_ub(kmp_uint64 index) const { |
| return hier_barrier.get_curr_ub(index); |
| } |
| ST get_curr_st(kmp_uint64 index) const { |
| return hier_barrier.get_curr_st(index); |
| } |
| dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) { |
| return hier_barrier.get_curr_sh(index); |
| } |
| |
| void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status, |
| kmp_uint64 index) { |
| hier_barrier.set_next_hand_thread(lb, ub, st, status, index); |
| } |
| void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) { |
| hier_barrier.set_next(lb, ub, st, status, index); |
| } |
| dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; } |
| kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; } |
| dispatch_private_info_template<T> *get_parent_pr() { |
| return &(hier_parent->hier_pr); |
| } |
| |
| kmp_int32 is_active() const { return active; } |
| kmp_int32 get_num_active() const { return active; } |
| #ifdef KMP_DEBUG |
| void print() { |
| KD_TRACE( |
| 10, |
| (" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n", |
| active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st, |
| hier_pr.u.p.tc)); |
| } |
| #endif |
| }; |
| |
| // Information regarding a single layer within the scheduling hierarchy |
| template <typename T> struct kmp_hier_layer_info_t { |
| int num_active; // number of threads active in this level |
| kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc. |
| enum sched_type sched; // static, dynamic, guided, etc. |
| typename traits_t<T>::signed_t chunk; // chunk size associated with schedule |
| int length; // length of the kmp_hier_top_unit_t array |
| |
| #ifdef KMP_DEBUG |
| // Print this layer's information |
| void print() { |
| const char *t = __kmp_get_hier_str(type); |
| KD_TRACE( |
| 10, |
| (" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d " |
| "length:%d\n", |
| num_active, t, sched, chunk, length)); |
| } |
| #endif |
| }; |
| |
| /* |
| * Structure to implement entire hierarchy |
| * |
| * The hierarchy is kept as an array of arrays to represent the different |
| * layers. Layer 0 is the lowest layer to layer num_layers - 1 which is the |
| * highest layer. |
| * Example: |
| * [ 2 ] -> [ L3 | L3 ] |
| * [ 1 ] -> [ L2 | L2 | L2 | L2 ] |
| * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ] |
| * There is also an array of layer_info_t which has information regarding |
| * each layer |
| */ |
| template <typename T> struct kmp_hier_t { |
| public: |
| typedef typename traits_t<T>::unsigned_t UT; |
| typedef typename traits_t<T>::signed_t ST; |
| |
| private: |
| int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current, |
| kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st, |
| kmp_int32 previous_id, int hier_level) { |
| int status; |
| kmp_info_t *th = __kmp_threads[gtid]; |
| auto parent = current->get_parent(); |
| bool last_layer = (hier_level == get_num_layers() - 1); |
| KMP_DEBUG_ASSERT(th); |
| kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]); |
| KMP_DEBUG_ASSERT(current); |
| KMP_DEBUG_ASSERT(hier_level >= 0); |
| KMP_DEBUG_ASSERT(hier_level < get_num_layers()); |
| KMP_DEBUG_ASSERT(tdata); |
| KMP_DEBUG_ASSERT(parent || last_layer); |
| |
| KD_TRACE( |
| 1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level)); |
| |
| T hier_id = (T)current->get_hier_id(); |
| // Attempt to grab next iteration range for this level |
| if (previous_id == 0) { |
| KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n", |
| gtid, hier_level)); |
| kmp_int32 contains_last; |
| T my_lb, my_ub; |
| ST my_st; |
| T nproc; |
| dispatch_shared_info_template<T> volatile *my_sh; |
| dispatch_private_info_template<T> *my_pr; |
| if (last_layer) { |
| // last layer below the very top uses the single shared buffer |
| // from the team struct. |
| KD_TRACE(10, |
| ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n", |
| gtid, hier_level)); |
| my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( |
| th->th.th_dispatch->th_dispatch_sh_current); |
| nproc = (T)get_top_level_nproc(); |
| } else { |
| // middle layers use the shared buffer inside the kmp_hier_top_unit_t |
| // structure |
| KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n", |
| gtid, hier_level)); |
| my_sh = |
| parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index); |
| nproc = (T)parent->get_num_active(); |
| } |
| my_pr = current->get_my_pr(); |
| KMP_DEBUG_ASSERT(my_sh); |
| KMP_DEBUG_ASSERT(my_pr); |
| enum sched_type schedule = get_sched(hier_level); |
| ST chunk = (ST)get_chunk(hier_level); |
| status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh, |
| &contains_last, &my_lb, &my_ub, |
| &my_st, nproc, hier_id); |
| KD_TRACE( |
| 10, |
| ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n", |
| gtid, hier_level, status)); |
| // When no iterations are found (status == 0) and this is not the last |
| // layer, attempt to go up the hierarchy for more iterations |
| if (status == 0 && !last_layer) { |
| kmp_int32 hid; |
| __kmp_type_convert(hier_id, &hid); |
| status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub, |
| &my_st, hid, hier_level + 1); |
| KD_TRACE( |
| 10, |
| ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n", |
| gtid, hier_level, status)); |
| if (status == 1) { |
| kmp_hier_private_bdata_t *upper_tdata = |
| &(th->th.th_hier_bar_data[hier_level + 1]); |
| my_sh = parent->get_curr_sh(upper_tdata->index); |
| KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n", |
| gtid, hier_level)); |
| __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule, |
| parent->get_curr_lb(upper_tdata->index), |
| parent->get_curr_ub(upper_tdata->index), |
| parent->get_curr_st(upper_tdata->index), |
| #if USE_ITT_BUILD |
| NULL, |
| #endif |
| chunk, nproc, hier_id); |
| status = __kmp_dispatch_next_algorithm<T>( |
| gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc, |
| hier_id); |
| if (!status) { |
| KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 " |
| "setting to 2!\n", |
| gtid, hier_level)); |
| status = 2; |
| } |
| } |
| } |
| current->set_next(my_lb, my_ub, my_st, status, tdata->index); |
| // Propagate whether a unit holds the actual global last iteration |
| // The contains_last attribute is sent downwards from the top to the |
| // bottom of the hierarchy via the contains_last flag inside the |
| // private dispatch buffers in the hierarchy's middle layers |
| if (contains_last) { |
| // If the next_algorithm() method returns 1 for p_last and it is the |
| // last layer or our parent contains the last serial chunk, then the |
| // chunk must contain the last serial iteration. |
| if (last_layer || parent->hier_pr.flags.contains_last) { |
| KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr " |
| "to contain last.\n", |
| gtid, hier_level)); |
| current->hier_pr.flags.contains_last = contains_last; |
| } |
| if (!current->hier_pr.flags.contains_last) |
| contains_last = FALSE; |
| } |
| if (p_last) |
| *p_last = contains_last; |
| } // if primary thread of this unit |
| if (hier_level > 0 || !__kmp_dispatch_hand_threading) { |
| KD_TRACE(10, |
| ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n", |
| gtid, hier_level)); |
| current->barrier(previous_id, tdata); |
| KD_TRACE(10, |
| ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n", |
| gtid, hier_level, current->get_curr_status(tdata->index))); |
| } else { |
| KMP_DEBUG_ASSERT(previous_id == 0); |
| return status; |
| } |
| return current->get_curr_status(tdata->index); |
| } |
| |
| public: |
| int top_level_nproc; |
| int num_layers; |
| bool valid; |
| int type_size; |
| kmp_hier_layer_info_t<T> *info; |
| kmp_hier_top_unit_t<T> **layers; |
| // Deallocate all memory from this hierarchy |
| void deallocate() { |
| for (int i = 0; i < num_layers; ++i) |
| if (layers[i] != NULL) { |
| __kmp_free(layers[i]); |
| } |
| if (layers != NULL) { |
| __kmp_free(layers); |
| layers = NULL; |
| } |
| if (info != NULL) { |
| __kmp_free(info); |
| info = NULL; |
| } |
| num_layers = 0; |
| valid = false; |
| } |
| // Returns true if reallocation is needed else false |
| bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers, |
| const enum sched_type *new_scheds, |
| const ST *new_chunks) const { |
| if (!valid || layers == NULL || info == NULL || |
| traits_t<T>::type_size != type_size || n != num_layers) |
| return true; |
| for (int i = 0; i < n; ++i) { |
| if (info[i].type != new_layers[i]) |
| return true; |
| if (info[i].sched != new_scheds[i]) |
| return true; |
| if (info[i].chunk != new_chunks[i]) |
| return true; |
| } |
| return false; |
| } |
| // A single thread should call this function while the other threads wait |
| // create a new scheduling hierarchy consisting of new_layers, new_scheds |
| // and new_chunks. These should come pre-sorted according to |
| // kmp_hier_layer_e value. This function will try to avoid reallocation |
| // if it can |
| void allocate_hier(int n, const kmp_hier_layer_e *new_layers, |
| const enum sched_type *new_scheds, const ST *new_chunks) { |
| top_level_nproc = 0; |
| if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) { |
| KD_TRACE( |
| 10, |
| ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n")); |
| for (int i = 0; i < n; ++i) { |
| info[i].num_active = 0; |
| for (int j = 0; j < get_length(i); ++j) |
| layers[i][j].active = 0; |
| } |
| return; |
| } |
| KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n")); |
| deallocate(); |
| type_size = traits_t<T>::type_size; |
| num_layers = n; |
| info = (kmp_hier_layer_info_t<T> *)__kmp_allocate( |
| sizeof(kmp_hier_layer_info_t<T>) * n); |
| layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate( |
| sizeof(kmp_hier_top_unit_t<T> *) * n); |
| for (int i = 0; i < n; ++i) { |
| int max = 0; |
| kmp_hier_layer_e layer = new_layers[i]; |
| info[i].num_active = 0; |
| info[i].type = layer; |
| info[i].sched = new_scheds[i]; |
| info[i].chunk = new_chunks[i]; |
| max = __kmp_hier_max_units[layer + 1]; |
| if (max == 0) { |
| valid = false; |
| KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer)); |
| deallocate(); |
| return; |
| } |
| info[i].length = max; |
| layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate( |
| sizeof(kmp_hier_top_unit_t<T>) * max); |
| for (int j = 0; j < max; ++j) { |
| layers[i][j].active = 0; |
| layers[i][j].hier_pr.flags.use_hier = TRUE; |
| } |
| } |
| valid = true; |
| } |
| // loc - source file location |
| // gtid - global thread identifier |
| // pr - this thread's private dispatch buffer (corresponding with gtid) |
| // p_last (return value) - pointer to flag indicating this set of iterations |
| // contains last |
| // iteration |
| // p_lb (return value) - lower bound for this chunk of iterations |
| // p_ub (return value) - upper bound for this chunk of iterations |
| // p_st (return value) - stride for this chunk of iterations |
| // |
| // Returns 1 if there are more iterations to perform, 0 otherwise |
| int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr, |
| kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) { |
| int status; |
| kmp_int32 contains_last = 0; |
| kmp_info_t *th = __kmp_threads[gtid]; |
| kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]); |
| auto parent = pr->get_parent(); |
| KMP_DEBUG_ASSERT(parent); |
| KMP_DEBUG_ASSERT(th); |
| KMP_DEBUG_ASSERT(tdata); |
| KMP_DEBUG_ASSERT(parent); |
| T nproc = (T)parent->get_num_active(); |
| T unit_id = (T)pr->get_hier_id(); |
| KD_TRACE( |
| 10, |
| ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n", |
| gtid, nproc, unit_id)); |
| // Handthreading implementation |
| // Each iteration is performed by all threads on last unit (typically |
| // cores/tiles) |
| // e.g., threads 0,1,2,3 all execute iteration 0 |
| // threads 0,1,2,3 all execute iteration 1 |
| // threads 4,5,6,7 all execute iteration 2 |
| // threads 4,5,6,7 all execute iteration 3 |
| // ... etc. |
| if (__kmp_dispatch_hand_threading) { |
| KD_TRACE(10, |
| ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n", |
| gtid)); |
| if (unit_id == 0) { |
| // For hand threading, the sh buffer on the lowest level is only ever |
| // modified and read by the primary thread on that level. Because of |
| // this, we can always use the first sh buffer. |
| auto sh = &(parent->hier_barrier.sh[0]); |
| KMP_DEBUG_ASSERT(sh); |
| status = __kmp_dispatch_next_algorithm<T>( |
| gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id); |
| if (!status) { |
| bool done = false; |
| while (!done) { |
| done = true; |
| kmp_int32 uid; |
| __kmp_type_convert(unit_id, &uid); |
| status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub, |
| p_st, uid, 0); |
| if (status == 1) { |
| __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule, |
| parent->get_next_lb(tdata->index), |
| parent->get_next_ub(tdata->index), |
| parent->get_next_st(tdata->index), |
| #if USE_ITT_BUILD |
| NULL, |
| #endif |
| pr->u.p.parm1, nproc, unit_id); |
| sh->u.s.iteration = 0; |
| status = __kmp_dispatch_next_algorithm<T>( |
| gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, |
| unit_id); |
| if (!status) { |
| KD_TRACE(10, |
| ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 " |
| "after next_pr_sh()" |
| "trying again.\n", |
| gtid)); |
| done = false; |
| } |
| } else if (status == 2) { |
| KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 " |
| "trying again.\n", |
| gtid)); |
| done = false; |
| } |
| } |
| } |
| parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index); |
| } // if primary thread of lowest unit level |
| parent->barrier(pr->get_hier_id(), tdata); |
| if (unit_id != 0) { |
| *p_lb = parent->get_curr_lb(tdata->index); |
| *p_ub = parent->get_curr_ub(tdata->index); |
| *p_st = parent->get_curr_st(tdata->index); |
| status = parent->get_curr_status(tdata->index); |
| } |
| } else { |
| // Normal implementation |
| // Each thread grabs an iteration chunk and executes it (no cooperation) |
| auto sh = parent->get_curr_sh(tdata->index); |
| KMP_DEBUG_ASSERT(sh); |
| status = __kmp_dispatch_next_algorithm<T>( |
| gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id); |
| KD_TRACE(10, |
| ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d " |
| "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n", |
| gtid, status, contains_last, *p_lb, *p_ub, *p_st)); |
| if (!status) { |
| bool done = false; |
| while (!done) { |
| done = true; |
| kmp_int32 uid; |
| __kmp_type_convert(unit_id, &uid); |
| status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub, |
| p_st, uid, 0); |
| if (status == 1) { |
| sh = parent->get_curr_sh(tdata->index); |
| __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule, |
| parent->get_curr_lb(tdata->index), |
| parent->get_curr_ub(tdata->index), |
| parent->get_curr_st(tdata->index), |
| #if USE_ITT_BUILD |
| NULL, |
| #endif |
| pr->u.p.parm1, nproc, unit_id); |
| status = __kmp_dispatch_next_algorithm<T>( |
| gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id); |
| if (!status) { |
| KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 " |
| "after next_pr_sh()" |
| "trying again.\n", |
| gtid)); |
| done = false; |
| } |
| } else if (status == 2) { |
| KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 " |
| "trying again.\n", |
| gtid)); |
| done = false; |
| } |
| } |
| } |
| } |
| if (contains_last && !parent->hier_pr.flags.contains_last) { |
| KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting " |
| "contains_last to FALSE\n", |
| gtid)); |
| contains_last = FALSE; |
| } |
| if (p_last) |
| *p_last = contains_last; |
| KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid, |
| status)); |
| return status; |
| } |
| // These functions probe the layer info structure |
| // Returns the type of topology unit given level |
| kmp_hier_layer_e get_type(int level) const { |
| KMP_DEBUG_ASSERT(level >= 0); |
| KMP_DEBUG_ASSERT(level < num_layers); |
| return info[level].type; |
| } |
| // Returns the schedule type at given level |
| enum sched_type get_sched(int level) const { |
| KMP_DEBUG_ASSERT(level >= 0); |
| KMP_DEBUG_ASSERT(level < num_layers); |
| return info[level].sched; |
| } |
| // Returns the chunk size at given level |
| ST get_chunk(int level) const { |
| KMP_DEBUG_ASSERT(level >= 0); |
| KMP_DEBUG_ASSERT(level < num_layers); |
| return info[level].chunk; |
| } |
| // Returns the number of active threads at given level |
| int get_num_active(int level) const { |
| KMP_DEBUG_ASSERT(level >= 0); |
| KMP_DEBUG_ASSERT(level < num_layers); |
| return info[level].num_active; |
| } |
| // Returns the length of topology unit array at given level |
| int get_length(int level) const { |
| KMP_DEBUG_ASSERT(level >= 0); |
| KMP_DEBUG_ASSERT(level < num_layers); |
| return info[level].length; |
| } |
| // Returns the topology unit given the level and index |
| kmp_hier_top_unit_t<T> *get_unit(int level, int index) { |
| KMP_DEBUG_ASSERT(level >= 0); |
| KMP_DEBUG_ASSERT(level < num_layers); |
| KMP_DEBUG_ASSERT(index >= 0); |
| KMP_DEBUG_ASSERT(index < get_length(level)); |
| return &(layers[level][index]); |
| } |
| // Returns the number of layers in the hierarchy |
| int get_num_layers() const { return num_layers; } |
| // Returns the number of threads in the top layer |
| // This is necessary because we don't store a topology unit as |
| // the very top level and the scheduling algorithms need this information |
| int get_top_level_nproc() const { return top_level_nproc; } |
| // Return whether this hierarchy is valid or not |
| bool is_valid() const { return valid; } |
| #ifdef KMP_DEBUG |
| // Print the hierarchy |
| void print() { |
| KD_TRACE(10, ("kmp_hier_t:\n")); |
| for (int i = num_layers - 1; i >= 0; --i) { |
| KD_TRACE(10, ("Info[%d] = ", i)); |
| info[i].print(); |
| } |
| for (int i = num_layers - 1; i >= 0; --i) { |
| KD_TRACE(10, ("Layer[%d] =\n", i)); |
| for (int j = 0; j < info[i].length; ++j) { |
| layers[i][j].print(); |
| } |
| } |
| } |
| #endif |
| }; |
| |
| template <typename T> |
| void __kmp_dispatch_init_hierarchy(ident_t *loc, int n, |
| kmp_hier_layer_e *new_layers, |
| enum sched_type *new_scheds, |
| typename traits_t<T>::signed_t *new_chunks, |
| T lb, T ub, |
| typename traits_t<T>::signed_t st) { |
| int tid, gtid, num_hw_threads, num_threads_per_layer1, active; |
| unsigned int my_buffer_index; |
| kmp_info_t *th; |
| kmp_team_t *team; |
| dispatch_private_info_template<T> *pr; |
| dispatch_shared_info_template<T> volatile *sh; |
| gtid = __kmp_entry_gtid(); |
| tid = __kmp_tid_from_gtid(gtid); |
| #ifdef KMP_DEBUG |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n", |
| gtid, n)); |
| for (int i = 0; i < n; ++i) { |
| const char *layer = __kmp_get_hier_str(new_layers[i]); |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, " |
| "new_scheds[%d] = %d, new_chunks[%d] = %u\n", |
| gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i])); |
| } |
| #endif // KMP_DEBUG |
| KMP_DEBUG_ASSERT(n > 0); |
| KMP_DEBUG_ASSERT(new_layers); |
| KMP_DEBUG_ASSERT(new_scheds); |
| KMP_DEBUG_ASSERT(new_chunks); |
| if (!TCR_4(__kmp_init_parallel)) |
| __kmp_parallel_initialize(); |
| __kmp_resume_if_soft_paused(); |
| |
| th = __kmp_threads[gtid]; |
| team = th->th.th_team; |
| active = !team->t.t_serialized; |
| th->th.th_ident = loc; |
| num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; |
| KMP_DEBUG_ASSERT(th->th.th_dispatch == |
| &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]); |
| my_buffer_index = th->th.th_dispatch->th_disp_index; |
| pr = reinterpret_cast<dispatch_private_info_template<T> *>( |
| &th->th.th_dispatch |
| ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); |
| sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>( |
| &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); |
| if (!active) { |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. " |
| "Using normal dispatch functions.\n", |
| gtid)); |
| KMP_DEBUG_ASSERT(pr); |
| pr->flags.use_hier = FALSE; |
| pr->flags.contains_last = FALSE; |
| return; |
| } |
| KMP_DEBUG_ASSERT(pr); |
| KMP_DEBUG_ASSERT(sh); |
| pr->flags.use_hier = TRUE; |
| pr->u.p.tc = 0; |
| // Have primary thread allocate the hierarchy |
| if (__kmp_tid_from_gtid(gtid) == 0) { |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating " |
| "hierarchy\n", |
| gtid, pr, sh)); |
| if (sh->hier == NULL) { |
| sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>)); |
| } |
| sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks); |
| sh->u.s.iteration = 0; |
| } |
| __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); |
| // Check to make sure the hierarchy is valid |
| kmp_hier_t<T> *hier = sh->hier; |
| if (!sh->hier->is_valid()) { |
| pr->flags.use_hier = FALSE; |
| return; |
| } |
| // Have threads allocate their thread-private barrier data if it hasn't |
| // already been allocated |
| if (th->th.th_hier_bar_data == NULL) { |
| th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate( |
| sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST); |
| } |
| // Have threads "register" themselves by modifying the active count for each |
| // level they are involved in. The active count will act as nthreads for that |
| // level regarding the scheduling algorithms |
| for (int i = 0; i < n; ++i) { |
| int index = __kmp_dispatch_get_index(tid, hier->get_type(i)); |
| kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index); |
| // Setup the thread's private dispatch buffer's hierarchy pointers |
| if (i == 0) |
| pr->hier_parent = my_unit; |
| // If this unit is already active, then increment active count and wait |
| if (my_unit->is_active()) { |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) " |
| "is already active (%d)\n", |
| gtid, my_unit, my_unit->active)); |
| KMP_TEST_THEN_INC32(&(my_unit->active)); |
| break; |
| } |
| // Flag that this unit is active |
| if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) { |
| // Do not setup parent pointer for top level unit since it has no parent |
| if (i < n - 1) { |
| // Setup middle layer pointers to parents |
| my_unit->get_my_pr()->hier_id = |
| index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i), |
| hier->get_type(i + 1)); |
| int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1)); |
| my_unit->hier_parent = hier->get_unit(i + 1, parent_index); |
| } else { |
| // Setup top layer information (no parent pointers are set) |
| my_unit->get_my_pr()->hier_id = |
| index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i), |
| kmp_hier_layer_e::LAYER_LOOP); |
| KMP_TEST_THEN_INC32(&(hier->top_level_nproc)); |
| my_unit->hier_parent = nullptr; |
| } |
| // Set trip count to 0 so that next() operation will initially climb up |
| // the hierarchy to get more iterations (early exit in next() for tc == 0) |
| my_unit->get_my_pr()->u.p.tc = 0; |
| // Increment this layer's number of active units |
| KMP_TEST_THEN_INC32(&(hier->info[i].num_active)); |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) " |
| "incrementing num_active\n", |
| gtid, my_unit)); |
| } else { |
| KMP_TEST_THEN_INC32(&(my_unit->active)); |
| break; |
| } |
| } |
| // Set this thread's id |
| num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2( |
| kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0)); |
| pr->hier_id = tid % num_threads_per_layer1; |
| // For oversubscribed threads, increment their index within the lowest unit |
| // This is done to prevent having two or more threads with id 0, id 1, etc. |
| if (tid >= num_hw_threads) |
| pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1); |
| KD_TRACE( |
| 10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n", |
| gtid, pr->hier_id)); |
| |
| pr->flags.contains_last = FALSE; |
| __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); |
| |
| // Now that the number of active threads at each level is determined, |
| // the barrier data for each unit can be initialized and the last layer's |
| // loop information can be initialized. |
| int prev_id = pr->get_hier_id(); |
| for (int i = 0; i < n; ++i) { |
| if (prev_id != 0) |
| break; |
| int index = __kmp_dispatch_get_index(tid, hier->get_type(i)); |
| kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index); |
| // Only primary threads of this unit within the hierarchy do initialization |
| KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n", |
| gtid, i)); |
| my_unit->reset_shared_barrier(); |
| my_unit->hier_pr.flags.contains_last = FALSE; |
| // Last layer, initialize the private buffers with entire loop information |
| // Now the next next_algorithm() call will get the first chunk of |
| // iterations properly |
| if (i == n - 1) { |
| __kmp_dispatch_init_algorithm<T>( |
| loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st, |
| #if USE_ITT_BUILD |
| NULL, |
| #endif |
| hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id()); |
| } |
| prev_id = my_unit->get_hier_id(); |
| } |
| // Initialize each layer of the thread's private barrier data |
| kmp_hier_top_unit_t<T> *unit = pr->hier_parent; |
| for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) { |
| kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]); |
| unit->reset_private_barrier(tdata); |
| } |
| __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); |
| |
| #ifdef KMP_DEBUG |
| if (__kmp_tid_from_gtid(gtid) == 0) { |
| for (int i = 0; i < n; ++i) { |
| KD_TRACE(10, |
| ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n", |
| gtid, i, hier->get_num_active(i))); |
| } |
| hier->print(); |
| } |
| __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL); |
| #endif // KMP_DEBUG |
| } |
| #endif |