| /* |
| * kmp_runtime.cpp -- KPTS runtime support library |
| */ |
| |
| //===----------------------------------------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "kmp.h" |
| #include "kmp_affinity.h" |
| #include "kmp_atomic.h" |
| #include "kmp_environment.h" |
| #include "kmp_error.h" |
| #include "kmp_i18n.h" |
| #include "kmp_io.h" |
| #include "kmp_itt.h" |
| #include "kmp_settings.h" |
| #include "kmp_stats.h" |
| #include "kmp_str.h" |
| #include "kmp_wait_release.h" |
| #include "kmp_wrapper_getpid.h" |
| #include "kmp_dispatch.h" |
| #include "kmp_utils.h" |
| #if KMP_USE_HIER_SCHED |
| #include "kmp_dispatch_hier.h" |
| #endif |
| |
| #if OMPT_SUPPORT |
| #include "ompt-specific.h" |
| #endif |
| #if OMPD_SUPPORT |
| #include "ompd-specific.h" |
| #endif |
| |
| #if OMP_PROFILING_SUPPORT |
| #include "llvm/Support/TimeProfiler.h" |
| static char *ProfileTraceFile = nullptr; |
| #endif |
| |
| /* these are temporary issues to be dealt with */ |
| #define KMP_USE_PRCTL 0 |
| |
| #if KMP_OS_WINDOWS |
| #include <process.h> |
| #endif |
| |
| #ifndef KMP_USE_SHM |
| // Windows and WASI do not need these include files as they don't use shared |
| // memory. |
| #else |
| #include <sys/mman.h> |
| #include <sys/stat.h> |
| #include <fcntl.h> |
| #define SHM_SIZE 1024 |
| #endif |
| |
| #if defined(KMP_GOMP_COMPAT) |
| char const __kmp_version_alt_comp[] = |
| KMP_VERSION_PREFIX "alternative compiler support: yes"; |
| #endif /* defined(KMP_GOMP_COMPAT) */ |
| |
| char const __kmp_version_omp_api[] = |
| KMP_VERSION_PREFIX "API version: 5.0 (201611)"; |
| |
| #ifdef KMP_DEBUG |
| char const __kmp_version_lock[] = |
| KMP_VERSION_PREFIX "lock type: run time selectable"; |
| #endif /* KMP_DEBUG */ |
| |
| #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) |
| |
| /* ------------------------------------------------------------------------ */ |
| |
| #if KMP_USE_MONITOR |
| kmp_info_t __kmp_monitor; |
| #endif |
| |
| /* Forward declarations */ |
| |
| void __kmp_cleanup(void); |
| |
| static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, |
| int gtid); |
| static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, |
| kmp_internal_control_t *new_icvs, |
| ident_t *loc); |
| #if KMP_AFFINITY_SUPPORTED |
| static void __kmp_partition_places(kmp_team_t *team, |
| int update_master_only = 0); |
| #endif |
| static void __kmp_do_serial_initialize(void); |
| void __kmp_fork_barrier(int gtid, int tid); |
| void __kmp_join_barrier(int gtid); |
| void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, |
| kmp_internal_control_t *new_icvs, ident_t *loc); |
| |
| #ifdef USE_LOAD_BALANCE |
| static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); |
| #endif |
| |
| static int __kmp_expand_threads(int nNeed); |
| #if KMP_OS_WINDOWS |
| static int __kmp_unregister_root_other_thread(int gtid); |
| #endif |
| static void __kmp_reap_thread(kmp_info_t *thread, int is_root); |
| kmp_info_t *__kmp_thread_pool_insert_pt = NULL; |
| |
| void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, |
| int new_nthreads); |
| void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); |
| |
| static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr, |
| int level) { |
| kmp_nested_nthreads_t *new_nested_nth = |
| (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC( |
| sizeof(kmp_nested_nthreads_t)); |
| int new_size = level + thr->th.th_set_nested_nth_sz; |
| new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int)); |
| for (int i = 0; i < level + 1; ++i) |
| new_nested_nth->nth[i] = 0; |
| for (int i = level + 1, j = 1; i < new_size; ++i, ++j) |
| new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j]; |
| new_nested_nth->size = new_nested_nth->used = new_size; |
| return new_nested_nth; |
| } |
| |
| /* Calculate the identifier of the current thread */ |
| /* fast (and somewhat portable) way to get unique identifier of executing |
| thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ |
| int __kmp_get_global_thread_id() { |
| int i; |
| kmp_info_t **other_threads; |
| size_t stack_data; |
| char *stack_addr; |
| size_t stack_size; |
| char *stack_base; |
| |
| KA_TRACE( |
| 1000, |
| ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", |
| __kmp_nth, __kmp_all_nth)); |
| |
| /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to |
| a parallel region, made it return KMP_GTID_DNE to force serial_initialize |
| by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee |
| __kmp_init_gtid for this to work. */ |
| |
| if (!TCR_4(__kmp_init_gtid)) |
| return KMP_GTID_DNE; |
| |
| #ifdef KMP_TDATA_GTID |
| if (TCR_4(__kmp_gtid_mode) >= 3) { |
| KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); |
| return __kmp_gtid; |
| } |
| #endif |
| if (TCR_4(__kmp_gtid_mode) >= 2) { |
| KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); |
| return __kmp_gtid_get_specific(); |
| } |
| KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); |
| |
| stack_addr = (char *)&stack_data; |
| other_threads = __kmp_threads; |
| |
| /* ATT: The code below is a source of potential bugs due to unsynchronized |
| access to __kmp_threads array. For example: |
| 1. Current thread loads other_threads[i] to thr and checks it, it is |
| non-NULL. |
| 2. Current thread is suspended by OS. |
| 3. Another thread unregisters and finishes (debug versions of free() |
| may fill memory with something like 0xEF). |
| 4. Current thread is resumed. |
| 5. Current thread reads junk from *thr. |
| TODO: Fix it. --ln */ |
| |
| for (i = 0; i < __kmp_threads_capacity; i++) { |
| |
| kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); |
| if (!thr) |
| continue; |
| |
| stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); |
| stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); |
| |
| /* stack grows down -- search through all of the active threads */ |
| |
| if (stack_addr <= stack_base) { |
| size_t stack_diff = stack_base - stack_addr; |
| |
| if (stack_diff <= stack_size) { |
| /* The only way we can be closer than the allocated */ |
| /* stack size is if we are running on this thread. */ |
| // __kmp_gtid_get_specific can return negative value because this |
| // function can be called by thread destructor. However, before the |
| // thread destructor is called, the value of the corresponding |
| // thread-specific data will be reset to NULL. |
| KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 || |
| __kmp_gtid_get_specific() == i); |
| return i; |
| } |
| } |
| } |
| |
| /* get specific to try and determine our gtid */ |
| KA_TRACE(1000, |
| ("*** __kmp_get_global_thread_id: internal alg. failed to find " |
| "thread, using TLS\n")); |
| i = __kmp_gtid_get_specific(); |
| |
| /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ |
| |
| /* if we havn't been assigned a gtid, then return code */ |
| if (i < 0) |
| return i; |
| |
| // other_threads[i] can be nullptr at this point because the corresponding |
| // thread could have already been destructed. It can happen when this function |
| // is called in end library routine. |
| if (!TCR_SYNC_PTR(other_threads[i])) |
| return i; |
| |
| /* dynamically updated stack window for uber threads to avoid get_specific |
| call */ |
| if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { |
| KMP_FATAL(StackOverflow, i); |
| } |
| |
| stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; |
| if (stack_addr > stack_base) { |
| TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); |
| TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, |
| other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - |
| stack_base); |
| } else { |
| TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, |
| stack_base - stack_addr); |
| } |
| |
| /* Reprint stack bounds for ubermaster since they have been refined */ |
| if (__kmp_storage_map) { |
| char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; |
| char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; |
| __kmp_print_storage_map_gtid(i, stack_beg, stack_end, |
| other_threads[i]->th.th_info.ds.ds_stacksize, |
| "th_%d stack (refinement)", i); |
| } |
| return i; |
| } |
| |
| int __kmp_get_global_thread_id_reg() { |
| int gtid; |
| |
| if (!__kmp_init_serial) { |
| gtid = KMP_GTID_DNE; |
| } else |
| #ifdef KMP_TDATA_GTID |
| if (TCR_4(__kmp_gtid_mode) >= 3) { |
| KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); |
| gtid = __kmp_gtid; |
| } else |
| #endif |
| if (TCR_4(__kmp_gtid_mode) >= 2) { |
| KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); |
| gtid = __kmp_gtid_get_specific(); |
| } else { |
| KA_TRACE(1000, |
| ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); |
| gtid = __kmp_get_global_thread_id(); |
| } |
| |
| /* we must be a new uber master sibling thread */ |
| if (gtid == KMP_GTID_DNE) { |
| KA_TRACE(10, |
| ("__kmp_get_global_thread_id_reg: Encountered new root thread. " |
| "Registering a new gtid.\n")); |
| __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); |
| if (!__kmp_init_serial) { |
| __kmp_do_serial_initialize(); |
| gtid = __kmp_gtid_get_specific(); |
| } else { |
| gtid = __kmp_register_root(FALSE); |
| } |
| __kmp_release_bootstrap_lock(&__kmp_initz_lock); |
| /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ |
| } |
| |
| KMP_DEBUG_ASSERT(gtid >= 0); |
| |
| return gtid; |
| } |
| |
| /* caller must hold forkjoin_lock */ |
| void __kmp_check_stack_overlap(kmp_info_t *th) { |
| int f; |
| char *stack_beg = NULL; |
| char *stack_end = NULL; |
| int gtid; |
| |
| KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); |
| if (__kmp_storage_map) { |
| stack_end = (char *)th->th.th_info.ds.ds_stackbase; |
| stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; |
| |
| gtid = __kmp_gtid_from_thread(th); |
| |
| if (gtid == KMP_GTID_MONITOR) { |
| __kmp_print_storage_map_gtid( |
| gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, |
| "th_%s stack (%s)", "mon", |
| (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); |
| } else { |
| __kmp_print_storage_map_gtid( |
| gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, |
| "th_%d stack (%s)", gtid, |
| (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); |
| } |
| } |
| |
| /* No point in checking ubermaster threads since they use refinement and |
| * cannot overlap */ |
| gtid = __kmp_gtid_from_thread(th); |
| if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { |
| KA_TRACE(10, |
| ("__kmp_check_stack_overlap: performing extensive checking\n")); |
| if (stack_beg == NULL) { |
| stack_end = (char *)th->th.th_info.ds.ds_stackbase; |
| stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; |
| } |
| |
| for (f = 0; f < __kmp_threads_capacity; f++) { |
| kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); |
| |
| if (f_th && f_th != th) { |
| char *other_stack_end = |
| (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); |
| char *other_stack_beg = |
| other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); |
| if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || |
| (stack_end > other_stack_beg && stack_end < other_stack_end)) { |
| |
| /* Print the other stack values before the abort */ |
| if (__kmp_storage_map) |
| __kmp_print_storage_map_gtid( |
| -1, other_stack_beg, other_stack_end, |
| (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), |
| "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); |
| |
| __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), |
| __kmp_msg_null); |
| } |
| } |
| } |
| } |
| KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); |
| } |
| |
| /* ------------------------------------------------------------------------ */ |
| |
| void __kmp_infinite_loop(void) { |
| static int done = FALSE; |
| |
| while (!done) { |
| KMP_YIELD(TRUE); |
| } |
| } |
| |
| #define MAX_MESSAGE 512 |
| |
| void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, |
| char const *format, ...) { |
| char buffer[MAX_MESSAGE]; |
| va_list ap; |
| |
| va_start(ap, format); |
| KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, |
| p2, (unsigned long)size, format); |
| __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); |
| __kmp_vprintf(kmp_err, buffer, ap); |
| #if KMP_PRINT_DATA_PLACEMENT |
| int node; |
| if (gtid >= 0) { |
| if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { |
| if (__kmp_storage_map_verbose) { |
| node = __kmp_get_host_node(p1); |
| if (node < 0) /* doesn't work, so don't try this next time */ |
| __kmp_storage_map_verbose = FALSE; |
| else { |
| char *last; |
| int lastNode; |
| int localProc = __kmp_get_cpu_from_gtid(gtid); |
| |
| const int page_size = KMP_GET_PAGE_SIZE(); |
| |
| p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); |
| p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); |
| if (localProc >= 0) |
| __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, |
| localProc >> 1); |
| else |
| __kmp_printf_no_lock(" GTID %d\n", gtid); |
| #if KMP_USE_PRCTL |
| /* The more elaborate format is disabled for now because of the prctl |
| * hanging bug. */ |
| do { |
| last = p1; |
| lastNode = node; |
| /* This loop collates adjacent pages with the same host node. */ |
| do { |
| (char *)p1 += page_size; |
| } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); |
| __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, |
| lastNode); |
| } while (p1 <= p2); |
| #else |
| __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, |
| (char *)p1 + (page_size - 1), |
| __kmp_get_host_node(p1)); |
| if (p1 < p2) { |
| __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, |
| (char *)p2 + (page_size - 1), |
| __kmp_get_host_node(p2)); |
| } |
| #endif |
| } |
| } |
| } else |
| __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); |
| } |
| #endif /* KMP_PRINT_DATA_PLACEMENT */ |
| __kmp_release_bootstrap_lock(&__kmp_stdio_lock); |
| |
| va_end(ap); |
| } |
| |
| void __kmp_warn(char const *format, ...) { |
| char buffer[MAX_MESSAGE]; |
| va_list ap; |
| |
| if (__kmp_generate_warnings == kmp_warnings_off) { |
| return; |
| } |
| |
| va_start(ap, format); |
| |
| KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); |
| __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); |
| __kmp_vprintf(kmp_err, buffer, ap); |
| __kmp_release_bootstrap_lock(&__kmp_stdio_lock); |
| |
| va_end(ap); |
| } |
| |
| void __kmp_abort_process() { |
| // Later threads may stall here, but that's ok because abort() will kill them. |
| __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); |
| |
| if (__kmp_debug_buf) { |
| __kmp_dump_debug_buffer(); |
| } |
| |
| #if KMP_OS_WINDOWS |
| // Let other threads know of abnormal termination and prevent deadlock |
| // if abort happened during library initialization or shutdown |
| __kmp_global.g.g_abort = SIGABRT; |
| |
| /* On Windows* OS by default abort() causes pop-up error box, which stalls |
| nightly testing. Unfortunately, we cannot reliably suppress pop-up error |
| boxes. _set_abort_behavior() works well, but this function is not |
| available in VS7 (this is not problem for DLL, but it is a problem for |
| static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not |
| help, at least in some versions of MS C RTL. |
| |
| It seems following sequence is the only way to simulate abort() and |
| avoid pop-up error box. */ |
| raise(SIGABRT); |
| _exit(3); // Just in case, if signal ignored, exit anyway. |
| #else |
| __kmp_unregister_library(); |
| abort(); |
| #endif |
| |
| __kmp_infinite_loop(); |
| __kmp_release_bootstrap_lock(&__kmp_exit_lock); |
| |
| } // __kmp_abort_process |
| |
| void __kmp_abort_thread(void) { |
| // TODO: Eliminate g_abort global variable and this function. |
| // In case of abort just call abort(), it will kill all the threads. |
| __kmp_infinite_loop(); |
| } // __kmp_abort_thread |
| |
| /* Print out the storage map for the major kmp_info_t thread data structures |
| that are allocated together. */ |
| |
| static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { |
| __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", |
| gtid); |
| |
| __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, |
| sizeof(kmp_desc_t), "th_%d.th_info", gtid); |
| |
| __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, |
| sizeof(kmp_local_t), "th_%d.th_local", gtid); |
| |
| __kmp_print_storage_map_gtid( |
| gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], |
| sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); |
| |
| __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], |
| &thr->th.th_bar[bs_plain_barrier + 1], |
| sizeof(kmp_balign_t), "th_%d.th_bar[plain]", |
| gtid); |
| |
| __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], |
| &thr->th.th_bar[bs_forkjoin_barrier + 1], |
| sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", |
| gtid); |
| |
| #if KMP_FAST_REDUCTION_BARRIER |
| __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], |
| &thr->th.th_bar[bs_reduction_barrier + 1], |
| sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", |
| gtid); |
| #endif // KMP_FAST_REDUCTION_BARRIER |
| } |
| |
| /* Print out the storage map for the major kmp_team_t team data structures |
| that are allocated together. */ |
| |
| static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, |
| int team_id, int num_thr) { |
| int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; |
| __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", |
| header, team_id); |
| |
| __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], |
| &team->t.t_bar[bs_last_barrier], |
| sizeof(kmp_balign_team_t) * bs_last_barrier, |
| "%s_%d.t_bar", header, team_id); |
| |
| __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], |
| &team->t.t_bar[bs_plain_barrier + 1], |
| sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", |
| header, team_id); |
| |
| __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], |
| &team->t.t_bar[bs_forkjoin_barrier + 1], |
| sizeof(kmp_balign_team_t), |
| "%s_%d.t_bar[forkjoin]", header, team_id); |
| |
| #if KMP_FAST_REDUCTION_BARRIER |
| __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], |
| &team->t.t_bar[bs_reduction_barrier + 1], |
| sizeof(kmp_balign_team_t), |
| "%s_%d.t_bar[reduction]", header, team_id); |
| #endif // KMP_FAST_REDUCTION_BARRIER |
| |
| __kmp_print_storage_map_gtid( |
| -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], |
| sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); |
| |
| __kmp_print_storage_map_gtid( |
| -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], |
| sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); |
| |
| __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], |
| &team->t.t_disp_buffer[num_disp_buff], |
| sizeof(dispatch_shared_info_t) * num_disp_buff, |
| "%s_%d.t_disp_buffer", header, team_id); |
| } |
| |
| static void __kmp_init_allocator() { |
| __kmp_init_memkind(); |
| __kmp_init_target_mem(); |
| } |
| static void __kmp_fini_allocator() { __kmp_fini_memkind(); } |
| |
| /* ------------------------------------------------------------------------ */ |
| |
| #if ENABLE_LIBOMPTARGET |
| static void __kmp_init_omptarget() { |
| __kmp_init_target_task(); |
| } |
| #endif |
| |
| /* ------------------------------------------------------------------------ */ |
| |
| #if KMP_DYNAMIC_LIB |
| #if KMP_OS_WINDOWS |
| |
| BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { |
| //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); |
| |
| switch (fdwReason) { |
| |
| case DLL_PROCESS_ATTACH: |
| KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); |
| |
| return TRUE; |
| |
| case DLL_PROCESS_DETACH: |
| KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); |
| |
| // According to Windows* documentation for DllMain entry point: |
| // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: |
| // lpReserved == NULL when FreeLibrary() is called, |
| // lpReserved != NULL when the process is terminated. |
| // When FreeLibrary() is called, worker threads remain alive. So the |
| // runtime's state is consistent and executing proper shutdown is OK. |
| // When the process is terminated, worker threads have exited or been |
| // forcefully terminated by the OS and only the shutdown thread remains. |
| // This can leave the runtime in an inconsistent state. |
| // Hence, only attempt proper cleanup when FreeLibrary() is called. |
| // Otherwise, rely on OS to reclaim resources. |
| if (lpReserved == NULL) |
| __kmp_internal_end_library(__kmp_gtid_get_specific()); |
| |
| return TRUE; |
| |
| case DLL_THREAD_ATTACH: |
| KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); |
| |
| /* if we want to register new siblings all the time here call |
| * __kmp_get_gtid(); */ |
| return TRUE; |
| |
| case DLL_THREAD_DETACH: |
| KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); |
| |
| __kmp_internal_end_thread(__kmp_gtid_get_specific()); |
| return TRUE; |
| } |
| |
| return TRUE; |
| } |
| |
| #endif /* KMP_OS_WINDOWS */ |
| #endif /* KMP_DYNAMIC_LIB */ |
| |
| /* __kmp_parallel_deo -- Wait until it's our turn. */ |
| void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { |
| int gtid = *gtid_ref; |
| #ifdef BUILD_PARALLEL_ORDERED |
| kmp_team_t *team = __kmp_team_from_gtid(gtid); |
| #endif /* BUILD_PARALLEL_ORDERED */ |
| |
| if (__kmp_env_consistency_check) { |
| if (__kmp_threads[gtid]->th.th_root->r.r_active) |
| #if KMP_USE_DYNAMIC_LOCK |
| __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); |
| #else |
| __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); |
| #endif |
| } |
| #ifdef BUILD_PARALLEL_ORDERED |
| if (!team->t.t_serialized) { |
| KMP_MB(); |
| KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, |
| NULL); |
| KMP_MB(); |
| } |
| #endif /* BUILD_PARALLEL_ORDERED */ |
| } |
| |
| /* __kmp_parallel_dxo -- Signal the next task. */ |
| void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { |
| int gtid = *gtid_ref; |
| #ifdef BUILD_PARALLEL_ORDERED |
| int tid = __kmp_tid_from_gtid(gtid); |
| kmp_team_t *team = __kmp_team_from_gtid(gtid); |
| #endif /* BUILD_PARALLEL_ORDERED */ |
| |
| if (__kmp_env_consistency_check) { |
| if (__kmp_threads[gtid]->th.th_root->r.r_active) |
| __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); |
| } |
| #ifdef BUILD_PARALLEL_ORDERED |
| if (!team->t.t_serialized) { |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| |
| /* use the tid of the next thread in this team */ |
| /* TODO replace with general release procedure */ |
| team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); |
| |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| } |
| #endif /* BUILD_PARALLEL_ORDERED */ |
| } |
| |
| /* ------------------------------------------------------------------------ */ |
| /* The BARRIER for a SINGLE process section is always explicit */ |
| |
| int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { |
| int status; |
| kmp_info_t *th; |
| kmp_team_t *team; |
| |
| if (!TCR_4(__kmp_init_parallel)) |
| __kmp_parallel_initialize(); |
| __kmp_resume_if_soft_paused(); |
| |
| th = __kmp_threads[gtid]; |
| team = th->th.th_team; |
| status = 0; |
| |
| th->th.th_ident = id_ref; |
| |
| if (team->t.t_serialized) { |
| status = 1; |
| } else { |
| kmp_int32 old_this = th->th.th_local.this_construct; |
| |
| ++th->th.th_local.this_construct; |
| /* try to set team count to thread count--success means thread got the |
| single block */ |
| /* TODO: Should this be acquire or release? */ |
| if (team->t.t_construct == old_this) { |
| status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, |
| th->th.th_local.this_construct); |
| } |
| #if USE_ITT_BUILD |
| if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && |
| KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && |
| team->t.t_active_level == 1) { |
| // Only report metadata by primary thread of active team at level 1 |
| __kmp_itt_metadata_single(id_ref); |
| } |
| #endif /* USE_ITT_BUILD */ |
| } |
| |
| if (__kmp_env_consistency_check) { |
| if (status && push_ws) { |
| __kmp_push_workshare(gtid, ct_psingle, id_ref); |
| } else { |
| __kmp_check_workshare(gtid, ct_psingle, id_ref); |
| } |
| } |
| #if USE_ITT_BUILD |
| if (status) { |
| __kmp_itt_single_start(gtid); |
| } |
| #endif /* USE_ITT_BUILD */ |
| return status; |
| } |
| |
| void __kmp_exit_single(int gtid) { |
| #if USE_ITT_BUILD |
| __kmp_itt_single_end(gtid); |
| #endif /* USE_ITT_BUILD */ |
| if (__kmp_env_consistency_check) |
| __kmp_pop_workshare(gtid, ct_psingle, NULL); |
| } |
| |
| /* determine if we can go parallel or must use a serialized parallel region and |
| * how many threads we can use |
| * set_nproc is the number of threads requested for the team |
| * returns 0 if we should serialize or only use one thread, |
| * otherwise the number of threads to use |
| * The forkjoin lock is held by the caller. */ |
| static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, |
| int master_tid, int set_nthreads, |
| int enter_teams) { |
| int capacity; |
| int new_nthreads; |
| KMP_DEBUG_ASSERT(__kmp_init_serial); |
| KMP_DEBUG_ASSERT(root && parent_team); |
| kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; |
| |
| // If dyn-var is set, dynamically adjust the number of desired threads, |
| // according to the method specified by dynamic_mode. |
| new_nthreads = set_nthreads; |
| if (!get__dynamic_2(parent_team, master_tid)) { |
| ; |
| } |
| #ifdef USE_LOAD_BALANCE |
| else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { |
| new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); |
| if (new_nthreads == 1) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " |
| "reservation to 1 thread\n", |
| master_tid)); |
| return 1; |
| } |
| if (new_nthreads < set_nthreads) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " |
| "reservation to %d threads\n", |
| master_tid, new_nthreads)); |
| } |
| } |
| #endif /* USE_LOAD_BALANCE */ |
| else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { |
| new_nthreads = __kmp_avail_proc - __kmp_nth + |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); |
| if (new_nthreads <= 1) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " |
| "reservation to 1 thread\n", |
| master_tid)); |
| return 1; |
| } |
| if (new_nthreads < set_nthreads) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " |
| "reservation to %d threads\n", |
| master_tid, new_nthreads)); |
| } else { |
| new_nthreads = set_nthreads; |
| } |
| } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { |
| if (set_nthreads > 2) { |
| new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); |
| new_nthreads = (new_nthreads % set_nthreads) + 1; |
| if (new_nthreads == 1) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " |
| "reservation to 1 thread\n", |
| master_tid)); |
| return 1; |
| } |
| if (new_nthreads < set_nthreads) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " |
| "reservation to %d threads\n", |
| master_tid, new_nthreads)); |
| } |
| } |
| } else { |
| KMP_ASSERT(0); |
| } |
| |
| // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. |
| if (__kmp_nth + new_nthreads - |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > |
| __kmp_max_nth) { |
| int tl_nthreads = __kmp_max_nth - __kmp_nth + |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); |
| if (tl_nthreads <= 0) { |
| tl_nthreads = 1; |
| } |
| |
| // If dyn-var is false, emit a 1-time warning. |
| if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { |
| __kmp_reserve_warn = 1; |
| __kmp_msg(kmp_ms_warning, |
| KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), |
| KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); |
| } |
| if (tl_nthreads == 1) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " |
| "reduced reservation to 1 thread\n", |
| master_tid)); |
| return 1; |
| } |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " |
| "reservation to %d threads\n", |
| master_tid, tl_nthreads)); |
| new_nthreads = tl_nthreads; |
| } |
| |
| // Respect OMP_THREAD_LIMIT |
| int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; |
| int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; |
| if (cg_nthreads + new_nthreads - |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > |
| max_cg_threads) { |
| int tl_nthreads = max_cg_threads - cg_nthreads + |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); |
| if (tl_nthreads <= 0) { |
| tl_nthreads = 1; |
| } |
| |
| // If dyn-var is false, emit a 1-time warning. |
| if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { |
| __kmp_reserve_warn = 1; |
| __kmp_msg(kmp_ms_warning, |
| KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), |
| KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); |
| } |
| if (tl_nthreads == 1) { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " |
| "reduced reservation to 1 thread\n", |
| master_tid)); |
| return 1; |
| } |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " |
| "reservation to %d threads\n", |
| master_tid, tl_nthreads)); |
| new_nthreads = tl_nthreads; |
| } |
| |
| // Check if the threads array is large enough, or needs expanding. |
| // See comment in __kmp_register_root() about the adjustment if |
| // __kmp_threads[0] == NULL. |
| capacity = __kmp_threads_capacity; |
| if (TCR_PTR(__kmp_threads[0]) == NULL) { |
| --capacity; |
| } |
| // If it is not for initializing the hidden helper team, we need to take |
| // __kmp_hidden_helper_threads_num out of the capacity because it is included |
| // in __kmp_threads_capacity. |
| if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { |
| capacity -= __kmp_hidden_helper_threads_num; |
| } |
| if (__kmp_nth + new_nthreads - |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > |
| capacity) { |
| // Expand the threads array. |
| int slotsRequired = __kmp_nth + new_nthreads - |
| (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - |
| capacity; |
| int slotsAdded = __kmp_expand_threads(slotsRequired); |
| if (slotsAdded < slotsRequired) { |
| // The threads array was not expanded enough. |
| new_nthreads -= (slotsRequired - slotsAdded); |
| KMP_ASSERT(new_nthreads >= 1); |
| |
| // If dyn-var is false, emit a 1-time warning. |
| if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { |
| __kmp_reserve_warn = 1; |
| if (__kmp_tp_cached) { |
| __kmp_msg(kmp_ms_warning, |
| KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), |
| KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), |
| KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); |
| } else { |
| __kmp_msg(kmp_ms_warning, |
| KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), |
| KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); |
| } |
| } |
| } |
| } |
| |
| #ifdef KMP_DEBUG |
| if (new_nthreads == 1) { |
| KC_TRACE(10, |
| ("__kmp_reserve_threads: T#%d serializing team after reclaiming " |
| "dead roots and rechecking; requested %d threads\n", |
| __kmp_get_gtid(), set_nthreads)); |
| } else { |
| KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" |
| " %d threads\n", |
| __kmp_get_gtid(), new_nthreads, set_nthreads)); |
| } |
| #endif // KMP_DEBUG |
| |
| if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) { |
| __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev, |
| this_thr->th.th_nt_msg); |
| } |
| return new_nthreads; |
| } |
| |
| /* Allocate threads from the thread pool and assign them to the new team. We are |
| assured that there are enough threads available, because we checked on that |
| earlier within critical section forkjoin */ |
| static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, |
| kmp_info_t *master_th, int master_gtid, |
| int fork_teams_workers) { |
| int i; |
| int use_hot_team; |
| |
| KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); |
| KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); |
| KMP_MB(); |
| |
| /* first, let's setup the primary thread */ |
| master_th->th.th_info.ds.ds_tid = 0; |
| master_th->th.th_team = team; |
| master_th->th.th_team_nproc = team->t.t_nproc; |
| master_th->th.th_team_master = master_th; |
| master_th->th.th_team_serialized = FALSE; |
| master_th->th.th_dispatch = &team->t.t_dispatch[0]; |
| |
| /* make sure we are not the optimized hot team */ |
| #if KMP_NESTED_HOT_TEAMS |
| use_hot_team = 0; |
| kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; |
| if (hot_teams) { // hot teams array is not allocated if |
| // KMP_HOT_TEAMS_MAX_LEVEL=0 |
| int level = team->t.t_active_level - 1; // index in array of hot teams |
| if (master_th->th.th_teams_microtask) { // are we inside the teams? |
| if (master_th->th.th_teams_size.nteams > 1) { |
| ++level; // level was not increased in teams construct for |
| // team_of_masters |
| } |
| if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && |
| master_th->th.th_teams_level == team->t.t_level) { |
| ++level; // level was not increased in teams construct for |
| // team_of_workers before the parallel |
| } // team->t.t_level will be increased inside parallel |
| } |
| if (level < __kmp_hot_teams_max_level) { |
| if (hot_teams[level].hot_team) { |
| // hot team has already been allocated for given level |
| KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); |
| use_hot_team = 1; // the team is ready to use |
| } else { |
| use_hot_team = 0; // AC: threads are not allocated yet |
| hot_teams[level].hot_team = team; // remember new hot team |
| hot_teams[level].hot_team_nth = team->t.t_nproc; |
| } |
| } else { |
| use_hot_team = 0; |
| } |
| } |
| #else |
| use_hot_team = team == root->r.r_hot_team; |
| #endif |
| if (!use_hot_team) { |
| |
| /* install the primary thread */ |
| team->t.t_threads[0] = master_th; |
| __kmp_initialize_info(master_th, team, 0, master_gtid); |
| |
| /* now, install the worker threads */ |
| for (i = 1; i < team->t.t_nproc; i++) { |
| |
| /* fork or reallocate a new thread and install it in team */ |
| kmp_info_t *thr = __kmp_allocate_thread(root, team, i); |
| team->t.t_threads[i] = thr; |
| KMP_DEBUG_ASSERT(thr); |
| KMP_DEBUG_ASSERT(thr->th.th_team == team); |
| /* align team and thread arrived states */ |
| KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " |
| "T#%d(%d:%d) join =%llu, plain=%llu\n", |
| __kmp_gtid_from_tid(0, team), team->t.t_id, 0, |
| __kmp_gtid_from_tid(i, team), team->t.t_id, i, |
| team->t.t_bar[bs_forkjoin_barrier].b_arrived, |
| team->t.t_bar[bs_plain_barrier].b_arrived)); |
| thr->th.th_teams_microtask = master_th->th.th_teams_microtask; |
| thr->th.th_teams_level = master_th->th.th_teams_level; |
| thr->th.th_teams_size = master_th->th.th_teams_size; |
| { // Initialize threads' barrier data. |
| int b; |
| kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; |
| for (b = 0; b < bs_last_barrier; ++b) { |
| balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; |
| KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); |
| #if USE_DEBUGGER |
| balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; |
| #endif |
| } |
| } |
| } |
| |
| #if KMP_AFFINITY_SUPPORTED |
| // Do not partition the places list for teams construct workers who |
| // haven't actually been forked to do real work yet. This partitioning |
| // will take place in the parallel region nested within the teams construct. |
| if (!fork_teams_workers) { |
| __kmp_partition_places(team); |
| } |
| #endif |
| |
| if (team->t.t_nproc > 1 && |
| __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { |
| team->t.b->update_num_threads(team->t.t_nproc); |
| __kmp_add_threads_to_team(team, team->t.t_nproc); |
| } |
| } |
| |
| // Take care of primary thread's task state |
| if (__kmp_tasking_mode != tskm_immediate_exec) { |
| if (use_hot_team) { |
| KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th); |
| KA_TRACE( |
| 20, |
| ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team " |
| "%p, new task_team %p / team %p\n", |
| __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, |
| team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state], |
| team)); |
| |
| // Store primary thread's current task state on new team |
| KMP_CHECK_UPDATE(team->t.t_primary_task_state, |
| master_th->th.th_task_state); |
| |
| // Restore primary thread's task state to hot team's state |
| // by using thread 1's task state |
| if (team->t.t_nproc > 1) { |
| KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 || |
| team->t.t_threads[1]->th.th_task_state == 1); |
| KMP_CHECK_UPDATE(master_th->th.th_task_state, |
| team->t.t_threads[1]->th.th_task_state); |
| } else { |
| master_th->th.th_task_state = 0; |
| } |
| } else { |
| // Store primary thread's current task_state on new team |
| KMP_CHECK_UPDATE(team->t.t_primary_task_state, |
| master_th->th.th_task_state); |
| // Are not using hot team, so set task state to 0. |
| master_th->th.th_task_state = 0; |
| } |
| } |
| |
| if (__kmp_display_affinity && team->t.t_display_affinity != 1) { |
| for (i = 0; i < team->t.t_nproc; i++) { |
| kmp_info_t *thr = team->t.t_threads[i]; |
| if (thr->th.th_prev_num_threads != team->t.t_nproc || |
| thr->th.th_prev_level != team->t.t_level) { |
| team->t.t_display_affinity = 1; |
| break; |
| } |
| } |
| } |
| |
| KMP_MB(); |
| } |
| |
| #if KMP_ARCH_X86 || KMP_ARCH_X86_64 |
| // Propagate any changes to the floating point control registers out to the team |
| // We try to avoid unnecessary writes to the relevant cache line in the team |
| // structure, so we don't make changes unless they are needed. |
| inline static void propagateFPControl(kmp_team_t *team) { |
| if (__kmp_inherit_fp_control) { |
| kmp_int16 x87_fpu_control_word; |
| kmp_uint32 mxcsr; |
| |
| // Get primary thread's values of FPU control flags (both X87 and vector) |
| __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); |
| __kmp_store_mxcsr(&mxcsr); |
| mxcsr &= KMP_X86_MXCSR_MASK; |
| |
| // There is no point looking at t_fp_control_saved here. |
| // If it is TRUE, we still have to update the values if they are different |
| // from those we now have. If it is FALSE we didn't save anything yet, but |
| // our objective is the same. We have to ensure that the values in the team |
| // are the same as those we have. |
| // So, this code achieves what we need whether or not t_fp_control_saved is |
| // true. By checking whether the value needs updating we avoid unnecessary |
| // writes that would put the cache-line into a written state, causing all |
| // threads in the team to have to read it again. |
| KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); |
| KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); |
| // Although we don't use this value, other code in the runtime wants to know |
| // whether it should restore them. So we must ensure it is correct. |
| KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); |
| } else { |
| // Similarly here. Don't write to this cache-line in the team structure |
| // unless we have to. |
| KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); |
| } |
| } |
| |
| // Do the opposite, setting the hardware registers to the updated values from |
| // the team. |
| inline static void updateHWFPControl(kmp_team_t *team) { |
| if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { |
| // Only reset the fp control regs if they have been changed in the team. |
| // the parallel region that we are exiting. |
| kmp_int16 x87_fpu_control_word; |
| kmp_uint32 mxcsr; |
| __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); |
| __kmp_store_mxcsr(&mxcsr); |
| mxcsr &= KMP_X86_MXCSR_MASK; |
| |
| if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { |
| __kmp_clear_x87_fpu_status_word(); |
| __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); |
| } |
| |
| if (team->t.t_mxcsr != mxcsr) { |
| __kmp_load_mxcsr(&team->t.t_mxcsr); |
| } |
| } |
| } |
| #else |
| #define propagateFPControl(x) ((void)0) |
| #define updateHWFPControl(x) ((void)0) |
| #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ |
| |
| static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, |
| int realloc); // forward declaration |
| |
| /* Run a parallel region that has been serialized, so runs only in a team of the |
| single primary thread. */ |
| void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { |
| kmp_info_t *this_thr; |
| kmp_team_t *serial_team; |
| |
| KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); |
| |
| /* Skip all this code for autopar serialized loops since it results in |
| unacceptable overhead */ |
| if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) |
| return; |
| |
| if (!TCR_4(__kmp_init_parallel)) |
| __kmp_parallel_initialize(); |
| __kmp_resume_if_soft_paused(); |
| |
| this_thr = __kmp_threads[global_tid]; |
| serial_team = this_thr->th.th_serial_team; |
| |
| /* utilize the serialized team held by this thread */ |
| KMP_DEBUG_ASSERT(serial_team); |
| KMP_MB(); |
| |
| kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; |
| if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { |
| proc_bind = proc_bind_false; |
| } else if (proc_bind == proc_bind_default) { |
| // No proc_bind clause was specified, so use the current value |
| // of proc-bind-var for this parallel region. |
| proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; |
| } |
| // Reset for next parallel region |
| this_thr->th.th_set_proc_bind = proc_bind_default; |
| |
| // Reset num_threads for next parallel region |
| this_thr->th.th_set_nproc = 0; |
| |
| #if OMPT_SUPPORT |
| ompt_data_t ompt_parallel_data = ompt_data_none; |
| void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); |
| if (ompt_enabled.enabled && |
| this_thr->th.ompt_thread_info.state != ompt_state_overhead) { |
| |
| ompt_task_info_t *parent_task_info; |
| parent_task_info = OMPT_CUR_TASK_INFO(this_thr); |
| |
| parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); |
| if (ompt_enabled.ompt_callback_parallel_begin) { |
| int team_size = 1; |
| |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( |
| &(parent_task_info->task_data), &(parent_task_info->frame), |
| &ompt_parallel_data, team_size, |
| ompt_parallel_invoker_program | ompt_parallel_team, codeptr); |
| } |
| } |
| #endif // OMPT_SUPPORT |
| |
| if (this_thr->th.th_team != serial_team) { |
| // Nested level will be an index in the nested nthreads array |
| int level = this_thr->th.th_team->t.t_level; |
| |
| if (serial_team->t.t_serialized) { |
| /* this serial team was already used |
| TODO increase performance by making this locks more specific */ |
| kmp_team_t *new_team; |
| |
| __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); |
| |
| new_team = |
| __kmp_allocate_team(this_thr->th.th_root, 1, 1, |
| #if OMPT_SUPPORT |
| ompt_parallel_data, |
| #endif |
| proc_bind, &this_thr->th.th_current_task->td_icvs, |
| 0 USE_NESTED_HOT_ARG(NULL)); |
| __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); |
| KMP_ASSERT(new_team); |
| |
| /* setup new serialized team and install it */ |
| new_team->t.t_threads[0] = this_thr; |
| new_team->t.t_parent = this_thr->th.th_team; |
| serial_team = new_team; |
| this_thr->th.th_serial_team = serial_team; |
| |
| KF_TRACE( |
| 10, |
| ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", |
| global_tid, serial_team)); |
| |
| /* TODO the above breaks the requirement that if we run out of resources, |
| then we can still guarantee that serialized teams are ok, since we may |
| need to allocate a new one */ |
| } else { |
| KF_TRACE( |
| 10, |
| ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", |
| global_tid, serial_team)); |
| } |
| |
| /* we have to initialize this serial team */ |
| KMP_DEBUG_ASSERT(serial_team->t.t_threads); |
| KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); |
| KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); |
| serial_team->t.t_ident = loc; |
| serial_team->t.t_serialized = 1; |
| serial_team->t.t_nproc = 1; |
| serial_team->t.t_parent = this_thr->th.th_team; |
| if (this_thr->th.th_team->t.t_nested_nth) |
| serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth; |
| else |
| serial_team->t.t_nested_nth = &__kmp_nested_nth; |
| // Save previous team's task state on serial team structure |
| serial_team->t.t_primary_task_state = this_thr->th.th_task_state; |
| serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; |
| this_thr->th.th_team = serial_team; |
| serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; |
| |
| KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, |
| this_thr->th.th_current_task)); |
| KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); |
| this_thr->th.th_current_task->td_flags.executing = 0; |
| |
| __kmp_push_current_task_to_thread(this_thr, serial_team, 0); |
| |
| /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an |
| implicit task for each serialized task represented by |
| team->t.t_serialized? */ |
| copy_icvs(&this_thr->th.th_current_task->td_icvs, |
| &this_thr->th.th_current_task->td_parent->td_icvs); |
| |
| // Thread value exists in the nested nthreads array for the next nested |
| // level |
| kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth; |
| if (this_thr->th.th_team->t.t_nested_nth) |
| nested_nth = this_thr->th.th_team->t.t_nested_nth; |
| if (nested_nth->used && (level + 1 < nested_nth->used)) { |
| this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1]; |
| } |
| |
| if (__kmp_nested_proc_bind.used && |
| (level + 1 < __kmp_nested_proc_bind.used)) { |
| this_thr->th.th_current_task->td_icvs.proc_bind = |
| __kmp_nested_proc_bind.bind_types[level + 1]; |
| } |
| |
| #if USE_DEBUGGER |
| serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. |
| #endif |
| this_thr->th.th_info.ds.ds_tid = 0; |
| |
| /* set thread cache values */ |
| this_thr->th.th_team_nproc = 1; |
| this_thr->th.th_team_master = this_thr; |
| this_thr->th.th_team_serialized = 1; |
| this_thr->th.th_task_team = NULL; |
| this_thr->th.th_task_state = 0; |
| |
| serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; |
| serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; |
| serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save |
| |
| propagateFPControl(serial_team); |
| |
| /* check if we need to allocate dispatch buffers stack */ |
| KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); |
| if (!serial_team->t.t_dispatch->th_disp_buffer) { |
| serial_team->t.t_dispatch->th_disp_buffer = |
| (dispatch_private_info_t *)__kmp_allocate( |
| sizeof(dispatch_private_info_t)); |
| } |
| this_thr->th.th_dispatch = serial_team->t.t_dispatch; |
| |
| KMP_MB(); |
| |
| } else { |
| /* this serialized team is already being used, |
| * that's fine, just add another nested level */ |
| KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); |
| KMP_DEBUG_ASSERT(serial_team->t.t_threads); |
| KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); |
| ++serial_team->t.t_serialized; |
| this_thr->th.th_team_serialized = serial_team->t.t_serialized; |
| |
| // Nested level will be an index in the nested nthreads array |
| int level = this_thr->th.th_team->t.t_level; |
| // Thread value exists in the nested nthreads array for the next nested |
| // level |
| |
| kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth; |
| if (serial_team->t.t_nested_nth) |
| nested_nth = serial_team->t.t_nested_nth; |
| if (nested_nth->used && (level + 1 < nested_nth->used)) { |
| this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1]; |
| } |
| |
| serial_team->t.t_level++; |
| KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " |
| "of serial team %p to %d\n", |
| global_tid, serial_team, serial_team->t.t_level)); |
| |
| /* allocate/push dispatch buffers stack */ |
| KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); |
| { |
| dispatch_private_info_t *disp_buffer = |
| (dispatch_private_info_t *)__kmp_allocate( |
| sizeof(dispatch_private_info_t)); |
| disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; |
| serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; |
| } |
| this_thr->th.th_dispatch = serial_team->t.t_dispatch; |
| |
| /* allocate/push task team stack */ |
| __kmp_push_task_team_node(this_thr, serial_team); |
| |
| KMP_MB(); |
| } |
| KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); |
| |
| // Perform the display affinity functionality for |
| // serialized parallel regions |
| if (__kmp_display_affinity) { |
| if (this_thr->th.th_prev_level != serial_team->t.t_level || |
| this_thr->th.th_prev_num_threads != 1) { |
| // NULL means use the affinity-format-var ICV |
| __kmp_aux_display_affinity(global_tid, NULL); |
| this_thr->th.th_prev_level = serial_team->t.t_level; |
| this_thr->th.th_prev_num_threads = 1; |
| } |
| } |
| |
| if (__kmp_env_consistency_check) |
| __kmp_push_parallel(global_tid, NULL); |
| #if OMPT_SUPPORT |
| serial_team->t.ompt_team_info.master_return_address = codeptr; |
| if (ompt_enabled.enabled && |
| this_thr->th.ompt_thread_info.state != ompt_state_overhead) { |
| OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = |
| OMPT_GET_FRAME_ADDRESS(0); |
| |
| ompt_lw_taskteam_t lw_taskteam; |
| __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, |
| &ompt_parallel_data, codeptr); |
| |
| __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); |
| // don't use lw_taskteam after linking. content was swaped |
| |
| /* OMPT implicit task begin */ |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), |
| OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), |
| ompt_task_implicit); // TODO: Can this be ompt_task_initial? |
| OMPT_CUR_TASK_INFO(this_thr)->thread_num = |
| __kmp_tid_from_gtid(global_tid); |
| } |
| |
| /* OMPT state */ |
| this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; |
| OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = |
| OMPT_GET_FRAME_ADDRESS(0); |
| } |
| #endif |
| } |
| |
| // Test if this fork is for a team closely nested in a teams construct |
| static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th, |
| microtask_t microtask, int level, |
| int teams_level, kmp_va_list ap) { |
| return (master_th->th.th_teams_microtask && ap && |
| microtask != (microtask_t)__kmp_teams_master && level == teams_level); |
| } |
| |
| // Test if this fork is for the teams construct, i.e. to form the outer league |
| // of teams |
| static inline bool __kmp_is_entering_teams(int active_level, int level, |
| int teams_level, kmp_va_list ap) { |
| return ((ap == NULL && active_level == 0) || |
| (ap && teams_level > 0 && teams_level == level)); |
| } |
| |
| // AC: This is start of parallel that is nested inside teams construct. |
| // The team is actual (hot), all workers are ready at the fork barrier. |
| // No lock needed to initialize the team a bit, then free workers. |
| static inline int |
| __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team, |
| kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root, |
| enum fork_context_e call_context, microtask_t microtask, |
| launch_t invoker, int master_set_numthreads, int level, |
| #if OMPT_SUPPORT |
| ompt_data_t ompt_parallel_data, void *return_address, |
| #endif |
| kmp_va_list ap) { |
| void **argv; |
| int i; |
| |
| parent_team->t.t_ident = loc; |
| __kmp_alloc_argv_entries(argc, parent_team, TRUE); |
| parent_team->t.t_argc = argc; |
| argv = (void **)parent_team->t.t_argv; |
| for (i = argc - 1; i >= 0; --i) { |
| *argv++ = va_arg(kmp_va_deref(ap), void *); |
| } |
| // Increment our nested depth levels, but not increase the serialization |
| if (parent_team == master_th->th.th_serial_team) { |
| // AC: we are in serialized parallel |
| __kmpc_serialized_parallel(loc, gtid); |
| KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); |
| |
| if (call_context == fork_context_gnu) { |
| // AC: need to decrement t_serialized for enquiry functions to work |
| // correctly, will restore at join time |
| parent_team->t.t_serialized--; |
| return TRUE; |
| } |
| |
| #if OMPD_SUPPORT |
| parent_team->t.t_pkfn = microtask; |
| #endif |
| |
| #if OMPT_SUPPORT |
| void *dummy; |
| void **exit_frame_p; |
| ompt_data_t *implicit_task_data; |
| ompt_lw_taskteam_t lw_taskteam; |
| |
| if (ompt_enabled.enabled) { |
| __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, |
| &ompt_parallel_data, return_address); |
| exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); |
| |
| __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); |
| // Don't use lw_taskteam after linking. Content was swapped. |
| |
| /* OMPT implicit task begin */ |
| implicit_task_data = OMPT_CUR_TASK_DATA(master_th); |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data, |
| 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); |
| } |
| |
| /* OMPT state */ |
| master_th->th.ompt_thread_info.state = ompt_state_work_parallel; |
| } else { |
| exit_frame_p = &dummy; |
| } |
| #endif |
| |
| // AC: need to decrement t_serialized for enquiry functions to work |
| // correctly, will restore at join time |
| parent_team->t.t_serialized--; |
| |
| { |
| KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); |
| KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); |
| __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv |
| #if OMPT_SUPPORT |
| , |
| exit_frame_p |
| #endif |
| ); |
| } |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| *exit_frame_p = NULL; |
| OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_end, NULL, implicit_task_data, 1, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); |
| } |
| ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); |
| __ompt_lw_taskteam_unlink(master_th); |
| if (ompt_enabled.ompt_callback_parallel_end) { |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( |
| &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), |
| OMPT_INVOKER(call_context) | ompt_parallel_team, return_address); |
| } |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| return TRUE; |
| } |
| |
| parent_team->t.t_pkfn = microtask; |
| parent_team->t.t_invoke = invoker; |
| KMP_ATOMIC_INC(&root->r.r_in_parallel); |
| parent_team->t.t_active_level++; |
| parent_team->t.t_level++; |
| parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save |
| |
| // If the threads allocated to the team are less than the thread limit, update |
| // the thread limit here. th_teams_size.nth is specific to this team nested |
| // in a teams construct, the team is fully created, and we're about to do |
| // the actual fork. Best to do this here so that the subsequent uses below |
| // and in the join have the correct value. |
| master_th->th.th_teams_size.nth = parent_team->t.t_nproc; |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| ompt_lw_taskteam_t lw_taskteam; |
| __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data, |
| return_address); |
| __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); |
| } |
| #endif |
| |
| /* Change number of threads in the team if requested */ |
| if (master_set_numthreads) { // The parallel has num_threads clause |
| if (master_set_numthreads <= master_th->th.th_teams_size.nth) { |
| // AC: only can reduce number of threads dynamically, can't increase |
| kmp_info_t **other_threads = parent_team->t.t_threads; |
| // NOTE: if using distributed barrier, we need to run this code block |
| // even when the team size appears not to have changed from the max. |
| int old_proc = master_th->th.th_teams_size.nth; |
| if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { |
| __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads); |
| __kmp_add_threads_to_team(parent_team, master_set_numthreads); |
| } |
| parent_team->t.t_nproc = master_set_numthreads; |
| for (i = 0; i < master_set_numthreads; ++i) { |
| other_threads[i]->th.th_team_nproc = master_set_numthreads; |
| } |
| } |
| // Keep extra threads hot in the team for possible next parallels |
| master_th->th.th_set_nproc = 0; |
| } |
| |
| #if USE_DEBUGGER |
| if (__kmp_debugging) { // Let debugger override number of threads. |
| int nth = __kmp_omp_num_threads(loc); |
| if (nth > 0) { // 0 means debugger doesn't want to change num threads |
| master_set_numthreads = nth; |
| } |
| } |
| #endif |
| |
| // Figure out the proc_bind policy for the nested parallel within teams |
| kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; |
| // proc_bind_default means don't update |
| kmp_proc_bind_t proc_bind_icv = proc_bind_default; |
| if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { |
| proc_bind = proc_bind_false; |
| } else { |
| // No proc_bind clause specified; use current proc-bind-var |
| if (proc_bind == proc_bind_default) { |
| proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; |
| } |
| /* else: The proc_bind policy was specified explicitly on parallel clause. |
| This overrides proc-bind-var for this parallel region, but does not |
| change proc-bind-var. */ |
| // Figure the value of proc-bind-var for the child threads. |
| if ((level + 1 < __kmp_nested_proc_bind.used) && |
| (__kmp_nested_proc_bind.bind_types[level + 1] != |
| master_th->th.th_current_task->td_icvs.proc_bind)) { |
| proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; |
| } |
| } |
| KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind); |
| // Need to change the bind-var ICV to correct value for each implicit task |
| if (proc_bind_icv != proc_bind_default && |
| master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) { |
| kmp_info_t **other_threads = parent_team->t.t_threads; |
| for (i = 0; i < master_th->th.th_team_nproc; ++i) { |
| other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv; |
| } |
| } |
| // Reset for next parallel region |
| master_th->th.th_set_proc_bind = proc_bind_default; |
| |
| #if USE_ITT_BUILD && USE_ITT_NOTIFY |
| if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || |
| KMP_ITT_DEBUG) && |
| __kmp_forkjoin_frames_mode == 3 && |
| parent_team->t.t_active_level == 1 // only report frames at level 1 |
| && master_th->th.th_teams_size.nteams == 1) { |
| kmp_uint64 tmp_time = __itt_get_timestamp(); |
| master_th->th.th_frame_time = tmp_time; |
| parent_team->t.t_region_time = tmp_time; |
| } |
| if (__itt_stack_caller_create_ptr) { |
| KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); |
| // create new stack stitching id before entering fork barrier |
| parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); |
| } |
| #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ |
| #if KMP_AFFINITY_SUPPORTED |
| __kmp_partition_places(parent_team); |
| #endif |
| |
| KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, " |
| "master_th=%p, gtid=%d\n", |
| root, parent_team, master_th, gtid)); |
| __kmp_internal_fork(loc, gtid, parent_team); |
| KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, " |
| "master_th=%p, gtid=%d\n", |
| root, parent_team, master_th, gtid)); |
| |
| if (call_context == fork_context_gnu) |
| return TRUE; |
| |
| /* Invoke microtask for PRIMARY thread */ |
| KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid, |
| parent_team->t.t_id, parent_team->t.t_pkfn)); |
| |
| if (!parent_team->t.t_invoke(gtid)) { |
| KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); |
| } |
| KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid, |
| parent_team->t.t_id, parent_team->t.t_pkfn)); |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| |
| KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid)); |
| |
| return TRUE; |
| } |
| |
| // Create a serialized parallel region |
| static inline int |
| __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context, |
| kmp_int32 argc, microtask_t microtask, launch_t invoker, |
| kmp_info_t *master_th, kmp_team_t *parent_team, |
| #if OMPT_SUPPORT |
| ompt_data_t *ompt_parallel_data, void **return_address, |
| ompt_data_t **parent_task_data, |
| #endif |
| kmp_va_list ap) { |
| kmp_team_t *team; |
| int i; |
| void **argv; |
| |
| /* josh todo: hypothetical question: what do we do for OS X*? */ |
| #if KMP_OS_LINUX && \ |
| (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) |
| SimpleVLA<void *> args(argc); |
| #else |
| void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); |
| #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ |
| KMP_ARCH_AARCH64) */ |
| |
| KA_TRACE( |
| 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid)); |
| |
| __kmpc_serialized_parallel(loc, gtid); |
| |
| #if OMPD_SUPPORT |
| master_th->th.th_serial_team->t.t_pkfn = microtask; |
| #endif |
| |
| if (call_context == fork_context_intel) { |
| /* TODO this sucks, use the compiler itself to pass args! :) */ |
| master_th->th.th_serial_team->t.t_ident = loc; |
| if (!ap) { |
| // revert change made in __kmpc_serialized_parallel() |
| master_th->th.th_serial_team->t.t_level--; |
| // Get args from parent team for teams construct |
| |
| #if OMPT_SUPPORT |
| void *dummy; |
| void **exit_frame_p; |
| ompt_task_info_t *task_info; |
| ompt_lw_taskteam_t lw_taskteam; |
| |
| if (ompt_enabled.enabled) { |
| __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, |
| ompt_parallel_data, *return_address); |
| |
| __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); |
| // don't use lw_taskteam after linking. content was swaped |
| task_info = OMPT_CUR_TASK_INFO(master_th); |
| exit_frame_p = &(task_info->frame.exit_frame.ptr); |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), |
| &(task_info->task_data), 1, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); |
| } |
| |
| /* OMPT state */ |
| master_th->th.ompt_thread_info.state = ompt_state_work_parallel; |
| } else { |
| exit_frame_p = &dummy; |
| } |
| #endif |
| |
| { |
| KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); |
| KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); |
| __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv |
| #if OMPT_SUPPORT |
| , |
| exit_frame_p |
| #endif |
| ); |
| } |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| *exit_frame_p = NULL; |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_end, NULL, &(task_info->task_data), 1, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); |
| } |
| *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); |
| __ompt_lw_taskteam_unlink(master_th); |
| if (ompt_enabled.ompt_callback_parallel_end) { |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( |
| ompt_parallel_data, *parent_task_data, |
| OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); |
| } |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| } else if (microtask == (microtask_t)__kmp_teams_master) { |
| KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team); |
| team = master_th->th.th_team; |
| // team->t.t_pkfn = microtask; |
| team->t.t_invoke = invoker; |
| __kmp_alloc_argv_entries(argc, team, TRUE); |
| team->t.t_argc = argc; |
| argv = (void **)team->t.t_argv; |
| for (i = argc - 1; i >= 0; --i) |
| *argv++ = va_arg(kmp_va_deref(ap), void *); |
| // AC: revert change made in __kmpc_serialized_parallel() |
| // because initial code in teams should have level=0 |
| team->t.t_level--; |
| // AC: call special invoker for outer "parallel" of teams construct |
| invoker(gtid); |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_end, NULL, &(task_info->task_data), 0, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); |
| } |
| if (ompt_enabled.ompt_callback_parallel_end) { |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( |
| ompt_parallel_data, *parent_task_data, |
| OMPT_INVOKER(call_context) | ompt_parallel_league, |
| *return_address); |
| } |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| } else { |
| argv = args; |
| for (i = argc - 1; i >= 0; --i) |
| *argv++ = va_arg(kmp_va_deref(ap), void *); |
| KMP_MB(); |
| |
| #if OMPT_SUPPORT |
| void *dummy; |
| void **exit_frame_p; |
| ompt_task_info_t *task_info; |
| ompt_lw_taskteam_t lw_taskteam; |
| ompt_data_t *implicit_task_data; |
| |
| if (ompt_enabled.enabled) { |
| __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, |
| ompt_parallel_data, *return_address); |
| __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); |
| // don't use lw_taskteam after linking. content was swaped |
| task_info = OMPT_CUR_TASK_INFO(master_th); |
| exit_frame_p = &(task_info->frame.exit_frame.ptr); |
| |
| /* OMPT implicit task begin */ |
| implicit_task_data = OMPT_CUR_TASK_DATA(master_th); |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), |
| implicit_task_data, 1, __kmp_tid_from_gtid(gtid), |
| ompt_task_implicit); |
| OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid); |
| } |
| |
| /* OMPT state */ |
| master_th->th.ompt_thread_info.state = ompt_state_work_parallel; |
| } else { |
| exit_frame_p = &dummy; |
| } |
| #endif |
| |
| { |
| KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); |
| KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); |
| __kmp_invoke_microtask(microtask, gtid, 0, argc, args |
| #if OMPT_SUPPORT |
| , |
| exit_frame_p |
| #endif |
| ); |
| } |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| *exit_frame_p = NULL; |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_end, NULL, &(task_info->task_data), 1, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); |
| } |
| |
| *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); |
| __ompt_lw_taskteam_unlink(master_th); |
| if (ompt_enabled.ompt_callback_parallel_end) { |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( |
| ompt_parallel_data, *parent_task_data, |
| OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address); |
| } |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| } |
| } else if (call_context == fork_context_gnu) { |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| ompt_lw_taskteam_t lwt; |
| __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data, |
| *return_address); |
| |
| lwt.ompt_task_info.frame.exit_frame = ompt_data_none; |
| __ompt_lw_taskteam_link(&lwt, master_th, 1); |
| } |
| // don't use lw_taskteam after linking. content was swaped |
| #endif |
| |
| // we were called from GNU native code |
| KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); |
| return FALSE; |
| } else { |
| KMP_ASSERT2(call_context < fork_context_last, |
| "__kmp_serial_fork_call: unknown fork_context parameter"); |
| } |
| |
| KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid)); |
| KMP_MB(); |
| return FALSE; |
| } |
| |
| /* most of the work for a fork */ |
| /* return true if we really went parallel, false if serialized */ |
| int __kmp_fork_call(ident_t *loc, int gtid, |
| enum fork_context_e call_context, // Intel, GNU, ... |
| kmp_int32 argc, microtask_t microtask, launch_t invoker, |
| kmp_va_list ap) { |
| void **argv; |
| int i; |
| int master_tid; |
| int master_this_cons; |
| kmp_team_t *team; |
| kmp_team_t *parent_team; |
| kmp_info_t *master_th; |
| kmp_root_t *root; |
| int nthreads; |
| int master_active; |
| int master_set_numthreads; |
| int task_thread_limit = 0; |
| int level; |
| int active_level; |
| int teams_level; |
| #if KMP_NESTED_HOT_TEAMS |
| kmp_hot_team_ptr_t **p_hot_teams; |
| #endif |
| { // KMP_TIME_BLOCK |
| KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); |
| KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); |
| |
| KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); |
| if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { |
| /* Some systems prefer the stack for the root thread(s) to start with */ |
| /* some gap from the parent stack to prevent false sharing. */ |
| void *dummy = KMP_ALLOCA(__kmp_stkpadding); |
| /* These 2 lines below are so this does not get optimized out */ |
| if (__kmp_stkpadding > KMP_MAX_STKPADDING) |
| __kmp_stkpadding += (short)((kmp_int64)dummy); |
| } |
| |
| /* initialize if needed */ |
| KMP_DEBUG_ASSERT( |
| __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown |
| if (!TCR_4(__kmp_init_parallel)) |
| __kmp_parallel_initialize(); |
| __kmp_resume_if_soft_paused(); |
| |
| /* setup current data */ |
| // AC: potentially unsafe, not in sync with library shutdown, |
| // __kmp_threads can be freed |
| master_th = __kmp_threads[gtid]; |
| |
| parent_team = master_th->th.th_team; |
| master_tid = master_th->th.th_info.ds.ds_tid; |
| master_this_cons = master_th->th.th_local.this_construct; |
| root = master_th->th.th_root; |
| master_active = root->r.r_active; |
| master_set_numthreads = master_th->th.th_set_nproc; |
| task_thread_limit = |
| master_th->th.th_current_task->td_icvs.task_thread_limit; |
| |
| #if OMPT_SUPPORT |
| ompt_data_t ompt_parallel_data = ompt_data_none; |
| ompt_data_t *parent_task_data = NULL; |
| ompt_frame_t *ompt_frame = NULL; |
| void *return_address = NULL; |
| |
| if (ompt_enabled.enabled) { |
| __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, |
| NULL, NULL); |
| return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); |
| } |
| #endif |
| |
| // Assign affinity to root thread if it hasn't happened yet |
| __kmp_assign_root_init_mask(); |
| |
| // Nested level will be an index in the nested nthreads array |
| level = parent_team->t.t_level; |
| // used to launch non-serial teams even if nested is not allowed |
| active_level = parent_team->t.t_active_level; |
| // needed to check nesting inside the teams |
| teams_level = master_th->th.th_teams_level; |
| #if KMP_NESTED_HOT_TEAMS |
| p_hot_teams = &master_th->th.th_hot_teams; |
| if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { |
| *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( |
| sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); |
| (*p_hot_teams)[0].hot_team = root->r.r_hot_team; |
| // it is either actual or not needed (when active_level > 0) |
| (*p_hot_teams)[0].hot_team_nth = 1; |
| } |
| #endif |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| if (ompt_enabled.ompt_callback_parallel_begin) { |
| int team_size = master_set_numthreads |
| ? master_set_numthreads |
| : get__nproc_2(parent_team, master_tid); |
| int flags = OMPT_INVOKER(call_context) | |
| ((microtask == (microtask_t)__kmp_teams_master) |
| ? ompt_parallel_league |
| : ompt_parallel_team); |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( |
| parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, |
| return_address); |
| } |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| |
| master_th->th.th_ident = loc; |
| |
| // Parallel closely nested in teams construct: |
| if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) { |
| return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root, |
| call_context, microtask, invoker, |
| master_set_numthreads, level, |
| #if OMPT_SUPPORT |
| ompt_parallel_data, return_address, |
| #endif |
| ap); |
| } // End parallel closely nested in teams construct |
| |
| // Need this to happen before we determine the number of threads, not while |
| // we are allocating the team |
| //__kmp_push_current_task_to_thread(master_th, parent_team, 0); |
| |
| KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th); |
| |
| // Determine the number of threads |
| int enter_teams = |
| __kmp_is_entering_teams(active_level, level, teams_level, ap); |
| if ((!enter_teams && |
| (parent_team->t.t_active_level >= |
| master_th->th.th_current_task->td_icvs.max_active_levels)) || |
| (__kmp_library == library_serial)) { |
| KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid)); |
| nthreads = 1; |
| } else { |
| nthreads = master_set_numthreads |
| ? master_set_numthreads |
| // TODO: get nproc directly from current task |
| : get__nproc_2(parent_team, master_tid); |
| // Use the thread_limit set for the current target task if exists, else go |
| // with the deduced nthreads |
| nthreads = task_thread_limit > 0 && task_thread_limit < nthreads |
| ? task_thread_limit |
| : nthreads; |
| // Check if we need to take forkjoin lock? (no need for serialized |
| // parallel out of teams construct). |
| if (nthreads > 1) { |
| /* determine how many new threads we can use */ |
| __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); |
| /* AC: If we execute teams from parallel region (on host), then teams |
| should be created but each can only have 1 thread if nesting is |
| disabled. If teams called from serial region, then teams and their |
| threads should be created regardless of the nesting setting. */ |
| nthreads = __kmp_reserve_threads(root, parent_team, master_tid, |
| nthreads, enter_teams); |
| if (nthreads == 1) { |
| // Free lock for single thread execution here; for multi-thread |
| // execution it will be freed later after team of threads created |
| // and initialized |
| __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); |
| } |
| } |
| } |
| KMP_DEBUG_ASSERT(nthreads > 0); |
| |
| // If we temporarily changed the set number of threads then restore it now |
| master_th->th.th_set_nproc = 0; |
| |
| if (nthreads == 1) { |
| return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask, |
| invoker, master_th, parent_team, |
| #if OMPT_SUPPORT |
| &ompt_parallel_data, &return_address, |
| &parent_task_data, |
| #endif |
| ap); |
| } // if (nthreads == 1) |
| |
| // GEH: only modify the executing flag in the case when not serialized |
| // serialized case is handled in kmpc_serialized_parallel |
| KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " |
| "curtask=%p, curtask_max_aclevel=%d\n", |
| parent_team->t.t_active_level, master_th, |
| master_th->th.th_current_task, |
| master_th->th.th_current_task->td_icvs.max_active_levels)); |
| // TODO: GEH - cannot do this assertion because root thread not set up as |
| // executing |
| // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); |
| master_th->th.th_current_task->td_flags.executing = 0; |
| |
| if (!master_th->th.th_teams_microtask || level > teams_level) { |
| /* Increment our nested depth level */ |
| KMP_ATOMIC_INC(&root->r.r_in_parallel); |
| } |
| |
| // See if we need to make a copy of the ICVs. |
| int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; |
| kmp_nested_nthreads_t *nested_nth = NULL; |
| if (!master_th->th.th_set_nested_nth && |
| (level + 1 < parent_team->t.t_nested_nth->used) && |
| (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) { |
| nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1]; |
| } else if (master_th->th.th_set_nested_nth) { |
| nested_nth = __kmp_override_nested_nth(master_th, level); |
| if ((level + 1 < nested_nth->used) && |
| (nested_nth->nth[level + 1] != nthreads_icv)) |
| nthreads_icv = nested_nth->nth[level + 1]; |
| else |
| nthreads_icv = 0; // don't update |
| } else { |
| nthreads_icv = 0; // don't update |
| } |
| |
| // Figure out the proc_bind_policy for the new team. |
| kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; |
| // proc_bind_default means don't update |
| kmp_proc_bind_t proc_bind_icv = proc_bind_default; |
| if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { |
| proc_bind = proc_bind_false; |
| } else { |
| // No proc_bind clause specified; use current proc-bind-var for this |
| // parallel region |
| if (proc_bind == proc_bind_default) { |
| proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; |
| } |
| // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND |
| if (master_th->th.th_teams_microtask && |
| microtask == (microtask_t)__kmp_teams_master) { |
| proc_bind = __kmp_teams_proc_bind; |
| } |
| /* else: The proc_bind policy was specified explicitly on parallel clause. |
| This overrides proc-bind-var for this parallel region, but does not |
| change proc-bind-var. */ |
| // Figure the value of proc-bind-var for the child threads. |
| if ((level + 1 < __kmp_nested_proc_bind.used) && |
| (__kmp_nested_proc_bind.bind_types[level + 1] != |
| master_th->th.th_current_task->td_icvs.proc_bind)) { |
| // Do not modify the proc bind icv for the two teams construct forks |
| // They just let the proc bind icv pass through |
| if (!master_th->th.th_teams_microtask || |
| !(microtask == (microtask_t)__kmp_teams_master || ap == NULL)) |
| proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; |
| } |
| } |
| |
| // Reset for next parallel region |
| master_th->th.th_set_proc_bind = proc_bind_default; |
| |
| if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { |
| kmp_internal_control_t new_icvs; |
| copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); |
| new_icvs.next = NULL; |
| if (nthreads_icv > 0) { |
| new_icvs.nproc = nthreads_icv; |
| } |
| if (proc_bind_icv != proc_bind_default) { |
| new_icvs.proc_bind = proc_bind_icv; |
| } |
| |
| /* allocate a new parallel team */ |
| KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); |
| team = __kmp_allocate_team(root, nthreads, nthreads, |
| #if OMPT_SUPPORT |
| ompt_parallel_data, |
| #endif |
| proc_bind, &new_icvs, |
| argc USE_NESTED_HOT_ARG(master_th)); |
| if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) |
| copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); |
| } else { |
| /* allocate a new parallel team */ |
| KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); |
| team = __kmp_allocate_team(root, nthreads, nthreads, |
| #if OMPT_SUPPORT |
| ompt_parallel_data, |
| #endif |
| proc_bind, |
| &master_th->th.th_current_task->td_icvs, |
| argc USE_NESTED_HOT_ARG(master_th)); |
| if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) |
| copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, |
| &master_th->th.th_current_task->td_icvs); |
| } |
| KF_TRACE( |
| 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); |
| |
| /* setup the new team */ |
| KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); |
| KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); |
| KMP_CHECK_UPDATE(team->t.t_ident, loc); |
| KMP_CHECK_UPDATE(team->t.t_parent, parent_team); |
| KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); |
| #if OMPT_SUPPORT |
| KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, |
| return_address); |
| #endif |
| KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe |
| // TODO: parent_team->t.t_level == INT_MAX ??? |
| if (!master_th->th.th_teams_microtask || level > teams_level) { |
| int new_level = parent_team->t.t_level + 1; |
| KMP_CHECK_UPDATE(team->t.t_level, new_level); |
| new_level = parent_team->t.t_active_level + 1; |
| KMP_CHECK_UPDATE(team->t.t_active_level, new_level); |
| } else { |
| // AC: Do not increase parallel level at start of the teams construct |
| int new_level = parent_team->t.t_level; |
| KMP_CHECK_UPDATE(team->t.t_level, new_level); |
| new_level = parent_team->t.t_active_level; |
| KMP_CHECK_UPDATE(team->t.t_active_level, new_level); |
| } |
| kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); |
| // set primary thread's schedule as new run-time schedule |
| KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); |
| |
| KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); |
| KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); |
| |
| // Check if hot team has potentially outdated list, and if so, free it |
| if (team->t.t_nested_nth && |
| team->t.t_nested_nth != parent_team->t.t_nested_nth) { |
| KMP_INTERNAL_FREE(team->t.t_nested_nth->nth); |
| KMP_INTERNAL_FREE(team->t.t_nested_nth); |
| team->t.t_nested_nth = NULL; |
| } |
| team->t.t_nested_nth = parent_team->t.t_nested_nth; |
| if (master_th->th.th_set_nested_nth) { |
| if (!nested_nth) |
| nested_nth = __kmp_override_nested_nth(master_th, level); |
| team->t.t_nested_nth = nested_nth; |
| KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth); |
| master_th->th.th_set_nested_nth = NULL; |
| master_th->th.th_set_nested_nth_sz = 0; |
| master_th->th.th_nt_strict = false; |
| } |
| |
| // Update the floating point rounding in the team if required. |
| propagateFPControl(team); |
| #if OMPD_SUPPORT |
| if (ompd_state & OMPD_ENABLE_BP) |
| ompd_bp_parallel_begin(); |
| #endif |
| |
| KA_TRACE( |
| 20, |
| ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", |
| gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, |
| team->t.t_nproc)); |
| KMP_DEBUG_ASSERT(team != root->r.r_hot_team || |
| (team->t.t_master_tid == 0 && |
| (team->t.t_parent == root->r.r_root_team || |
| team->t.t_parent->t.t_serialized))); |
| KMP_MB(); |
| |
| /* now, setup the arguments */ |
| argv = (void **)team->t.t_argv; |
| if (ap) { |
| for (i = argc - 1; i >= 0; --i) { |
| void *new_argv = va_arg(kmp_va_deref(ap), void *); |
| KMP_CHECK_UPDATE(*argv, new_argv); |
| argv++; |
| } |
| } else { |
| for (i = 0; i < argc; ++i) { |
| // Get args from parent team for teams construct |
| KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); |
| } |
| } |
| |
| /* now actually fork the threads */ |
| KMP_CHECK_UPDATE(team->t.t_master_active, master_active); |
| if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong |
| root->r.r_active = TRUE; |
| |
| __kmp_fork_team_threads(root, team, master_th, gtid, !ap); |
| __kmp_setup_icv_copy(team, nthreads, |
| &master_th->th.th_current_task->td_icvs, loc); |
| |
| #if OMPT_SUPPORT |
| master_th->th.ompt_thread_info.state = ompt_state_work_parallel; |
| #endif |
| |
| __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); |
| |
| #if USE_ITT_BUILD |
| if (team->t.t_active_level == 1 // only report frames at level 1 |
| && !master_th->th.th_teams_microtask) { // not in teams construct |
| #if USE_ITT_NOTIFY |
| if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && |
| (__kmp_forkjoin_frames_mode == 3 || |
| __kmp_forkjoin_frames_mode == 1)) { |
| kmp_uint64 tmp_time = 0; |
| if (__itt_get_timestamp_ptr) |
| tmp_time = __itt_get_timestamp(); |
| // Internal fork - report frame begin |
| master_th->th.th_frame_time = tmp_time; |
| if (__kmp_forkjoin_frames_mode == 3) |
| team->t.t_region_time = tmp_time; |
| } else |
| // only one notification scheme (either "submit" or "forking/joined", not both) |
| #endif /* USE_ITT_NOTIFY */ |
| if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && |
| __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { |
| // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. |
| __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); |
| } |
| } |
| #endif /* USE_ITT_BUILD */ |
| |
| /* now go on and do the work */ |
| KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); |
| KMP_MB(); |
| KF_TRACE(10, |
| ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", |
| root, team, master_th, gtid)); |
| |
| #if USE_ITT_BUILD |
| if (__itt_stack_caller_create_ptr) { |
| // create new stack stitching id before entering fork barrier |
| if (!enter_teams) { |
| KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); |
| team->t.t_stack_id = __kmp_itt_stack_caller_create(); |
| } else if (parent_team->t.t_serialized) { |
| // keep stack stitching id in the serialized parent_team; |
| // current team will be used for parallel inside the teams; |
| // if parent_team is active, then it already keeps stack stitching id |
| // for the league of teams |
| KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); |
| parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); |
| } |
| } |
| #endif /* USE_ITT_BUILD */ |
| |
| // AC: skip __kmp_internal_fork at teams construct, let only primary |
| // threads execute |
| if (ap) { |
| __kmp_internal_fork(loc, gtid, team); |
| KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " |
| "master_th=%p, gtid=%d\n", |
| root, team, master_th, gtid)); |
| } |
| |
| if (call_context == fork_context_gnu) { |
| KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); |
| return TRUE; |
| } |
| |
| /* Invoke microtask for PRIMARY thread */ |
| KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, |
| team->t.t_id, team->t.t_pkfn)); |
| } // END of timer KMP_fork_call block |
| |
| #if KMP_STATS_ENABLED |
| // If beginning a teams construct, then change thread state |
| stats_state_e previous_state = KMP_GET_THREAD_STATE(); |
| if (!ap) { |
| KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); |
| } |
| #endif |
| |
| if (!team->t.t_invoke(gtid)) { |
| KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); |
| } |
| |
| #if KMP_STATS_ENABLED |
| // If was beginning of a teams construct, then reset thread state |
| if (!ap) { |
| KMP_SET_THREAD_STATE(previous_state); |
| } |
| #endif |
| |
| KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, |
| team->t.t_id, team->t.t_pkfn)); |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| |
| KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| |
| return TRUE; |
| } |
| |
| #if OMPT_SUPPORT |
| static inline void __kmp_join_restore_state(kmp_info_t *thread, |
| kmp_team_t *team) { |
| // restore state outside the region |
| thread->th.ompt_thread_info.state = |
| ((team->t.t_serialized) ? ompt_state_work_serial |
| : ompt_state_work_parallel); |
| } |
| |
| static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, |
| kmp_team_t *team, ompt_data_t *parallel_data, |
| int flags, void *codeptr) { |
| ompt_task_info_t *task_info = __ompt_get_task_info_object(0); |
| if (ompt_enabled.ompt_callback_parallel_end) { |
| ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( |
| parallel_data, &(task_info->task_data), flags, codeptr); |
| } |
| |
| task_info->frame.enter_frame = ompt_data_none; |
| __kmp_join_restore_state(thread, team); |
| } |
| #endif |
| |
| void __kmp_join_call(ident_t *loc, int gtid |
| #if OMPT_SUPPORT |
| , |
| enum fork_context_e fork_context |
| #endif |
| , |
| int exit_teams) { |
| KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); |
| kmp_team_t *team; |
| kmp_team_t *parent_team; |
| kmp_info_t *master_th; |
| kmp_root_t *root; |
| int master_active; |
| |
| KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); |
| |
| /* setup current data */ |
| master_th = __kmp_threads[gtid]; |
| root = master_th->th.th_root; |
| team = master_th->th.th_team; |
| parent_team = team->t.t_parent; |
| |
| master_th->th.th_ident = loc; |
| |
| #if OMPT_SUPPORT |
| void *team_microtask = (void *)team->t.t_pkfn; |
| // For GOMP interface with serialized parallel, need the |
| // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task |
| // and end-parallel events. |
| if (ompt_enabled.enabled && |
| !(team->t.t_serialized && fork_context == fork_context_gnu)) { |
| master_th->th.ompt_thread_info.state = ompt_state_overhead; |
| } |
| #endif |
| |
| #if KMP_DEBUG |
| if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { |
| KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " |
| "th_task_team = %p\n", |
| __kmp_gtid_from_thread(master_th), team, |
| team->t.t_task_team[master_th->th.th_task_state], |
| master_th->th.th_task_team)); |
| KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th); |
| } |
| #endif |
| |
| if (team->t.t_serialized) { |
| if (master_th->th.th_teams_microtask) { |
| // We are in teams construct |
| int level = team->t.t_level; |
| int tlevel = master_th->th.th_teams_level; |
| if (level == tlevel) { |
| // AC: we haven't incremented it earlier at start of teams construct, |
| // so do it here - at the end of teams construct |
| team->t.t_level++; |
| } else if (level == tlevel + 1) { |
| // AC: we are exiting parallel inside teams, need to increment |
| // serialization in order to restore it in the next call to |
| // __kmpc_end_serialized_parallel |
| team->t.t_serialized++; |
| } |
| } |
| __kmpc_end_serialized_parallel(loc, gtid); |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| if (fork_context == fork_context_gnu) { |
| __ompt_lw_taskteam_unlink(master_th); |
| } |
| __kmp_join_restore_state(master_th, parent_team); |
| } |
| #endif |
| |
| return; |
| } |
| |
| master_active = team->t.t_master_active; |
| |
| if (!exit_teams) { |
| // AC: No barrier for internal teams at exit from teams construct. |
| // But there is barrier for external team (league). |
| __kmp_internal_join(loc, gtid, team); |
| #if USE_ITT_BUILD |
| if (__itt_stack_caller_create_ptr) { |
| KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); |
| // destroy the stack stitching id after join barrier |
| __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); |
| team->t.t_stack_id = NULL; |
| } |
| #endif |
| } else { |
| master_th->th.th_task_state = |
| 0; // AC: no tasking in teams (out of any parallel) |
| #if USE_ITT_BUILD |
| if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { |
| KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); |
| // destroy the stack stitching id on exit from the teams construct |
| // if parent_team is active, then the id will be destroyed later on |
| // by master of the league of teams |
| __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); |
| parent_team->t.t_stack_id = NULL; |
| } |
| #endif |
| } |
| |
| KMP_MB(); |
| |
| #if OMPT_SUPPORT |
| ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); |
| void *codeptr = team->t.ompt_team_info.master_return_address; |
| #endif |
| |
| #if USE_ITT_BUILD |
| // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. |
| if (team->t.t_active_level == 1 && |
| (!master_th->th.th_teams_microtask || /* not in teams construct */ |
| master_th->th.th_teams_size.nteams == 1)) { |
| master_th->th.th_ident = loc; |
| // only one notification scheme (either "submit" or "forking/joined", not |
| // both) |
| if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && |
| __kmp_forkjoin_frames_mode == 3) |
| __kmp_itt_frame_submit(gtid, team->t.t_region_time, |
| master_th->th.th_frame_time, 0, loc, |
| master_th->th.th_team_nproc, 1); |
| else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && |
| !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) |
| __kmp_itt_region_joined(gtid); |
| } // active_level == 1 |
| #endif /* USE_ITT_BUILD */ |
| |
| #if KMP_AFFINITY_SUPPORTED |
| if (!exit_teams) { |
| // Restore master thread's partition. |
| master_th->th.th_first_place = team->t.t_first_place; |
| master_th->th.th_last_place = team->t.t_last_place; |
| } |
| #endif // KMP_AFFINITY_SUPPORTED |
| |
| if (master_th->th.th_teams_microtask && !exit_teams && |
| team->t.t_pkfn != (microtask_t)__kmp_teams_master && |
| team->t.t_level == master_th->th.th_teams_level + 1) { |
| // AC: We need to leave the team structure intact at the end of parallel |
| // inside the teams construct, so that at the next parallel same (hot) team |
| // works, only adjust nesting levels |
| #if OMPT_SUPPORT |
| ompt_data_t ompt_parallel_data = ompt_data_none; |
| if (ompt_enabled.enabled) { |
| ompt_task_info_t *task_info = __ompt_get_task_info_object(0); |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| int ompt_team_size = team->t.t_nproc; |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); |
| } |
| task_info->frame.exit_frame = ompt_data_none; |
| task_info->task_data = ompt_data_none; |
| ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); |
| __ompt_lw_taskteam_unlink(master_th); |
| } |
| #endif |
| /* Decrement our nested depth level */ |
| team->t.t_level--; |
| team->t.t_active_level--; |
| KMP_ATOMIC_DEC(&root->r.r_in_parallel); |
| |
| // Restore number of threads in the team if needed. This code relies on |
| // the proper adjustment of th_teams_size.nth after the fork in |
| // __kmp_teams_master on each teams primary thread in the case that |
| // __kmp_reserve_threads reduced it. |
| if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { |
| int old_num = master_th->th.th_team_nproc; |
| int new_num = master_th->th.th_teams_size.nth; |
| kmp_info_t **other_threads = team->t.t_threads; |
| team->t.t_nproc = new_num; |
| for (int i = 0; i < old_num; ++i) { |
| other_threads[i]->th.th_team_nproc = new_num; |
| } |
| // Adjust states of non-used threads of the team |
| for (int i = old_num; i < new_num; ++i) { |
| // Re-initialize thread's barrier data. |
| KMP_DEBUG_ASSERT(other_threads[i]); |
| kmp_balign_t *balign = other_threads[i]->th.th_bar; |
| for (int b = 0; b < bs_last_barrier; ++b) { |
| balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; |
| KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); |
| #if USE_DEBUGGER |
| balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; |
| #endif |
| } |
| if (__kmp_tasking_mode != tskm_immediate_exec) { |
| // Synchronize thread's task state |
| other_threads[i]->th.th_task_state = master_th->th.th_task_state; |
| } |
| } |
| } |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, |
| OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); |
| } |
| #endif |
| |
| return; |
| } |
| |
| /* do cleanup and restore the parent team */ |
| master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; |
| master_th->th.th_local.this_construct = team->t.t_master_this_cons; |
| |
| master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; |
| |
| /* jc: The following lock has instructions with REL and ACQ semantics, |
| separating the parallel user code called in this parallel region |
| from the serial user code called after this function returns. */ |
| __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); |
| |
| if (!master_th->th.th_teams_microtask || |
| team->t.t_level > master_th->th.th_teams_level) { |
| /* Decrement our nested depth level */ |
| KMP_ATOMIC_DEC(&root->r.r_in_parallel); |
| } |
| KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); |
| |
| #if OMPT_SUPPORT |
| if (ompt_enabled.enabled) { |
| ompt_task_info_t *task_info = __ompt_get_task_info_object(0); |
| if (ompt_enabled.ompt_callback_implicit_task) { |
| int flags = (team_microtask == (void *)__kmp_teams_master) |
| ? ompt_task_initial |
| : ompt_task_implicit; |
| int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; |
| ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( |
| ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, |
| OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); |
| } |
| task_info->frame.exit_frame = ompt_data_none; |
| task_info->task_data = ompt_data_none; |
| } |
| #endif |
| |
| KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, |
| master_th, team)); |
| __kmp_pop_current_task_from_thread(master_th); |
| |
| master_th->th.th_def_allocator = team->t.t_def_allocator; |
| |
| #if OMPD_SUPPORT |
| if (ompd_state & OMPD_ENABLE_BP) |
| ompd_bp_parallel_end(); |
| #endif |
| updateHWFPControl(team); |
| |
| if (root->r.r_active != master_active) |
| root->r.r_active = master_active; |
| |
| __kmp_free_team(root, team USE_NESTED_HOT_ARG( |
| master_th)); // this will free worker threads |
| |
| /* this race was fun to find. make sure the following is in the critical |
| region otherwise assertions may fail occasionally since the old team may be |
| reallocated and the hierarchy appears inconsistent. it is actually safe to |
| run and won't cause any bugs, but will cause those assertion failures. it's |
| only one deref&assign so might as well put this in the critical region */ |
| master_th->th.th_team = parent_team; |
| master_th->th.th_team_nproc = parent_team->t.t_nproc; |
| master_th->th.th_team_master = parent_team->t.t_threads[0]; |
| master_th->th.th_team_serialized = parent_team->t.t_serialized; |
| |
| /* restore serialized team, if need be */ |
| if (parent_team->t.t_serialized && |
| parent_team != master_th->th.th_serial_team && |
| parent_team != root->r.r_root_team) { |
| __kmp_free_team(root, |
| master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); |
| master_th->th.th_serial_team = parent_team; |
| } |
| |
| if (__kmp_tasking_mode != tskm_immediate_exec) { |
| // Restore primary thread's task state from team structure |
| KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 || |
| team->t.t_primary_task_state == 1); |
| master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state; |
| |
| // Copy the task team from the parent team to the primary thread |
| master_th->th.th_task_team = |
| parent_team->t.t_task_team[master_th->th.th_task_state]; |
| KA_TRACE(20, |
| ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", |
| __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, |
| parent_team)); |
| } |
| |
| // TODO: GEH - cannot do this assertion because root thread not set up as |
| // executing |
| // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); |
| master_th->th.th_current_task->td_flags.executing = 1; |
| |
| __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); |
| |
| #if KMP_AFFINITY_SUPPORTED |
| if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) { |
| __kmp_reset_root_init_mask(gtid); |
| } |
| #endif |
| #if OMPT_SUPPORT |
| int flags = |
| OMPT_INVOKER(fork_context) | |
| ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league |
| : ompt_parallel_team); |
| if (ompt_enabled.enabled) { |
| __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, |
| codeptr); |
| } |
| #endif |
| |
| KMP_MB(); |
| KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); |
| } |
| |
| /* Check whether we should push an internal control record onto the |
| serial team stack. If so, do it. */ |
| void __kmp_save_internal_controls(kmp_info_t *thread) { |
| |
| if (thread->th.th_team != thread->th.th_serial_team) { |
| return; |
| } |
| if (thread->th.th_team->t.t_serialized > 1) { |
| int push = 0; |
| |
| if (thread->th.th_team->t.t_control_stack_top == NULL) { |
| push = 1; |
| } else { |
| if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != |
| thread->th.th_team->t.t_serialized) { |
| push = 1; |
| } |
| } |
| if (push) { /* push a record on the serial team's stack */ |
| kmp_internal_control_t *control = |
| (kmp_internal_control_t *)__kmp_allocate( |
| sizeof(kmp_internal_control_t)); |
| |
| copy_icvs(control, &thread->th.th_current_task->td_icvs); |
| |
| control->serial_nesting_level = thread->th.th_team->t.t_serialized; |
| |
| control->next = thread->th.th_team->t.t_control_stack_top; |
| thread->th.th_team->t.t_control_stack_top = control; |
| } |
| } |
| } |
| |
| /* Changes set_nproc */ |
| void __kmp_set_num_threads(int new_nth, int gtid) { |
| kmp_info_t *thread; |
| kmp_root_t *root; |
| |
| KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); |
| KMP_DEBUG_ASSERT(__kmp_init_serial); |
| |
| if (new_nth < 1) |
| new_nth = 1; |
| else if (new_nth > __kmp_max_nth) |
| new_nth = __kmp_max_nth; |
| |
| KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); |
| thread = __kmp_threads[gtid]; |
| if (thread->th.th_current_task->td_icvs.nproc == new_nth) |
| return; // nothing to do |
| |
| __kmp_save_internal_controls(thread); |
| |
| set__nproc(thread, new_nth); |
| |
| // If this omp_set_num_threads() call will cause the hot team size to be |
| // reduced (in the absence of a num_threads clause), then reduce it now, |
| // rather than waiting for the next parallel region. |
| root = thread->th.th_root; |
| if (__kmp_init_parallel && (!root->r.r_active) && |
| (root->r.r_hot_team->t.t_nproc > new_nth) |
| #if KMP_NESTED_HOT_TEAMS |
| && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode |
| #endif |
| ) { |
| kmp_team_t *hot_team = root->r.r_hot_team; |
| int f; |
| |
| __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); |
| |
| if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { |
| __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); |
| } |
| // Release the extra threads we don't need any more. |
| for (f = new_nth; f < hot_team->t.t_nproc; f++) { |
| KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); |
| if (__kmp_tasking_mode != tskm_immediate_exec) { |
| // When decreasing team size, threads no longer in the team should unref |
| // task team. |
| hot_team->t.t_threads[f]->th.th_task_team = NULL; |
| } |
| __kmp_free_thread(hot_team->t.t_threads[f]); |
| hot_team->t.t_threads[f] = NULL; |
| } |
| hot_team->t.t_nproc = new_nth; |
| #if KMP_NESTED_HOT_TEAMS |
| if (thread->th.th_hot_teams) { |
| KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); |
| thread->th.th_hot_teams[0].hot_team_nth = new_nth; |
| } |
| #endif |
| |
| if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { |
| hot_team->t.b->update_num_threads(new_nth); |
| __kmp_add_threads_to_team(hot_team, new_nth); |
| } |
| |
| __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); |
| |
| // Update the t_nproc field in the threads that are still active. |
| for (f = 0; f < new_nth; f++) { |
| KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); |
| hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; |
| } |
| // Special flag in case omp_set_num_threads() call |
| hot_team->t.t_size_changed = -1; |
| } |
| |