blob: 2c8d9304c46bc25885d47908421fbff432779b1b [file] [log] [blame]
/*
* kmp_runtime.cpp -- KPTS runtime support library
*/
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "kmp.h"
#include "kmp_affinity.h"
#include "kmp_atomic.h"
#include "kmp_environment.h"
#include "kmp_error.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_itt.h"
#include "kmp_settings.h"
#include "kmp_stats.h"
#include "kmp_str.h"
#include "kmp_wait_release.h"
#include "kmp_wrapper_getpid.h"
#include "kmp_dispatch.h"
#include "kmp_utils.h"
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
#if OMPD_SUPPORT
#include "ompd-specific.h"
#endif
#if OMP_PROFILING_SUPPORT
#include "llvm/Support/TimeProfiler.h"
static char *ProfileTraceFile = nullptr;
#endif
/* these are temporary issues to be dealt with */
#define KMP_USE_PRCTL 0
#if KMP_OS_WINDOWS
#include <process.h>
#endif
#ifndef KMP_USE_SHM
// Windows and WASI do not need these include files as they don't use shared
// memory.
#else
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#define SHM_SIZE 1024
#endif
#if defined(KMP_GOMP_COMPAT)
char const __kmp_version_alt_comp[] =
KMP_VERSION_PREFIX "alternative compiler support: yes";
#endif /* defined(KMP_GOMP_COMPAT) */
char const __kmp_version_omp_api[] =
KMP_VERSION_PREFIX "API version: 5.0 (201611)";
#ifdef KMP_DEBUG
char const __kmp_version_lock[] =
KMP_VERSION_PREFIX "lock type: run time selectable";
#endif /* KMP_DEBUG */
#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
/* ------------------------------------------------------------------------ */
#if KMP_USE_MONITOR
kmp_info_t __kmp_monitor;
#endif
/* Forward declarations */
void __kmp_cleanup(void);
static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
int gtid);
static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs,
ident_t *loc);
#if KMP_AFFINITY_SUPPORTED
static void __kmp_partition_places(kmp_team_t *team,
int update_master_only = 0);
#endif
static void __kmp_do_serial_initialize(void);
void __kmp_fork_barrier(int gtid, int tid);
void __kmp_join_barrier(int gtid);
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
kmp_internal_control_t *new_icvs, ident_t *loc);
#ifdef USE_LOAD_BALANCE
static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
#endif
static int __kmp_expand_threads(int nNeed);
#if KMP_OS_WINDOWS
static int __kmp_unregister_root_other_thread(int gtid);
#endif
static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
int new_nthreads);
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
int level) {
kmp_nested_nthreads_t *new_nested_nth =
(kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
sizeof(kmp_nested_nthreads_t));
int new_size = level + thr->th.th_set_nested_nth_sz;
new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
for (int i = 0; i < level + 1; ++i)
new_nested_nth->nth[i] = 0;
for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
new_nested_nth->size = new_nested_nth->used = new_size;
return new_nested_nth;
}
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique identifier of executing
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
int __kmp_get_global_thread_id() {
int i;
kmp_info_t **other_threads;
size_t stack_data;
char *stack_addr;
size_t stack_size;
char *stack_base;
KA_TRACE(
1000,
("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
__kmp_nth, __kmp_all_nth));
/* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
a parallel region, made it return KMP_GTID_DNE to force serial_initialize
by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
__kmp_init_gtid for this to work. */
if (!TCR_4(__kmp_init_gtid))
return KMP_GTID_DNE;
#ifdef KMP_TDATA_GTID
if (TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
return __kmp_gtid;
}
#endif
if (TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
return __kmp_gtid_get_specific();
}
KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
stack_addr = (char *)&stack_data;
other_threads = __kmp_threads;
/* ATT: The code below is a source of potential bugs due to unsynchronized
access to __kmp_threads array. For example:
1. Current thread loads other_threads[i] to thr and checks it, it is
non-NULL.
2. Current thread is suspended by OS.
3. Another thread unregisters and finishes (debug versions of free()
may fill memory with something like 0xEF).
4. Current thread is resumed.
5. Current thread reads junk from *thr.
TODO: Fix it. --ln */
for (i = 0; i < __kmp_threads_capacity; i++) {
kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
if (!thr)
continue;
stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
/* stack grows down -- search through all of the active threads */
if (stack_addr <= stack_base) {
size_t stack_diff = stack_base - stack_addr;
if (stack_diff <= stack_size) {
/* The only way we can be closer than the allocated */
/* stack size is if we are running on this thread. */
// __kmp_gtid_get_specific can return negative value because this
// function can be called by thread destructor. However, before the
// thread destructor is called, the value of the corresponding
// thread-specific data will be reset to NULL.
KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
__kmp_gtid_get_specific() == i);
return i;
}
}
}
/* get specific to try and determine our gtid */
KA_TRACE(1000,
("*** __kmp_get_global_thread_id: internal alg. failed to find "
"thread, using TLS\n"));
i = __kmp_gtid_get_specific();
/*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
/* if we havn't been assigned a gtid, then return code */
if (i < 0)
return i;
// other_threads[i] can be nullptr at this point because the corresponding
// thread could have already been destructed. It can happen when this function
// is called in end library routine.
if (!TCR_SYNC_PTR(other_threads[i]))
return i;
/* dynamically updated stack window for uber threads to avoid get_specific
call */
if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
KMP_FATAL(StackOverflow, i);
}
stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
if (stack_addr > stack_base) {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
stack_base);
} else {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
stack_base - stack_addr);
}
/* Reprint stack bounds for ubermaster since they have been refined */
if (__kmp_storage_map) {
char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
__kmp_print_storage_map_gtid(i, stack_beg, stack_end,
other_threads[i]->th.th_info.ds.ds_stacksize,
"th_%d stack (refinement)", i);
}
return i;
}
int __kmp_get_global_thread_id_reg() {
int gtid;
if (!__kmp_init_serial) {
gtid = KMP_GTID_DNE;
} else
#ifdef KMP_TDATA_GTID
if (TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
gtid = __kmp_gtid;
} else
#endif
if (TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
gtid = __kmp_gtid_get_specific();
} else {
KA_TRACE(1000,
("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
gtid = __kmp_get_global_thread_id();
}
/* we must be a new uber master sibling thread */
if (gtid == KMP_GTID_DNE) {
KA_TRACE(10,
("__kmp_get_global_thread_id_reg: Encountered new root thread. "
"Registering a new gtid.\n"));
__kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
if (!__kmp_init_serial) {
__kmp_do_serial_initialize();
gtid = __kmp_gtid_get_specific();
} else {
gtid = __kmp_register_root(FALSE);
}
__kmp_release_bootstrap_lock(&__kmp_initz_lock);
/*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
}
KMP_DEBUG_ASSERT(gtid >= 0);
return gtid;
}
/* caller must hold forkjoin_lock */
void __kmp_check_stack_overlap(kmp_info_t *th) {
int f;
char *stack_beg = NULL;
char *stack_end = NULL;
int gtid;
KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
if (__kmp_storage_map) {
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
gtid = __kmp_gtid_from_thread(th);
if (gtid == KMP_GTID_MONITOR) {
__kmp_print_storage_map_gtid(
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%s stack (%s)", "mon",
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
} else {
__kmp_print_storage_map_gtid(
gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%d stack (%s)", gtid,
(th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
}
}
/* No point in checking ubermaster threads since they use refinement and
* cannot overlap */
gtid = __kmp_gtid_from_thread(th);
if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
KA_TRACE(10,
("__kmp_check_stack_overlap: performing extensive checking\n"));
if (stack_beg == NULL) {
stack_end = (char *)th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
}
for (f = 0; f < __kmp_threads_capacity; f++) {
kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
if (f_th && f_th != th) {
char *other_stack_end =
(char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
char *other_stack_beg =
other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
(stack_end > other_stack_beg && stack_end < other_stack_end)) {
/* Print the other stack values before the abort */
if (__kmp_storage_map)
__kmp_print_storage_map_gtid(
-1, other_stack_beg, other_stack_end,
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
"th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
__kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
__kmp_msg_null);
}
}
}
}
KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
}
/* ------------------------------------------------------------------------ */
void __kmp_infinite_loop(void) {
static int done = FALSE;
while (!done) {
KMP_YIELD(TRUE);
}
}
#define MAX_MESSAGE 512
void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
char const *format, ...) {
char buffer[MAX_MESSAGE];
va_list ap;
va_start(ap, format);
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
p2, (unsigned long)size, format);
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
__kmp_vprintf(kmp_err, buffer, ap);
#if KMP_PRINT_DATA_PLACEMENT
int node;
if (gtid >= 0) {
if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
if (__kmp_storage_map_verbose) {
node = __kmp_get_host_node(p1);
if (node < 0) /* doesn't work, so don't try this next time */
__kmp_storage_map_verbose = FALSE;
else {
char *last;
int lastNode;
int localProc = __kmp_get_cpu_from_gtid(gtid);
const int page_size = KMP_GET_PAGE_SIZE();
p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
if (localProc >= 0)
__kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
localProc >> 1);
else
__kmp_printf_no_lock(" GTID %d\n", gtid);
#if KMP_USE_PRCTL
/* The more elaborate format is disabled for now because of the prctl
* hanging bug. */
do {
last = p1;
lastNode = node;
/* This loop collates adjacent pages with the same host node. */
do {
(char *)p1 += page_size;
} while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
__kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
lastNode);
} while (p1 <= p2);
#else
__kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
(char *)p1 + (page_size - 1),
__kmp_get_host_node(p1));
if (p1 < p2) {
__kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
(char *)p2 + (page_size - 1),
__kmp_get_host_node(p2));
}
#endif
}
}
} else
__kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
}
#endif /* KMP_PRINT_DATA_PLACEMENT */
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
va_end(ap);
}
void __kmp_warn(char const *format, ...) {
char buffer[MAX_MESSAGE];
va_list ap;
if (__kmp_generate_warnings == kmp_warnings_off) {
return;
}
va_start(ap, format);
KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
__kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
__kmp_vprintf(kmp_err, buffer, ap);
__kmp_release_bootstrap_lock(&__kmp_stdio_lock);
va_end(ap);
}
void __kmp_abort_process() {
// Later threads may stall here, but that's ok because abort() will kill them.
__kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
if (__kmp_debug_buf) {
__kmp_dump_debug_buffer();
}
#if KMP_OS_WINDOWS
// Let other threads know of abnormal termination and prevent deadlock
// if abort happened during library initialization or shutdown
__kmp_global.g.g_abort = SIGABRT;
/* On Windows* OS by default abort() causes pop-up error box, which stalls
nightly testing. Unfortunately, we cannot reliably suppress pop-up error
boxes. _set_abort_behavior() works well, but this function is not
available in VS7 (this is not problem for DLL, but it is a problem for
static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
help, at least in some versions of MS C RTL.
It seems following sequence is the only way to simulate abort() and
avoid pop-up error box. */
raise(SIGABRT);
_exit(3); // Just in case, if signal ignored, exit anyway.
#else
__kmp_unregister_library();
abort();
#endif
__kmp_infinite_loop();
__kmp_release_bootstrap_lock(&__kmp_exit_lock);
} // __kmp_abort_process
void __kmp_abort_thread(void) {
// TODO: Eliminate g_abort global variable and this function.
// In case of abort just call abort(), it will kill all the threads.
__kmp_infinite_loop();
} // __kmp_abort_thread
/* Print out the storage map for the major kmp_info_t thread data structures
that are allocated together. */
static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
__kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
sizeof(kmp_desc_t), "th_%d.th_info", gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
sizeof(kmp_local_t), "th_%d.th_local", gtid);
__kmp_print_storage_map_gtid(
gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
&thr->th.th_bar[bs_plain_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
gtid);
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
&thr->th.th_bar[bs_forkjoin_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
gtid);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
&thr->th.th_bar[bs_reduction_barrier + 1],
sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
gtid);
#endif // KMP_FAST_REDUCTION_BARRIER
}
/* Print out the storage map for the major kmp_team_t team data structures
that are allocated together. */
static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
int team_id, int num_thr) {
int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
__kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
&team->t.t_bar[bs_last_barrier],
sizeof(kmp_balign_team_t) * bs_last_barrier,
"%s_%d.t_bar", header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
&team->t.t_bar[bs_plain_barrier + 1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
&team->t.t_bar[bs_forkjoin_barrier + 1],
sizeof(kmp_balign_team_t),
"%s_%d.t_bar[forkjoin]", header, team_id);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
&team->t.t_bar[bs_reduction_barrier + 1],
sizeof(kmp_balign_team_t),
"%s_%d.t_bar[reduction]", header, team_id);
#endif // KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid(
-1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
__kmp_print_storage_map_gtid(
-1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
__kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
&team->t.t_disp_buffer[num_disp_buff],
sizeof(dispatch_shared_info_t) * num_disp_buff,
"%s_%d.t_disp_buffer", header, team_id);
}
static void __kmp_init_allocator() {
__kmp_init_memkind();
__kmp_init_target_mem();
}
static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
/* ------------------------------------------------------------------------ */
#if ENABLE_LIBOMPTARGET
static void __kmp_init_omptarget() {
__kmp_init_target_task();
}
#endif
/* ------------------------------------------------------------------------ */
#if KMP_DYNAMIC_LIB
#if KMP_OS_WINDOWS
BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
//__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
switch (fdwReason) {
case DLL_PROCESS_ATTACH:
KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
return TRUE;
case DLL_PROCESS_DETACH:
KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
// According to Windows* documentation for DllMain entry point:
// for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
// lpReserved == NULL when FreeLibrary() is called,
// lpReserved != NULL when the process is terminated.
// When FreeLibrary() is called, worker threads remain alive. So the
// runtime's state is consistent and executing proper shutdown is OK.
// When the process is terminated, worker threads have exited or been
// forcefully terminated by the OS and only the shutdown thread remains.
// This can leave the runtime in an inconsistent state.
// Hence, only attempt proper cleanup when FreeLibrary() is called.
// Otherwise, rely on OS to reclaim resources.
if (lpReserved == NULL)
__kmp_internal_end_library(__kmp_gtid_get_specific());
return TRUE;
case DLL_THREAD_ATTACH:
KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
/* if we want to register new siblings all the time here call
* __kmp_get_gtid(); */
return TRUE;
case DLL_THREAD_DETACH:
KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
__kmp_internal_end_thread(__kmp_gtid_get_specific());
return TRUE;
}
return TRUE;
}
#endif /* KMP_OS_WINDOWS */
#endif /* KMP_DYNAMIC_LIB */
/* __kmp_parallel_deo -- Wait until it's our turn. */
void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
kmp_team_t *team = __kmp_team_from_gtid(gtid);
#endif /* BUILD_PARALLEL_ORDERED */
if (__kmp_env_consistency_check) {
if (__kmp_threads[gtid]->th.th_root->r.r_active)
#if KMP_USE_DYNAMIC_LOCK
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
#else
__kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
#endif
}
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB();
KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
NULL);
KMP_MB();
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* __kmp_parallel_dxo -- Signal the next task. */
void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
int tid = __kmp_tid_from_gtid(gtid);
kmp_team_t *team = __kmp_team_from_gtid(gtid);
#endif /* BUILD_PARALLEL_ORDERED */
if (__kmp_env_consistency_check) {
if (__kmp_threads[gtid]->th.th_root->r.r_active)
__kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
}
#ifdef BUILD_PARALLEL_ORDERED
if (!team->t.t_serialized) {
KMP_MB(); /* Flush all pending memory write invalidates. */
/* use the tid of the next thread in this team */
/* TODO replace with general release procedure */
team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
KMP_MB(); /* Flush all pending memory write invalidates. */
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* ------------------------------------------------------------------------ */
/* The BARRIER for a SINGLE process section is always explicit */
int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
int status;
kmp_info_t *th;
kmp_team_t *team;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
th = __kmp_threads[gtid];
team = th->th.th_team;
status = 0;
th->th.th_ident = id_ref;
if (team->t.t_serialized) {
status = 1;
} else {
kmp_int32 old_this = th->th.th_local.this_construct;
++th->th.th_local.this_construct;
/* try to set team count to thread count--success means thread got the
single block */
/* TODO: Should this be acquire or release? */
if (team->t.t_construct == old_this) {
status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
th->th.th_local.this_construct);
}
#if USE_ITT_BUILD
if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
team->t.t_active_level == 1) {
// Only report metadata by primary thread of active team at level 1
__kmp_itt_metadata_single(id_ref);
}
#endif /* USE_ITT_BUILD */
}
if (__kmp_env_consistency_check) {
if (status && push_ws) {
__kmp_push_workshare(gtid, ct_psingle, id_ref);
} else {
__kmp_check_workshare(gtid, ct_psingle, id_ref);
}
}
#if USE_ITT_BUILD
if (status) {
__kmp_itt_single_start(gtid);
}
#endif /* USE_ITT_BUILD */
return status;
}
void __kmp_exit_single(int gtid) {
#if USE_ITT_BUILD
__kmp_itt_single_end(gtid);
#endif /* USE_ITT_BUILD */
if (__kmp_env_consistency_check)
__kmp_pop_workshare(gtid, ct_psingle, NULL);
}
/* determine if we can go parallel or must use a serialized parallel region and
* how many threads we can use
* set_nproc is the number of threads requested for the team
* returns 0 if we should serialize or only use one thread,
* otherwise the number of threads to use
* The forkjoin lock is held by the caller. */
static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
int master_tid, int set_nthreads,
int enter_teams) {
int capacity;
int new_nthreads;
KMP_DEBUG_ASSERT(__kmp_init_serial);
KMP_DEBUG_ASSERT(root && parent_team);
kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
// If dyn-var is set, dynamically adjust the number of desired threads,
// according to the method specified by dynamic_mode.
new_nthreads = set_nthreads;
if (!get__dynamic_2(parent_team, master_tid)) {
;
}
#ifdef USE_LOAD_BALANCE
else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
if (new_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
}
}
#endif /* USE_LOAD_BALANCE */
else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
new_nthreads = __kmp_avail_proc - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (new_nthreads <= 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
} else {
new_nthreads = set_nthreads;
}
} else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
if (set_nthreads > 2) {
new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
new_nthreads = (new_nthreads % set_nthreads) + 1;
if (new_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
"reservation to 1 thread\n",
master_tid));
return 1;
}
if (new_nthreads < set_nthreads) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
"reservation to %d threads\n",
master_tid, new_nthreads));
}
}
} else {
KMP_ASSERT(0);
}
// Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
__kmp_max_nth) {
int tl_nthreads = __kmp_max_nth - __kmp_nth +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (tl_nthreads <= 0) {
tl_nthreads = 1;
}
// If dyn-var is false, emit a 1-time warning.
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
if (tl_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
"reduced reservation to 1 thread\n",
master_tid));
return 1;
}
KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
"reservation to %d threads\n",
master_tid, tl_nthreads));
new_nthreads = tl_nthreads;
}
// Respect OMP_THREAD_LIMIT
int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
if (cg_nthreads + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
max_cg_threads) {
int tl_nthreads = max_cg_threads - cg_nthreads +
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
if (tl_nthreads <= 0) {
tl_nthreads = 1;
}
// If dyn-var is false, emit a 1-time warning.
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
}
if (tl_nthreads == 1) {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
"reduced reservation to 1 thread\n",
master_tid));
return 1;
}
KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
"reservation to %d threads\n",
master_tid, tl_nthreads));
new_nthreads = tl_nthreads;
}
// Check if the threads array is large enough, or needs expanding.
// See comment in __kmp_register_root() about the adjustment if
// __kmp_threads[0] == NULL.
capacity = __kmp_threads_capacity;
if (TCR_PTR(__kmp_threads[0]) == NULL) {
--capacity;
}
// If it is not for initializing the hidden helper team, we need to take
// __kmp_hidden_helper_threads_num out of the capacity because it is included
// in __kmp_threads_capacity.
if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
capacity -= __kmp_hidden_helper_threads_num;
}
if (__kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
capacity) {
// Expand the threads array.
int slotsRequired = __kmp_nth + new_nthreads -
(root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
capacity;
int slotsAdded = __kmp_expand_threads(slotsRequired);
if (slotsAdded < slotsRequired) {
// The threads array was not expanded enough.
new_nthreads -= (slotsRequired - slotsAdded);
KMP_ASSERT(new_nthreads >= 1);
// If dyn-var is false, emit a 1-time warning.
if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
__kmp_reserve_warn = 1;
if (__kmp_tp_cached) {
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
} else {
__kmp_msg(kmp_ms_warning,
KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
}
}
}
}
#ifdef KMP_DEBUG
if (new_nthreads == 1) {
KC_TRACE(10,
("__kmp_reserve_threads: T#%d serializing team after reclaiming "
"dead roots and rechecking; requested %d threads\n",
__kmp_get_gtid(), set_nthreads));
} else {
KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
" %d threads\n",
__kmp_get_gtid(), new_nthreads, set_nthreads));
}
#endif // KMP_DEBUG
if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
__kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
this_thr->th.th_nt_msg);
}
return new_nthreads;
}
/* Allocate threads from the thread pool and assign them to the new team. We are
assured that there are enough threads available, because we checked on that
earlier within critical section forkjoin */
static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
kmp_info_t *master_th, int master_gtid,
int fork_teams_workers) {
int i;
int use_hot_team;
KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
KMP_MB();
/* first, let's setup the primary thread */
master_th->th.th_info.ds.ds_tid = 0;
master_th->th.th_team = team;
master_th->th.th_team_nproc = team->t.t_nproc;
master_th->th.th_team_master = master_th;
master_th->th.th_team_serialized = FALSE;
master_th->th.th_dispatch = &team->t.t_dispatch[0];
/* make sure we are not the optimized hot team */
#if KMP_NESTED_HOT_TEAMS
use_hot_team = 0;
kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
if (hot_teams) { // hot teams array is not allocated if
// KMP_HOT_TEAMS_MAX_LEVEL=0
int level = team->t.t_active_level - 1; // index in array of hot teams
if (master_th->th.th_teams_microtask) { // are we inside the teams?
if (master_th->th.th_teams_size.nteams > 1) {
++level; // level was not increased in teams construct for
// team_of_masters
}
if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
master_th->th.th_teams_level == team->t.t_level) {
++level; // level was not increased in teams construct for
// team_of_workers before the parallel
} // team->t.t_level will be increased inside parallel
}
if (level < __kmp_hot_teams_max_level) {
if (hot_teams[level].hot_team) {
// hot team has already been allocated for given level
KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
use_hot_team = 1; // the team is ready to use
} else {
use_hot_team = 0; // AC: threads are not allocated yet
hot_teams[level].hot_team = team; // remember new hot team
hot_teams[level].hot_team_nth = team->t.t_nproc;
}
} else {
use_hot_team = 0;
}
}
#else
use_hot_team = team == root->r.r_hot_team;
#endif
if (!use_hot_team) {
/* install the primary thread */
team->t.t_threads[0] = master_th;
__kmp_initialize_info(master_th, team, 0, master_gtid);
/* now, install the worker threads */
for (i = 1; i < team->t.t_nproc; i++) {
/* fork or reallocate a new thread and install it in team */
kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
team->t.t_threads[i] = thr;
KMP_DEBUG_ASSERT(thr);
KMP_DEBUG_ASSERT(thr->th.th_team == team);
/* align team and thread arrived states */
KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
"T#%d(%d:%d) join =%llu, plain=%llu\n",
__kmp_gtid_from_tid(0, team), team->t.t_id, 0,
__kmp_gtid_from_tid(i, team), team->t.t_id, i,
team->t.t_bar[bs_forkjoin_barrier].b_arrived,
team->t.t_bar[bs_plain_barrier].b_arrived));
thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
thr->th.th_teams_level = master_th->th.th_teams_level;
thr->th.th_teams_size = master_th->th.th_teams_size;
{ // Initialize threads' barrier data.
int b;
kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
for (b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
}
}
#if KMP_AFFINITY_SUPPORTED
// Do not partition the places list for teams construct workers who
// haven't actually been forked to do real work yet. This partitioning
// will take place in the parallel region nested within the teams construct.
if (!fork_teams_workers) {
__kmp_partition_places(team);
}
#endif
if (team->t.t_nproc > 1 &&
__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
team->t.b->update_num_threads(team->t.t_nproc);
__kmp_add_threads_to_team(team, team->t.t_nproc);
}
}
// Take care of primary thread's task state
if (__kmp_tasking_mode != tskm_immediate_exec) {
if (use_hot_team) {
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
KA_TRACE(
20,
("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
"%p, new task_team %p / team %p\n",
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
team));
// Store primary thread's current task state on new team
KMP_CHECK_UPDATE(team->t.t_primary_task_state,
master_th->th.th_task_state);
// Restore primary thread's task state to hot team's state
// by using thread 1's task state
if (team->t.t_nproc > 1) {
KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
team->t.t_threads[1]->th.th_task_state == 1);
KMP_CHECK_UPDATE(master_th->th.th_task_state,
team->t.t_threads[1]->th.th_task_state);
} else {
master_th->th.th_task_state = 0;
}
} else {
// Store primary thread's current task_state on new team
KMP_CHECK_UPDATE(team->t.t_primary_task_state,
master_th->th.th_task_state);
// Are not using hot team, so set task state to 0.
master_th->th.th_task_state = 0;
}
}
if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
for (i = 0; i < team->t.t_nproc; i++) {
kmp_info_t *thr = team->t.t_threads[i];
if (thr->th.th_prev_num_threads != team->t.t_nproc ||
thr->th.th_prev_level != team->t.t_level) {
team->t.t_display_affinity = 1;
break;
}
}
}
KMP_MB();
}
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
// Propagate any changes to the floating point control registers out to the team
// We try to avoid unnecessary writes to the relevant cache line in the team
// structure, so we don't make changes unless they are needed.
inline static void propagateFPControl(kmp_team_t *team) {
if (__kmp_inherit_fp_control) {
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
// Get primary thread's values of FPU control flags (both X87 and vector)
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
__kmp_store_mxcsr(&mxcsr);
mxcsr &= KMP_X86_MXCSR_MASK;
// There is no point looking at t_fp_control_saved here.
// If it is TRUE, we still have to update the values if they are different
// from those we now have. If it is FALSE we didn't save anything yet, but
// our objective is the same. We have to ensure that the values in the team
// are the same as those we have.
// So, this code achieves what we need whether or not t_fp_control_saved is
// true. By checking whether the value needs updating we avoid unnecessary
// writes that would put the cache-line into a written state, causing all
// threads in the team to have to read it again.
KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
// Although we don't use this value, other code in the runtime wants to know
// whether it should restore them. So we must ensure it is correct.
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
} else {
// Similarly here. Don't write to this cache-line in the team structure
// unless we have to.
KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
}
}
// Do the opposite, setting the hardware registers to the updated values from
// the team.
inline static void updateHWFPControl(kmp_team_t *team) {
if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
// Only reset the fp control regs if they have been changed in the team.
// the parallel region that we are exiting.
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
__kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
__kmp_store_mxcsr(&mxcsr);
mxcsr &= KMP_X86_MXCSR_MASK;
if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
__kmp_clear_x87_fpu_status_word();
__kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
}
if (team->t.t_mxcsr != mxcsr) {
__kmp_load_mxcsr(&team->t.t_mxcsr);
}
}
}
#else
#define propagateFPControl(x) ((void)0)
#define updateHWFPControl(x) ((void)0)
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
int realloc); // forward declaration
/* Run a parallel region that has been serialized, so runs only in a team of the
single primary thread. */
void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
kmp_info_t *this_thr;
kmp_team_t *serial_team;
KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
/* Skip all this code for autopar serialized loops since it results in
unacceptable overhead */
if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
return;
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
this_thr = __kmp_threads[global_tid];
serial_team = this_thr->th.th_serial_team;
/* utilize the serialized team held by this thread */
KMP_DEBUG_ASSERT(serial_team);
KMP_MB();
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else if (proc_bind == proc_bind_default) {
// No proc_bind clause was specified, so use the current value
// of proc-bind-var for this parallel region.
proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
}
// Reset for next parallel region
this_thr->th.th_set_proc_bind = proc_bind_default;
// Reset num_threads for next parallel region
this_thr->th.th_set_nproc = 0;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
if (ompt_enabled.enabled &&
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
ompt_task_info_t *parent_task_info;
parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = 1;
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
&(parent_task_info->task_data), &(parent_task_info->frame),
&ompt_parallel_data, team_size,
ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
}
}
#endif // OMPT_SUPPORT
if (this_thr->th.th_team != serial_team) {
// Nested level will be an index in the nested nthreads array
int level = this_thr->th.th_team->t.t_level;
if (serial_team->t.t_serialized) {
/* this serial team was already used
TODO increase performance by making this locks more specific */
kmp_team_t *new_team;
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
new_team =
__kmp_allocate_team(this_thr->th.th_root, 1, 1,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind, &this_thr->th.th_current_task->td_icvs,
0 USE_NESTED_HOT_ARG(NULL));
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
KMP_ASSERT(new_team);
/* setup new serialized team and install it */
new_team->t.t_threads[0] = this_thr;
new_team->t.t_parent = this_thr->th.th_team;
serial_team = new_team;
this_thr->th.th_serial_team = serial_team;
KF_TRACE(
10,
("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
global_tid, serial_team));
/* TODO the above breaks the requirement that if we run out of resources,
then we can still guarantee that serialized teams are ok, since we may
need to allocate a new one */
} else {
KF_TRACE(
10,
("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
global_tid, serial_team));
}
/* we have to initialize this serial team */
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
serial_team->t.t_ident = loc;
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
if (this_thr->th.th_team->t.t_nested_nth)
serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
else
serial_team->t.t_nested_nth = &__kmp_nested_nth;
// Save previous team's task state on serial team structure
serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
this_thr->th.th_current_task));
KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
this_thr->th.th_current_task->td_flags.executing = 0;
__kmp_push_current_task_to_thread(this_thr, serial_team, 0);
/* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
implicit task for each serialized task represented by
team->t.t_serialized? */
copy_icvs(&this_thr->th.th_current_task->td_icvs,
&this_thr->th.th_current_task->td_parent->td_icvs);
// Thread value exists in the nested nthreads array for the next nested
// level
kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
if (this_thr->th.th_team->t.t_nested_nth)
nested_nth = this_thr->th.th_team->t.t_nested_nth;
if (nested_nth->used && (level + 1 < nested_nth->used)) {
this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
if (__kmp_nested_proc_bind.used &&
(level + 1 < __kmp_nested_proc_bind.used)) {
this_thr->th.th_current_task->td_icvs.proc_bind =
__kmp_nested_proc_bind.bind_types[level + 1];
}
#if USE_DEBUGGER
serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
#endif
this_thr->th.th_info.ds.ds_tid = 0;
/* set thread cache values */
this_thr->th.th_team_nproc = 1;
this_thr->th.th_team_master = this_thr;
this_thr->th.th_team_serialized = 1;
this_thr->th.th_task_team = NULL;
this_thr->th.th_task_state = 0;
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
propagateFPControl(serial_team);
/* check if we need to allocate dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
if (!serial_team->t.t_dispatch->th_disp_buffer) {
serial_team->t.t_dispatch->th_disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(
sizeof(dispatch_private_info_t));
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
KMP_MB();
} else {
/* this serialized team is already being used,
* that's fine, just add another nested level */
KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
KMP_DEBUG_ASSERT(serial_team->t.t_threads);
KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
++serial_team->t.t_serialized;
this_thr->th.th_team_serialized = serial_team->t.t_serialized;
// Nested level will be an index in the nested nthreads array
int level = this_thr->th.th_team->t.t_level;
// Thread value exists in the nested nthreads array for the next nested
// level
kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
if (serial_team->t.t_nested_nth)
nested_nth = serial_team->t.t_nested_nth;
if (nested_nth->used && (level + 1 < nested_nth->used)) {
this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
serial_team->t.t_level++;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
"of serial team %p to %d\n",
global_tid, serial_team, serial_team->t.t_level));
/* allocate/push dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
{
dispatch_private_info_t *disp_buffer =
(dispatch_private_info_t *)__kmp_allocate(
sizeof(dispatch_private_info_t));
disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
/* allocate/push task team stack */
__kmp_push_task_team_node(this_thr, serial_team);
KMP_MB();
}
KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
// Perform the display affinity functionality for
// serialized parallel regions
if (__kmp_display_affinity) {
if (this_thr->th.th_prev_level != serial_team->t.t_level ||
this_thr->th.th_prev_num_threads != 1) {
// NULL means use the affinity-format-var ICV
__kmp_aux_display_affinity(global_tid, NULL);
this_thr->th.th_prev_level = serial_team->t.t_level;
this_thr->th.th_prev_num_threads = 1;
}
}
if (__kmp_env_consistency_check)
__kmp_push_parallel(global_tid, NULL);
#if OMPT_SUPPORT
serial_team->t.ompt_team_info.master_return_address = codeptr;
if (ompt_enabled.enabled &&
this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
OMPT_GET_FRAME_ADDRESS(0);
ompt_lw_taskteam_t lw_taskteam;
__ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
&ompt_parallel_data, codeptr);
__ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
// don't use lw_taskteam after linking. content was swaped
/* OMPT implicit task begin */
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
ompt_task_implicit); // TODO: Can this be ompt_task_initial?
OMPT_CUR_TASK_INFO(this_thr)->thread_num =
__kmp_tid_from_gtid(global_tid);
}
/* OMPT state */
this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
OMPT_GET_FRAME_ADDRESS(0);
}
#endif
}
// Test if this fork is for a team closely nested in a teams construct
static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
microtask_t microtask, int level,
int teams_level, kmp_va_list ap) {
return (master_th->th.th_teams_microtask && ap &&
microtask != (microtask_t)__kmp_teams_master && level == teams_level);
}
// Test if this fork is for the teams construct, i.e. to form the outer league
// of teams
static inline bool __kmp_is_entering_teams(int active_level, int level,
int teams_level, kmp_va_list ap) {
return ((ap == NULL && active_level == 0) ||
(ap && teams_level > 0 && teams_level == level));
}
// AC: This is start of parallel that is nested inside teams construct.
// The team is actual (hot), all workers are ready at the fork barrier.
// No lock needed to initialize the team a bit, then free workers.
static inline int
__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
enum fork_context_e call_context, microtask_t microtask,
launch_t invoker, int master_set_numthreads, int level,
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data, void *return_address,
#endif
kmp_va_list ap) {
void **argv;
int i;
parent_team->t.t_ident = loc;
__kmp_alloc_argv_entries(argc, parent_team, TRUE);
parent_team->t.t_argc = argc;
argv = (void **)parent_team->t.t_argv;
for (i = argc - 1; i >= 0; --i) {
*argv++ = va_arg(kmp_va_deref(ap), void *);
}
// Increment our nested depth levels, but not increase the serialization
if (parent_team == master_th->th.th_serial_team) {
// AC: we are in serialized parallel
__kmpc_serialized_parallel(loc, gtid);
KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
if (call_context == fork_context_gnu) {
// AC: need to decrement t_serialized for enquiry functions to work
// correctly, will restore at join time
parent_team->t.t_serialized--;
return TRUE;
}
#if OMPD_SUPPORT
parent_team->t.t_pkfn = microtask;
#endif
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_data_t *implicit_task_data;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
&ompt_parallel_data, return_address);
exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// Don't use lw_taskteam after linking. Content was swapped.
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
// AC: need to decrement t_serialized for enquiry functions to work
// correctly, will restore at join time
parent_team->t.t_serialized--;
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, implicit_task_data, 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
&ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
parent_team->t.t_pkfn = microtask;
parent_team->t.t_invoke = invoker;
KMP_ATOMIC_INC(&root->r.r_in_parallel);
parent_team->t.t_active_level++;
parent_team->t.t_level++;
parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
// If the threads allocated to the team are less than the thread limit, update
// the thread limit here. th_teams_size.nth is specific to this team nested
// in a teams construct, the team is fully created, and we're about to do
// the actual fork. Best to do this here so that the subsequent uses below
// and in the join have the correct value.
master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_lw_taskteam_t lw_taskteam;
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
}
#endif
/* Change number of threads in the team if requested */
if (master_set_numthreads) { // The parallel has num_threads clause
if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
// AC: only can reduce number of threads dynamically, can't increase
kmp_info_t **other_threads = parent_team->t.t_threads;
// NOTE: if using distributed barrier, we need to run this code block
// even when the team size appears not to have changed from the max.
int old_proc = master_th->th.th_teams_size.nth;
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
__kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
__kmp_add_threads_to_team(parent_team, master_set_numthreads);
}
parent_team->t.t_nproc = master_set_numthreads;
for (i = 0; i < master_set_numthreads; ++i) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
}
// Keep extra threads hot in the team for possible next parallels
master_th->th.th_set_nproc = 0;
}
#if USE_DEBUGGER
if (__kmp_debugging) { // Let debugger override number of threads.
int nth = __kmp_omp_num_threads(loc);
if (nth > 0) { // 0 means debugger doesn't want to change num threads
master_set_numthreads = nth;
}
}
#endif
// Figure out the proc_bind policy for the nested parallel within teams
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
// proc_bind_default means don't update
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
// No proc_bind clause specified; use current proc-bind-var
if (proc_bind == proc_bind_default) {
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
/* else: The proc_bind policy was specified explicitly on parallel clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
// Figure the value of proc-bind-var for the child threads.
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
// Need to change the bind-var ICV to correct value for each implicit task
if (proc_bind_icv != proc_bind_default &&
master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
kmp_info_t **other_threads = parent_team->t.t_threads;
for (i = 0; i < master_th->th.th_team_nproc; ++i) {
other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
}
}
// Reset for next parallel region
master_th->th.th_set_proc_bind = proc_bind_default;
#if USE_ITT_BUILD && USE_ITT_NOTIFY
if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames_mode == 3 &&
parent_team->t.t_active_level == 1 // only report frames at level 1
&& master_th->th.th_teams_size.nteams == 1) {
kmp_uint64 tmp_time = __itt_get_timestamp();
master_th->th.th_frame_time = tmp_time;
parent_team->t.t_region_time = tmp_time;
}
if (__itt_stack_caller_create_ptr) {
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
// create new stack stitching id before entering fork barrier
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
#if KMP_AFFINITY_SUPPORTED
__kmp_partition_places(parent_team);
#endif
KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
__kmp_internal_fork(loc, gtid, parent_team);
KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, parent_team, master_th, gtid));
if (call_context == fork_context_gnu)
return TRUE;
/* Invoke microtask for PRIMARY thread */
KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
if (!parent_team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
}
KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
parent_team->t.t_id, parent_team->t.t_pkfn));
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
return TRUE;
}
// Create a serialized parallel region
static inline int
__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
kmp_int32 argc, microtask_t microtask, launch_t invoker,
kmp_info_t *master_th, kmp_team_t *parent_team,
#if OMPT_SUPPORT
ompt_data_t *ompt_parallel_data, void **return_address,
ompt_data_t **parent_task_data,
#endif
kmp_va_list ap) {
kmp_team_t *team;
int i;
void **argv;
/* josh todo: hypothetical question: what do we do for OS X*? */
#if KMP_OS_LINUX && \
(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
SimpleVLA<void *> args(argc);
#else
void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
KMP_ARCH_AARCH64) */
KA_TRACE(
20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
__kmpc_serialized_parallel(loc, gtid);
#if OMPD_SUPPORT
master_th->th.th_serial_team->t.t_pkfn = microtask;
#endif
if (call_context == fork_context_intel) {
/* TODO this sucks, use the compiler itself to pass args! :) */
master_th->th.th_serial_team->t.t_ident = loc;
if (!ap) {
// revert change made in __kmpc_serialized_parallel()
master_th->th.th_serial_team->t.t_level--;
// Get args from parent team for teams construct
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
ompt_parallel_data, *return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_frame_p = &(task_info->frame.exit_frame.ptr);
if (ompt_enabled.ompt_callback_implicit_task) {
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
&(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
*ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
ompt_parallel_data, *parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else if (microtask == (microtask_t)__kmp_teams_master) {
KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
team = master_th->th.th_team;
// team->t.t_pkfn = microtask;
team->t.t_invoke = invoker;
__kmp_alloc_argv_entries(argc, team, TRUE);
team->t.t_argc = argc;
argv = (void **)team->t.t_argv;
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
// AC: revert change made in __kmpc_serialized_parallel()
// because initial code in teams should have level=0
team->t.t_level--;
// AC: call special invoker for outer "parallel" of teams construct
invoker(gtid);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 0,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
}
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
ompt_parallel_data, *parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_league,
*return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
} else {
argv = args;
for (i = argc - 1; i >= 0; --i)
*argv++ = va_arg(kmp_va_deref(ap), void *);
KMP_MB();
#if OMPT_SUPPORT
void *dummy;
void **exit_frame_p;
ompt_task_info_t *task_info;
ompt_lw_taskteam_t lw_taskteam;
ompt_data_t *implicit_task_data;
if (ompt_enabled.enabled) {
__ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
ompt_parallel_data, *return_address);
__ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
// don't use lw_taskteam after linking. content was swaped
task_info = OMPT_CUR_TASK_INFO(master_th);
exit_frame_p = &(task_info->frame.exit_frame.ptr);
/* OMPT implicit task begin */
implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
ompt_task_implicit);
OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
}
/* OMPT state */
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
} else {
exit_frame_p = &dummy;
}
#endif
{
KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
__kmp_invoke_microtask(microtask, gtid, 0, argc, args
#if OMPT_SUPPORT
,
exit_frame_p
#endif
);
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
*exit_frame_p = NULL;
if (ompt_enabled.ompt_callback_implicit_task) {
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), 1,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
*ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
ompt_parallel_data, *parent_task_data,
OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
}
} else if (call_context == fork_context_gnu) {
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_lw_taskteam_t lwt;
__ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
*return_address);
lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
__ompt_lw_taskteam_link(&lwt, master_th, 1);
}
// don't use lw_taskteam after linking. content was swaped
#endif
// we were called from GNU native code
KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
return FALSE;
} else {
KMP_ASSERT2(call_context < fork_context_last,
"__kmp_serial_fork_call: unknown fork_context parameter");
}
KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
KMP_MB();
return FALSE;
}
/* most of the work for a fork */
/* return true if we really went parallel, false if serialized */
int __kmp_fork_call(ident_t *loc, int gtid,
enum fork_context_e call_context, // Intel, GNU, ...
kmp_int32 argc, microtask_t microtask, launch_t invoker,
kmp_va_list ap) {
void **argv;
int i;
int master_tid;
int master_this_cons;
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int nthreads;
int master_active;
int master_set_numthreads;
int task_thread_limit = 0;
int level;
int active_level;
int teams_level;
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{ // KMP_TIME_BLOCK
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
/* Some systems prefer the stack for the root thread(s) to start with */
/* some gap from the parent stack to prevent false sharing. */
void *dummy = KMP_ALLOCA(__kmp_stkpadding);
/* These 2 lines below are so this does not get optimized out */
if (__kmp_stkpadding > KMP_MAX_STKPADDING)
__kmp_stkpadding += (short)((kmp_int64)dummy);
}
/* initialize if needed */
KMP_DEBUG_ASSERT(
__kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
if (!TCR_4(__kmp_init_parallel))
__kmp_parallel_initialize();
__kmp_resume_if_soft_paused();
/* setup current data */
// AC: potentially unsafe, not in sync with library shutdown,
// __kmp_threads can be freed
master_th = __kmp_threads[gtid];
parent_team = master_th->th.th_team;
master_tid = master_th->th.th_info.ds.ds_tid;
master_this_cons = master_th->th.th_local.this_construct;
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
task_thread_limit =
master_th->th.th_current_task->td_icvs.task_thread_limit;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
ompt_data_t *parent_task_data = NULL;
ompt_frame_t *ompt_frame = NULL;
void *return_address = NULL;
if (ompt_enabled.enabled) {
__ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
NULL, NULL);
return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
}
#endif
// Assign affinity to root thread if it hasn't happened yet
__kmp_assign_root_init_mask();
// Nested level will be an index in the nested nthreads array
level = parent_team->t.t_level;
// used to launch non-serial teams even if nested is not allowed
active_level = parent_team->t.t_active_level;
// needed to check nesting inside the teams
teams_level = master_th->th.th_teams_level;
#if KMP_NESTED_HOT_TEAMS
p_hot_teams = &master_th->th.th_hot_teams;
if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
*p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
(*p_hot_teams)[0].hot_team = root->r.r_hot_team;
// it is either actual or not needed (when active_level > 0)
(*p_hot_teams)[0].hot_team_nth = 1;
}
#endif
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (ompt_enabled.ompt_callback_parallel_begin) {
int team_size = master_set_numthreads
? master_set_numthreads
: get__nproc_2(parent_team, master_tid);
int flags = OMPT_INVOKER(call_context) |
((microtask == (microtask_t)__kmp_teams_master)
? ompt_parallel_league
: ompt_parallel_team);
ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
return_address);
}
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
master_th->th.th_ident = loc;
// Parallel closely nested in teams construct:
if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
call_context, microtask, invoker,
master_set_numthreads, level,
#if OMPT_SUPPORT
ompt_parallel_data, return_address,
#endif
ap);
} // End parallel closely nested in teams construct
// Need this to happen before we determine the number of threads, not while
// we are allocating the team
//__kmp_push_current_task_to_thread(master_th, parent_team, 0);
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
// Determine the number of threads
int enter_teams =
__kmp_is_entering_teams(active_level, level, teams_level, ap);
if ((!enter_teams &&
(parent_team->t.t_active_level >=
master_th->th.th_current_task->td_icvs.max_active_levels)) ||
(__kmp_library == library_serial)) {
KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
nthreads = 1;
} else {
nthreads = master_set_numthreads
? master_set_numthreads
// TODO: get nproc directly from current task
: get__nproc_2(parent_team, master_tid);
// Use the thread_limit set for the current target task if exists, else go
// with the deduced nthreads
nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
? task_thread_limit
: nthreads;
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct).
if (nthreads > 1) {
/* determine how many new threads we can use */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
/* AC: If we execute teams from parallel region (on host), then teams
should be created but each can only have 1 thread if nesting is
disabled. If teams called from serial region, then teams and their
threads should be created regardless of the nesting setting. */
nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
nthreads, enter_teams);
if (nthreads == 1) {
// Free lock for single thread execution here; for multi-thread
// execution it will be freed later after team of threads created
// and initialized
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
}
}
}
KMP_DEBUG_ASSERT(nthreads > 0);
// If we temporarily changed the set number of threads then restore it now
master_th->th.th_set_nproc = 0;
if (nthreads == 1) {
return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
invoker, master_th, parent_team,
#if OMPT_SUPPORT
&ompt_parallel_data, &return_address,
&parent_task_data,
#endif
ap);
} // if (nthreads == 1)
// GEH: only modify the executing flag in the case when not serialized
// serialized case is handled in kmpc_serialized_parallel
KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
"curtask=%p, curtask_max_aclevel=%d\n",
parent_team->t.t_active_level, master_th,
master_th->th.th_current_task,
master_th->th.th_current_task->td_icvs.max_active_levels));
// TODO: GEH - cannot do this assertion because root thread not set up as
// executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
master_th->th.th_current_task->td_flags.executing = 0;
if (!master_th->th.th_teams_microtask || level > teams_level) {
/* Increment our nested depth level */
KMP_ATOMIC_INC(&root->r.r_in_parallel);
}
// See if we need to make a copy of the ICVs.
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
kmp_nested_nthreads_t *nested_nth = NULL;
if (!master_th->th.th_set_nested_nth &&
(level + 1 < parent_team->t.t_nested_nth->used) &&
(parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
} else if (master_th->th.th_set_nested_nth) {
nested_nth = __kmp_override_nested_nth(master_th, level);
if ((level + 1 < nested_nth->used) &&
(nested_nth->nth[level + 1] != nthreads_icv))
nthreads_icv = nested_nth->nth[level + 1];
else
nthreads_icv = 0; // don't update
} else {
nthreads_icv = 0; // don't update
}
// Figure out the proc_bind_policy for the new team.
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
// proc_bind_default means don't update
kmp_proc_bind_t proc_bind_icv = proc_bind_default;
if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
} else {
// No proc_bind clause specified; use current proc-bind-var for this
// parallel region
if (proc_bind == proc_bind_default) {
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
// Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
if (master_th->th.th_teams_microtask &&
microtask == (microtask_t)__kmp_teams_master) {
proc_bind = __kmp_teams_proc_bind;
}
/* else: The proc_bind policy was specified explicitly on parallel clause.
This overrides proc-bind-var for this parallel region, but does not
change proc-bind-var. */
// Figure the value of proc-bind-var for the child threads.
if ((level + 1 < __kmp_nested_proc_bind.used) &&
(__kmp_nested_proc_bind.bind_types[level + 1] !=
master_th->th.th_current_task->td_icvs.proc_bind)) {
// Do not modify the proc bind icv for the two teams construct forks
// They just let the proc bind icv pass through
if (!master_th->th.th_teams_microtask ||
!(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
}
}
// Reset for next parallel region
master_th->th.th_set_proc_bind = proc_bind_default;
if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
kmp_internal_control_t new_icvs;
copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
new_icvs.next = NULL;
if (nthreads_icv > 0) {
new_icvs.nproc = nthreads_icv;
}
if (proc_bind_icv != proc_bind_default) {
new_icvs.proc_bind = proc_bind_icv;
}
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind, &new_icvs,
argc USE_NESTED_HOT_ARG(master_th));
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
} else {
/* allocate a new parallel team */
KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMPT_SUPPORT
ompt_parallel_data,
#endif
proc_bind,
&master_th->th.th_current_task->td_icvs,
argc USE_NESTED_HOT_ARG(master_th));
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
&master_th->th.th_current_task->td_icvs);
}
KF_TRACE(
10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
/* setup the new team */
KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
KMP_CHECK_UPDATE(team->t.t_ident, loc);
KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
#if OMPT_SUPPORT
KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
return_address);
#endif
KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
// TODO: parent_team->t.t_level == INT_MAX ???
if (!master_th->th.th_teams_microtask || level > teams_level) {
int new_level = parent_team->t.t_level + 1;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level + 1;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
} else {
// AC: Do not increase parallel level at start of the teams construct
int new_level = parent_team->t.t_level;
KMP_CHECK_UPDATE(team->t.t_level, new_level);
new_level = parent_team->t.t_active_level;
KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
}
kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
// set primary thread's schedule as new run-time schedule
KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
// Check if hot team has potentially outdated list, and if so, free it
if (team->t.t_nested_nth &&
team->t.t_nested_nth != parent_team->t.t_nested_nth) {
KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
KMP_INTERNAL_FREE(team->t.t_nested_nth);
team->t.t_nested_nth = NULL;
}
team->t.t_nested_nth = parent_team->t.t_nested_nth;
if (master_th->th.th_set_nested_nth) {
if (!nested_nth)
nested_nth = __kmp_override_nested_nth(master_th, level);
team->t.t_nested_nth = nested_nth;
KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
master_th->th.th_set_nested_nth = NULL;
master_th->th.th_set_nested_nth_sz = 0;
master_th->th.th_nt_strict = false;
}
// Update the floating point rounding in the team if required.
propagateFPControl(team);
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_parallel_begin();
#endif
KA_TRACE(
20,
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
team->t.t_nproc));
KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
(team->t.t_master_tid == 0 &&
(team->t.t_parent == root->r.r_root_team ||
team->t.t_parent->t.t_serialized)));
KMP_MB();
/* now, setup the arguments */
argv = (void **)team->t.t_argv;
if (ap) {
for (i = argc - 1; i >= 0; --i) {
void *new_argv = va_arg(kmp_va_deref(ap), void *);
KMP_CHECK_UPDATE(*argv, new_argv);
argv++;
}
} else {
for (i = 0; i < argc; ++i) {
// Get args from parent team for teams construct
KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
}
}
/* now actually fork the threads */
KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
root->r.r_active = TRUE;
__kmp_fork_team_threads(root, team, master_th, gtid, !ap);
__kmp_setup_icv_copy(team, nthreads,
&master_th->th.th_current_task->td_icvs, loc);
#if OMPT_SUPPORT
master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
#endif
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if USE_ITT_BUILD
if (team->t.t_active_level == 1 // only report frames at level 1
&& !master_th->th.th_teams_microtask) { // not in teams construct
#if USE_ITT_NOTIFY
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
(__kmp_forkjoin_frames_mode == 3 ||
__kmp_forkjoin_frames_mode == 1)) {
kmp_uint64 tmp_time = 0;
if (__itt_get_timestamp_ptr)
tmp_time = __itt_get_timestamp();
// Internal fork - report frame begin
master_th->th.th_frame_time = tmp_time;
if (__kmp_forkjoin_frames_mode == 3)
team->t.t_region_time = tmp_time;
} else
// only one notification scheme (either "submit" or "forking/joined", not both)
#endif /* USE_ITT_NOTIFY */
if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
// Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
__kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
}
}
#endif /* USE_ITT_BUILD */
/* now go on and do the work */
KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
KMP_MB();
KF_TRACE(10,
("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
// create new stack stitching id before entering fork barrier
if (!enter_teams) {
KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
team->t.t_stack_id = __kmp_itt_stack_caller_create();
} else if (parent_team->t.t_serialized) {
// keep stack stitching id in the serialized parent_team;
// current team will be used for parallel inside the teams;
// if parent_team is active, then it already keeps stack stitching id
// for the league of teams
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
}
}
#endif /* USE_ITT_BUILD */
// AC: skip __kmp_internal_fork at teams construct, let only primary
// threads execute
if (ap) {
__kmp_internal_fork(loc, gtid, team);
KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
"master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
}
if (call_context == fork_context_gnu) {
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
return TRUE;
}
/* Invoke microtask for PRIMARY thread */
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
} // END of timer KMP_fork_call block
#if KMP_STATS_ENABLED
// If beginning a teams construct, then change thread state
stats_state_e previous_state = KMP_GET_THREAD_STATE();
if (!ap) {
KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
}
#endif
if (!team->t.t_invoke(gtid)) {
KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
}
#if KMP_STATS_ENABLED
// If was beginning of a teams construct, then reset thread state
if (!ap) {
KMP_SET_THREAD_STATE(previous_state);
}
#endif
KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
team->t.t_id, team->t.t_pkfn));
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
return TRUE;
}
#if OMPT_SUPPORT
static inline void __kmp_join_restore_state(kmp_info_t *thread,
kmp_team_t *team) {
// restore state outside the region
thread->th.ompt_thread_info.state =
((team->t.t_serialized) ? ompt_state_work_serial
: ompt_state_work_parallel);
}
static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
kmp_team_t *team, ompt_data_t *parallel_data,
int flags, void *codeptr) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_parallel_end) {
ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
parallel_data, &(task_info->task_data), flags, codeptr);
}
task_info->frame.enter_frame = ompt_data_none;
__kmp_join_restore_state(thread, team);
}
#endif
void __kmp_join_call(ident_t *loc, int gtid
#if OMPT_SUPPORT
,
enum fork_context_e fork_context
#endif
,
int exit_teams) {
KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int master_active;
KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
/* setup current data */
master_th = __kmp_threads[gtid];
root = master_th->th.th_root;
team = master_th->th.th_team;
parent_team = team->t.t_parent;
master_th->th.th_ident = loc;
#if OMPT_SUPPORT
void *team_microtask = (void *)team->t.t_pkfn;
// For GOMP interface with serialized parallel, need the
// __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
// and end-parallel events.
if (ompt_enabled.enabled &&
!(team->t.t_serialized && fork_context == fork_context_gnu)) {
master_th->th.ompt_thread_info.state = ompt_state_overhead;
}
#endif
#if KMP_DEBUG
if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
"th_task_team = %p\n",
__kmp_gtid_from_thread(master_th), team,
team->t.t_task_team[master_th->th.th_task_state],
master_th->th.th_task_team));
KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
}
#endif
if (team->t.t_serialized) {
if (master_th->th.th_teams_microtask) {
// We are in teams construct
int level = team->t.t_level;
int tlevel = master_th->th.th_teams_level;
if (level == tlevel) {
// AC: we haven't incremented it earlier at start of teams construct,
// so do it here - at the end of teams construct
team->t.t_level++;
} else if (level == tlevel + 1) {
// AC: we are exiting parallel inside teams, need to increment
// serialization in order to restore it in the next call to
// __kmpc_end_serialized_parallel
team->t.t_serialized++;
}
}
__kmpc_end_serialized_parallel(loc, gtid);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
if (fork_context == fork_context_gnu) {
__ompt_lw_taskteam_unlink(master_th);
}
__kmp_join_restore_state(master_th, parent_team);
}
#endif
return;
}
master_active = team->t.t_master_active;
if (!exit_teams) {
// AC: No barrier for internal teams at exit from teams construct.
// But there is barrier for external team (league).
__kmp_internal_join(loc, gtid, team);
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr) {
KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
// destroy the stack stitching id after join barrier
__kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
team->t.t_stack_id = NULL;
}
#endif
} else {
master_th->th.th_task_state =
0; // AC: no tasking in teams (out of any parallel)
#if USE_ITT_BUILD
if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
// destroy the stack stitching id on exit from the teams construct
// if parent_team is active, then the id will be destroyed later on
// by master of the league of teams
__kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
parent_team->t.t_stack_id = NULL;
}
#endif
}
KMP_MB();
#if OMPT_SUPPORT
ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
void *codeptr = team->t.ompt_team_info.master_return_address;
#endif
#if USE_ITT_BUILD
// Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
if (team->t.t_active_level == 1 &&
(!master_th->th.th_teams_microtask || /* not in teams construct */
master_th->th.th_teams_size.nteams == 1)) {
master_th->th.th_ident = loc;
// only one notification scheme (either "submit" or "forking/joined", not
// both)
if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
__kmp_forkjoin_frames_mode == 3)
__kmp_itt_frame_submit(gtid, team->t.t_region_time,
master_th->th.th_frame_time, 0, loc,
master_th->th.th_team_nproc, 1);
else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
!__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
__kmp_itt_region_joined(gtid);
} // active_level == 1
#endif /* USE_ITT_BUILD */
#if KMP_AFFINITY_SUPPORTED
if (!exit_teams) {
// Restore master thread's partition.
master_th->th.th_first_place = team->t.t_first_place;
master_th->th.th_last_place = team->t.t_last_place;
}
#endif // KMP_AFFINITY_SUPPORTED
if (master_th->th.th_teams_microtask && !exit_teams &&
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
team->t.t_level == master_th->th.th_teams_level + 1) {
// AC: We need to leave the team structure intact at the end of parallel
// inside the teams construct, so that at the next parallel same (hot) team
// works, only adjust nesting levels
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_implicit_task) {
int ompt_team_size = team->t.t_nproc;
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
}
task_info->frame.exit_frame = ompt_data_none;
task_info->task_data = ompt_data_none;
ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
__ompt_lw_taskteam_unlink(master_th);
}
#endif
/* Decrement our nested depth level */
team->t.t_level--;
team->t.t_active_level--;
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
// Restore number of threads in the team if needed. This code relies on
// the proper adjustment of th_teams_size.nth after the fork in
// __kmp_teams_master on each teams primary thread in the case that
// __kmp_reserve_threads reduced it.
if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
int old_num = master_th->th.th_team_nproc;
int new_num = master_th->th.th_teams_size.nth;
kmp_info_t **other_threads = team->t.t_threads;
team->t.t_nproc = new_num;
for (int i = 0; i < old_num; ++i) {
other_threads[i]->th.th_team_nproc = new_num;
}
// Adjust states of non-used threads of the team
for (int i = old_num; i < new_num; ++i) {
// Re-initialize thread's barrier data.
KMP_DEBUG_ASSERT(other_threads[i]);
kmp_balign_t *balign = other_threads[i]->th.th_bar;
for (int b = 0; b < bs_last_barrier; ++b) {
balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
#if USE_DEBUGGER
balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
#endif
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
// Synchronize thread's task state
other_threads[i]->th.th_task_state = master_th->th.th_task_state;
}
}
}
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
__kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
}
#endif
return;
}
/* do cleanup and restore the parent team */
master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
master_th->th.th_local.this_construct = team->t.t_master_this_cons;
master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
/* jc: The following lock has instructions with REL and ACQ semantics,
separating the parallel user code called in this parallel region
from the serial user code called after this function returns. */
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
if (!master_th->th.th_teams_microtask ||
team->t.t_level > master_th->th.th_teams_level) {
/* Decrement our nested depth level */
KMP_ATOMIC_DEC(&root->r.r_in_parallel);
}
KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
#if OMPT_SUPPORT
if (ompt_enabled.enabled) {
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
if (ompt_enabled.ompt_callback_implicit_task) {
int flags = (team_microtask == (void *)__kmp_teams_master)
? ompt_task_initial
: ompt_task_implicit;
int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
}
task_info->frame.exit_frame = ompt_data_none;
task_info->task_data = ompt_data_none;
}
#endif
KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
master_th, team));
__kmp_pop_current_task_from_thread(master_th);
master_th->th.th_def_allocator = team->t.t_def_allocator;
#if OMPD_SUPPORT
if (ompd_state & OMPD_ENABLE_BP)
ompd_bp_parallel_end();
#endif
updateHWFPControl(team);
if (root->r.r_active != master_active)
root->r.r_active = master_active;
__kmp_free_team(root, team USE_NESTED_HOT_ARG(
master_th)); // this will free worker threads
/* this race was fun to find. make sure the following is in the critical
region otherwise assertions may fail occasionally since the old team may be
reallocated and the hierarchy appears inconsistent. it is actually safe to
run and won't cause any bugs, but will cause those assertion failures. it's
only one deref&assign so might as well put this in the critical region */
master_th->th.th_team = parent_team;
master_th->th.th_team_nproc = parent_team->t.t_nproc;
master_th->th.th_team_master = parent_team->t.t_threads[0];
master_th->th.th_team_serialized = parent_team->t.t_serialized;
/* restore serialized team, if need be */
if (parent_team->t.t_serialized &&
parent_team != master_th->th.th_serial_team &&
parent_team != root->r.r_root_team) {
__kmp_free_team(root,
master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
master_th->th.th_serial_team = parent_team;
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
// Restore primary thread's task state from team structure
KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
team->t.t_primary_task_state == 1);
master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
// Copy the task team from the parent team to the primary thread
master_th->th.th_task_team =
parent_team->t.t_task_team[master_th->th.th_task_state];
KA_TRACE(20,
("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
__kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
parent_team));
}
// TODO: GEH - cannot do this assertion because root thread not set up as
// executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
master_th->th.th_current_task->td_flags.executing = 1;
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
#if KMP_AFFINITY_SUPPORTED
if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
__kmp_reset_root_init_mask(gtid);
}
#endif
#if OMPT_SUPPORT
int flags =
OMPT_INVOKER(fork_context) |
((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
: ompt_parallel_team);
if (ompt_enabled.enabled) {
__kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
codeptr);
}
#endif
KMP_MB();
KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
}
/* Check whether we should push an internal control record onto the
serial team stack. If so, do it. */
void __kmp_save_internal_controls(kmp_info_t *thread) {
if (thread->th.th_team != thread->th.th_serial_team) {
return;
}
if (thread->th.th_team->t.t_serialized > 1) {
int push = 0;
if (thread->th.th_team->t.t_control_stack_top == NULL) {
push = 1;
} else {
if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
thread->th.th_team->t.t_serialized) {
push = 1;
}
}
if (push) { /* push a record on the serial team's stack */
kmp_internal_control_t *control =
(kmp_internal_control_t *)__kmp_allocate(
sizeof(kmp_internal_control_t));
copy_icvs(control, &thread->th.th_current_task->td_icvs);
control->serial_nesting_level = thread->th.th_team->t.t_serialized;
control->next = thread->th.th_team->t.t_control_stack_top;
thread->th.th_team->t.t_control_stack_top = control;
}
}
}
/* Changes set_nproc */
void __kmp_set_num_threads(int new_nth, int gtid) {
kmp_info_t *thread;
kmp_root_t *root;
KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
KMP_DEBUG_ASSERT(__kmp_init_serial);
if (new_nth < 1)
new_nth = 1;
else if (new_nth > __kmp_max_nth)
new_nth = __kmp_max_nth;
KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
thread = __kmp_threads[gtid];
if (thread->th.th_current_task->td_icvs.nproc == new_nth)
return; // nothing to do
__kmp_save_internal_controls(thread);
set__nproc(thread, new_nth);
// If this omp_set_num_threads() call will cause the hot team size to be
// reduced (in the absence of a num_threads clause), then reduce it now,
// rather than waiting for the next parallel region.
root = thread->th.th_root;
if (__kmp_init_parallel && (!root->r.r_active) &&
(root->r.r_hot_team->t.t_nproc > new_nth)
#if KMP_NESTED_HOT_TEAMS
&& __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
#endif
) {
kmp_team_t *hot_team = root->r.r_hot_team;
int f;
__kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
__kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
}
// Release the extra threads we don't need any more.
for (f = new_nth; f < hot_team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
if (__kmp_tasking_mode != tskm_immediate_exec) {
// When decreasing team size, threads no longer in the team should unref
// task team.
hot_team->t.t_threads[f]->th.th_task_team = NULL;
}
__kmp_free_thread(hot_team->t.t_threads[f]);
hot_team->t.t_threads[f] = NULL;
}
hot_team->t.t_nproc = new_nth;
#if KMP_NESTED_HOT_TEAMS
if (thread->th.th_hot_teams) {
KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
thread->th.th_hot_teams[0].hot_team_nth = new_nth;
}
#endif
if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
hot_team->t.b->update_num_threads(new_nth);
__kmp_add_threads_to_team(hot_team, new_nth);
}
__kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
// Update the t_nproc field in the threads that are still active.
for (f = 0; f < new_nth; f++) {
KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
}
// Special flag in case omp_set_num_threads() call
hot_team->t.t_size_changed = -1;
}