blob: e7106aa7aca1a3c0987a544544e87d94c0107a40 [file] [log] [blame]
/*
* kmp_runtime.c -- KPTS runtime support library
* $Revision: 43473 $
* $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
*/
//===----------------------------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
#include "kmp.h"
#include "kmp_atomic.h"
#include "kmp_wrapper_getpid.h"
#include "kmp_environment.h"
#include "kmp_itt.h"
#include "kmp_str.h"
#include "kmp_settings.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_error.h"
#include "kmp_stats.h"
#include "kmp_wait_release.h"
/* these are temporary issues to be dealt with */
#define KMP_USE_PRCTL 0
#define KMP_USE_POOLED_ALLOC 0
#if KMP_OS_WINDOWS
#include <process.h>
#endif
#if defined(KMP_GOMP_COMPAT)
char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
#endif /* defined(KMP_GOMP_COMPAT) */
char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
#if OMP_40_ENABLED
"4.0 (201307)";
#else
"3.1 (201107)";
#endif
#ifdef KMP_DEBUG
char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
#endif /* KMP_DEBUG */
#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
kmp_info_t __kmp_monitor;
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* Forward declarations */
void __kmp_cleanup( void );
static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
static void __kmp_partition_places( kmp_team_t *team );
static void __kmp_do_serial_initialize( void );
void __kmp_fork_barrier( int gtid, int tid );
void __kmp_join_barrier( int gtid );
void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
#ifdef USE_LOAD_BALANCE
static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
#endif
static int __kmp_expand_threads(int nWish, int nNeed);
static int __kmp_unregister_root_other_thread( int gtid );
static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique */
/* identifier of executing thread. */
/* returns KMP_GTID_DNE if we haven't been assigned a gtid */
int
__kmp_get_global_thread_id( )
{
int i;
kmp_info_t **other_threads;
size_t stack_data;
char *stack_addr;
size_t stack_size;
char *stack_base;
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
__kmp_nth, __kmp_all_nth ));
/* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
parallel region, made it return KMP_GTID_DNE to force serial_initialize by
caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
__kmp_init_gtid for this to work. */
if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
#ifdef KMP_TDATA_GTID
if ( TCR_4(__kmp_gtid_mode) >= 3) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
return __kmp_gtid;
}
#endif
if ( TCR_4(__kmp_gtid_mode) >= 2) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
return __kmp_gtid_get_specific();
}
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
stack_addr = (char*) & stack_data;
other_threads = __kmp_threads;
/*
ATT: The code below is a source of potential bugs due to unsynchronized access to
__kmp_threads array. For example:
1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
2. Current thread is suspended by OS.
3. Another thread unregisters and finishes (debug versions of free() may fill memory
with something like 0xEF).
4. Current thread is resumed.
5. Current thread reads junk from *thr.
TODO: Fix it.
--ln
*/
for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
if( !thr ) continue;
stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
/* stack grows down -- search through all of the active threads */
if( stack_addr <= stack_base ) {
size_t stack_diff = stack_base - stack_addr;
if( stack_diff <= stack_size ) {
/* The only way we can be closer than the allocated */
/* stack size is if we are running on this thread. */
KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
return i;
}
}
}
/* get specific to try and determine our gtid */
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
"thread, using TLS\n" ));
i = __kmp_gtid_get_specific();
/*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
/* if we havn't been assigned a gtid, then return code */
if( i<0 ) return i;
/* dynamically updated stack window for uber threads to avoid get_specific call */
if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
KMP_FATAL( StackOverflow, i );
}
stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
if( stack_addr > stack_base ) {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
} else {
TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
}
/* Reprint stack bounds for ubermaster since they have been refined */
if ( __kmp_storage_map ) {
char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
__kmp_print_storage_map_gtid( i, stack_beg, stack_end,
other_threads[i]->th.th_info.ds.ds_stacksize,
"th_%d stack (refinement)", i );
}
return i;
}
int
__kmp_get_global_thread_id_reg( )
{
int gtid;
if ( !__kmp_init_serial ) {
gtid = KMP_GTID_DNE;
} else
#ifdef KMP_TDATA_GTID
if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
gtid = __kmp_gtid;
} else
#endif
if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
gtid = __kmp_gtid_get_specific();
} else {
KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
gtid = __kmp_get_global_thread_id();
}
/* we must be a new uber master sibling thread */
if( gtid == KMP_GTID_DNE ) {
KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
"Registering a new gtid.\n" ));
__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
if( !__kmp_init_serial ) {
__kmp_do_serial_initialize();
gtid = __kmp_gtid_get_specific();
} else {
gtid = __kmp_register_root(FALSE);
}
__kmp_release_bootstrap_lock( &__kmp_initz_lock );
/*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
}
KMP_DEBUG_ASSERT( gtid >=0 );
return gtid;
}
/* caller must hold forkjoin_lock */
void
__kmp_check_stack_overlap( kmp_info_t *th )
{
int f;
char *stack_beg = NULL;
char *stack_end = NULL;
int gtid;
KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
if ( __kmp_storage_map ) {
stack_end = (char *) th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
gtid = __kmp_gtid_from_thread( th );
if (gtid == KMP_GTID_MONITOR) {
__kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%s stack (%s)", "mon",
( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
} else {
__kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
"th_%d stack (%s)", gtid,
( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
}
}
/* No point in checking ubermaster threads since they use refinement and cannot overlap */
if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid = __kmp_gtid_from_thread( th )))
{
KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
if ( stack_beg == NULL ) {
stack_end = (char *) th->th.th_info.ds.ds_stackbase;
stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
}
for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
if( f_th && f_th != th ) {
char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
char *other_stack_beg = other_stack_end -
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
(stack_end > other_stack_beg && stack_end < other_stack_end)) {
/* Print the other stack values before the abort */
if ( __kmp_storage_map )
__kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
(size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
"th_%d stack (overlapped)",
__kmp_gtid_from_thread( f_th ) );
__kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
}
}
}
}
KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
void
__kmp_infinite_loop( void )
{
static int done = FALSE;
while (! done) {
KMP_YIELD( 1 );
}
}
#define MAX_MESSAGE 512
void
__kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
char buffer[MAX_MESSAGE];
int node;
va_list ap;
va_start( ap, format);
sprintf( buffer, "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
__kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
__kmp_vprintf( kmp_err, buffer, ap );
#if KMP_PRINT_DATA_PLACEMENT
if(gtid >= 0) {
if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
if( __kmp_storage_map_verbose ) {
node = __kmp_get_host_node(p1);
if(node < 0) /* doesn't work, so don't try this next time */
__kmp_storage_map_verbose = FALSE;
else {
char *last;
int lastNode;
int localProc = __kmp_get_cpu_from_gtid(gtid);
p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
if(localProc >= 0)
__kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
else
__kmp_printf_no_lock(" GTID %d\n", gtid);
# if KMP_USE_PRCTL
/* The more elaborate format is disabled for now because of the prctl hanging bug. */
do {
last = p1;
lastNode = node;
/* This loop collates adjacent pages with the same host node. */
do {
(char*)p1 += PAGE_SIZE;
} while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
__kmp_printf_no_lock(" %p-%p memNode %d\n", last,
(char*)p1 - 1, lastNode);
} while(p1 <= p2);
# else
__kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
(char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
if(p1 < p2) {
__kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
(char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
}
# endif
}
}
} else
__kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
}
#endif /* KMP_PRINT_DATA_PLACEMENT */
__kmp_release_bootstrap_lock( & __kmp_stdio_lock );
}
void
__kmp_warn( char const * format, ... )
{
char buffer[MAX_MESSAGE];
va_list ap;
if ( __kmp_generate_warnings == kmp_warnings_off ) {
return;
}
va_start( ap, format );
snprintf( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
__kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
__kmp_vprintf( kmp_err, buffer, ap );
__kmp_release_bootstrap_lock( & __kmp_stdio_lock );
va_end( ap );
}
void
__kmp_abort_process()
{
// Later threads may stall here, but that's ok because abort() will kill them.
__kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
if ( __kmp_debug_buf ) {
__kmp_dump_debug_buffer();
}; // if
if ( KMP_OS_WINDOWS ) {
// Let other threads know of abnormal termination and prevent deadlock
// if abort happened during library initialization or shutdown
__kmp_global.g.g_abort = SIGABRT;
/*
On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
works well, but this function is not available in VS7 (this is not problem for DLL, but
it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
not help, at least in some versions of MS C RTL.
It seems following sequence is the only way to simulate abort() and avoid pop-up error
box.
*/
raise( SIGABRT );
_exit( 3 ); // Just in case, if signal ignored, exit anyway.
} else {
abort();
}; // if
__kmp_infinite_loop();
__kmp_release_bootstrap_lock( & __kmp_exit_lock );
} // __kmp_abort_process
void
__kmp_abort_thread( void )
{
// TODO: Eliminate g_abort global variable and this function.
// In case of abort just call abort(), it will kill all the threads.
__kmp_infinite_loop();
} // __kmp_abort_thread
/* ------------------------------------------------------------------------ */
/*
* Print out the storage map for the major kmp_info_t thread data structures
* that are allocated together.
*/
static void
__kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
{
__kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
"th_%d.th_info", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
"th_%d.th_local", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
&thr->th.th_bar[bs_plain_barrier+1],
sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
&thr->th.th_bar[bs_forkjoin_barrier+1],
sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
&thr->th.th_bar[bs_reduction_barrier+1],
sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
#endif // KMP_FAST_REDUCTION_BARRIER
}
/*
* Print out the storage map for the major kmp_team_t team data structures
* that are allocated together.
*/
static void
__kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
{
int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
__kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
#if KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
#endif // KMP_FAST_REDUCTION_BARRIER
__kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
header, team_id );
/*
__kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
//__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
// sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
__kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
#if OMP_40_ENABLED
__kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
#endif
*/
__kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
}
static void __kmp_init_allocator() {}
static void __kmp_fini_allocator() {}
static void __kmp_fini_allocator_thread() {}
/* ------------------------------------------------------------------------ */
#ifdef GUIDEDLL_EXPORTS
# if KMP_OS_WINDOWS
static void
__kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
// TODO: Change to __kmp_break_bootstrap_lock().
__kmp_init_bootstrap_lock( lck ); // make the lock released
}
static void
__kmp_reset_locks_on_process_detach( int gtid_req ) {
int i;
int thread_count;
// PROCESS_DETACH is expected to be called by a thread
// that executes ProcessExit() or FreeLibrary().
// OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
// So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
// However, in fact, some threads can be still alive here, although being about to be terminated.
// The threads in the array with ds_thread==0 are most suspicious.
// Actually, it can be not safe to access the __kmp_threads[].
// TODO: does it make sense to check __kmp_roots[] ?
// Let's check that there are no other alive threads registered with the OMP lib.
while( 1 ) {
thread_count = 0;
for( i = 0; i < __kmp_threads_capacity; ++i ) {
if( !__kmp_threads ) continue;
kmp_info_t* th = __kmp_threads[ i ];
if( th == NULL ) continue;
int gtid = th->th.th_info.ds.ds_gtid;
if( gtid == gtid_req ) continue;
if( gtid < 0 ) continue;
DWORD exit_val;
int alive = __kmp_is_thread_alive( th, &exit_val );
if( alive ) {
++thread_count;
}
}
if( thread_count == 0 ) break; // success
}
// Assume that I'm alone.
// Now it might be probably safe to check and reset locks.
// __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
__kmp_reset_lock( &__kmp_forkjoin_lock );
#ifdef KMP_DEBUG
__kmp_reset_lock( &__kmp_stdio_lock );
#endif // KMP_DEBUG
}
BOOL WINAPI
DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
//__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
switch( fdwReason ) {
case DLL_PROCESS_ATTACH:
KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
return TRUE;
case DLL_PROCESS_DETACH:
KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
__kmp_gtid_get_specific() ));
if( lpReserved != NULL )
{
// lpReserved is used for telling the difference:
// lpReserved == NULL when FreeLibrary() was called,
// lpReserved != NULL when the process terminates.
// When FreeLibrary() is called, worker threads remain alive.
// So they will release the forkjoin lock by themselves.
// When the process terminates, worker threads disappear triggering
// the problem of unreleased forkjoin lock as described below.
// A worker thread can take the forkjoin lock
// in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
// The problem comes up if that worker thread becomes dead
// before it releases the forkjoin lock.
// The forkjoin lock remains taken, while the thread
// executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
// will try to take the forkjoin lock and will always fail,
// so that the application will never finish [normally].
// This scenario is possible if __kmpc_end() has not been executed.
// It looks like it's not a corner case, but common cases:
// - the main function was compiled by an alternative compiler;
// - the main function was compiled by icl but without /Qopenmp (application with plugins);
// - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
// - alive foreign thread prevented __kmpc_end from doing cleanup.
// This is a hack to work around the problem.
// TODO: !!! to figure out something better.
__kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
}
__kmp_internal_end_library( __kmp_gtid_get_specific() );
return TRUE;
case DLL_THREAD_ATTACH:
KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
/* if we wanted to register new siblings all the time here call
* __kmp_get_gtid(); */
return TRUE;
case DLL_THREAD_DETACH:
KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
__kmp_gtid_get_specific() ));
__kmp_internal_end_thread( __kmp_gtid_get_specific() );
return TRUE;
}
return TRUE;
}
# endif /* KMP_OS_WINDOWS */
#endif /* GUIDEDLL_EXPORTS */
/* ------------------------------------------------------------------------ */
/* Change the library type to "status" and return the old type */
/* called from within initialization routines where __kmp_initz_lock is held */
int
__kmp_change_library( int status )
{
int old_status;
old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
if (status) {
__kmp_yield_init |= 1; // throughput => turnaround (odd init count)
}
else {
__kmp_yield_init &= ~1; // turnaround => throughput (even init count)
}
return old_status; // return previous setting of whether KMP_LIBRARY=throughput
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* __kmp_parallel_deo --
* Wait until it's our turn.
*/
void
__kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
{
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
kmp_team_t *team = __kmp_team_from_gtid( gtid );
#endif /* BUILD_PARALLEL_ORDERED */
if( __kmp_env_consistency_check ) {
if( __kmp_threads[gtid]->th.th_root->r.r_active )
__kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
}
#ifdef BUILD_PARALLEL_ORDERED
if( !team->t.t_serialized ) {
kmp_uint32 spins;
KMP_MB();
KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
KMP_MB();
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* __kmp_parallel_dxo --
* Signal the next task.
*/
void
__kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
{
int gtid = *gtid_ref;
#ifdef BUILD_PARALLEL_ORDERED
int tid = __kmp_tid_from_gtid( gtid );
kmp_team_t *team = __kmp_team_from_gtid( gtid );
#endif /* BUILD_PARALLEL_ORDERED */
if( __kmp_env_consistency_check ) {
if( __kmp_threads[gtid]->th.th_root->r.r_active )
__kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
}
#ifdef BUILD_PARALLEL_ORDERED
if ( ! team->t.t_serialized ) {
KMP_MB(); /* Flush all pending memory write invalidates. */
/* use the tid of the next thread in this team */
/* TODO repleace with general release procedure */
team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
KMP_MB(); /* Flush all pending memory write invalidates. */
}
#endif /* BUILD_PARALLEL_ORDERED */
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* The BARRIER for a SINGLE process section is always explicit */
int
__kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
{
int status;
kmp_info_t *th;
kmp_team_t *team;
if( ! TCR_4(__kmp_init_parallel) )
__kmp_parallel_initialize();
th = __kmp_threads[ gtid ];
team = th->th.th_team;
status = 0;
th->th.th_ident = id_ref;
if ( team->t.t_serialized ) {
status = 1;
} else {
kmp_int32 old_this = th->th.th_local.this_construct;
++th->th.th_local.this_construct;
/* try to set team count to thread count--success means thread got the
single block
*/
/* TODO: Should this be acquire or release? */
status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
th->th.th_local.this_construct);
}
if( __kmp_env_consistency_check ) {
if (status && push_ws) {
__kmp_push_workshare( gtid, ct_psingle, id_ref );
} else {
__kmp_check_workshare( gtid, ct_psingle, id_ref );
}
}
#if USE_ITT_BUILD
if ( status ) {
__kmp_itt_single_start( gtid );
}
if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
__kmp_itt_metadata_single();
}
#endif /* USE_ITT_BUILD */
return status;
}
void
__kmp_exit_single( int gtid )
{
#if USE_ITT_BUILD
__kmp_itt_single_end( gtid );
#endif /* USE_ITT_BUILD */
if( __kmp_env_consistency_check )
__kmp_pop_workshare( gtid, ct_psingle, NULL );
}
/*
* determine if we can go parallel or must use a serialized parallel region and
* how many threads we can use
* set_nproc is the number of threads requested for the team
* returns 0 if we should serialize or only use one thread,
* otherwise the number of threads to use
* The forkjoin lock is held by the caller.
*/
static int
__kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
int master_tid, int set_nthreads
#if OMP_40_ENABLED
, int enter_teams
#endif /* OMP_40_ENABLED */
)
{
int capacity;
int new_nthreads;
int use_rml_to_adjust_nth;
KMP_DEBUG_ASSERT( __kmp_init_serial );
KMP_DEBUG_ASSERT( root && parent_team );
//
// Initial check to see if we should use a serialized team.
//
if ( set_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
__kmp_get_gtid(), set_nthreads ));
return 1;
}
if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
#if OMP_40_ENABLED
&& !enter_teams
#endif /* OMP_40_ENABLED */
) ) || ( __kmp_library == library_serial ) ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
__kmp_get_gtid(), set_nthreads ));
return 1;
}
//
// If dyn-var is set, dynamically adjust the number of desired threads,
// according to the method specified by dynamic_mode.
//
new_nthreads = set_nthreads;
use_rml_to_adjust_nth = FALSE;
if ( ! get__dynamic_2( parent_team, master_tid ) ) {
;
}
#ifdef USE_LOAD_BALANCE
else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
if ( new_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
if ( new_nthreads < set_nthreads ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
master_tid, new_nthreads ));
}
}
#endif /* USE_LOAD_BALANCE */
else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
: root->r.r_hot_team->t.t_nproc);
if ( new_nthreads <= 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
if ( new_nthreads < set_nthreads ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
master_tid, new_nthreads ));
}
else {
new_nthreads = set_nthreads;
}
}
else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
if ( set_nthreads > 2 ) {
new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
new_nthreads = ( new_nthreads % set_nthreads ) + 1;
if ( new_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
if ( new_nthreads < set_nthreads ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
master_tid, new_nthreads ));
}
}
}
else {
KMP_ASSERT( 0 );
}
//
// Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
//
if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc );
if ( tl_nthreads <= 0 ) {
tl_nthreads = 1;
}
//
// If dyn-var is false, emit a 1-time warning.
//
if ( ! get__dynamic_2( parent_team, master_tid )
&& ( ! __kmp_reserve_warn ) ) {
__kmp_reserve_warn = 1;
__kmp_msg(
kmp_ms_warning,
KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
KMP_HNT( Unset_ALL_THREADS ),
__kmp_msg_null
);
}
if ( tl_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
master_tid ));
return 1;
}
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
master_tid, tl_nthreads ));
new_nthreads = tl_nthreads;
}
//
// Check if the threads array is large enough, or needs expanding.
//
// See comment in __kmp_register_root() about the adjustment if
// __kmp_threads[0] == NULL.
//
capacity = __kmp_threads_capacity;
if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
--capacity;
}
if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc ) > capacity ) {
//
// Expand the threads array.
//
int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
root->r.r_hot_team->t.t_nproc ) - capacity;
int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
if ( slotsAdded < slotsRequired ) {
//
// The threads array was not expanded enough.
//
new_nthreads -= ( slotsRequired - slotsAdded );
KMP_ASSERT( new_nthreads >= 1 );
//
// If dyn-var is false, emit a 1-time warning.
//
if ( ! get__dynamic_2( parent_team, master_tid )
&& ( ! __kmp_reserve_warn ) ) {
__kmp_reserve_warn = 1;
if ( __kmp_tp_cached ) {
__kmp_msg(
kmp_ms_warning,
KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
KMP_HNT( PossibleSystemLimitOnThreads ),
__kmp_msg_null
);
}
else {
__kmp_msg(
kmp_ms_warning,
KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
KMP_HNT( SystemLimitOnThreads ),
__kmp_msg_null
);
}
}
}
}
if ( new_nthreads == 1 ) {
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
__kmp_get_gtid(), set_nthreads ) );
return 1;
}
KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
__kmp_get_gtid(), new_nthreads, set_nthreads ));
return new_nthreads;
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* allocate threads from the thread pool and assign them to the new team */
/* we are assured that there are enough threads available, because we
* checked on that earlier within critical section forkjoin */
static void
__kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
kmp_info_t *master_th, int master_gtid )
{
int i;
int use_hot_team;
KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
KMP_MB();
/* first, let's setup the master thread */
master_th->th.th_info.ds.ds_tid = 0;
master_th->th.th_team = team;
master_th->th.th_team_nproc = team->t.t_nproc;
master_th->th.th_team_master = master_th;
master_th->th.th_team_serialized = FALSE;
master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
/* make sure we are not the optimized hot team */
#if KMP_NESTED_HOT_TEAMS
use_hot_team = 0;
kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
int level = team->t.t_active_level - 1; // index in array of hot teams
if( master_th->th.th_teams_microtask ) { // are we inside the teams?
if( master_th->th.th_teams_size.nteams > 1 ) {
++level; // level was not increased in teams construct for team_of_masters
}
if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
master_th->th.th_teams_level == team->t.t_level ) {
++level; // level was not increased in teams construct for team_of_workers before the parallel
} // team->t.t_level will be increased inside parallel
}
if( level < __kmp_hot_teams_max_level ) {
if( hot_teams[level].hot_team ) {
// hot team has already been allocated for given level
KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
use_hot_team = 1; // the team is ready to use
} else {
use_hot_team = 0; // AC: threads are not allocated yet
hot_teams[level].hot_team = team; // remember new hot team
hot_teams[level].hot_team_nth = team->t.t_nproc;
}
} else {
use_hot_team = 0;
}
}
#else
use_hot_team = team == root->r.r_hot_team;
#endif
if ( !use_hot_team ) {
/* install the master thread */
team->t.t_threads[ 0 ] = master_th;
__kmp_initialize_info( master_th, team, 0, master_gtid );
/* now, install the worker threads */
for ( i=1 ; i < team->t.t_nproc ; i++ ) {
/* fork or reallocate a new thread and install it in team */
kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
team->t.t_threads[ i ] = thr;
KMP_DEBUG_ASSERT( thr );
KMP_DEBUG_ASSERT( thr->th.th_team == team );
/* align team and thread arrived states */
KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
__kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
__kmp_gtid_from_tid( i, team ), team->t.t_id, i,
team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
#if OMP_40_ENABLED
thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
thr->th.th_teams_level = master_th->th.th_teams_level;
thr->th.th_teams_size = master_th->th.th_teams_size;
#endif
{ // Initialize threads' barrier data.
int b;
kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
for ( b = 0; b < bs_last_barrier; ++ b ) {
balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
}; // for b
}
}
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
__kmp_partition_places( team );
#endif
}
KMP_MB();
}
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
//
// Propagate any changes to the floating point control registers out to the team
// We try to avoid unnecessary writes to the relevant cache line in the team structure,
// so we don't make changes unless they are needed.
//
inline static void
propagateFPControl(kmp_team_t * team)
{
if ( __kmp_inherit_fp_control ) {
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
// Get master values of FPU control flags (both X87 and vector)
__kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
__kmp_store_mxcsr( &mxcsr );
mxcsr &= KMP_X86_MXCSR_MASK;
// There is no point looking at t_fp_control_saved here.
// If it is TRUE, we still have to update the values if they are different from those we now have.
// If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
// that the values in the team are the same as those we have.
// So, this code achieves what we need whether or not t_fp_control_saved is true.
// By checking whether the value needs updating we avoid unnecessary writes that would put the
// cache-line into a written state, causing all threads in the team to have to read it again.
if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
team->t.t_x87_fpu_control_word = x87_fpu_control_word;
}
if ( team->t.t_mxcsr != mxcsr ) {
team->t.t_mxcsr = mxcsr;
}
// Although we don't use this value, other code in the runtime wants to know whether it should restore them.
// So we must ensure it is correct.
if (!team->t.t_fp_control_saved) {
team->t.t_fp_control_saved = TRUE;
}
}
else {
// Similarly here. Don't write to this cache-line in the team structure unless we have to.
if (team->t.t_fp_control_saved)
team->t.t_fp_control_saved = FALSE;
}
}
// Do the opposite, setting the hardware registers to the updated values from the team.
inline static void
updateHWFPControl(kmp_team_t * team)
{
if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
//
// Only reset the fp control regs if they have been changed in the team.
// the parallel region that we are exiting.
//
kmp_int16 x87_fpu_control_word;
kmp_uint32 mxcsr;
__kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
__kmp_store_mxcsr( &mxcsr );
mxcsr &= KMP_X86_MXCSR_MASK;
if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
__kmp_clear_x87_fpu_status_word();
__kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
}
if ( team->t.t_mxcsr != mxcsr ) {
__kmp_load_mxcsr( &team->t.t_mxcsr );
}
}
}
#else
# define propagateFPControl(x) ((void)0)
# define updateHWFPControl(x) ((void)0)
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
static void
__kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
/*
* Run a parallel region that has been serialized, so runs only in a team of the single master thread.
*/
void
__kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
{
kmp_info_t *this_thr;
kmp_team_t *serial_team;
KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
/* Skip all this code for autopar serialized loops since it results in
unacceptable overhead */
if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
return;
if( ! TCR_4( __kmp_init_parallel ) )
__kmp_parallel_initialize();
this_thr = __kmp_threads[ global_tid ];
serial_team = this_thr->th.th_serial_team;
/* utilize the serialized team held by this thread */
KMP_DEBUG_ASSERT( serial_team );
KMP_MB();
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
KMP_DEBUG_ASSERT( this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team );
KMP_DEBUG_ASSERT( serial_team->t.t_task_team == NULL );
KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
this_thr->th.th_task_team = NULL;
}
#if OMP_40_ENABLED
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
proc_bind = proc_bind_false;
}
else if ( proc_bind == proc_bind_default ) {
//
// No proc_bind clause was specified, so use the current value
// of proc-bind-var for this parallel region.
//
proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
}
//
// Reset for next parallel region
//
this_thr->th.th_set_proc_bind = proc_bind_default;
#endif /* OMP_40_ENABLED */
if( this_thr->th.th_team != serial_team ) {
// Nested level will be an index in the nested nthreads array
int level = this_thr->th.th_team->t.t_level;
if( serial_team->t.t_serialized ) {
/* this serial team was already used
* TODO increase performance by making this locks more specific */
kmp_team_t *new_team;
int tid = this_thr->th.th_info.ds.ds_tid;
__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
#if OMP_40_ENABLED
proc_bind,
#endif
& this_thr->th.th_current_task->td_icvs,
0 USE_NESTED_HOT_ARG(NULL) );
__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
KMP_ASSERT( new_team );
/* setup new serialized team and install it */
new_team->t.t_threads[0] = this_thr;
new_team->t.t_parent = this_thr->th.th_team;
serial_team = new_team;
this_thr->th.th_serial_team = serial_team;
KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
global_tid, serial_team ) );
/* TODO the above breaks the requirement that if we run out of
* resources, then we can still guarantee that serialized teams
* are ok, since we may need to allocate a new one */
} else {
KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
global_tid, serial_team ) );
}
/* we have to initialize this serial team */
KMP_DEBUG_ASSERT( serial_team->t.t_threads );
KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
serial_team->t.t_ident = loc;
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
global_tid, this_thr->th.th_current_task ) );
KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
this_thr->th.th_current_task->td_flags.executing = 0;
__kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
/* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
each serialized task represented by team->t.t_serialized? */
copy_icvs(
& this_thr->th.th_current_task->td_icvs,
& this_thr->th.th_current_task->td_parent->td_icvs );
// Thread value exists in the nested nthreads array for the next nested level
if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
}
#if OMP_40_ENABLED
if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
this_thr->th.th_current_task->td_icvs.proc_bind
= __kmp_nested_proc_bind.bind_types[ level + 1 ];
}
#endif /* OMP_40_ENABLED */
this_thr->th.th_info.ds.ds_tid = 0;
/* set thread cache values */
this_thr->th.th_team_nproc = 1;
this_thr->th.th_team_master = this_thr;
this_thr->th.th_team_serialized = 1;
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
propagateFPControl (serial_team);
/* check if we need to allocate dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
__kmp_allocate( sizeof( dispatch_private_info_t ) );
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
KMP_MB();
} else {
/* this serialized team is already being used,
* that's fine, just add another nested level */
KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
KMP_DEBUG_ASSERT( serial_team->t.t_threads );
KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
++ serial_team->t.t_serialized;
this_thr->th.th_team_serialized = serial_team->t.t_serialized;
// Nested level will be an index in the nested nthreads array
int level = this_thr->th.th_team->t.t_level;
// Thread value exists in the nested nthreads array for the next nested level
if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
}
serial_team->t.t_level++;
KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
global_tid, serial_team, serial_team->t.t_level ) );
/* allocate/push dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
{
dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
__kmp_allocate( sizeof( dispatch_private_info_t ) );
disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
KMP_MB();
}
if ( __kmp_env_consistency_check )
__kmp_push_parallel( global_tid, NULL );
#if USE_ITT_BUILD
// Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
{
this_thr->th.th_ident = loc;
// 0 - no barriers; 1 - serialized parallel
__kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
}
// Save the start of the "parallel" region for VTune. This is the join barrier begin at the same time.
if( ( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) &&
__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr ) || KMP_ITT_DEBUG )
{
this_thr->th.th_ident = loc;
#if USE_ITT_NOTIFY
if( this_thr->th.th_team->t.t_level == 1 ) {
serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
}
#endif
}
#endif /* USE_ITT_BUILD */
}
/* most of the work for a fork */
/* return true if we really went parallel, false if serialized */
int
__kmp_fork_call(
ident_t * loc,
int gtid,
enum fork_context_e call_context, // Intel, GNU, ...
kmp_int32 argc,
microtask_t microtask,
launch_t invoker,
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
va_list * ap
#else
va_list ap
#endif
)
{
void **argv;
int i;
int master_tid;
int master_this_cons;
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int nthreads;
int master_active;
int master_set_numthreads;
int level;
#if OMP_40_ENABLED
int active_level;
int teams_level;
#endif
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t **p_hot_teams;
#endif
{ // KMP_TIME_BLOCK
KMP_TIME_BLOCK(KMP_fork_call);
KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
/* Some systems prefer the stack for the root thread(s) to start with */
/* some gap from the parent stack to prevent false sharing. */
void *dummy = alloca(__kmp_stkpadding);
/* These 2 lines below are so this does not get optimized out */
if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
__kmp_stkpadding += (short)((kmp_int64)dummy);
}
/* initialize if needed */
KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
if( ! TCR_4(__kmp_init_parallel) )
__kmp_parallel_initialize();
/* setup current data */
master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
parent_team = master_th->th.th_team;
master_tid = master_th->th.th_info.ds.ds_tid;
master_this_cons = master_th->th.th_local.this_construct;
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
// Nested level will be an index in the nested nthreads array
level = parent_team->t.t_level;
#if OMP_40_ENABLED
active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
#endif
#if KMP_NESTED_HOT_TEAMS
p_hot_teams = &master_th->th.th_hot_teams;
if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
*p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
(*p_hot_teams)[0].hot_team = root->r.r_hot_team;
(*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
}
#endif
master_th->th.th_ident = loc;
#if OMP_40_ENABLED
if ( master_th->th.th_teams_microtask &&
ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
// AC: This is start of parallel that is nested inside teams construct.
// The team is actual (hot), all workers are ready at the fork barrier.
// No lock needed to initialize the team a bit, then free workers.
parent_team->t.t_ident = loc;
parent_team->t.t_argc = argc;
argv = (void**)parent_team->t.t_argv;
for( i=argc-1; i >= 0; --i )
/* TODO: revert workaround for Intel(R) 64 tracker #96 */
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg( *ap, void * );
#else
*argv++ = va_arg( ap, void * );
#endif
/* Increment our nested depth levels, but not increase the serialization */
if ( parent_team == master_th->th.th_serial_team ) {
// AC: we are in serialized parallel
__kmpc_serialized_parallel(loc, gtid);
KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
parent_team->t.t_serialized--; // AC: need this in order enquiry functions
// work correctly, will restore at join time
KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
return TRUE;
}
parent_team->t.t_pkfn = microtask;
parent_team->t.t_invoke = invoker;
KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
parent_team->t.t_active_level ++;
parent_team->t.t_level ++;
/* Change number of threads in the team if requested */
if ( master_set_numthreads ) { // The parallel has num_threads clause
if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
// AC: only can reduce the number of threads dynamically, cannot increase
kmp_info_t **other_threads = parent_team->t.t_threads;
parent_team->t.t_nproc = master_set_numthreads;
for ( i = 0; i < master_set_numthreads; ++i ) {
other_threads[i]->th.th_team_nproc = master_set_numthreads;
}
// Keep extra threads hot in the team for possible next parallels
}
master_th->th.th_set_nproc = 0;
}
KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
__kmp_internal_fork( loc, gtid, parent_team );
KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
/* Invoke microtask for MASTER thread */
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
{
KMP_TIME_BLOCK(OMP_work);
if (! parent_team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
}
}
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
return TRUE;
} // Parallel closely nested in teams construct
#endif /* OMP_40_ENABLED */
#if KMP_DEBUG
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
}
#endif
/* determine how many new threads we can use */
__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
nthreads = 1;
} else {
nthreads = master_set_numthreads ?
master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
#if OMP_40_ENABLED
/* AC: If we execute teams from parallel region (on host), then teams should be created
but each can only have 1 thread if nesting is disabled. If teams called from serial region,
then teams and their threads should be created regardless of the nesting setting. */
, ((ap==NULL && active_level==0) ||
(ap && teams_level>0 && teams_level==level))
#endif /* OMP_40_ENABLED */
);
}
KMP_DEBUG_ASSERT( nthreads > 0 );
/* If we temporarily changed the set number of threads then restore it now */
master_th->th.th_set_nproc = 0;
/* create a serialized parallel region? */
if ( nthreads == 1 ) {
/* josh todo: hypothetical question: what do we do for OS X*? */
#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
void * args[ argc ];
#else
void * * args = (void**) alloca( argc * sizeof( void * ) );
#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
__kmpc_serialized_parallel(loc, gtid);
if ( call_context == fork_context_intel ) {
/* TODO this sucks, use the compiler itself to pass args! :) */
master_th->th.th_serial_team->t.t_ident = loc;
#if OMP_40_ENABLED
if ( !ap ) {
// revert change made in __kmpc_serialized_parallel()
master_th->th.th_serial_team->t.t_level--;
// Get args from parent team for teams construct
{
KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
}
} else if ( microtask == (microtask_t)__kmp_teams_master ) {
KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
team = master_th->th.th_team;
//team->t.t_pkfn = microtask;
team->t.t_invoke = invoker;
__kmp_alloc_argv_entries( argc, team, TRUE );
team->t.t_argc = argc;
argv = (void**) team->t.t_argv;
if ( ap ) {
for( i=argc-1; i >= 0; --i )
// TODO: revert workaround for Intel(R) 64 tracker #96
# if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg( *ap, void * );
# else
*argv++ = va_arg( ap, void * );
# endif
} else {
for( i=0; i < argc; ++i )
// Get args from parent team for teams construct
argv[i] = parent_team->t.t_argv[i];
}
// AC: revert change made in __kmpc_serialized_parallel()
// because initial code in teams should have level=0
team->t.t_level--;
// AC: call special invoker for outer "parallel" of the teams construct
{
KMP_TIME_BLOCK(OMP_work);
invoker(gtid);
}
} else {
#endif /* OMP_40_ENABLED */
argv = args;
for( i=argc-1; i >= 0; --i )
// TODO: revert workaround for Intel(R) 64 tracker #96
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg( *ap, void * );
#else
*argv++ = va_arg( ap, void * );
#endif
KMP_MB();
{
KMP_TIME_BLOCK(OMP_work);
__kmp_invoke_microtask( microtask, gtid, 0, argc, args );
}
#if OMP_40_ENABLED
}
#endif /* OMP_40_ENABLED */
}
else if ( call_context == fork_context_gnu ) {
// we were called from GNU native code
KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
return FALSE;
}
else {
KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
}
KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
KMP_MB();
return FALSE;
}
// GEH: only modify the executing flag in the case when not serialized
// serialized case is handled in kmpc_serialized_parallel
KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
master_th->th.th_current_task->td_icvs.max_active_levels ) );
// TODO: GEH - cannot do this assertion because root thread not set up as executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
master_th->th.th_current_task->td_flags.executing = 0;
#if OMP_40_ENABLED
if ( !master_th->th.th_teams_microtask || level > teams_level )
#endif /* OMP_40_ENABLED */
{
/* Increment our nested depth level */
KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
}
// See if we need to make a copy of the ICVs.
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
nthreads_icv = __kmp_nested_nth.nth[level+1];
}
else {
nthreads_icv = 0; // don't update
}
#if OMP_40_ENABLED
// Figure out the proc_bind_policy for the new team.
kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
proc_bind = proc_bind_false;
}
else {
if (proc_bind == proc_bind_default) {
// No proc_bind clause specified; use current proc-bind-var for this parallel region
proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
}
/* else: The proc_bind policy was specified explicitly on parallel clause. This
overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
// Figure the value of proc-bind-var for the child threads.
if ((level+1 < __kmp_nested_proc_bind.used)
&& (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
}
}
// Reset for next parallel region
master_th->th.th_set_proc_bind = proc_bind_default;
#endif /* OMP_40_ENABLED */
if ((nthreads_icv > 0)
#if OMP_40_ENABLED
|| (proc_bind_icv != proc_bind_default)
#endif /* OMP_40_ENABLED */
) {
kmp_internal_control_t new_icvs;
copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
new_icvs.next = NULL;
if (nthreads_icv > 0) {
new_icvs.nproc = nthreads_icv;
}
#if OMP_40_ENABLED
if (proc_bind_icv != proc_bind_default) {
new_icvs.proc_bind = proc_bind_icv;
}
#endif /* OMP_40_ENABLED */
/* allocate a new parallel team */
KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMP_40_ENABLED
proc_bind,
#endif
&new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
} else {
/* allocate a new parallel team */
KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
team = __kmp_allocate_team(root, nthreads, nthreads,
#if OMP_40_ENABLED
proc_bind,
#endif
&master_th->th.th_current_task->td_icvs, argc
USE_NESTED_HOT_ARG(master_th) );
}
KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
/* setup the new team */
team->t.t_master_tid = master_tid;
team->t.t_master_this_cons = master_this_cons;
team->t.t_ident = loc;
team->t.t_parent = parent_team;
TCW_SYNC_PTR(team->t.t_pkfn, microtask);
team->t.t_invoke = invoker; /* TODO move this to root, maybe */
// TODO: parent_team->t.t_level == INT_MAX ???
#if OMP_40_ENABLED
if ( !master_th->th.th_teams_microtask || level > teams_level ) {
#endif /* OMP_40_ENABLED */
team->t.t_level = parent_team->t.t_level + 1;
team->t.t_active_level = parent_team->t.t_active_level + 1;
#if OMP_40_ENABLED
} else {
// AC: Do not increase parallel level at start of the teams construct
team->t.t_level = parent_team->t.t_level;
team->t.t_active_level = parent_team->t.t_active_level;
}
#endif /* OMP_40_ENABLED */
team->t.t_sched = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
// Update the floating point rounding in the team if required.
propagateFPControl(team);
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
// Set master's task team to team's task team. Unless this is hot team, it should be NULL.
KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
__kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
parent_team, team->t.t_task_team, team ) );
master_th->th.th_task_team = team->t.t_task_team;
#if !KMP_NESTED_HOT_TEAMS
KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
#endif
}
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
( team->t.t_master_tid == 0 &&
( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
KMP_MB();
/* now, setup the arguments */
argv = (void**)team->t.t_argv;
#if OMP_40_ENABLED
if ( ap ) {
#endif /* OMP_40_ENABLED */
for ( i=argc-1; i >= 0; --i )
// TODO: revert workaround for Intel(R) 64 tracker #96
#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
*argv++ = va_arg( *ap, void * );
#else
*argv++ = va_arg( ap, void * );
#endif
#if OMP_40_ENABLED
} else {
for ( i=0; i < argc; ++i )
// Get args from parent team for teams construct
argv[i] = team->t.t_parent->t.t_argv[i];
}
#endif /* OMP_40_ENABLED */
/* now actually fork the threads */
team->t.t_master_active = master_active;
if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
root->r.r_active = TRUE;
__kmp_fork_team_threads( root, team, master_th, gtid );
__kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
#if USE_ITT_BUILD
// Mark start of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
if ((__itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) || KMP_ITT_DEBUG)
# if OMP_40_ENABLED
if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master)
// Either not in teams or the outer fork of the teams construct
# endif /* OMP_40_ENABLED */
{
__kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
}
kmp_uint64 tmp_time = 0;
#if USE_ITT_NOTIFY
if ( __itt_get_timestamp_ptr )
tmp_time = __itt_get_timestamp();
#endif
if ((__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode==3)|| KMP_ITT_DEBUG)
# if OMP_40_ENABLED
if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master)
// Either not in teams or the outer fork of the teams construct
# endif /* OMP_40_ENABLED */
team->t.t_region_time = tmp_time;
// Internal fork - report frame begin
if ((__kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3) && __itt_frame_submit_v3_ptr ) {
if (!(team->t.t_active_level > 1)) {
master_th->th.th_frame_time = tmp_time;
}
}
#endif /* USE_ITT_BUILD */
/* now go on and do the work */
KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
KMP_MB();
KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
#if USE_ITT_BUILD
if ( __itt_stack_caller_create_ptr ) {
team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
}
#endif /* USE_ITT_BUILD */
#if OMP_40_ENABLED
if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
#endif /* OMP_40_ENABLED */
{
__kmp_internal_fork( loc, gtid, team );
KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
root, team, master_th, gtid));
}
if (call_context == fork_context_gnu) {
KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
return TRUE;
}
/* Invoke microtask for MASTER thread */
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
gtid, team->t.t_id, team->t.t_pkfn ) );
} // END of timer KMP_fork_call block
{
//KMP_TIME_BLOCK(OMP_work);
KMP_TIME_BLOCK(USER_master_invoke);
if (! team->t.t_invoke( gtid )) {
KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
}
}
KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
gtid, team->t.t_id, team->t.t_pkfn ) );
KMP_MB(); /* Flush all pending memory write invalidates. */
KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
return TRUE;
}
void
__kmp_join_call(ident_t *loc, int gtid
#if OMP_40_ENABLED
, int exit_teams
#endif /* OMP_40_ENABLED */
)
{
KMP_TIME_BLOCK(KMP_join_call);
kmp_team_t *team;
kmp_team_t *parent_team;
kmp_info_t *master_th;
kmp_root_t *root;
int master_active;
int i;
KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
/* setup current data */
master_th = __kmp_threads[ gtid ];
root = master_th->th.th_root;
team = master_th->th.th_team;
parent_team = team->t.t_parent;
master_th->th.th_ident = loc;
#if KMP_DEBUG
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
__kmp_gtid_from_thread( master_th ), team,
team->t.t_task_team, master_th->th.th_task_team) );
KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team );
}
#endif
if( team->t.t_serialized ) {
#if OMP_40_ENABLED
if ( master_th->th.th_teams_microtask ) {
// We are in teams construct
int level = team->t.t_level;
int tlevel = master_th->th.th_teams_level;
if ( level == tlevel ) {
// AC: we haven't incremented it earlier at start of teams construct,
// so do it here - at the end of teams construct
team->t.t_level++;
} else if ( level == tlevel + 1 ) {
// AC: we are exiting parallel inside teams, need to increment serialization
// in order to restore it in the next call to __kmpc_end_serialized_parallel
team->t.t_serialized++;
}
}
#endif /* OMP_40_ENABLED */
__kmpc_end_serialized_parallel( loc, gtid );
return;
}
master_active = team->t.t_master_active;
#if OMP_40_ENABLED
if (!exit_teams)
#endif /* OMP_40_ENABLED */
{
// AC: No barrier for internal teams at exit from teams construct.
// But there is barrier for external team (league).
__kmp_internal_join( loc, gtid, team );
}
KMP_MB();
#if USE_ITT_BUILD
if ( __itt_stack_caller_create_ptr ) {
__kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
}
// Mark end of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
# if OMP_40_ENABLED
if ( !master_th->th.th_teams_microtask /* not in teams */ ||
( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
// Either not in teams or exiting teams region
// (teams is a frame and no other frames inside the teams)
# endif /* OMP_40_ENABLED */
{
master_th->th.th_ident = loc;
__kmp_itt_region_joined( gtid );
}
if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG )
# if OMP_40_ENABLED
if ( !master_th->th.th_teams_microtask /* not in teams */ ||
( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
// Either not in teams or exiting teams region
// (teams is a frame and no other frames inside the teams)
# endif /* OMP_40_ENABLED */
{
master_th->th.th_ident = loc;
__kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time, 0, loc, master_th->th.th_team_nproc, 1 );
}
#endif /* USE_ITT_BUILD */
#if OMP_40_ENABLED
if ( master_th->th.th_teams_microtask &&
!exit_teams &&
team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
team->t.t_level == master_th->th.th_teams_level + 1 ) {
// AC: We need to leave the team structure intact at the end
// of parallel inside the teams construct, so that at the next
// parallel same (hot) team works, only adjust nesting levels
/* Decrement our nested depth level */
team->t.t_level --;
team->t.t_active_level --;
KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
/* Restore number of threads in the team if needed */
if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
int old_num = master_th->th.th_team_nproc;
int new_num = master_th->th.th_teams_size.nth;
kmp_info_t **other_threads = team->t.t_threads;
team->t.t_nproc = new_num;
for ( i = 0; i < old_num; ++i ) {
other_threads[i]->th.th_team_nproc = new_num;
}
// Adjust states of non-used threads of the team
for ( i = old_num; i < new_num; ++i ) {
// Re-initialize thread's barrier data.
int b;
kmp_balign_t * balign = other_threads[i]->th.th_bar;
for ( b = 0; b < bs_last_barrier; ++ b ) {
balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
}
// Synchronize thread's task state
other_threads[i]->th.th_task_state = master_th->th.th_task_state;
}
}
return;
}
#endif /* OMP_40_ENABLED */
/* do cleanup and restore the parent team */
master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
master_th->th.th_local.this_construct = team->t.t_master_this_cons;
master_th->th.th_dispatch =
& parent_team->t.t_dispatch[ team->t.t_master_tid ];
/* jc: The following lock has instructions with REL and ACQ semantics,
separating the parallel user code called in this parallel region
from the serial user code called after this function returns.
*/
__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
#if OMP_40_ENABLED
if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
#endif /* OMP_40_ENABLED */
{
/* Decrement our nested depth level */
KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
}
KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
0, master_th, team ) );
__kmp_pop_current_task_from_thread( master_th );
#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
//
// Restore master thread's partition.
//
master_th->th.th_first_place = team->t.t_first_place;
master_th->th.th_last_place = team->t.t_last_place;
#endif /* OMP_40_ENABLED */
updateHWFPControl (team);
if ( root->r.r_active != master_active )
root->r.r_active = master_active;
__kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
/* this race was fun to find. make sure the following is in the critical
* region otherwise assertions may fail occasiounally since the old team
* may be reallocated and the hierarchy appears inconsistent. it is
* actually safe to run and won't cause any bugs, but will cause thoose
* assertion failures. it's only one deref&assign so might as well put this
* in the critical region */
master_th->th.th_team = parent_team;
master_th->th.th_team_nproc = parent_team->t.t_nproc;
master_th->th.th_team_master = parent_team->t.t_threads[0];
master_th->th.th_team_serialized = parent_team->t.t_serialized;
/* restore serialized team, if need be */
if( parent_team->t.t_serialized &&
parent_team != master_th->th.th_serial_team &&
parent_team != root->r.r_root_team ) {
__kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
master_th->th.th_serial_team = parent_team;
}
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
//
// Copy the task team from the new child / old parent team
// to the thread. If non-NULL, copy the state flag also.
//
if ( ( master_th->th.th_task_team = parent_team->t.t_task_team ) != NULL ) {
master_th->th.th_task_state = master_th->th.th_task_team->tt.tt_state;
}
KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
__kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
parent_team ) );
}
// TODO: GEH - cannot do this assertion because root thread not set up as executing
// KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
master_th->th.th_current_task->td_flags.executing = 1;
__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
KMP_MB();
KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/* Check whether we should push an internal control record onto the
serial team stack. If so, do it. */
void
__kmp_save_internal_controls ( kmp_info_t * thread )
{
if ( thread->th.th_team != thread->th.th_serial_team ) {
return;
}
if (thread->th.th_team->t.t_serialized > 1) {
int push = 0;
if (thread->th.th_team->t.t_control_stack_top == NULL) {
push = 1;
} else {
if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
thread->th.th_team->t.t_serialized ) {
push = 1;
}
}
if (push) { /* push a record on the serial team's stack */
kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
copy_icvs( control, & thread->th.th_current_task->td_icvs );
control->serial_nesting_level = thread->th.th_team->t.t_serialized;
control->next = thread->th.th_team->t.t_control_stack_top;
thread->th.th_team->t.t_control_stack_top = control;
}
}
}
/* Changes set_nproc */
void
__kmp_set_num_threads( int new_nth, int gtid )
{
kmp_info_t *thread;
kmp_root_t *root;
KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
KMP_DEBUG_ASSERT( __kmp_init_serial );
if (new_nth < 1)
new_nth = 1;
else if (new_nth > __kmp_max_nth)
new_nth = __kmp_max_nth;
thread = __kmp_threads[gtid];
__kmp_save_internal_controls( thread );
set__nproc( thread, new_nth );
//
// If this omp_set_num_threads() call will cause the hot team size to be
// reduced (in the absence of a num_threads clause), then reduce it now,
// rather than waiting for the next parallel region.
//
root = thread->th.th_root;
if ( __kmp_init_parallel && ( ! root->r.r_active )
&& ( root->r.r_hot_team->t.t_nproc > new_nth )
#if KMP_NESTED_HOT_TEAMS
&& __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
#endif
) {
kmp_team_t *hot_team = root->r.r_hot_team;
int f;
__kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
kmp_task_team_t *task_team = hot_team->t.t_task_team;
if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
//
// Signal the worker threads (esp. the extra ones) to stop
// looking for tasks while spin waiting. The task teams
// are reference counted and will be deallocated by the
// last worker thread.
//
KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
TCW_SYNC_4( task_team->tt.tt_active, FALSE );
KMP_MB();
KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
&hot_team->t.t_task_team ) );
hot_team->t.t_task_team = NULL;
}
else {
KMP_DEBUG_ASSERT( task_team == NULL );
}
}
//
// Release the extra threads we don't need any more.
//
for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
__kmp_free_thread( hot_team->t.t_threads[f] );
hot_team->t.t_threads[f] = NULL;
}
hot_team->t.t_nproc = new_nth;
#if KMP_NESTED_HOT_TEAMS
if( thread->th.th_hot_teams ) {
KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
thread->th.th_hot_teams[0].hot_team_nth = new_nth;
}
#endif
__kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
//
// Update the t_nproc field in the threads that are still active.
//
for( f=0 ; f < new_nth; f++ ) {
KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
}
// Special flag in case omp_set_num_threads() call
hot_team->t.t_size_changed = -1;
}
}
/* Changes max_active_levels */
void
__kmp_set_max_active_levels( int gtid, int max_active_levels )
{
kmp_info_t *thread;
KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
KMP_DEBUG_ASSERT( __kmp_init_serial );
// validate max_active_levels
if( max_active_levels < 0 ) {
KMP_WARNING( ActiveLevelsNegative, max_active_levels );
// We ignore this call if the user has specified a negative value.
// The current setting won't be changed. The last valid setting will be used.
// A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
return;
}
if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
// it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
// We allow a zero value. (implementation defined behavior)
} else {
KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
// Current upper limit is MAX_INT. (implementation defined behavior)
// If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
// Actually, the flow should never get here until we use MAX_INT limit.
}
KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
thread = __kmp_threads[ gtid ];
__kmp_save_internal_controls( thread );
set__max_active_levels( thread, max_active_levels );
}
/* Gets max_active_levels */
int
__kmp_get_max_active_levels( int gtid )
{
kmp_info_t *thread;
KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
KMP_DEBUG_ASSERT( __kmp_init_serial );
thread = __kmp_threads[ gtid ];
KMP_DEBUG_ASSERT( thread->th.th_current_task );
KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
return thread->th.th_current_task->td_icvs.max_active_levels;
}
/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
void
__kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
{
kmp_info_t *thread;
// kmp_team_t *team;
KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
KMP_DEBUG_ASSERT( __kmp_init_serial );
// Check if the kind parameter is valid, correct if needed.
// Valid parameters should fit in one of two intervals - standard or extended:
// <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
// 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
{
// TODO: Hint needs attention in case we change the default schedule.
__kmp_msg(
kmp_ms_warning,
KMP_MSG( ScheduleKindOutOfRange, kind ),
KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
__kmp_msg_null
);
kind = kmp_sched_default;
chunk = 0; // ignore chunk value in case of bad kind
}
thread = __kmp_threads[ gtid ];
__kmp_save_internal_controls( thread );
if ( kind < kmp_sched_upper_std ) {
if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
// differ static chunked vs. unchunked:
// chunk should be invalid to indicate unchunked schedule (which is the default)
thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
} else {
thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
}
} else {
// __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
thread->th.th_current_task->td_icvs.sched.r_sched_type =
__kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
}
if ( kind == kmp_sched_auto ) {
// ignore parameter chunk for schedule auto
thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
} else {
thread->th.th_current_task->td_icvs.sched.chunk = chunk;
}
}
/* Gets def_sched_var ICV values */
void
__kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
{
kmp_info_t *thread;
enum sched_type th_type;
int i;
KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
KMP_DEBUG_ASSERT( __kmp_init_serial );
thread = __kmp_threads[ gtid ];
//th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
switch ( th_type ) {
case kmp_sch_static:
case kmp_sch_static_greedy:
case kmp_sch_static_balanced:
*kind = kmp_sched_static;
*chunk = 0; // chunk was not set, try to show this fact via zero value
return;
case kmp_sch_static_chunked:
*kind = kmp_sched_static;
break;
case kmp_sch_dynamic_chunked:
*kind = kmp_sched_dynamic;
break;
case kmp_sch_guided_chunked:
case kmp_sch_guided_iterative_chunked:
case kmp_sch_guided_analytical_chunked:
*kind = kmp_sched_guided;
break;
case kmp_sch_auto:
*kind = kmp_sched_auto;
break;
case kmp_sch_trapezoidal:
*kind = kmp_sched_trapezoidal;
break;
/*
case kmp_sch_static_steal:
*kind = kmp_sched_static_steal;
break;
*/
default:
KMP_FATAL( UnknownSchedulingType, th_type );
}
//*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
*chunk = thread->th.th_current_task->td_icvs.sched.chunk;
}
int
__kmp_get_ancestor_thread_num( int gtid, int level ) {
int ii, dd;
kmp_team_t *team;
kmp_info_t *thr;
KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
KMP_DEBUG_ASSERT( __kmp_init_serial );
// validate level
if( level == 0 ) return 0;
if( level < 0 ) return -1;
thr = __kmp_threads[ gtid ];
team = thr->th.th_team;
ii = team->t.t_level;
if( level > ii ) return -1;
#if OMP_40_ENABLED
if( thr->th.th_teams_microtask ) {
// AC: we are in teams region where multiple nested teams have same level
int tlevel = thr->th.th_teams_level; // the level of the teams construct
if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
KMP_DEBUG_ASSERT( ii >= tlevel );
// AC: As we need to pass by the teams league, we need to artificially increase ii
if ( ii == tlevel ) {
ii += 2; // three teams have same level
} else {
ii ++; // two teams have same level
}
}
}
#endif
if( ii == level ) return __kmp_tid_from_gtid( gtid );
dd = team->t.t_serialized;
level++;
while( ii > level )
{
for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
{
}
if( ( team->t.t_serialized ) && ( !dd ) ) {
team = team->t.t_parent;
continue;
}
if( ii > level ) {
team = team->t.t_parent;
dd = team->t.t_serialized;
ii--;
}
}
return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
}
int
__kmp_get_team_size( int gtid, int level ) {
int ii, dd;
kmp_team_t *team;
kmp_info_t *thr;
KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
KMP_DEBUG_ASSERT( __kmp_init_serial );
// validate level
if( level == 0 ) return 1;
if( level < 0 ) return -1;
thr = __kmp_threads[ gtid ];
team = thr->th.th_team;
ii = team->t.t_level;
if( level > ii ) return -1;
#if OMP_40_ENABLED
if( thr->th.th_teams_microtask ) {
// AC: we are in teams region where multiple nested teams have same level
int tlevel = thr->th.th_teams_level; // the level of the teams construct
if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
KMP_DEBUG_ASSERT( ii >= tlevel );
// AC: As we need to pass by the teams league, we need to artificially increase ii
if ( ii == tlevel ) {
ii += 2; // three teams have same level
} else {
ii ++; // two teams have same level
}
}
}
#endif
while( ii > level )
{
for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
{
}
if( team->t.t_serialized && ( !dd ) ) {
team = team->t.t_parent;
continue;
}
if( ii > level ) {
team = team->t.t_parent;
ii--;
}
}
return team->t.t_nproc;
}
kmp_r_sched_t
__kmp_get_schedule_global() {
// This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
// may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
kmp_r_sched_t r_sched;
// create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
// __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
// and thus have different run-time schedules in different roots (even in OMP 2.5)
if ( __kmp_sched == kmp_sch_static ) {
r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
} else if ( __kmp_sched == kmp_sch_guided_chunked ) {
r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
} else {
r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
}
if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
r_sched.chunk = KMP_DEFAULT_CHUNK;
} else {
r_sched.chunk = __kmp_chunk;
}
return r_sched;
}
/* ------------------------------------------------------------------------ */
/* ------------------------------------------------------------------------ */
/*
* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
* at least argc number of *t_argv entries for the requested team.
*/
static void
__kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
{
KMP_DEBUG_ASSERT( team );
if( !realloc || argc > team->t.t_max_argc ) {
KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
/* if previously allocated heap space for args, free them */
if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
__kmp_free( (void *) team->t.t_argv );
if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
/* use unused space in the cache line for arguments */
team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
team->t.t_id, team->t.t_max_argc ));
team->t.t_argv = &team->t.t_inline_argv[0];
if ( __kmp_storage_map ) {
__kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
&team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
(sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
"team_%d.t_inline_argv",
team->t.t_id );
}
} else {
/* allocate space for arguments in the heap */
team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
team->t.t_id, team->t.t_max_argc ));
team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
if ( __kmp_storage_map ) {
__kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
team->t.t_id );
}
}
}
}
static void
__kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
{
int i;
int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
#if KMP_USE_POOLED_ALLOC
// AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
char *ptr = __kmp_allocate(max_nth *
( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
+ sizeof(kmp_disp_t) + sizeof(int)*6
//+ sizeof(int)
+ sizeof(kmp_r_sched_t)
+ sizeof(kmp_taskdata_t) ) );
team->t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
team->t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
team->t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
team->t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
team->t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
team->t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
team->t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
team->t.t_set_bt_set = (int*) ptr;
ptr += sizeof(int) * max_nth;
//team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
team->t.t_set_sched = (kmp_r_sched_t*) ptr;
ptr += sizeof(kmp_r_sched_t) * max_nth;
team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
ptr += sizeof(kmp_taskdata_t) * max_nth;
#else
team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
team->t.t_disp_buffer = (dispatch_shared_info_t*)
__kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
//team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
//team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
#endif
team->t.t_max_nproc = max_nth;
/* setup dispatch buffers */
for(i = 0 ; i < num_disp_buff; ++i)
team->t.t_disp_buffer[i].buffer_index = i;
}
static void
__kmp_free_team_arrays(kmp_team_t *team) {
/* Note: this does not free the threads in t_threads (__kmp_free_threads) */
int i;
for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
__kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
team->t.t_dispatch[ i ].th_disp_buffer = NULL;
}; // if
}; // for
__kmp_free(team->t.t_threads);
#if !KMP_USE_POOLED_ALLOC
__kmp_free(team->t.t_disp_buffer);
__kmp_free(team->t.t_dispatch);
//__kmp_free(team->t.t_set_max_active_levels);
//__kmp_free(team->t.t_set_sched);
__kmp_free(team->t.t_implicit_task_taskdata);
#endif
team->t.t_threads = NULL;
team->t.t_disp_buffer = NULL;
team->t.t_dispatch = NULL;
//team->t.t_set_sched = 0;
//team->t.t_set_max_active_levels = 0;
team->t.t_implicit_task_taskdata = 0;
}
static void
__kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
kmp_info_t **oldThreads = team->t.t_threads;
#if !KMP_USE_POOLED_ALLOC
__kmp_free(team->t.t_disp_buffer);
__kmp_free(team->t.t_dispatch);
//__kmp_free(team->t.t_set_max_active_levels);
//__kmp_free(team->t.t_set_sched);
__kmp_free(team->t.t_implicit_task_taskdata);
#endif
__kmp_allocate_team_arrays(team, max_nth);
memcpy(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
__kmp_free(oldThreads);
}
static kmp_internal_control_t
__kmp_get_global_icvs( void ) {
kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
#if OMP_40_ENABLED
KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
#endif /* OMP_40_ENABLED */
kmp_internal_control_t g_icvs = {
0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
(kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
(kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
(kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
__kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
__kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
__kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
// (use a max ub on value if __kmp_parallel_initialize not called yet)
__kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
#if OMP_40_ENABLED
__kmp_nested_proc_bind.bind_types[0],
#endif /* OMP_40_ENABLED */
NULL //struct kmp_internal_control *next;
};
return g_icvs;
}
static kmp_internal_control_t
__kmp_get_x_global_icvs( const kmp_team_t *team ) {
kmp_internal_control_t gx_icvs;
gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
gx_icvs.next = NULL;
return gx_icvs;
}
static void
__kmp_initialize_root( kmp_root_t *root )
{
int f;
kmp_team_t *root_team;
kmp_team_t *hot_team;
size_t disp_size, dispatch_size, bar_size;
int hot_team_max_nth;
kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
KMP_DEBUG_ASSERT( root );
KMP_ASSERT( ! root->r.r_begin );
/* setup the root state structure */
__kmp_init_lock( &root->r.r_begin_lock );
root->r.r_begin = FALSE;
root->r.r_active = FALSE;
root->r.r_in_parallel = 0;
root->r.r_blocktime = __kmp_dflt_blocktime;
root->r.r_nested = __kmp_dflt_nested;
/* setup the root team for this task */
/* allocate the root team structure */
KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
root_team =
__kmp_allocate_team(
root,
1, // new_nproc
1, // max_nproc
#if OMP_40_ENABLED
__kmp_nested_proc_bind.bind_types[0],
#endif
&r_icvs,
0 // argc
USE_NESTED_HOT_ARG(NULL) // master thread is unknown
);
KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
root->r.r_root_team = root_team;
root_team->t.t_control_stack_top = NULL;
/* initialize root team */
root_team->t.t_threads[0] = NULL;
root_team->t.t_nproc = 1;
root_team->t.t_serialized = 1;
// TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
root_team->t.t_sched.chunk = r_sched.chunk;
KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
/* setup the hot team for this task */
/* allocate the hot team structure */
KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
hot_team =
__kmp_allocate_team(
root,
1, // new_nproc
__kmp_dflt_team_nth_ub * 2, // max_nproc
#if OMP_40_ENABLED
__kmp_nested_proc_bind.bind_types[0],
#endif
&r_icvs,
0 // argc
USE_NESTED_HOT_ARG(NULL) // master thread is unknown
);
KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
root->r.r_hot_team = hot_team;
root_team->t.t_control_stack_top = NULL;
/* first-time initialization */
hot_team->t.t_parent = root_team;
/* initialize hot team */
hot_team_max_nth = hot_team->t.t_max_nproc;
for ( f = 0; f < hot_team_max_nth; ++ f ) {
hot_team->t.t_threads[ f ] = NULL;
}; // for
hot_team->t.t_nproc = 1;
// TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
hot_team->t.t_sched.chunk = r_sched.chunk;
hot_team->t.t_size_changed = 0;
}
#ifdef KMP_DEBUG
typedef struct kmp_team_list_item {
kmp_team_p const * entry;
struct kmp_team_list_item * next;
} kmp_team_list_item_t;
typedef kmp_team_list_item_t * kmp_team_list_t;
static void
__kmp_print_structure_team_accum( // Add team to list of teams.
kmp_team_list_t list, // List of teams.
kmp_team_p const * team // Team to add.
) {
// List must terminate with item where both entry and next are NULL.
// Team is added to the list only once.
// List is sorted in ascending order by team id.
// Team id is *not* a key.
kmp_team_list_t l;
KMP_DEBUG_ASSERT( list != NULL );
if ( team == NULL ) {
return;
}; // if
__kmp_print_structure_team_accum( list, team->t.t_parent );