blob: f448e215a608264ca4b348d35ad9ae7bc809aaec [file] [log] [blame]
/*
* kmp_tasking.cpp -- OpenMP 3.0 tasking support.
*/
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "kmp.h"
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_stats.h"
#include "kmp_wait_release.h"
#include "kmp_taskdeps.h"
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
#include "tsan_annotations.h"
/* forward declaration */
static void __kmp_enable_tasking(kmp_task_team_t *task_team,
kmp_info_t *this_thr);
static void __kmp_alloc_task_deque(kmp_info_t *thread,
kmp_thread_data_t *thread_data);
static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
kmp_task_team_t *task_team);
static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
#ifdef BUILD_TIED_TASK_STACK
// __kmp_trace_task_stack: print the tied tasks from the task stack in order
// from top do bottom
//
// gtid: global thread identifier for thread containing stack
// thread_data: thread data for task team thread containing stack
// threshold: value above which the trace statement triggers
// location: string identifying call site of this function (for trace)
static void __kmp_trace_task_stack(kmp_int32 gtid,
kmp_thread_data_t *thread_data,
int threshold, char *location) {
kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
kmp_taskdata_t **stack_top = task_stack->ts_top;
kmp_int32 entries = task_stack->ts_entries;
kmp_taskdata_t *tied_task;
KA_TRACE(
threshold,
("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
"first_block = %p, stack_top = %p \n",
location, gtid, entries, task_stack->ts_first_block, stack_top));
KMP_DEBUG_ASSERT(stack_top != NULL);
KMP_DEBUG_ASSERT(entries > 0);
while (entries != 0) {
KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
// fix up ts_top if we need to pop from previous block
if (entries & TASK_STACK_INDEX_MASK == 0) {
kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
stack_block = stack_block->sb_prev;
stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
}
// finish bookkeeping
stack_top--;
entries--;
tied_task = *stack_top;
KMP_DEBUG_ASSERT(tied_task != NULL);
KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
KA_TRACE(threshold,
("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
"stack_top=%p, tied_task=%p\n",
location, gtid, entries, stack_top, tied_task));
}
KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
KA_TRACE(threshold,
("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
location, gtid));
}
// __kmp_init_task_stack: initialize the task stack for the first time
// after a thread_data structure is created.
// It should not be necessary to do this again (assuming the stack works).
//
// gtid: global thread identifier of calling thread
// thread_data: thread data for task team thread containing stack
static void __kmp_init_task_stack(kmp_int32 gtid,
kmp_thread_data_t *thread_data) {
kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
kmp_stack_block_t *first_block;
// set up the first block of the stack
first_block = &task_stack->ts_first_block;
task_stack->ts_top = (kmp_taskdata_t **)first_block;
memset((void *)first_block, '\0',
TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
// initialize the stack to be empty
task_stack->ts_entries = TASK_STACK_EMPTY;
first_block->sb_next = NULL;
first_block->sb_prev = NULL;
}
// __kmp_free_task_stack: free the task stack when thread_data is destroyed.
//
// gtid: global thread identifier for calling thread
// thread_data: thread info for thread containing stack
static void __kmp_free_task_stack(kmp_int32 gtid,
kmp_thread_data_t *thread_data) {
kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
// free from the second block of the stack
while (stack_block != NULL) {
kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
stack_block->sb_next = NULL;
stack_block->sb_prev = NULL;
if (stack_block != &task_stack->ts_first_block) {
__kmp_thread_free(thread,
stack_block); // free the block, if not the first
}
stack_block = next_block;
}
// initialize the stack to be empty
task_stack->ts_entries = 0;
task_stack->ts_top = NULL;
}
// __kmp_push_task_stack: Push the tied task onto the task stack.
// Grow the stack if necessary by allocating another block.
//
// gtid: global thread identifier for calling thread
// thread: thread info for thread containing stack
// tied_task: the task to push on the stack
static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
kmp_taskdata_t *tied_task) {
// GEH - need to consider what to do if tt_threads_data not allocated yet
kmp_thread_data_t *thread_data =
&thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
return; // Don't push anything on stack if team or team tasks are serialized
}
KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
KA_TRACE(20,
("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
gtid, thread, tied_task));
// Store entry
*(task_stack->ts_top) = tied_task;
// Do bookkeeping for next push
task_stack->ts_top++;
task_stack->ts_entries++;
if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
// Find beginning of this task block
kmp_stack_block_t *stack_block =
(kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
// Check if we already have a block
if (stack_block->sb_next !=
NULL) { // reset ts_top to beginning of next block
task_stack->ts_top = &stack_block->sb_next->sb_block[0];
} else { // Alloc new block and link it up
kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
thread, sizeof(kmp_stack_block_t));
task_stack->ts_top = &new_block->sb_block[0];
stack_block->sb_next = new_block;
new_block->sb_prev = stack_block;
new_block->sb_next = NULL;
KA_TRACE(
30,
("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
gtid, tied_task, new_block));
}
}
KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
tied_task));
}
// __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
// the task, just check to make sure it matches the ending task passed in.
//
// gtid: global thread identifier for the calling thread
// thread: thread info structure containing stack
// tied_task: the task popped off the stack
// ending_task: the task that is ending (should match popped task)
static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
kmp_taskdata_t *ending_task) {
// GEH - need to consider what to do if tt_threads_data not allocated yet
kmp_thread_data_t *thread_data =
&thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
kmp_taskdata_t *tied_task;
if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
// Don't pop anything from stack if team or team tasks are serialized
return;
}
KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
thread));
// fix up ts_top if we need to pop from previous block
if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
stack_block = stack_block->sb_prev;
task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
}
// finish bookkeeping
task_stack->ts_top--;
task_stack->ts_entries--;
tied_task = *(task_stack->ts_top);
KMP_DEBUG_ASSERT(tied_task != NULL);
KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
tied_task));
return;
}
#endif /* BUILD_TIED_TASK_STACK */
// returns 1 if new task is allowed to execute, 0 otherwise
// checks Task Scheduling constraint (if requested) and
// mutexinoutset dependencies if any
static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
const kmp_taskdata_t *tasknew,
const kmp_taskdata_t *taskcurr) {
if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
// Check if the candidate obeys the Task Scheduling Constraints (TSC)
// only descendant of all deferred tied tasks can be scheduled, checking
// the last one is enough, as it in turn is the descendant of all others
kmp_taskdata_t *current = taskcurr->td_last_tied;
KMP_DEBUG_ASSERT(current != NULL);
// check if the task is not suspended on barrier
if (current->td_flags.tasktype == TASK_EXPLICIT ||
current->td_taskwait_thread > 0) { // <= 0 on barrier
kmp_int32 level = current->td_level;
kmp_taskdata_t *parent = tasknew->td_parent;
while (parent != current && parent->td_level > level) {
// check generation up to the level of the current task
parent = parent->td_parent;
KMP_DEBUG_ASSERT(parent != NULL);
}
if (parent != current)
return false;
}
}
// Check mutexinoutset dependencies, acquire locks
kmp_depnode_t *node = tasknew->td_depnode;
if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
continue;
// could not get the lock, release previous locks
for (int j = i - 1; j >= 0; --j)
__kmp_release_lock(node->dn.mtx_locks[j], gtid);
return false;
}
// negative num_locks means all locks acquired successfully
node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
}
return true;
}
// __kmp_realloc_task_deque:
// Re-allocates a task deque for a particular thread, copies the content from
// the old deque and adjusts the necessary data structures relating to the
// deque. This operation must be done with the deque_lock being held
static void __kmp_realloc_task_deque(kmp_info_t *thread,
kmp_thread_data_t *thread_data) {
kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
kmp_int32 new_size = 2 * size;
KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
"%d] for thread_data %p\n",
__kmp_gtid_from_thread(thread), size, new_size, thread_data));
kmp_taskdata_t **new_deque =
(kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
int i, j;
for (i = thread_data->td.td_deque_head, j = 0; j < size;
i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
new_deque[j] = thread_data->td.td_deque[i];
__kmp_free(thread_data->td.td_deque);
thread_data->td.td_deque_head = 0;
thread_data->td.td_deque_tail = size;
thread_data->td.td_deque = new_deque;
thread_data->td.td_deque_size = new_size;
}
// __kmp_push_task: Add a task to the thread's deque
static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
// We don't need to map to shadow gtid if it is already hidden helper thread
if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
thread = __kmp_threads[gtid];
}
kmp_task_team_t *task_team = thread->th.th_task_team;
kmp_int32 tid = __kmp_tid_from_gtid(gtid);
kmp_thread_data_t *thread_data;
KA_TRACE(20,
("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
// untied task needs to increment counter so that the task structure is not
// freed prematurely
kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
KMP_DEBUG_USE_VAR(counter);
KA_TRACE(
20,
("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
gtid, counter, taskdata));
}
// The first check avoids building task_team thread data if serialized
if (UNLIKELY(taskdata->td_flags.task_serial)) {
KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
"TASK_NOT_PUSHED for task %p\n",
gtid, taskdata));
return TASK_NOT_PUSHED;
}
// Now that serialized tasks have returned, we can assume that we are not in
// immediate exec mode
KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
__kmp_enable_tasking(task_team, thread);
}
KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
// Find tasking deque specific to encountering thread
thread_data = &task_team->tt.tt_threads_data[tid];
// No lock needed since only owner can allocate. If the task is hidden_helper,
// we don't need it either because we have initialized the dequeue for hidden
// helper thread data.
if (UNLIKELY(thread_data->td.td_deque == NULL)) {
__kmp_alloc_task_deque(thread, thread_data);
}
int locked = 0;
// Check if deque is full
if (TCR_4(thread_data->td.td_deque_ntasks) >=
TASK_DEQUE_SIZE(thread_data->td)) {
if (__kmp_enable_task_throttling &&
__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
thread->th.th_current_task)) {
KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
"TASK_NOT_PUSHED for task %p\n",
gtid, taskdata));
return TASK_NOT_PUSHED;
} else {
__kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
locked = 1;
if (TCR_4(thread_data->td.td_deque_ntasks) >=
TASK_DEQUE_SIZE(thread_data->td)) {
// expand deque to push the task which is not allowed to execute
__kmp_realloc_task_deque(thread, thread_data);
}
}
}
// Lock the deque for the task push operation
if (!locked) {
__kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
// Need to recheck as we can get a proxy task from thread outside of OpenMP
if (TCR_4(thread_data->td.td_deque_ntasks) >=
TASK_DEQUE_SIZE(thread_data->td)) {
if (__kmp_enable_task_throttling &&
__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
thread->th.th_current_task)) {
__kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
"returning TASK_NOT_PUSHED for task %p\n",
gtid, taskdata));
return TASK_NOT_PUSHED;
} else {
// expand deque to push the task which is not allowed to execute
__kmp_realloc_task_deque(thread, thread_data);
}
}
}
// Must have room since no thread can add tasks but calling thread
KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
TASK_DEQUE_SIZE(thread_data->td));
thread_data->td.td_deque[thread_data->td.td_deque_tail] =
taskdata; // Push taskdata
// Wrap index.
thread_data->td.td_deque_tail =
(thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
TCW_4(thread_data->td.td_deque_ntasks,
TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
KMP_FSYNC_RELEASING(taskdata); // releasing child
KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
"task=%p ntasks=%d head=%u tail=%u\n",
gtid, taskdata, thread_data->td.td_deque_ntasks,
thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
__kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
// Signal one worker thread to execute the task
if (taskdata->td_flags.hidden_helper) {
// Wake hidden helper threads up if they're sleeping
__kmp_hidden_helper_worker_thread_signal();
}
return TASK_SUCCESSFULLY_PUSHED;
}
// __kmp_pop_current_task_from_thread: set up current task from called thread
// when team ends
//
// this_thr: thread structure to set current_task in.
void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
"this_thread=%p, curtask=%p, "
"curtask_parent=%p\n",
0, this_thr, this_thr->th.th_current_task,
this_thr->th.th_current_task->td_parent));
this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
"this_thread=%p, curtask=%p, "
"curtask_parent=%p\n",
0, this_thr, this_thr->th.th_current_task,
this_thr->th.th_current_task->td_parent));
}
// __kmp_push_current_task_to_thread: set up current task in called thread for a
// new team
//
// this_thr: thread structure to set up
// team: team for implicit task data
// tid: thread within team to set up
void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
int tid) {
// current task of the thread is a parent of the new just created implicit
// tasks of new team
KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
"curtask=%p "
"parent_task=%p\n",
tid, this_thr, this_thr->th.th_current_task,
team->t.t_implicit_task_taskdata[tid].td_parent));
KMP_DEBUG_ASSERT(this_thr != NULL);
if (tid == 0) {
if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
team->t.t_implicit_task_taskdata[0].td_parent =
this_thr->th.th_current_task;
this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
}
} else {
team->t.t_implicit_task_taskdata[tid].td_parent =
team->t.t_implicit_task_taskdata[0].td_parent;
this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
}
KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
"curtask=%p "
"parent_task=%p\n",
tid, this_thr, this_thr->th.th_current_task,
team->t.t_implicit_task_taskdata[tid].td_parent));
}
// __kmp_task_start: bookkeeping for a task starting execution
//
// GTID: global thread id of calling thread
// task: task starting execution
// current_task: task suspending
static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
kmp_taskdata_t *current_task) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
kmp_info_t *thread = __kmp_threads[gtid];
KA_TRACE(10,
("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
gtid, taskdata, current_task));
KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
// mark currently executing task as suspended
// TODO: GEH - make sure root team implicit task is initialized properly.
// KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
current_task->td_flags.executing = 0;
// Add task to stack if tied
#ifdef BUILD_TIED_TASK_STACK
if (taskdata->td_flags.tiedness == TASK_TIED) {
__kmp_push_task_stack(gtid, thread, taskdata);
}
#endif /* BUILD_TIED_TASK_STACK */
// mark starting task as executing and as current task
thread->th.th_current_task = taskdata;
KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
taskdata->td_flags.tiedness == TASK_UNTIED);
KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
taskdata->td_flags.tiedness == TASK_UNTIED);
taskdata->td_flags.started = 1;
taskdata->td_flags.executing = 1;
KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
// GEH TODO: shouldn't we pass some sort of location identifier here?
// APT: yes, we will pass location here.
// need to store current thread state (in a thread or taskdata structure)
// before setting work_state, otherwise wrong state is set after end of task
KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
return;
}
#if OMPT_SUPPORT
//------------------------------------------------------------------------------
// __ompt_task_init:
// Initialize OMPT fields maintained by a task. This will only be called after
// ompt_start_tool, so we already know whether ompt is enabled or not.
static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
// The calls to __ompt_task_init already have the ompt_enabled condition.
task->ompt_task_info.task_data.value = 0;
task->ompt_task_info.frame.exit_frame = ompt_data_none;
task->ompt_task_info.frame.enter_frame = ompt_data_none;
task->ompt_task_info.frame.exit_frame_flags =
ompt_frame_runtime | ompt_frame_framepointer;
task->ompt_task_info.frame.enter_frame_flags =
ompt_frame_runtime | ompt_frame_framepointer;
}
// __ompt_task_start:
// Build and trigger task-begin event
static inline void __ompt_task_start(kmp_task_t *task,
kmp_taskdata_t *current_task,
kmp_int32 gtid) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
ompt_task_status_t status = ompt_task_switch;
if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
status = ompt_task_yield;
__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
}
/* let OMPT know that we're about to run this task */
if (ompt_enabled.ompt_callback_task_schedule) {
ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
&(current_task->ompt_task_info.task_data), status,
&(taskdata->ompt_task_info.task_data));
}
taskdata->ompt_task_info.scheduling_parent = current_task;
}
// __ompt_task_finish:
// Build and trigger final task-schedule event
static inline void __ompt_task_finish(kmp_task_t *task,
kmp_taskdata_t *resumed_task,
ompt_task_status_t status) {
if (ompt_enabled.ompt_callback_task_schedule) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
status = ompt_task_cancel;
}
/* let OMPT know that we're returning to the callee task */
ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
&(taskdata->ompt_task_info.task_data), status,
(resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
}
}
#endif
template <bool ompt>
static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task,
void *frame_address,
void *return_address) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
"current_task=%p\n",
gtid, loc_ref, taskdata, current_task));
if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
// untied task needs to increment counter so that the task structure is not
// freed prematurely
kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
KMP_DEBUG_USE_VAR(counter);
KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
"incremented for task %p\n",
gtid, counter, taskdata));
}
taskdata->td_flags.task_serial =
1; // Execute this task immediately, not deferred.
__kmp_task_start(gtid, task, current_task);
#if OMPT_SUPPORT
if (ompt) {
if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
current_task->ompt_task_info.frame.enter_frame.ptr =
taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
current_task->ompt_task_info.frame.enter_frame_flags =
taskdata->ompt_task_info.frame.exit_frame_flags =
ompt_frame_application | ompt_frame_framepointer;
}
if (ompt_enabled.ompt_callback_task_create) {
ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
&(parent_info->task_data), &(parent_info->frame),
&(taskdata->ompt_task_info.task_data),
ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
return_address);
}
__ompt_task_start(task, current_task, gtid);
}
#endif // OMPT_SUPPORT
KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
loc_ref, taskdata));
}
#if OMPT_SUPPORT
OMPT_NOINLINE
static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task,
void *frame_address,
void *return_address) {
__kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
return_address);
}
#endif // OMPT_SUPPORT
// __kmpc_omp_task_begin_if0: report that a given serialized task has started
// execution
//
// loc_ref: source location information; points to beginning of task block.
// gtid: global thread number.
// task: task thunk for the started task.
void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task) {
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled)) {
OMPT_STORE_RETURN_ADDRESS(gtid);
__kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
OMPT_GET_FRAME_ADDRESS(1),
OMPT_LOAD_RETURN_ADDRESS(gtid));
return;
}
#endif
__kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
}
#ifdef TASK_UNUSED
// __kmpc_omp_task_begin: report that a given task has started execution
// NEVER GENERATED BY COMPILER, DEPRECATED!!!
void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
KA_TRACE(
10,
("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
__kmp_task_start(gtid, task, current_task);
KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
loc_ref, KMP_TASK_TO_TASKDATA(task)));
return;
}
#endif // TASK_UNUSED
// __kmp_free_task: free the current task space and the space for shareds
//
// gtid: Global thread ID of calling thread
// taskdata: task to free
// thread: thread data structure of caller
static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
kmp_info_t *thread) {
KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
taskdata));
// Check to make sure all flags and counters have the correct values
KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
taskdata->td_flags.task_serial == 1);
KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
taskdata->td_flags.freed = 1;
ANNOTATE_HAPPENS_BEFORE(taskdata);
// deallocate the taskdata and shared variable blocks associated with this task
#if USE_FAST_MEMORY
__kmp_fast_free(thread, taskdata);
#else /* ! USE_FAST_MEMORY */
__kmp_thread_free(thread, taskdata);
#endif
KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
}
// __kmp_free_task_and_ancestors: free the current task and ancestors without
// children
//
// gtid: Global thread ID of calling thread
// taskdata: task to free
// thread: thread data structure of caller
static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
kmp_taskdata_t *taskdata,
kmp_info_t *thread) {
// Proxy tasks must always be allowed to free their parents
// because they can be run in background even in serial mode.
kmp_int32 team_serial =
(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
!taskdata->td_flags.proxy;
KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
KMP_DEBUG_ASSERT(children >= 0);
// Now, go up the ancestor tree to see if any ancestors can now be freed.
while (children == 0) {
kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
"and freeing itself\n",
gtid, taskdata));
// --- Deallocate my ancestor task ---
__kmp_free_task(gtid, taskdata, thread);
taskdata = parent_taskdata;
if (team_serial)
return;
// Stop checking ancestors at implicit task instead of walking up ancestor
// tree to avoid premature deallocation of ancestors.
if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
if (taskdata->td_dephash) { // do we need to cleanup dephash?
int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
kmp_tasking_flags_t flags_old = taskdata->td_flags;
if (children == 0 && flags_old.complete == 1) {
kmp_tasking_flags_t flags_new = flags_old;
flags_new.complete = 0;
if (KMP_COMPARE_AND_STORE_ACQ32(
RCAST(kmp_int32 *, &taskdata->td_flags),
*RCAST(kmp_int32 *, &flags_old),
*RCAST(kmp_int32 *, &flags_new))) {
KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
"dephash of implicit task %p\n",
gtid, taskdata));
// cleanup dephash of finished implicit task
__kmp_dephash_free_entries(thread, taskdata->td_dephash);
}
}
}
return;
}
// Predecrement simulated by "- 1" calculation
children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
KMP_DEBUG_ASSERT(children >= 0);
}
KA_TRACE(
20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
"not freeing it yet\n",
gtid, taskdata, children));
}
// __kmp_task_finish: bookkeeping to do when a task finishes execution
//
// gtid: global thread ID for calling thread
// task: task to be finished
// resumed_task: task to be resumed. (may be NULL if task is serialized)
//
// template<ompt>: effectively ompt_enabled.enabled!=0
// the version with ompt=false is inlined, allowing to optimize away all ompt
// code in this case
template <bool ompt>
static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
kmp_taskdata_t *resumed_task) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
kmp_info_t *thread = __kmp_threads[gtid];
kmp_task_team_t *task_team =
thread->th.th_task_team; // might be NULL for serial teams...
kmp_int32 children = 0;
KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
"task %p\n",
gtid, taskdata, resumed_task));
KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
// Pop task from stack if tied
#ifdef BUILD_TIED_TASK_STACK
if (taskdata->td_flags.tiedness == TASK_TIED) {
__kmp_pop_task_stack(gtid, thread, taskdata);
}
#endif /* BUILD_TIED_TASK_STACK */
if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
// untied task needs to check the counter so that the task structure is not
// freed prematurely
kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
KA_TRACE(
20,
("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
gtid, counter, taskdata));
if (counter > 0) {
// untied task is not done, to be continued possibly by other thread, do
// not free it now
if (resumed_task == NULL) {
KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
resumed_task = taskdata->td_parent; // In a serialized task, the resumed
// task is the parent
}
thread->th.th_current_task = resumed_task; // restore current_task
resumed_task->td_flags.executing = 1; // resume previous task
KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
"resuming task %p\n",
gtid, taskdata, resumed_task));
return;
}
}
// bookkeeping for resuming task:
// GEH - note tasking_ser => task_serial
KMP_DEBUG_ASSERT(
(taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
taskdata->td_flags.task_serial);
if (taskdata->td_flags.task_serial) {
if (resumed_task == NULL) {
resumed_task = taskdata->td_parent; // In a serialized task, the resumed
// task is the parent
}
} else {
KMP_DEBUG_ASSERT(resumed_task !=
NULL); // verify that resumed task is passed as argument
}
/* If the tasks' destructor thunk flag has been set, we need to invoke the
destructor thunk that has been generated by the compiler. The code is
placed here, since at this point other tasks might have been released
hence overlapping the destructor invocations with some other work in the
released tasks. The OpenMP spec is not specific on when the destructors
are invoked, so we should be free to choose. */
if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
kmp_routine_entry_t destr_thunk = task->data1.destructors;
KMP_ASSERT(destr_thunk);
destr_thunk(gtid, task);
}
KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
bool detach = false;
if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
if (taskdata->td_allow_completion_event.type ==
KMP_EVENT_ALLOW_COMPLETION) {
// event hasn't been fulfilled yet. Try to detach task.
__kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
if (taskdata->td_allow_completion_event.type ==
KMP_EVENT_ALLOW_COMPLETION) {
// task finished execution
KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
taskdata->td_flags.executing = 0; // suspend the finishing task
#if OMPT_SUPPORT
// For a detached task, which is not completed, we switch back
// the omp_fulfill_event signals completion
// locking is necessary to avoid a race with ompt_task_late_fulfill
if (ompt)
__ompt_task_finish(task, resumed_task, ompt_task_detach);
#endif
// no access to taskdata after this point!
// __kmp_fulfill_event might free taskdata at any time from now
taskdata->td_flags.proxy = TASK_PROXY; // proxify!
detach = true;
}
__kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
}
}
if (!detach) {
taskdata->td_flags.complete = 1; // mark the task as completed
#if OMPT_SUPPORT
// This is not a detached task, we are done here
if (ompt)
__ompt_task_finish(task, resumed_task, ompt_task_complete);
#endif
// Only need to keep track of count if team parallel and tasking not
// serialized, or task is detachable and event has already been fulfilled
if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
taskdata->td_flags.detachable == TASK_DETACHABLE ||
taskdata->td_flags.hidden_helper) {
// Predecrement simulated by "- 1" calculation
children =
KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
KMP_DEBUG_ASSERT(children >= 0);
if (taskdata->td_taskgroup)
KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
__kmp_release_deps(gtid, taskdata);
} else if (task_team && task_team->tt.tt_found_proxy_tasks) {
// if we found proxy tasks there could exist a dependency chain
// with the proxy task as origin
__kmp_release_deps(gtid, taskdata);
}
// td_flags.executing must be marked as 0 after __kmp_release_deps has been
// called. Othertwise, if a task is executed immediately from the
// release_deps code, the flag will be reset to 1 again by this same
// function
KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
taskdata->td_flags.executing = 0; // suspend the finishing task
}
KA_TRACE(
20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
gtid, taskdata, children));
// Free this task and then ancestor tasks if they have no children.
// Restore th_current_task first as suggested by John:
// johnmc: if an asynchronous inquiry peers into the runtime system
// it doesn't see the freed task as the current task.
thread->th.th_current_task = resumed_task;
if (!detach)
__kmp_free_task_and_ancestors(gtid, taskdata, thread);
// TODO: GEH - make sure root team implicit task is initialized properly.
// KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
resumed_task->td_flags.executing = 1; // resume previous task
KA_TRACE(
10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
gtid, taskdata, resumed_task));
return;
}
template <bool ompt>
static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
kmp_int32 gtid,
kmp_task_t *task) {
KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
KMP_DEBUG_ASSERT(gtid >= 0);
// this routine will provide task to resume
__kmp_task_finish<ompt>(gtid, task, NULL);
KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
#if OMPT_SUPPORT
if (ompt) {
ompt_frame_t *ompt_frame;
__ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
ompt_frame->enter_frame = ompt_data_none;
ompt_frame->enter_frame_flags =
ompt_frame_runtime | ompt_frame_framepointer;
}
#endif
return;
}
#if OMPT_SUPPORT
OMPT_NOINLINE
void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task) {
__kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
}
#endif // OMPT_SUPPORT
// __kmpc_omp_task_complete_if0: report that a task has completed execution
//
// loc_ref: source location information; points to end of task block.
// gtid: global thread number.
// task: task thunk for the completed task.
void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task) {
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled)) {
__kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
return;
}
#endif
__kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
}
#ifdef TASK_UNUSED
// __kmpc_omp_task_complete: report that a task has completed execution
// NEVER GENERATED BY COMPILER, DEPRECATED!!!
void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task) {
KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
loc_ref, KMP_TASK_TO_TASKDATA(task)));
__kmp_task_finish<false>(gtid, task,
NULL); // Not sure how to find task to resume
KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
loc_ref, KMP_TASK_TO_TASKDATA(task)));
return;
}
#endif // TASK_UNUSED
// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
// task for a given thread
//
// loc_ref: reference to source location of parallel region
// this_thr: thread data structure corresponding to implicit task
// team: team for this_thr
// tid: thread id of given thread within team
// set_curr_task: TRUE if need to push current task to thread
// NOTE: Routine does not set up the implicit task ICVS. This is assumed to
// have already been done elsewhere.
// TODO: Get better loc_ref. Value passed in may be NULL
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
kmp_team_t *team, int tid, int set_curr_task) {
kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
KF_TRACE(
10,
("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
task->td_task_id = KMP_GEN_TASK_ID();
task->td_team = team;
// task->td_parent = NULL; // fix for CQ230101 (broken parent task info
// in debugger)
task->td_ident = loc_ref;
task->td_taskwait_ident = NULL;
task->td_taskwait_counter = 0;
task->td_taskwait_thread = 0;
task->td_flags.tiedness = TASK_TIED;
task->td_flags.tasktype = TASK_IMPLICIT;
task->td_flags.proxy = TASK_FULL;
// All implicit tasks are executed immediately, not deferred
task->td_flags.task_serial = 1;
task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
task->td_flags.started = 1;
task->td_flags.executing = 1;
task->td_flags.complete = 0;
task->td_flags.freed = 0;
task->td_depnode = NULL;
task->td_last_tied = task;
task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
if (set_curr_task) { // only do this init first time thread is created
KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
// Not used: don't need to deallocate implicit task
KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
task->td_taskgroup = NULL; // An implicit task does not have taskgroup
task->td_dephash = NULL;
__kmp_push_current_task_to_thread(this_thr, team, tid);
} else {
KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
}
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled))
__ompt_task_init(task, tid);
#endif
KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
team, task));
}
// __kmp_finish_implicit_task: Release resources associated to implicit tasks
// at the end of parallel regions. Some resources are kept for reuse in the next
// parallel region.
//
// thread: thread data structure corresponding to implicit task
void __kmp_finish_implicit_task(kmp_info_t *thread) {
kmp_taskdata_t *task = thread->th.th_current_task;
if (task->td_dephash) {
int children;
task->td_flags.complete = 1;
children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
kmp_tasking_flags_t flags_old = task->td_flags;
if (children == 0 && flags_old.complete == 1) {
kmp_tasking_flags_t flags_new = flags_old;
flags_new.complete = 0;
if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
*RCAST(kmp_int32 *, &flags_old),
*RCAST(kmp_int32 *, &flags_new))) {
KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
"dephash of implicit task %p\n",
thread->th.th_info.ds.ds_gtid, task));
__kmp_dephash_free_entries(thread, task->td_dephash);
}
}
}
}
// __kmp_free_implicit_task: Release resources associated to implicit tasks
// when these are destroyed regions
//
// thread: thread data structure corresponding to implicit task
void __kmp_free_implicit_task(kmp_info_t *thread) {
kmp_taskdata_t *task = thread->th.th_current_task;
if (task && task->td_dephash) {
__kmp_dephash_free(thread, task->td_dephash);
task->td_dephash = NULL;
}
}
// Round up a size to a power of two specified by val: Used to insert padding
// between structures co-allocated using a single malloc() call
static size_t __kmp_round_up_to_val(size_t size, size_t val) {
if (size & (val - 1)) {
size &= ~(val - 1);
if (size <= KMP_SIZE_T_MAX - val) {
size += val; // Round up if there is no overflow.
}
}
return size;
} // __kmp_round_up_to_va
// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
//
// loc_ref: source location information
// gtid: global thread number.
// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
// sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
// private vars accessed in task.
// sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
// in task.
// task_entry: Pointer to task code entry point generated by compiler.
// returns: a pointer to the allocated kmp_task_t structure (task).
kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
kmp_tasking_flags_t *flags,
size_t sizeof_kmp_task_t, size_t sizeof_shareds,
kmp_routine_entry_t task_entry) {
kmp_task_t *task;
kmp_taskdata_t *taskdata;
kmp_info_t *thread = __kmp_threads[gtid];
kmp_info_t *encountering_thread = thread;
kmp_team_t *team = thread->th.th_team;
kmp_taskdata_t *parent_task = thread->th.th_current_task;
size_t shareds_offset;
if (UNLIKELY(!TCR_4(__kmp_init_middle)))
__kmp_middle_initialize();
if (flags->hidden_helper) {
if (__kmp_enable_hidden_helper) {
if (!TCR_4(__kmp_init_hidden_helper))
__kmp_hidden_helper_initialize();
// For a hidden helper task encountered by a regular thread, we will push
// the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper
// thread.
if (!KMP_HIDDEN_HELPER_THREAD(gtid)) {
thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
// We don't change the parent-child relation for hidden helper task as
// we need that to do per-task-region synchronization.
}
} else {
// If the hidden helper task is not enabled, reset the flag to FALSE.
flags->hidden_helper = FALSE;
}
}
KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
"sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
sizeof_shareds, task_entry));
if (parent_task->td_flags.final) {
if (flags->merged_if0) {
}
flags->final = 1;
}
if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
// Untied task encountered causes the TSC algorithm to check entire deque of
// the victim thread. If no untied task encountered, then checking the head
// of the deque should be enough.
KMP_CHECK_UPDATE(
encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
}
// Detachable tasks are not proxy tasks yet but could be in the future. Doing
// the tasking setup
// when that happens is too late.
if (UNLIKELY(flags->proxy == TASK_PROXY ||
flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
if (flags->proxy == TASK_PROXY) {
flags->tiedness = TASK_UNTIED;
flags->merged_if0 = 1;
}
/* are we running in a sequential parallel or tskm_immediate_exec... we need
tasking support enabled */
if ((encountering_thread->th.th_task_team) == NULL) {
/* This should only happen if the team is serialized
setup a task team and propagate it to the thread */
KMP_DEBUG_ASSERT(team->t.t_serialized);
KA_TRACE(30,
("T#%d creating task team in __kmp_task_alloc for proxy task\n",
gtid));
__kmp_task_team_setup(
encountering_thread, team,
1); // 1 indicates setup the current team regardless of nthreads
encountering_thread->th.th_task_team =
team->t.t_task_team[encountering_thread->th.th_task_state];
}
kmp_task_team_t *task_team = encountering_thread->th.th_task_team;
/* tasking must be enabled now as the task might not be pushed */
if (!KMP_TASKING_ENABLED(task_team)) {
KA_TRACE(
30,
("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
__kmp_enable_tasking(task_team, encountering_thread);
kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid;
kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
// No lock needed since only owner can allocate
if (thread_data->td.td_deque == NULL) {
__kmp_alloc_task_deque(encountering_thread, thread_data);
}
}
if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
task_team->tt.tt_found_proxy_tasks == FALSE)
TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
if (flags->hidden_helper &&
task_team->tt.tt_hidden_helper_task_encountered == FALSE)
TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
}
// Calculate shared structure offset including padding after kmp_task_t struct
// to align pointers in shared struct
shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
// Allocate a kmp_taskdata_t block and a kmp_task_t block.
KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
shareds_offset));
KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
sizeof_shareds));
// Avoid double allocation here by combining shareds with taskdata
#if USE_FAST_MEMORY
taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
encountering_thread, shareds_offset + sizeof_shareds);
#else /* ! USE_FAST_MEMORY */
taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
encountering_thread, shareds_offset + sizeof_shareds);
#endif /* USE_FAST_MEMORY */
ANNOTATE_HAPPENS_AFTER(taskdata);
task = KMP_TASKDATA_TO_TASK(taskdata);
// Make sure task & taskdata are aligned appropriately
#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
#else
KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
#endif
if (sizeof_shareds > 0) {
// Avoid double allocation here by combining shareds with taskdata
task->shareds = &((char *)taskdata)[shareds_offset];
// Make sure shareds struct is aligned to pointer size
KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
0);
} else {
task->shareds = NULL;
}
task->routine = task_entry;
task->part_id = 0; // AC: Always start with 0 part id
taskdata->td_task_id = KMP_GEN_TASK_ID();
taskdata->td_team = thread->th.th_team;
taskdata->td_alloc_thread = encountering_thread;
taskdata->td_parent = parent_task;
taskdata->td_level = parent_task->td_level + 1; // increment nesting level
KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
taskdata->td_ident = loc_ref;
taskdata->td_taskwait_ident = NULL;
taskdata->td_taskwait_counter = 0;
taskdata->td_taskwait_thread = 0;
KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
// avoid copying icvs for proxy tasks
if (flags->proxy == TASK_FULL)
copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
taskdata->td_flags.tiedness = flags->tiedness;
taskdata->td_flags.final = flags->final;
taskdata->td_flags.merged_if0 = flags->merged_if0;
taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
taskdata->td_flags.proxy = flags->proxy;
taskdata->td_flags.detachable = flags->detachable;
taskdata->td_flags.hidden_helper = flags->hidden_helper;
taskdata->encountering_gtid = gtid;
taskdata->td_task_team = thread->th.th_task_team;
taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
taskdata->td_flags.tasktype = TASK_EXPLICIT;
// GEH - TODO: fix this to copy parent task's value of tasking_ser flag
taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
// GEH - TODO: fix this to copy parent task's value of team_serial flag
taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
// GEH - Note we serialize the task if the team is serialized to make sure
// implicit parallel region tasks are not left until program termination to
// execute. Also, it helps locality to execute immediately.
taskdata->td_flags.task_serial =
(parent_task->td_flags.final || taskdata->td_flags.team_serial ||
taskdata->td_flags.tasking_ser || flags->merged_if0);
taskdata->td_flags.started = 0;
taskdata->td_flags.executing = 0;
taskdata->td_flags.complete = 0;
taskdata->td_flags.freed = 0;
taskdata->td_flags.native = flags->native;
KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
// start at one because counts current task and children
KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
taskdata->td_taskgroup =
parent_task->td_taskgroup; // task inherits taskgroup from the parent task
taskdata->td_dephash = NULL;
taskdata->td_depnode = NULL;
if (flags->tiedness == TASK_UNTIED)
taskdata->td_last_tied = NULL; // will be set when the task is scheduled
else
taskdata->td_last_tied = taskdata;
taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled))
__ompt_task_init(taskdata, gtid);
#endif
// Only need to keep track of child task counts if team parallel and tasking
// not serialized or if it is a proxy or detachable or hidden helper task
if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE ||
flags->hidden_helper ||
!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
if (parent_task->td_taskgroup)
KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
// Only need to keep track of allocated child tasks for explicit tasks since
// implicit not deallocated
if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
}
}
if (flags->hidden_helper) {
taskdata->td_flags.task_serial = FALSE;
// Increment the number of hidden helper tasks to be executed
KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
}
KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
gtid, taskdata, taskdata->td_parent));
ANNOTATE_HAPPENS_BEFORE(task);
return task;
}
kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
kmp_int32 flags, size_t sizeof_kmp_task_t,
size_t sizeof_shareds,
kmp_routine_entry_t task_entry) {
kmp_task_t *retval;
kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
__kmp_assert_valid_gtid(gtid);
input_flags->native = FALSE;
// __kmp_task_alloc() sets up all other runtime flags
KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
"sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
input_flags->proxy ? "proxy" : "",
input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
sizeof_shareds, task_entry));
retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
sizeof_shareds, task_entry);
KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
return retval;
}
kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
kmp_int32 flags,
size_t sizeof_kmp_task_t,
size_t sizeof_shareds,
kmp_routine_entry_t task_entry,
kmp_int64 device_id) {
if (__kmp_enable_hidden_helper) {
auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
input_flags.hidden_helper = TRUE;
}
return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
sizeof_shareds, task_entry);
}
/*!
@ingroup TASKING
@param loc_ref location of the original task directive
@param gtid Global Thread ID of encountering thread
@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
task''
@param naffins Number of affinity items
@param affin_list List of affinity items
@return Returns non-zero if registering affinity information was not successful.
Returns 0 if registration was successful
This entry registers the affinity information attached to a task with the task
thunk structure kmp_taskdata_t.
*/
kmp_int32
__kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *new_task, kmp_int32 naffins,
kmp_task_affinity_info_t *affin_list) {
return 0;
}
// __kmp_invoke_task: invoke the specified task
//
// gtid: global thread ID of caller
// task: the task to invoke
// current_task: the task to resume after task invocation
static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
kmp_taskdata_t *current_task) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
kmp_info_t *thread;
int discard = 0 /* false */;
KA_TRACE(
30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
gtid, taskdata, current_task));
KMP_DEBUG_ASSERT(task);
if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
taskdata->td_flags.complete == 1)) {
// This is a proxy task that was already completed but it needs to run
// its bottom-half finish
KA_TRACE(
30,
("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
gtid, taskdata));
__kmp_bottom_half_finish_proxy(gtid, task);
KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
"proxy task %p, resuming task %p\n",
gtid, taskdata, current_task));
return;
}
#if OMPT_SUPPORT
// For untied tasks, the first task executed only calls __kmpc_omp_task and
// does not execute code.
ompt_thread_info_t oldInfo;
if (UNLIKELY(ompt_enabled.enabled)) {
// Store the threads states and restore them after the task
thread = __kmp_threads[gtid];
oldInfo = thread->th.ompt_thread_info;
thread->th.ompt_thread_info.wait_id = 0;
thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
? ompt_state_work_serial
: ompt_state_work_parallel;
taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
}
#endif
// Decreament the counter of hidden helper tasks to be executed
if (taskdata->td_flags.hidden_helper) {
// Hidden helper tasks can only be executed by hidden helper threads
KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
}
// Proxy tasks are not handled by the runtime
if (taskdata->td_flags.proxy != TASK_PROXY) {
ANNOTATE_HAPPENS_AFTER(task);
__kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
}
// TODO: cancel tasks if the parallel region has also been cancelled
// TODO: check if this sequence can be hoisted above __kmp_task_start
// if cancellation has been enabled for this run ...
if (UNLIKELY(__kmp_omp_cancellation)) {
thread = __kmp_threads[gtid];
kmp_team_t *this_team = thread->th.th_team;
kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
if ((taskgroup && taskgroup->cancel_request) ||
(this_team->t.t_cancel_request == cancel_parallel)) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
ompt_data_t *task_data;
if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
__ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
ompt_callbacks.ompt_callback(ompt_callback_cancel)(
task_data,
((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
: ompt_cancel_parallel) |
ompt_cancel_discarded_task,
NULL);
}
#endif
KMP_COUNT_BLOCK(TASK_cancelled);
// this task belongs to a task group and we need to cancel it
discard = 1 /* true */;
}
}
// Invoke the task routine and pass in relevant data.
// Thunks generated by gcc take a different argument list.
if (!discard) {
if (taskdata->td_flags.tiedness == TASK_UNTIED) {
taskdata->td_last_tied = current_task->td_last_tied;
KMP_DEBUG_ASSERT(taskdata->td_last_tied);
}
#if KMP_STATS_ENABLED
KMP_COUNT_BLOCK(TASK_executed);
switch (KMP_GET_THREAD_STATE()) {
case FORK_JOIN_BARRIER:
KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
break;
case PLAIN_BARRIER:
KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
break;
case TASKYIELD:
KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
break;
case TASKWAIT:
KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
break;
case TASKGROUP:
KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
break;
default:
KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
break;
}
#endif // KMP_STATS_ENABLED
// OMPT task begin
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled))
__ompt_task_start(task, current_task, gtid);
#endif
#if USE_ITT_BUILD && USE_ITT_NOTIFY
kmp_uint64 cur_time;
kmp_int32 kmp_itt_count_task =
__kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
current_task->td_flags.tasktype == TASK_IMPLICIT;
if (kmp_itt_count_task) {
thread = __kmp_threads[gtid];
// Time outer level explicit task on barrier for adjusting imbalance time
if (thread->th.th_bar_arrive_time)
cur_time = __itt_get_timestamp();
else
kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
}
KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
#endif
#ifdef KMP_GOMP_COMPAT
if (taskdata->td_flags.native) {
((void (*)(void *))(*(task->routine)))(task->shareds);
} else
#endif /* KMP_GOMP_COMPAT */
{
(*(task->routine))(gtid, task);
}
KMP_POP_PARTITIONED_TIMER();
#if USE_ITT_BUILD && USE_ITT_NOTIFY
if (kmp_itt_count_task) {
// Barrier imbalance - adjust arrive time with the task duration
thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
}
KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
#endif
}
// Proxy tasks are not handled by the runtime
if (taskdata->td_flags.proxy != TASK_PROXY) {
ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled)) {
thread->th.ompt_thread_info = oldInfo;
if (taskdata->td_flags.tiedness == TASK_TIED) {
taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
}
__kmp_task_finish<true>(gtid, task, current_task);
} else
#endif
__kmp_task_finish<false>(gtid, task, current_task);
}
KA_TRACE(
30,
("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
gtid, taskdata, current_task));
return;
}
// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
//
// loc_ref: location of original task pragma (ignored)
// gtid: Global Thread ID of encountering thread
// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
// Returns:
// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
// be resumed later.
// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
// resumed later.
kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *new_task) {
kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
loc_ref, new_taskdata));
#if OMPT_SUPPORT
kmp_taskdata_t *parent;
if (UNLIKELY(ompt_enabled.enabled)) {
parent = new_taskdata->td_parent;
if (ompt_enabled.ompt_callback_task_create) {
ompt_data_t task_data = ompt_data_none;
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
parent ? &(parent->ompt_task_info.task_data) : &task_data,
parent ? &(parent->ompt_task_info.frame) : NULL,
&(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
OMPT_GET_RETURN_ADDRESS(0));
}
}
#endif
/* Should we execute the new task or queue it? For now, let's just always try
to queue it. If the queue fills up, then we'll execute it. */
if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
{ // Execute this task immediately
kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
new_taskdata->td_flags.task_serial = 1;
__kmp_invoke_task(gtid, new_task, current_task);
}
KA_TRACE(
10,
("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
"loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
gtid, loc_ref, new_taskdata));
ANNOTATE_HAPPENS_BEFORE(new_task);
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled)) {
parent->ompt_task_info.frame.enter_frame = ompt_data_none;
}
#endif
return TASK_CURRENT_NOT_QUEUED;
}
// __kmp_omp_task: Schedule a non-thread-switchable task for execution
//
// gtid: Global Thread ID of encountering thread
// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
// serialize_immediate: if TRUE then if the task is executed immediately its
// execution will be serialized
// Returns:
// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
// be resumed later.
// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
// resumed later.
kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
bool serialize_immediate) {
kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
/* Should we execute the new task or queue it? For now, let's just always try
to queue it. If the queue fills up, then we'll execute it. */
if (new_taskdata->td_flags.proxy == TASK_PROXY ||
__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
{ // Execute this task immediately
kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
if (serialize_immediate)
new_taskdata->td_flags.task_serial = 1;
__kmp_invoke_task(gtid, new_task, current_task);
}
ANNOTATE_HAPPENS_BEFORE(new_task);
return TASK_CURRENT_NOT_QUEUED;
}
// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
// non-thread-switchable task from the parent thread only!
//
// loc_ref: location of original task pragma (ignored)
// gtid: Global Thread ID of encountering thread
// new_task: non-thread-switchable task thunk allocated by
// __kmp_omp_task_alloc()
// Returns:
// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
// be resumed later.
// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
// resumed later.
kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *new_task) {
kmp_int32 res;
KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
#if KMP_DEBUG || OMPT_SUPPORT
kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
#endif
KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
new_taskdata));
__kmp_assert_valid_gtid(gtid);
#if OMPT_SUPPORT
kmp_taskdata_t *parent = NULL;
if (UNLIKELY(ompt_enabled.enabled)) {
if (!new_taskdata->td_flags.started) {
OMPT_STORE_RETURN_ADDRESS(gtid);
parent = new_taskdata->td_parent;
if (!parent->ompt_task_info.frame.enter_frame.ptr) {
parent->ompt_task_info.frame.enter_frame.ptr =
OMPT_GET_FRAME_ADDRESS(0);
}
if (ompt_enabled.ompt_callback_task_create) {
ompt_data_t task_data = ompt_data_none;
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
parent ? &(parent->ompt_task_info.task_data) : &task_data,
parent ? &(parent->ompt_task_info.frame) : NULL,
&(new_taskdata->ompt_task_info.task_data),
ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
OMPT_LOAD_RETURN_ADDRESS(gtid));
}
} else {
// We are scheduling the continuation of an UNTIED task.
// Scheduling back to the parent task.
__ompt_task_finish(new_task,
new_taskdata->ompt_task_info.scheduling_parent,
ompt_task_switch);
new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
}
}
#endif
res = __kmp_omp_task(gtid, new_task, true);
KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
"TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
gtid, loc_ref, new_taskdata));
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
parent->ompt_task_info.frame.enter_frame = ompt_data_none;
}
#endif
return res;
}
// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
// a taskloop task with the correct OMPT return address
//
// loc_ref: location of original task pragma (ignored)
// gtid: Global Thread ID of encountering thread
// new_task: non-thread-switchable task thunk allocated by
// __kmp_omp_task_alloc()
// codeptr_ra: return address for OMPT callback
// Returns:
// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
// be resumed later.
// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
// resumed later.
kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *new_task, void *codeptr_ra) {
kmp_int32 res;
KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
#if KMP_DEBUG || OMPT_SUPPORT
kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
#endif
KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
new_taskdata));
#if OMPT_SUPPORT
kmp_taskdata_t *parent = NULL;
if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
parent = new_taskdata->td_parent;
if (!parent->ompt_task_info.frame.enter_frame.ptr)
parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
if (ompt_enabled.ompt_callback_task_create) {
ompt_data_t task_data = ompt_data_none;
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
parent ? &(parent->ompt_task_info.task_data) : &task_data,
parent ? &(parent->ompt_task_info.frame) : NULL,
&(new_taskdata->ompt_task_info.task_data),
ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
codeptr_ra);
}
}
#endif
res = __kmp_omp_task(gtid, new_task, true);
KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
"TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
gtid, loc_ref, new_taskdata));
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
parent->ompt_task_info.frame.enter_frame = ompt_data_none;
}
#endif
return res;
}
template <bool ompt>
static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
void *frame_address,
void *return_address) {
kmp_taskdata_t *taskdata;
kmp_info_t *thread;
int thread_finished = FALSE;
KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
KMP_DEBUG_ASSERT(gtid >= 0);
if (__kmp_tasking_mode != tskm_immediate_exec) {
thread = __kmp_threads[gtid];
taskdata = thread->th.th_current_task;
#if OMPT_SUPPORT && OMPT_OPTIONAL
ompt_data_t *my_task_data;
ompt_data_t *my_parallel_data;
if (ompt) {
my_task_data = &(taskdata->ompt_task_info.task_data);
my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
my_task_data, return_address);
}
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
my_task_data, return_address);
}
}
#endif // OMPT_SUPPORT && OMPT_OPTIONAL
// Debugger: The taskwait is active. Store location and thread encountered the
// taskwait.
#if USE_ITT_BUILD
// Note: These values are used by ITT events as well.
#endif /* USE_ITT_BUILD */
taskdata->td_taskwait_counter += 1;
taskdata->td_taskwait_ident = loc_ref;
taskdata->td_taskwait_thread = gtid + 1;
#if USE_ITT_BUILD
void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
#endif /* USE_ITT_NOTIFY */
#endif /* USE_ITT_BUILD */
bool must_wait =
!taskdata->td_flags.team_serial && !taskdata->td_flags.final;
must_wait = must_wait || (thread->th.th_task_team != NULL &&
thread->th.th_task_team->tt.tt_found_proxy_tasks);
// If hidden helper thread is encountered, we must enable wait here.
must_wait =
must_wait ||
(__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
if (must_wait) {
kmp_flag_32<false, false> flag(
RCAST(std::atomic<kmp_uint32> *,
&(taskdata->td_incomplete_child_tasks)),
0U);
while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
flag.execute_tasks(thread, gtid, FALSE,
&thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
__kmp_task_stealing_constraint);
}
}
#if USE_ITT_BUILD
KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
#endif /* USE_ITT_BUILD */
// Debugger: The taskwait is completed. Location remains, but thread is
// negated.
taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt) {
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
my_task_data, return_address);
}
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
my_task_data, return_address);
}
taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
}
#endif // OMPT_SUPPORT && OMPT_OPTIONAL
ANNOTATE_HAPPENS_AFTER(taskdata);
}
KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
"returning TASK_CURRENT_NOT_QUEUED\n",
gtid, taskdata));
return TASK_CURRENT_NOT_QUEUED;
}
#if OMPT_SUPPORT && OMPT_OPTIONAL
OMPT_NOINLINE
static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
void *frame_address,
void *return_address) {
return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
return_address);
}
#endif // OMPT_SUPPORT && OMPT_OPTIONAL
// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
// complete
kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (UNLIKELY(ompt_enabled.enabled)) {
OMPT_STORE_RETURN_ADDRESS(gtid);
return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
OMPT_LOAD_RETURN_ADDRESS(gtid));
}
#endif
return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
}
// __kmpc_omp_taskyield: switch to a different task
kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
kmp_taskdata_t *taskdata;
kmp_info_t *thread;
int thread_finished = FALSE;
KMP_COUNT_BLOCK(OMP_TASKYIELD);
KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
gtid, loc_ref, end_part));
__kmp_assert_valid_gtid(gtid);
if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
thread = __kmp_threads[gtid];
taskdata = thread->th.th_current_task;
// Should we model this as a task wait or not?
// Debugger: The taskwait is active. Store location and thread encountered the
// taskwait.
#if USE_ITT_BUILD
// Note: These values are used by ITT events as well.
#endif /* USE_ITT_BUILD */
taskdata->td_taskwait_counter += 1;
taskdata->td_taskwait_ident = loc_ref;
taskdata->td_taskwait_thread = gtid + 1;
#if USE_ITT_BUILD
void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
#endif /* USE_ITT_NOTIFY */
#endif /* USE_ITT_BUILD */
if (!taskdata->td_flags.team_serial) {
kmp_task_team_t *task_team = thread->th.th_task_team;
if (task_team != NULL) {
if (KMP_TASKING_ENABLED(task_team)) {
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled))
thread->th.ompt_thread_info.ompt_task_yielded = 1;
#endif
__kmp_execute_tasks_32(
thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
&thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
__kmp_task_stealing_constraint);
#if OMPT_SUPPORT
if (UNLIKELY(ompt_enabled.enabled))
thread->th.ompt_thread_info.ompt_task_yielded = 0;
#endif
}
}
}
#if USE_ITT_BUILD
KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
#endif /* USE_ITT_BUILD */
// Debugger: The taskwait is completed. Location remains, but thread is
// negated.
taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
}
KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
"returning TASK_CURRENT_NOT_QUEUED\n",
gtid, taskdata));
return TASK_CURRENT_NOT_QUEUED;
}
// Task Reduction implementation
//
// Note: initial implementation didn't take into account the possibility
// to specify omp_orig for initializer of the UDR (user defined reduction).
// Corrected implementation takes into account the omp_orig object.
// Compiler is free to use old implementation if omp_orig is not specified.
/*!
@ingroup BASIC_TYPES
@{
*/
/*!
Flags for special info per task reduction item.
*/
typedef struct kmp_taskred_flags {
/*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */
unsigned lazy_priv : 1;
unsigned reserved31 : 31;
} kmp_taskred_flags_t;
/*!
Internal struct for reduction data item related info set up by compiler.
*/
typedef struct kmp_task_red_input {
void *reduce_shar; /**< shared between tasks item to reduce into */
size_t reduce_size; /**< size of data item in bytes */
// three compiler-generated routines (init, fini are optional):
void *reduce_init; /**< data initialization routine (single parameter) */
void *reduce_fini; /**< data finalization routine */
void *reduce_comb; /**< data combiner routine */
kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
} kmp_task_red_input_t;
/*!
Internal struct for reduction data item related info saved by the library.
*/
typedef struct kmp_taskred_data {
void *reduce_shar; /**< shared between tasks item to reduce into */
size_t reduce_size; /**< size of data item */
kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
void *reduce_priv; /**< array of thread specific items */
void *reduce_pend; /**< end of private data for faster comparison op */
// three compiler-generated routines (init, fini are optional):
void *reduce_comb; /**< data combiner routine */
void *reduce_init; /**< data initialization routine (two parameters) */
void *reduce_fini; /**< data finalization routine */
void *reduce_orig; /**< original item (can be used in UDR initializer) */
} kmp_taskred_data_t;
/*!
Internal struct for reduction data item related info set up by compiler.
New interface: added reduce_orig field to provide omp_orig for UDR initializer.
*/
typedef struct kmp_taskred_input {
void *reduce_shar; /**< shared between tasks item to reduce into */
void *reduce_orig; /**< original reduction item used for initialization */
size_t reduce_size; /**< size of data item */
// three compiler-generated routines (init, fini are optional):
void *reduce_init; /**< data initialization routine (two parameters) */
void *reduce_fini; /**< data finalization routine */
void *reduce_comb; /**< data combiner routine */
kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
} kmp_taskred_input_t;
/*!
@}
*/
template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
template <>
void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
kmp_task_red_input_t &src) {
item.reduce_orig = NULL;
}
template <>
void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
kmp_taskred_input_t &src) {
if (src.reduce_orig != NULL) {
item.reduce_orig = src.reduce_orig;
} else {
item.reduce_orig = src.reduce_shar;
} // non-NULL reduce_orig means new interface used
}
template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
template <>
void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
size_t offset) {
((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
}
template <>
void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
size_t offset) {
((void (*)(void *, void *))item.reduce_init)(
(char *)(item.reduce_priv) + offset, item.reduce_orig);
}
template <typename T>
void *__kmp_task_reduction_init(int gtid, int num, T *data) {
__kmp_assert_valid_gtid(gtid);
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
kmp_uint32 nth = thread->th.th_team_nproc;
kmp_taskred_data_t *arr;
// check input data just in case
KMP_ASSERT(tg != NULL);
KMP_ASSERT(data != NULL);
KMP_ASSERT(num > 0);
if (nth == 1) {
KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
gtid, tg));
return (void *)tg;
}
KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
gtid, tg, num));
arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
thread, num * sizeof(kmp_taskred_data_t));
for (int i = 0; i < num; ++i) {
size_t size = data[i].reduce_size - 1;
// round the size up to cache line per thread-specific item
size += CACHE_LINE - size % CACHE_LINE;
KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
arr[i].reduce_shar = data[i].reduce_shar;
arr[i].reduce_size = size;
arr[i].flags = data[i].flags;
arr[i].reduce_comb = data[i].reduce_comb;
arr[i].reduce_init = data[i].reduce_init;
arr[i].reduce_fini = data[i].reduce_fini;
__kmp_assign_orig<T>(arr[i], data[i]);
if (!arr[i].flags.lazy_priv) {
// allocate cache-line aligned block and fill it with zeros
arr[i].reduce_priv = __kmp_allocate(nth * size);
arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
if (arr[i].reduce_init != NULL) {
// initialize all thread-specific items
for (size_t j = 0; j < nth; ++j) {
__kmp_call_init<T>(arr[i], j * size);
}
}
} else {
// only allocate space for pointers now,
// objects will be lazily allocated/initialized if/when requested
// note that __kmp_allocate zeroes the allocated memory
arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
}
}
tg->reduce_data = (void *)arr;
tg->reduce_num_data = num;
return (void *)tg;
}
/*!
@ingroup TASKING
@param gtid Global thread ID
@param num Number of data items to reduce
@param data Array of data for reduction
@return The taskgroup identifier
Initialize task reduction for the taskgroup.
Note: this entry supposes the optional compiler-generated initializer routine
has single parameter - pointer to object to be initialized. That means
the reduction either does not use omp_orig object, or the omp_orig is accessible
without help of the runtime library.
*/
void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
}
/*!
@ingroup TASKING
@param gtid Global thread ID
@param num Number of data items to reduce
@param data Array of data for reduction
@return The taskgroup identifier
Initialize task reduction for the taskgroup.
Note: this entry supposes the optional compiler-generated initializer routine
has two parameters, pointer to object to be initialized and pointer to omp_orig
*/
void *__kmpc_taskred_init(int gtid, int num, void *data) {
return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
}
// Copy task reduction data (except for shared pointers).
template <typename T>
void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
kmp_taskgroup_t *tg, void *reduce_data) {
kmp_taskred_data_t *arr;
KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
" from data %p\n",
thr, tg, reduce_data));
arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
thr, num * sizeof(kmp_taskred_data_t));
// threads will share private copies, thunk routines, sizes, flags, etc.:
KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
for (int i = 0; i < num; ++i) {
arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
}
tg->reduce_data = (void *)arr;
tg->reduce_num_data = num;
}
/*!
@ingroup TASKING
@param gtid Global thread ID
@param tskgrp The taskgroup ID (optional)
@param data Shared location of the item
@return The pointer to per-thread data
Get thread-specific location of data item
*/
void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
__kmp_assert_valid_gtid(gtid);
kmp_info_t *thread = __kmp_threads[gtid];
kmp_int32 nth = thread->th.th_team_nproc;
if (nth == 1)
return data; // nothing to do
kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
if (tg == NULL)
tg = thread->th.th_current_task->td_taskgroup;
KMP_ASSERT(tg != NULL);
kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
kmp_int32 num = tg->reduce_num_data;
kmp_int32 tid = thread->th.th_info.ds.ds_tid;
KMP_ASSERT(data != NULL);
while (tg != NULL) {
for (int i = 0; i < num; ++i) {
if (!arr[i].flags.lazy_priv) {
if (data == arr[i].reduce_shar ||
(data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
} else {
// check shared location first
void **p_priv = (void **)(arr[i].reduce_priv);
if (data == arr[i].reduce_shar)
goto found;
// check if we get some thread specific location as parameter
for (int j = 0; j < nth; ++j)
if (data == p_priv[j])
goto found;
continue; // not found, continue search
found:
if (p_priv[tid] == NULL) {
// allocate thread specific object lazily
p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
if (arr[i].reduce_init != NULL) {
if (arr[i].reduce_orig != NULL) { // new interface
((void (*)(void *, void *))arr[i].reduce_init)(
p_priv[tid], arr[i].reduce_orig);
} else { // old interface (single parameter)
((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
}
}
}
return p_priv[tid];
}
}
tg = tg->parent;
arr = (kmp_taskred_data_t *)(tg->reduce_data);
num = tg->reduce_num_data;
}
KMP_ASSERT2(0, "Unknown task reduction item");
return NULL; // ERROR, this line never executed
}
// Finalize task reduction.
// Called from __kmpc_end_taskgroup()
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
kmp_int32 nth = th->th.th_team_nproc;
KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
kmp_int32 num = tg->reduce_num_data;
for (int i = 0; i < num; ++i) {
void *sh_data = arr[i].reduce_shar;
void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
void (*f_comb)(void *, void *) =
(void (*)(void *, void *))(arr[i].reduce_comb);
if (!arr[i].flags.lazy_priv) {
void *pr_data = arr[i].reduce_priv;
size_t size = arr[i].reduce_size;
for (int j = 0; j < nth; ++j) {
void *priv_data = (char *)pr_data + j * size;
f_comb(sh_data, priv_data); // combine results
if (f_fini)
f_fini(priv_data); // finalize if needed
}
} else {
void **pr_data = (void **)(arr[i].reduce_priv);
for (int j = 0; j < nth; ++j) {
if (pr_data[j] != NULL) {
f_comb(sh_data, pr_data[j]); // combine results
if (f_fini)
f_fini(pr_data[j]); // finalize if needed
__kmp_free(pr_data[j]);
}
}
}
__kmp_free(arr[i].reduce_priv);
}
__kmp_thread_free(th, arr);
tg->reduce_data = NULL;
tg->reduce_num_data = 0;
}
// Cleanup task reduction data for parallel or worksharing,
// do not touch task private data other threads still working with.
// Called from __kmpc_end_taskgroup()
static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
__kmp_thread_free(th, tg->reduce_data);
tg->reduce_data = NULL;
tg->reduce_num_data = 0;
}
template <typename T>
void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
int num, T *data) {
__kmp_assert_valid_gtid(gtid);
kmp_info_t *thr = __kmp_threads[gtid];
kmp_int32 nth = thr->th.th_team_nproc;
__kmpc_taskgroup(loc, gtid); // form new taskgroup first
if (nth == 1) {
KA_TRACE(10,
("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
gtid, thr->th.th_current_task->td_taskgroup));
return (void *)thr->th.th_current_task->td_taskgroup;
}
kmp_team_t *team = thr->th.th_team;
void *reduce_data;
kmp_taskgroup_t *tg;
reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
if (reduce_data == NULL &&
__kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
(void *)1)) {
// single thread enters this block to initialize common reduction data
KMP_DEBUG_ASSERT(reduce_data == NULL);
// first initialize own data, then make a copy other threads can use
tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
// fini counters should be 0 at this point
KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
} else {
while (
(reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
(void *)1) { // wait for task reduction initialization
KMP_CPU_PAUSE();
}
KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
tg = thr->th.th_current_task->td_taskgroup;
__kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
}
return tg;
}
/*!
@ingroup TASKING
@param loc Source location info
@param gtid Global thread ID
@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
@param num Number of data items to reduce
@param data Array of data for reduction
@return The taskgroup identifier
Initialize task reduction for a parallel or worksharing.
Note: this entry supposes the optional compiler-generated initializer routine
has single parameter - pointer to object to be initialized. That means
the reduction either does not use omp_orig object, or the omp_orig is accessible
without help of the runtime library.
*/
void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
int num, void *data) {
return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
(kmp_task_red_input_t *)data);
}
/*!
@ingroup TASKING
@param loc Source location info
@param gtid Global thread ID
@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
@param num Number of data items to reduce
@param data Array of data for reduction
@return The taskgroup identifier
Initialize task reduction for a parallel or worksharing.
Note: this entry supposes the optional compiler-generated initializer routine
has two parameters, pointer to object to be initialized and pointer to omp_orig
*/
void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
void *data) {
return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
(kmp_taskred_input_t *)data);
}
/*!
@ingroup TASKING
@param loc Source location info
@param gtid Global thread ID
@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
Finalize task reduction for a parallel or worksharing.
*/
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
__kmpc_end_taskgroup(loc, gtid);
}
// __kmpc_taskgroup: Start a new taskgroup
void __kmpc_taskgroup(ident_t *loc, int gtid) {
__kmp_assert_valid_gtid(gtid);
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskdata_t *taskdata = thread->th.th_current_task;
kmp_taskgroup_t *tg_new =
(kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
tg_new->parent = taskdata->td_taskgroup;
tg_new->reduce_data = NULL;
tg_new->reduce_num_data = 0;
taskdata->td_taskgroup = tg_new;
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
if (!codeptr)
codeptr = OMPT_GET_RETURN_ADDRESS(0);
kmp_team_t *team = thread->th.th_team;
ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
// FIXME: I think this is wrong for lwt!
ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
&(my_task_data), codeptr);
}
#endif
}
// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
// and its descendants are complete
void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
__kmp_assert_valid_gtid(gtid);
kmp_info_t *thread = __kmp_threads[gtid];
kmp_taskdata_t *taskdata = thread->th.th_current_task;
kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
int thread_finished = FALSE;
#if OMPT_SUPPORT && OMPT_OPTIONAL
kmp_team_t *team;
ompt_data_t my_task_data;
ompt_data_t my_parallel_data;
void *codeptr;
if (UNLIKELY(ompt_enabled.enabled)) {
team = thread->th.th_team;
my_task_data = taskdata->ompt_task_info.task_data;
// FIXME: I think this is wrong for lwt!
my_parallel_data = team->t.ompt_team_info.parallel_data;
codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
if (!codeptr)
codeptr = OMPT_GET_RETURN_ADDRESS(0);
}
#endif
KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
KMP_DEBUG_ASSERT(taskgroup != NULL);
KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
if (__kmp_tasking_mode != tskm_immediate_exec) {
// mark task as waiting not on a barrier
taskdata->td_taskwait_counter += 1;
taskdata->td_taskwait_ident = loc;
taskdata->td_taskwait_thread = gtid + 1;
#if USE_ITT_BUILD
// For ITT the taskgroup wait is similar to taskwait until we need to
// distinguish them
void *itt_sync_obj = NULL;
#if USE_ITT_NOTIFY
KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
#endif /* USE_ITT_NOTIFY */
#endif /* USE_ITT_BUILD */
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
&(my_task_data), codeptr);
}
#endif
if (!taskdata->td_flags.team_serial ||
(thread->th.th_task_team != NULL &&
thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
kmp_flag_32<false, false> flag(
RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
flag.execute_tasks(thread, gtid, FALSE,
&thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
__kmp_task_stealing_constraint);
}
}
taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
&(my_task_data), codeptr);
}
#endif
#if USE_ITT_BUILD
KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
#endif /* USE_ITT_BUILD */
}
KMP_DEBUG_ASSERT(taskgroup->count == 0);
if (taskgroup->reduce_data != NULL) { // need to reduce?
int cnt;
void *reduce_data;
kmp_team_t *t = thread->th.th_team;
kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
// check if <priv> data of the first reduction variable shared for the team
void *priv0 = arr[0].reduce_priv;
if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
// finishing task reduction on parallel
cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
if (cnt == thread->th.th_team_nproc - 1) {
// we are the last thread passing __kmpc_reduction_modifier_fini()
// finalize task reduction:
__kmp_task_reduction_fini(thread, taskgroup);
// cleanup fields in the team structure:
// TODO: is relaxed store enough here (whole barrier should follow)?
__kmp_thread_free(thread, reduce_data);
KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
} else {
// we are not the last thread passing __kmpc_reduction_modifier_fini(),