libomptarget/DeviceRTL/src/Parallelism.cpp - llvm-project/openmp - Git at Google

 //===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Parallel implementation in the GPU. Here is the pattern:
 //
 //    while (not finished) {
 //
 //    if (master) {
 //      sequential code, decide which par loop to do, or if finished
 //     __kmpc_kernel_prepare_parallel() // exec by master only
 //    }
 //    syncthreads // A
 //    __kmpc_kernel_parallel() // exec by all
 //    if (this thread is included in the parallel) {
 //      switch () for all parallel loops
 //      __kmpc_kernel_end_parallel() // exec only by threads in parallel
 //    }
 //
 //
 //    The reason we don't exec end_parallel for the threads not included
 //    in the parallel loop is that for each barrier in the parallel
 //    region, these non-included threads will cycle through the
 //    syncthread A. Thus they must preserve their current threadId that
 //    is larger than thread in team.
 //
 //    To make a long story short...
 //
 //===----------------------------------------------------------------------===//

 #include "Debug.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
 #include "Synchronization.h"
 #include "Types.h"
 #include "Utils.h"

 using namespace ompx;

 #pragma omp begin declare target device_type(nohost)

 namespace {

 uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
   uint32_t NThreadsICV =
       NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
   uint32_t NumThreads = mapping::getMaxTeamThreads();

   if (NThreadsICV != 0 && NThreadsICV < NumThreads)
     NumThreads = NThreadsICV;

   // SPMD mode allows any number of threads, for generic mode we round down to a
   // multiple of WARPSIZE since it is legal to do so in OpenMP.
   if (mapping::isSPMDMode())
     return NumThreads;

   if (NumThreads < mapping::getWarpSize())
     NumThreads = 1;
   else
     NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));

   return NumThreads;
 }

 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
                                               int32_t bound_tid, void *fn,
                                               void **args, int64_t nargs) {
   switch (nargs) {
 #include "generated_microtask_cases.gen"
   default:
     PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
     __builtin_trap();
   }
 }

 } // namespace

 extern "C" {

 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
                                                    int32_t num_threads,
                                                    void *fn, void **args,
                                                    const int64_t nargs) {
   uint32_t TId = mapping::getThreadIdInBlock();
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
   uint32_t PTeamSize =
       NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
   // Avoid the race between the read of the `icv::Level` above and the write
   // below by synchronizing all threads here.
   synchronize::threadsAligned(atomic::seq_cst);
   {
     // Note that the order here is important. `icv::Level` has to be updated
     // last or the other updates will cause a thread specific state to be
     // created.
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, TId == 0, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
                                      /*ForceTeamState=*/true);
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
                                /*ForceTeamState=*/true);

     // Synchronize all threads after the main thread (TId == 0) set up the
     // team state properly.
     synchronize::threadsAligned(atomic::acq_rel);

     state::ParallelTeamSize.assert_eq(PTeamSize, ident,
                                       /*ForceTeamState=*/true);
     icv::ActiveLevel.assert_eq(1u, ident, /*ForceTeamState=*/true);
     icv::Level.assert_eq(1u, ident, /*ForceTeamState=*/true);

     // Ensure we synchronize before we run user code to avoid invalidating the
     // assumptions above.
     synchronize::threadsAligned(atomic::relaxed);

     if (!PTeamSize || TId < PTeamSize)
       invokeMicrotask(TId, 0, fn, args, nargs);

     // Synchronize all threads at the end of a parallel region.
     synchronize::threadsAligned(atomic::seq_cst);
   }

   // Synchronize all threads to make sure every thread exits the scope above;
   // otherwise the following assertions and the assumption in
   // __kmpc_target_deinit may not hold.
   synchronize::threadsAligned(atomic::acq_rel);

   state::ParallelTeamSize.assert_eq(1u, ident, /*ForceTeamState=*/true);
   icv::ActiveLevel.assert_eq(0u, ident, /*ForceTeamState=*/true);
   icv::Level.assert_eq(0u, ident, /*ForceTeamState=*/true);

   // Ensure we synchronize to create an aligned region around the assumptions.
   synchronize::threadsAligned(atomic::relaxed);

   return;
 }

 [[clang::always_inline]] void
 __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                    int32_t num_threads, int proc_bind, void *fn,
                    void *wrapper_fn, void **args, int64_t nargs) {
   uint32_t TId = mapping::getThreadIdInBlock();

   // Assert the parallelism level is zero if disabled by the user.
   ASSERT((config::mayUseNestedParallelism() || icv::Level == 0),
          "nested parallelism while disabled");

   // Handle the serialized case first, same for SPMD/non-SPMD:
   // 1) if-clause(0)
   // 2) parallel in task or other thread state inducing construct
   // 3) nested parallel regions
   if (OMP_UNLIKELY(!if_expr || state::HasThreadState ||
                    (config::mayUseNestedParallelism() && icv::Level))) {
     state::DateEnvironmentRAII DERAII(ident);
     ++icv::Level;
     invokeMicrotask(TId, 0, fn, args, nargs);
     return;
   }

   // From this point forward we know that there is no thread state used.
   ASSERT(state::HasThreadState == false, nullptr);

   uint32_t NumThreads = determineNumberOfThreads(num_threads);
   uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
   uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
   if (mapping::isSPMDMode()) {
     // This was moved to its own routine so it could be called directly
     // in certain situations to avoid resource consumption of unused
     // logic in parallel_51.
     __kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);

     return;
   }

   // We do *not* create a new data environment because all threads in the team
   // that are active are now running this parallel region. They share the
   // TeamState, which has an increase level-var and potentially active-level
   // set, but they do not have individual ThreadStates yet. If they ever
   // modify the ICVs beyond this point a ThreadStates will be allocated.

   bool IsActiveParallelRegion = NumThreads > 1;
   if (!IsActiveParallelRegion) {
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
     invokeMicrotask(TId, 0, fn, args, nargs);
     return;
   }

   void **GlobalArgs = nullptr;
   if (nargs) {
     __kmpc_begin_sharing_variables(&GlobalArgs, nargs);
     switch (nargs) {
     default:
       for (int I = 0; I < nargs; I++)
         GlobalArgs[I] = args[I];
       break;
     case 16:
       GlobalArgs[15] = args[15];
       [[fallthrough]];
     case 15:
       GlobalArgs[14] = args[14];
       [[fallthrough]];
     case 14:
       GlobalArgs[13] = args[13];
       [[fallthrough]];
     case 13:
       GlobalArgs[12] = args[12];
       [[fallthrough]];
     case 12:
       GlobalArgs[11] = args[11];
       [[fallthrough]];
     case 11:
       GlobalArgs[10] = args[10];
       [[fallthrough]];
     case 10:
       GlobalArgs[9] = args[9];
       [[fallthrough]];
     case 9:
       GlobalArgs[8] = args[8];
       [[fallthrough]];
     case 8:
       GlobalArgs[7] = args[7];
       [[fallthrough]];
     case 7:
       GlobalArgs[6] = args[6];
       [[fallthrough]];
     case 6:
       GlobalArgs[5] = args[5];
       [[fallthrough]];
     case 5:
       GlobalArgs[4] = args[4];
       [[fallthrough]];
     case 4:
       GlobalArgs[3] = args[3];
       [[fallthrough]];
     case 3:
       GlobalArgs[2] = args[2];
       [[fallthrough]];
     case 2:
       GlobalArgs[1] = args[1];
       [[fallthrough]];
     case 1:
       GlobalArgs[0] = args[0];
       [[fallthrough]];
     case 0:
       break;
     }
   }

   {
     // Note that the order here is important. `icv::Level` has to be updated
     // last or the other updates will cause a thread specific state to be
     // created.
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
                                           1u, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
                                           (void *)nullptr, true, ident,
                                           /*ForceTeamState=*/true);
     state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
                                      /*ForceTeamState=*/true);
     state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
                                /*ForceTeamState=*/true);

     // Master signals work to activate workers.
     synchronize::threads(atomic::seq_cst);
     // Master waits for workers to signal.
     synchronize::threads(atomic::seq_cst);
   }

   if (nargs)
     __kmpc_end_sharing_variables();
 }

 [[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
   // Work function and arguments for L1 parallel region.
   *WorkFn = state::ParallelRegionFn;

   // If this is the termination signal from the master, quit early.
   if (!*WorkFn)
     return false;

   // Set to true for workers participating in the parallel region.
   uint32_t TId = mapping::getThreadIdInBlock();
   bool ThreadIsActive = TId < state::getEffectivePTeamSize();
   return ThreadIsActive;
 }

 [[clang::noinline]] void __kmpc_kernel_end_parallel() {
   // In case we have modified an ICV for this thread before a ThreadState was
   // created. We drop it now to not contaminate the next parallel region.
   ASSERT(!mapping::isSPMDMode(), nullptr);
   uint32_t TId = mapping::getThreadIdInBlock();
   state::resetStateForThread(TId);
   ASSERT(!mapping::isSPMDMode(), nullptr);
 }

 uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }

 int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }

 void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
                            int32_t thread_limit) {}

 void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
 }

 #pragma omp end declare target
	//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Parallel implementation in the GPU. Here is the pattern:
	//
	// while (not finished) {
	//
	// if (master) {
	// sequential code, decide which par loop to do, or if finished
	// __kmpc_kernel_prepare_parallel() // exec by master only
	// }
	// syncthreads // A
	// __kmpc_kernel_parallel() // exec by all
	// if (this thread is included in the parallel) {
	// switch () for all parallel loops
	// __kmpc_kernel_end_parallel() // exec only by threads in parallel
	// }
	//
	//
	// The reason we don't exec end_parallel for the threads not included
	// in the parallel loop is that for each barrier in the parallel
	// region, these non-included threads will cycle through the
	// syncthread A. Thus they must preserve their current threadId that
	// is larger than thread in team.
	//
	// To make a long story short...
	//
	//===----------------------------------------------------------------------===//

	#include "Debug.h"
	#include "Interface.h"
	#include "Mapping.h"
	#include "State.h"
	#include "Synchronization.h"
	#include "Types.h"
	#include "Utils.h"

	using namespace ompx;

	#pragma omp begin declare target device_type(nohost)

	namespace {

	uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
	uint32_t NThreadsICV =
	NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads;
	uint32_t NumThreads = mapping::getMaxTeamThreads();

	if (NThreadsICV != 0 && NThreadsICV < NumThreads)
	NumThreads = NThreadsICV;

	// SPMD mode allows any number of threads, for generic mode we round down to a
	// multiple of WARPSIZE since it is legal to do so in OpenMP.
	if (mapping::isSPMDMode())
	return NumThreads;

	if (NumThreads < mapping::getWarpSize())
	NumThreads = 1;
	else
	NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));

	return NumThreads;
	}

	// Invoke an outlined parallel function unwrapping arguments (up to 32).
	[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
	int32_t bound_tid, void *fn,
	void **args, int64_t nargs) {
	switch (nargs) {
	#include "generated_microtask_cases.gen"
	default:
	PRINT("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
	__builtin_trap();
	}
	}

	} // namespace

	extern "C" {

	[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
	int32_t num_threads,
	void fn, void *args,
	const int64_t nargs) {
	uint32_t TId = mapping::getThreadIdInBlock();
	uint32_t NumThreads = determineNumberOfThreads(num_threads);
	uint32_t PTeamSize =
	NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads;
	// Avoid the race between the read of the `icv::Level` above and the write
	// below by synchronizing all threads here.
	synchronize::threadsAligned(atomic::seq_cst);
	{
	// Note that the order here is important. `icv::Level` has to be updated
	// last or the other updates will cause a thread specific state to be
	// created.
	state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
	1u, TId == 0, ident,
	/ForceTeamState=/true);
	state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident,
	/ForceTeamState=/true);
	state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident,
	/ForceTeamState=/true);

	// Synchronize all threads after the main thread (TId == 0) set up the
	// team state properly.
	synchronize::threadsAligned(atomic::acq_rel);

	state::ParallelTeamSize.assert_eq(PTeamSize, ident,
	/ForceTeamState=/true);
	icv::ActiveLevel.assert_eq(1u, ident, /ForceTeamState=/true);
	icv::Level.assert_eq(1u, ident, /ForceTeamState=/true);

	// Ensure we synchronize before we run user code to avoid invalidating the
	// assumptions above.
	synchronize::threadsAligned(atomic::relaxed);

	if (!PTeamSize \|\| TId < PTeamSize)
	invokeMicrotask(TId, 0, fn, args, nargs);

	// Synchronize all threads at the end of a parallel region.
	synchronize::threadsAligned(atomic::seq_cst);
	}

	// Synchronize all threads to make sure every thread exits the scope above;
	// otherwise the following assertions and the assumption in
	// __kmpc_target_deinit may not hold.
	synchronize::threadsAligned(atomic::acq_rel);

	state::ParallelTeamSize.assert_eq(1u, ident, /ForceTeamState=/true);
	icv::ActiveLevel.assert_eq(0u, ident, /ForceTeamState=/true);
	icv::Level.assert_eq(0u, ident, /ForceTeamState=/true);

	// Ensure we synchronize to create an aligned region around the assumptions.
	synchronize::threadsAligned(atomic::relaxed);

	return;
	}

	[[clang::always_inline]] void
	__kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
	int32_t num_threads, int proc_bind, void *fn,
	void wrapper_fn, void *args, int64_t nargs) {
	uint32_t TId = mapping::getThreadIdInBlock();

	// Assert the parallelism level is zero if disabled by the user.
	ASSERT((config::mayUseNestedParallelism() \|\| icv::Level == 0),
	"nested parallelism while disabled");

	// Handle the serialized case first, same for SPMD/non-SPMD:
	// 1) if-clause(0)
	// 2) parallel in task or other thread state inducing construct
	// 3) nested parallel regions
	if (OMP_UNLIKELY(!if_expr \|\| state::HasThreadState \|\|
	(config::mayUseNestedParallelism() && icv::Level))) {
	state::DateEnvironmentRAII DERAII(ident);
	++icv::Level;
	invokeMicrotask(TId, 0, fn, args, nargs);
	return;
	}

	// From this point forward we know that there is no thread state used.
	ASSERT(state::HasThreadState == false, nullptr);

	uint32_t NumThreads = determineNumberOfThreads(num_threads);
	uint32_t MaxTeamThreads = mapping::getMaxTeamThreads();
	uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads;
	if (mapping::isSPMDMode()) {
	// This was moved to its own routine so it could be called directly
	// in certain situations to avoid resource consumption of unused
	// logic in parallel_51.
	__kmpc_parallel_spmd(ident, num_threads, fn, args, nargs);

	return;
	}

	// We do not create a new data environment because all threads in the team
	// that are active are now running this parallel region. They share the
	// TeamState, which has an increase level-var and potentially active-level
	// set, but they do not have individual ThreadStates yet. If they ever
	// modify the ICVs beyond this point a ThreadStates will be allocated.

	bool IsActiveParallelRegion = NumThreads > 1;
	if (!IsActiveParallelRegion) {
	state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident);
	invokeMicrotask(TId, 0, fn, args, nargs);
	return;
	}

	void **GlobalArgs = nullptr;
	if (nargs) {
	__kmpc_begin_sharing_variables(&GlobalArgs, nargs);
	switch (nargs) {
	default:
	for (int I = 0; I < nargs; I++)
	GlobalArgs[I] = args[I];
	break;
	case 16:
	GlobalArgs[15] = args[15];
	[[fallthrough]];
	case 15:
	GlobalArgs[14] = args[14];
	[[fallthrough]];
	case 14:
	GlobalArgs[13] = args[13];
	[[fallthrough]];
	case 13:
	GlobalArgs[12] = args[12];
	[[fallthrough]];
	case 12:
	GlobalArgs[11] = args[11];
	[[fallthrough]];
	case 11:
	GlobalArgs[10] = args[10];
	[[fallthrough]];
	case 10:
	GlobalArgs[9] = args[9];
	[[fallthrough]];
	case 9:
	GlobalArgs[8] = args[8];
	[[fallthrough]];
	case 8:
	GlobalArgs[7] = args[7];
	[[fallthrough]];
	case 7:
	GlobalArgs[6] = args[6];
	[[fallthrough]];
	case 6:
	GlobalArgs[5] = args[5];
	[[fallthrough]];
	case 5:
	GlobalArgs[4] = args[4];
	[[fallthrough]];
	case 4:
	GlobalArgs[3] = args[3];
	[[fallthrough]];
	case 3:
	GlobalArgs[2] = args[2];
	[[fallthrough]];
	case 2:
	GlobalArgs[1] = args[1];
	[[fallthrough]];
	case 1:
	GlobalArgs[0] = args[0];
	[[fallthrough]];
	case 0:
	break;
	}
	}

	{
	// Note that the order here is important. `icv::Level` has to be updated
	// last or the other updates will cause a thread specific state to be
	// created.
	state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize,
	1u, true, ident,
	/ForceTeamState=/true);
	state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
	(void *)nullptr, true, ident,
	/ForceTeamState=/true);
	state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
	/ForceTeamState=/true);
	state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident,
	/ForceTeamState=/true);

	// Master signals work to activate workers.
	synchronize::threads(atomic::seq_cst);
	// Master waits for workers to signal.
	synchronize::threads(atomic::seq_cst);
	}

	if (nargs)
	__kmpc_end_sharing_variables();
	}

	[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
	// Work function and arguments for L1 parallel region.
	*WorkFn = state::ParallelRegionFn;

	// If this is the termination signal from the master, quit early.
	if (!*WorkFn)
	return false;

	// Set to true for workers participating in the parallel region.
	uint32_t TId = mapping::getThreadIdInBlock();
	bool ThreadIsActive = TId < state::getEffectivePTeamSize();
	return ThreadIsActive;
	}

	[[clang::noinline]] void __kmpc_kernel_end_parallel() {
	// In case we have modified an ICV for this thread before a ThreadState was
	// created. We drop it now to not contaminate the next parallel region.
	ASSERT(!mapping::isSPMDMode(), nullptr);
	uint32_t TId = mapping::getThreadIdInBlock();
	state::resetStateForThread(TId);
	ASSERT(!mapping::isSPMDMode(), nullptr);
	}

	uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }

	int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }

	void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
	int32_t thread_limit) {}

	void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
	}

	#pragma omp end declare target