blob: 557703632c6258fff9fb92c42cf1990e90287d8c [file] [log] [blame]
//===-------- interface.cpp - Target independent OpenMP target RTL --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implementation of the interface to be used by Clang during the codegen of a
// target region.
//
//===----------------------------------------------------------------------===//
#include "OpenMP/OMPT/Interface.h"
#include "OpenMP/OMPT/Callback.h"
#include "PluginManager.h"
#include "private.h"
#include "Shared/EnvironmentVar.h"
#include "Shared/Profile.h"
#include "Utils/ExponentialBackoff.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#ifdef OMPT_SUPPORT
using namespace llvm::omp::target::ompt;
#endif
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
EXTERN void __tgt_register_requires(int64_t Flags) {
MESSAGE("The %s function has been removed. Old OpenMP requirements will not "
"be handled",
__PRETTY_FUNCTION__);
}
EXTERN void __tgt_rtl_init() { initRuntime(); }
EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
////////////////////////////////////////////////////////////////////////////////
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
initRuntime();
if (PM->delayRegisterLib(Desc))
return;
PM->registerLib(Desc);
}
////////////////////////////////////////////////////////////////////////////////
/// Initialize all available devices without registering any image
EXTERN void __tgt_init_all_rtls() {
assert(PM && "Runtime not initialized");
PM->initAllPlugins();
}
////////////////////////////////////////////////////////////////////////////////
/// unloads a target shared library
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
PM->unregisterLib(Desc);
deinitRuntime();
}
template <typename TargetAsyncInfoTy>
static inline void
targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers,
TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
const char *RegionName) {
assert(PM && "Runtime not initialized");
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
"NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
if (checkDeviceAndCtors(DeviceId, Loc)) {
DP("Not offloading to device %" PRId64 "\n", DeviceId);
return;
}
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames,
RegionTypeMsg);
#ifdef OMPTARGET_DEBUG
for (int I = 0; I < ArgNum; ++I) {
DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s\n",
I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I],
(ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown");
}
#endif
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
/// RAII to establish tool anchors before and after data begin / end / update
OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
TargetDataFunction == targetDataEnd ||
TargetDataFunction == targetDataUpdate) &&
"Encountered unexpected TargetDataFunction during "
"execution of targetData");
auto CallbackFunctions =
(TargetDataFunction == targetDataBegin)
? RegionInterface.getCallbacks<ompt_target_enter_data>()
: (TargetDataFunction == targetDataEnd)
? RegionInterface.getCallbacks<ompt_target_exit_data>()
: RegionInterface.getCallbacks<ompt_target_update>();
InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
OMPT_GET_RETURN_ADDRESS);)
int Rc = OFFLOAD_SUCCESS;
Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, AsyncInfo,
false /*FromMapper=*/);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
}
/// creates host-to-target data mapping, stores it in the
/// libomptarget.so internal structure (an entry in a stack of data maps)
/// and passes the data to the device.
EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes,
int64_t *ArgTypes,
map_var_info_t *ArgNames,
void **ArgMappers) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, targetDataBegin,
"Entering OpenMP data region with being_mapper",
"begin");
}
EXTERN void __tgt_target_data_begin_nowait_mapper(
ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
void *NoAliasDepList) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
targetData<TaskAsyncInfoWrapperTy>(
Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
ArgMappers, targetDataBegin,
"Entering OpenMP data region with being_nowait_mapper", "begin");
}
/// passes data from the target, releases target memory and destroys
/// the host-target mapping (top entry from the stack of data maps)
/// created by the last __tgt_target_data_begin.
EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes,
int64_t *ArgTypes,
map_var_info_t *ArgNames,
void **ArgMappers) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
ArgTypes, ArgNames, ArgMappers, targetDataEnd,
"Exiting OpenMP data region with end_mapper", "end");
}
EXTERN void __tgt_target_data_end_nowait_mapper(
ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
void *NoAliasDepList) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
targetData<TaskAsyncInfoWrapperTy>(
Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
ArgMappers, targetDataEnd,
"Exiting OpenMP data region with end_nowait_mapper", "end");
}
EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes,
int64_t *ArgTypes,
map_var_info_t *ArgNames,
void **ArgMappers) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
targetData<AsyncInfoTy>(
Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
ArgMappers, targetDataUpdate,
"Updating data within the OpenMP data region with update_mapper",
"update");
}
EXTERN void __tgt_target_data_update_nowait_mapper(
ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
void *NoAliasDepList) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
targetData<TaskAsyncInfoWrapperTy>(
Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
ArgMappers, targetDataUpdate,
"Updating data within the OpenMP data region with update_nowait_mapper",
"update");
}
static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
KernelArgsTy &LocalKernelArgs,
int32_t NumTeams, int32_t ThreadLimit) {
if (KernelArgs->Version > OMP_KERNEL_ARG_VERSION)
DP("Unexpected ABI version: %u\n", KernelArgs->Version);
uint32_t UpgradedVersion = KernelArgs->Version;
if (KernelArgs->Version < OMP_KERNEL_ARG_VERSION) {
// The upgraded version will be based on the kernel launch environment.
if (KernelArgs->Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
UpgradedVersion = OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR - 1;
else
UpgradedVersion = OMP_KERNEL_ARG_VERSION;
}
if (UpgradedVersion != KernelArgs->Version) {
LocalKernelArgs.Version = UpgradedVersion;
LocalKernelArgs.NumArgs = KernelArgs->NumArgs;
LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs;
LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs;
LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes;
LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes;
LocalKernelArgs.ArgNames = KernelArgs->ArgNames;
LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers;
LocalKernelArgs.Tripcount = KernelArgs->Tripcount;
LocalKernelArgs.Flags = KernelArgs->Flags;
LocalKernelArgs.DynCGroupMem = 0;
LocalKernelArgs.NumTeams[0] = NumTeams;
LocalKernelArgs.NumTeams[1] = 0;
LocalKernelArgs.NumTeams[2] = 0;
LocalKernelArgs.ThreadLimit[0] = ThreadLimit;
LocalKernelArgs.ThreadLimit[1] = 0;
LocalKernelArgs.ThreadLimit[2] = 0;
return &LocalKernelArgs;
}
return KernelArgs;
}
template <typename TargetAsyncInfoTy>
static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int32_t ThreadLimit, void *HostPtr,
KernelArgsTy *KernelArgs) {
assert(PM && "Runtime not initialized");
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
DeviceId, DPxPTR(HostPtr));
if (checkDeviceAndCtors(DeviceId, Loc)) {
DP("Not offloading to device %" PRId64 "\n", DeviceId);
return OMP_TGT_FAIL;
}
bool IsTeams = NumTeams != -1;
if (!IsTeams)
KernelArgs->NumTeams[0] = NumTeams = 1;
// Auto-upgrade kernel args version 1 to 2.
KernelArgsTy LocalKernelArgs;
KernelArgs =
upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit);
assert(KernelArgs->NumTeams[0] == static_cast<uint32_t>(NumTeams) &&
!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
"OpenMP interface should not use multiple dimensions");
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
TIMESCOPE_WITH_DETAILS_AND_IDENT(
"Runtime: target exe",
"NumTeams=" + std::to_string(NumTeams) +
";NumArgs=" + std::to_string(KernelArgs->NumArgs),
Loc);
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
KernelArgs->ArgSizes, KernelArgs->ArgTypes,
KernelArgs->ArgNames, "Entering OpenMP kernel");
#ifdef OMPTARGET_DEBUG
for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s\n",
I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]),
KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I],
(KernelArgs->ArgNames)
? getNameFromMapping(KernelArgs->ArgNames[I]).c_str()
: "unknown");
}
#endif
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
/// RAII to establish tool anchors before and after target region
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
{ // required to show syncronization
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
}
return OMP_TGT_SUCCESS;
}
/// Implements a kernel entry that executes the target region on the specified
/// device.
///
/// \param Loc Source location associated with this target region.
/// \param DeviceId The device to execute this region, -1 indicated the default.
/// \param NumTeams Number of teams to launch the region with, -1 indicates a
/// non-teams region and 0 indicates it was unspecified.
/// \param ThreadLimit Limit to the number of threads to use in the kernel
/// launch, 0 indicates it was unspecified.
/// \param HostPtr The pointer to the host function registered with the kernel.
/// \param Args All arguments to this kernel launch (see struct definition).
EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int32_t ThreadLimit, void *HostPtr,
KernelArgsTy *KernelArgs) {
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (KernelArgs->Flags.NoWait)
return targetKernel<TaskAsyncInfoWrapperTy>(
Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs);
return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit,
HostPtr, KernelArgs);
}
/// Activates the record replay mechanism.
/// \param DeviceId The device identifier to execute the target region.
/// \param MemorySize The number of bytes to be (pre-)allocated
/// by the bump allocator
/// /param IsRecord Activates the record replay mechanism in
/// 'record' mode or 'replay' mode.
/// /param SaveOutput Store the device memory after kernel
/// execution on persistent storage
EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord,
bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
assert(PM && "Runtime not initialized");
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
[[maybe_unused]] int Rc = target_activate_rr(
*DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
assert(Rc == OFFLOAD_SUCCESS &&
"__tgt_activate_record_replay unexpected failure!");
return OMP_TGT_SUCCESS;
}
/// Implements a target kernel entry that replays a pre-recorded kernel.
/// \param Loc Source location associated with this target region (unused).
/// \param DeviceId The device identifier to execute the target region.
/// \param HostPtr A pointer to an address that uniquely identifies the kernel.
/// \param DeviceMemory A pointer to an array storing device memory data to move
/// prior to kernel execution.
/// \param DeviceMemorySize The size of the above device memory data in bytes.
/// \param TgtArgs An array of pointers of the pre-recorded target kernel
/// arguments.
/// \param TgtOffsets An array of pointers of the pre-recorded target kernel
/// argument offsets.
/// \param NumArgs The number of kernel arguments.
/// \param NumTeams Number of teams to launch the target region with.
/// \param ThreadLimit Limit to the number of threads to use in kernel
/// execution.
/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
void *HostPtr, void *DeviceMemory,
int64_t DeviceMemorySize, void **TgtArgs,
ptrdiff_t *TgtOffsets, int32_t NumArgs,
int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount) {
assert(PM && "Runtime not initialized");
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (checkDeviceAndCtors(DeviceId, Loc)) {
DP("Not offloading to device %" PRId64 "\n", DeviceId);
return OMP_TGT_FAIL;
}
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
/// RAII to establish tool anchors before and after target region
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
RegionInterface.getCallbacks<ompt_target>(), DeviceId,
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
AsyncInfoTy AsyncInfo(*DeviceOrErr);
int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory,
DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs,
NumTeams, ThreadLimit, LoopTripCount, AsyncInfo);
if (Rc == OFFLOAD_SUCCESS)
Rc = AsyncInfo.synchronize();
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
assert(Rc == OFFLOAD_SUCCESS &&
"__tgt_target_kernel_replay unexpected failure!");
return OMP_TGT_SUCCESS;
}
// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
DPxPTR(RtMapperHandle), Size);
return Size;
}
// Push back one component for a user-defined mapper.
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type,
(Name) ? getNameFromMapping(Name).c_str() : "unknown");
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
MapperComponentsPtr->Components.push_back(
MapComponentInfoTy(Base, Begin, Size, Type, Name));
}
EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
assert(PM && "Runtime not initialized");
std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
InfoLevel.store(NewInfoLevel);
for (auto &R : PM->pluginAdaptors())
R.set_info_flag(NewInfoLevel);
}
EXTERN int __tgt_print_device_info(int64_t DeviceId) {
assert(PM && "Runtime not initialized");
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
return DeviceOrErr->printDeviceInfo();
}
EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
assert(PM && "Runtime not initialized");
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (!AsyncHandle || !*AsyncHandle) {
FATAL_MESSAGE0(
1, "Receive an invalid async handle from the current OpenMP task. Is "
"this a target nowait region?\n");
}
// Exponential backoff tries to optimally decide if a thread should just query
// for the device operations (work/spin wait on them) or block until they are
// completed (use device side blocking mechanism). This allows the runtime to
// adapt itself when there are a lot of long-running target regions in-flight.
static thread_local utils::ExponentialBackoff QueryCounter(
Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10),
Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5),
Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR", 0.5f));
auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle;
// If the thread is actively waiting on too many target nowait regions, we
// should use the blocking sync type.
if (QueryCounter.isAboveThreshold())
AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING;
if (AsyncInfo->synchronize())
FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
// If there are device operations still pending, return immediately without
// deallocating the handle and increase the current thread query count.
if (!AsyncInfo->isDone()) {
QueryCounter.increment();
return;
}
// When a thread successfully completes a target nowait region, we
// exponentially backoff its query counter by the query factor.
QueryCounter.decrement();
// Delete the handle and unset it from the OpenMP task data.
delete AsyncInfo;
*AsyncHandle = nullptr;
}