| //===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef OMPTARGET_STATE_H |
| #define OMPTARGET_STATE_H |
| |
| #include "Shared/Environment.h" |
| |
| #include "Debug.h" |
| #include "Mapping.h" |
| #include "Types.h" |
| #include "Utils.h" |
| |
| // Forward declaration. |
| struct KernelEnvironmentTy; |
| |
| #pragma omp begin declare target device_type(nohost) |
| |
| namespace ompx { |
| |
| namespace memory { |
| |
| /// Alloca \p Size bytes in shared memory, if possible, for \p Reason. |
| /// |
| /// Note: See the restrictions on __kmpc_alloc_shared for proper usage. |
| void *allocShared(uint64_t Size, const char *Reason); |
| |
| /// Free \p Ptr, alloated via allocShared, for \p Reason. |
| /// |
| /// Note: See the restrictions on __kmpc_free_shared for proper usage. |
| void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); |
| |
| /// Alloca \p Size bytes in global memory, if possible, for \p Reason. |
| void *allocGlobal(uint64_t Size, const char *Reason); |
| |
| /// Return a pointer to the dynamic shared memory buffer. |
| void *getDynamicBuffer(); |
| |
| /// Free \p Ptr, alloated via allocGlobal, for \p Reason. |
| void freeGlobal(void *Ptr, const char *Reason); |
| |
| } // namespace memory |
| |
| namespace state { |
| |
| inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE; |
| |
| struct ICVStateTy { |
| uint32_t NThreadsVar; |
| uint32_t LevelVar; |
| uint32_t ActiveLevelVar; |
| uint32_t Padding0Val; |
| uint32_t MaxActiveLevelsVar; |
| uint32_t RunSchedVar; |
| uint32_t RunSchedChunkVar; |
| |
| bool operator==(const ICVStateTy &Other) const; |
| |
| void assertEqual(const ICVStateTy &Other) const; |
| }; |
| |
| struct TeamStateTy { |
| void init(bool IsSPMD); |
| |
| bool operator==(const TeamStateTy &) const; |
| |
| void assertEqual(TeamStateTy &Other) const; |
| |
| /// ICVs |
| /// |
| /// Preallocated storage for ICV values that are used if the threads have not |
| /// set a custom default. The latter is supported but unlikely and slow(er). |
| /// |
| ///{ |
| ICVStateTy ICVState; |
| ///} |
| |
| uint32_t ParallelTeamSize; |
| uint32_t HasThreadState; |
| ParallelRegionFnTy ParallelRegionFnVar; |
| }; |
| |
| extern TeamStateTy TeamState; |
| #pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc) |
| |
| struct ThreadStateTy { |
| |
| /// ICVs have preallocated storage in the TeamStateTy which is used if a |
| /// thread has not set a custom value. The latter is supported but unlikely. |
| /// When it happens we will allocate dynamic memory to hold the values of all |
| /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an |
| /// ICV struct to hold them all. This is slower than alternatives but allows |
| /// users to pay only for what they use. |
| /// |
| state::ICVStateTy ICVState; |
| |
| ThreadStateTy *PreviousThreadState; |
| |
| void init() { |
| ICVState = TeamState.ICVState; |
| PreviousThreadState = nullptr; |
| } |
| |
| void init(ThreadStateTy *PreviousTS) { |
| ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; |
| PreviousThreadState = PreviousTS; |
| } |
| }; |
| |
| extern ThreadStateTy **ThreadStates; |
| #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) |
| |
| /// Initialize the state machinery. Must be called by all threads. |
| void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, |
| KernelLaunchEnvironmentTy &KernelLaunchEnvironment); |
| |
| /// Return the kernel and kernel launch environment associated with the current |
| /// kernel. The former is static and contains compile time information that |
| /// holds for all instances of the kernel. The latter is dynamic and provides |
| /// per-launch information. |
| KernelEnvironmentTy &getKernelEnvironment(); |
| KernelLaunchEnvironmentTy &getKernelLaunchEnvironment(); |
| |
| /// TODO |
| enum ValueKind { |
| VK_NThreads, |
| VK_Level, |
| VK_ActiveLevel, |
| VK_MaxActiveLevels, |
| VK_RunSched, |
| // --- |
| VK_RunSchedChunk, |
| VK_ParallelRegionFn, |
| VK_ParallelTeamSize, |
| VK_HasThreadState, |
| }; |
| |
| /// TODO |
| void enterDataEnvironment(IdentTy *Ident); |
| |
| /// TODO |
| void exitDataEnvironment(); |
| |
| /// TODO |
| struct DateEnvironmentRAII { |
| DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); } |
| ~DateEnvironmentRAII() { exitDataEnvironment(); } |
| }; |
| |
| /// TODO |
| void resetStateForThread(uint32_t TId); |
| |
| inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, |
| IdentTy *Ident, bool ForceTeamState) { |
| if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() || |
| !TeamState.HasThreadState)) |
| return TeamState.ICVState.*Var; |
| uint32_t TId = mapping::getThreadIdInBlock(); |
| if (OMP_UNLIKELY(!ThreadStates[TId])) { |
| ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal( |
| sizeof(ThreadStateTy), "ICV modification outside data environment")); |
| ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!"); |
| TeamState.HasThreadState = true; |
| ThreadStates[TId]->init(); |
| } |
| return ThreadStates[TId]->ICVState.*Var; |
| } |
| |
| inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var, |
| bool ForceTeamState) { |
| auto TId = mapping::getThreadIdInBlock(); |
| if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() && |
| TeamState.HasThreadState && ThreadStates[TId])) |
| return ThreadStates[TId]->ICVState.*Var; |
| return TeamState.ICVState.*Var; |
| } |
| |
| [[gnu::always_inline, gnu::flatten]] inline uint32_t & |
| lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { |
| switch (Kind) { |
| case state::VK_NThreads: |
| if (IsReadonly) |
| return lookupImpl(&ICVStateTy::NThreadsVar, ForceTeamState); |
| return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident, |
| ForceTeamState); |
| case state::VK_Level: |
| if (IsReadonly) |
| return lookupImpl(&ICVStateTy::LevelVar, ForceTeamState); |
| return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident, ForceTeamState); |
| case state::VK_ActiveLevel: |
| if (IsReadonly) |
| return lookupImpl(&ICVStateTy::ActiveLevelVar, ForceTeamState); |
| return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident, |
| ForceTeamState); |
| case state::VK_MaxActiveLevels: |
| if (IsReadonly) |
| return lookupImpl(&ICVStateTy::MaxActiveLevelsVar, ForceTeamState); |
| return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident, |
| ForceTeamState); |
| case state::VK_RunSched: |
| if (IsReadonly) |
| return lookupImpl(&ICVStateTy::RunSchedVar, ForceTeamState); |
| return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident, |
| ForceTeamState); |
| case state::VK_RunSchedChunk: |
| if (IsReadonly) |
| return lookupImpl(&ICVStateTy::RunSchedChunkVar, ForceTeamState); |
| return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident, |
| ForceTeamState); |
| case state::VK_ParallelTeamSize: |
| return TeamState.ParallelTeamSize; |
| case state::VK_HasThreadState: |
| return TeamState.HasThreadState; |
| default: |
| break; |
| } |
| __builtin_unreachable(); |
| } |
| |
| [[gnu::always_inline, gnu::flatten]] inline void *& |
| lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { |
| switch (Kind) { |
| case state::VK_ParallelRegionFn: |
| return TeamState.ParallelRegionFnVar; |
| default: |
| break; |
| } |
| __builtin_unreachable(); |
| } |
| |
| /// A class without actual state used to provide a nice interface to lookup and |
| /// update ICV values we can declare in global scope. |
| template <typename Ty, ValueKind Kind> struct Value { |
| [[gnu::flatten, gnu::always_inline]] operator Ty() { |
| return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, |
| /*ForceTeamState=*/false); |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) { |
| set(Other, /*IdentTy=*/nullptr); |
| return *this; |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] Value &operator++() { |
| inc(1, /*IdentTy=*/nullptr); |
| return *this; |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] Value &operator--() { |
| inc(-1, /*IdentTy=*/nullptr); |
| return *this; |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] void |
| assert_eq(const Ty &V, IdentTy *Ident = nullptr, |
| bool ForceTeamState = false) { |
| ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr); |
| } |
| |
| private: |
| [[gnu::flatten, gnu::always_inline]] Ty & |
| lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { |
| Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState); |
| return t; |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) { |
| return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) += |
| UpdateVal); |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) { |
| return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) = |
| UpdateVal); |
| } |
| |
| template <typename VTy, typename Ty2> friend struct ValueRAII; |
| }; |
| |
| /// A mookup class without actual state used to provide |
| /// a nice interface to lookup and update ICV values |
| /// we can declare in global scope. |
| template <typename Ty, ValueKind Kind> struct PtrValue { |
| [[gnu::flatten, gnu::always_inline]] operator Ty() { |
| return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, |
| /*ForceTeamState=*/false); |
| } |
| |
| [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) { |
| set(Other); |
| return *this; |
| } |
| |
| private: |
| Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) { |
| return lookupPtr(Kind, IsReadonly, ForceTeamState); |
| } |
| |
| Ty &set(Ty UpdateVal) { |
| return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr, |
| /*ForceTeamState=*/false) = UpdateVal); |
| } |
| |
| template <typename VTy, typename Ty2> friend struct ValueRAII; |
| }; |
| |
| template <typename VTy, typename Ty> struct ValueRAII { |
| ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident, |
| bool ForceTeamState = false) |
| : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState) |
| : (Ty *)utils::UndefPtr), |
| Val(OldValue), Active(Active) { |
| if (!Active) |
| return; |
| ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!"); |
| *Ptr = NewValue; |
| } |
| ~ValueRAII() { |
| if (Active) |
| *Ptr = Val; |
| } |
| |
| private: |
| Ty *Ptr; |
| Ty Val; |
| bool Active; |
| }; |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk; |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize; |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState; |
| |
| /// TODO |
| inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn> |
| ParallelRegionFn; |
| |
| void runAndCheckState(void(Func(void))); |
| |
| void assumeInitialState(bool IsSPMD); |
| |
| /// Return the value of the ParallelTeamSize ICV. |
| int getEffectivePTeamSize(); |
| |
| } // namespace state |
| |
| namespace icv { |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_NThreads> NThreads; |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_Level> Level; |
| |
| /// The `active-level` describes which of the parallel level counted with the |
| /// `level-var` is active. There can only be one. |
| /// |
| /// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0. |
| inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel; |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels; |
| |
| /// TODO |
| inline state::Value<uint32_t, state::VK_RunSched> RunSched; |
| |
| } // namespace icv |
| |
| } // namespace ompx |
| |
| #pragma omp end declare target |
| |
| #endif |