[libomptarget] Refactor DeviceRTL prior to AMDGPU bringup
Subset of D111993. Fix typos, rename read to load.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D111999
GitOrigin-RevId: 7272982e1dfe89d492de4aac3e23d9cdcb1198fa
diff --git a/libomptarget/DeviceRTL/include/Synchronization.h b/libomptarget/DeviceRTL/include/Synchronization.h
index ace624e..7097056 100644
--- a/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/libomptarget/DeviceRTL/include/Synchronization.h
@@ -44,14 +44,11 @@
namespace atomic {
-/// Atomically read \p Addr with \p Ordering semantics.
-uint32_t read(uint32_t *Addr, int Ordering);
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
/// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
-
-/// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+void store(uint32_t *Addr, uint32_t V, int Ordering);
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
index c204185..740cc7b 100644
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -29,7 +29,7 @@
#pragma omp begin declare variant match(device = {arch(amdgcn)})
constexpr const llvm::omp::GV &getGridValue() {
- return llvm::omp::AMDGPUGridValues;
+ return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
}
uint32_t getGridDim(uint32_t n, uint16_t d) {
diff --git a/libomptarget/DeviceRTL/src/Reduction.cpp b/libomptarget/DeviceRTL/src/Reduction.cpp
index a06ac23..05efe95 100644
--- a/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -209,7 +209,7 @@
// to the number of slots in the buffer.
bool IsMaster = (ThreadId == 0);
while (IsMaster) {
- Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+ Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
if (TeamId < Bound + num_of_records)
break;
}
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index c9a1ac6..17a91de 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -31,10 +31,14 @@
/// NOTE: This function needs to be implemented by every target.
uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
-uint32_t atomicRead(uint32_t *Address, int Ordering) {
+uint32_t atomicLoad(uint32_t *Address, int Ordering) {
return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
}
+void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) {
+ __atomic_store_n(Address, Val, Ordering);
+}
+
uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
return __atomic_fetch_add(Address, Val, Ordering);
}
@@ -68,7 +72,7 @@
return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
}
-uint32_t SHARD(namedBarrierTracker);
+uint32_t SHARED(namedBarrierTracker);
void namedBarrierInit() {
// Don't have global ctors, and shared memory is not zero init
@@ -79,7 +83,7 @@
uint32_t NumThreads = omp_get_num_threads();
// assert(NumThreads % 32 == 0);
- uint32_t WarpSize = maping::getWarpSize();
+ uint32_t WarpSize = mapping::getWarpSize();
uint32_t NumWaves = NumThreads / WarpSize;
fence::team(__ATOMIC_ACQUIRE);
@@ -115,7 +119,7 @@
// more waves still to go, spin until generation counter changes
do {
__builtin_amdgcn_s_sleep(0);
- load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+ load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);
}
}
@@ -192,7 +196,7 @@
void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setLock(omp_lock_t *Lock) {
// TODO: not sure spinning is a good idea here..
@@ -229,8 +233,12 @@
void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
-uint32_t atomic::read(uint32_t *Addr, int Ordering) {
- return impl::atomicRead(Addr, Ordering);
+uint32_t atomic::load(uint32_t *Addr, int Ordering) {
+ return impl::atomicLoad(Addr, Ordering);
+}
+
+void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
+ impl::atomicStore(Addr, V, Ordering);
}
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -300,7 +308,7 @@
void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
index f11c54e..3f65f21 100644
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/libomptarget/DeviceRTL/src/Utils.cpp
@@ -35,8 +35,9 @@
#pragma omp begin declare variant match(device = {arch(amdgcn)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
- *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
- *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+ static_assert(sizeof(unsigned long) == 8, "");
+ *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
+ *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
}
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
@@ -75,7 +76,7 @@
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
int Width = mapping::getWarpSize();
- int Self = mapping::getgetThreadIdInWarp();
+ int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}