[libomptarget] Refactor DeviceRTL prior to AMDGPU bringup

Subset of D111993. Fix typos, rename read to load.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D111999

GitOrigin-RevId: 7272982e1dfe89d492de4aac3e23d9cdcb1198fa
diff --git a/libomptarget/DeviceRTL/include/Synchronization.h b/libomptarget/DeviceRTL/include/Synchronization.h
index ace624e..7097056 100644
--- a/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/libomptarget/DeviceRTL/include/Synchronization.h
@@ -44,14 +44,11 @@
 
 namespace atomic {
 
-/// Atomically read \p Addr with \p Ordering semantics.
-uint32_t read(uint32_t *Addr, int Ordering);
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
 
 /// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
-
-/// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+void store(uint32_t *Addr, uint32_t V, int Ordering);
 
 /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
 uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
diff --git a/libomptarget/DeviceRTL/src/Mapping.cpp b/libomptarget/DeviceRTL/src/Mapping.cpp
index c204185..740cc7b 100644
--- a/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -29,7 +29,7 @@
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::AMDGPUGridValues;
+  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
 }
 
 uint32_t getGridDim(uint32_t n, uint16_t d) {
diff --git a/libomptarget/DeviceRTL/src/Reduction.cpp b/libomptarget/DeviceRTL/src/Reduction.cpp
index a06ac23..05efe95 100644
--- a/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -209,7 +209,7 @@
   // to the number of slots in the buffer.
   bool IsMaster = (ThreadId == 0);
   while (IsMaster) {
-    Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+    Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
     if (TeamId < Bound + num_of_records)
       break;
   }
diff --git a/libomptarget/DeviceRTL/src/Synchronization.cpp b/libomptarget/DeviceRTL/src/Synchronization.cpp
index c9a1ac6..17a91de 100644
--- a/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -31,10 +31,14 @@
 /// NOTE: This function needs to be implemented by every target.
 uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
 
-uint32_t atomicRead(uint32_t *Address, int Ordering) {
+uint32_t atomicLoad(uint32_t *Address, int Ordering) {
   return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
 }
 
+void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) {
+  __atomic_store_n(Address, Val, Ordering);
+}
+
 uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
   return __atomic_fetch_add(Address, Val, Ordering);
 }
@@ -68,7 +72,7 @@
   return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
 }
 
-uint32_t SHARD(namedBarrierTracker);
+uint32_t SHARED(namedBarrierTracker);
 
 void namedBarrierInit() {
   // Don't have global ctors, and shared memory is not zero init
@@ -79,7 +83,7 @@
   uint32_t NumThreads = omp_get_num_threads();
   // assert(NumThreads % 32 == 0);
 
-  uint32_t WarpSize = maping::getWarpSize();
+  uint32_t WarpSize = mapping::getWarpSize();
   uint32_t NumWaves = NumThreads / WarpSize;
 
   fence::team(__ATOMIC_ACQUIRE);
@@ -115,7 +119,7 @@
       // more waves still to go, spin until generation counter changes
       do {
         __builtin_amdgcn_s_sleep(0);
-        load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+        load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED);
       } while ((load & 0xffff0000u) == generation);
     }
   }
@@ -192,7 +196,7 @@
 
 void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
 
-void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
 
 void setLock(omp_lock_t *Lock) {
   // TODO: not sure spinning is a good idea here..
@@ -229,8 +233,12 @@
 
 void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
 
-uint32_t atomic::read(uint32_t *Addr, int Ordering) {
-  return impl::atomicRead(Addr, Ordering);
+uint32_t atomic::load(uint32_t *Addr, int Ordering) {
+  return impl::atomicLoad(Addr, Ordering);
+}
+
+void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
+   impl::atomicStore(Addr, V, Ordering);
 }
 
 uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -300,7 +308,7 @@
 
 void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
 
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
 
 void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
 
diff --git a/libomptarget/DeviceRTL/src/Utils.cpp b/libomptarget/DeviceRTL/src/Utils.cpp
index f11c54e..3f65f21 100644
--- a/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/libomptarget/DeviceRTL/src/Utils.cpp
@@ -35,8 +35,9 @@
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
-  *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+  static_assert(sizeof(unsigned long) == 8, "");
+  *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
+  *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
 }
 
 uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
@@ -75,7 +76,7 @@
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
   int Width = mapping::getWarpSize();
-  int Self = mapping::getgetThreadIdInWarp();
+  int Self = mapping::getThreadIdInWarp();
   int Index = SrcLane + (Self & ~(Width - 1));
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }