diff --git a/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
index abcc67e..2261505 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
+++ b/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip
@@ -18,15 +18,15 @@
 
 #include "common/debug.h"
 
-static DEVICE void warn() {
+static void warn() {
   PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
 }
 
-DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
-DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
-DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
-DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
-DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
+void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
+void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
+void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
+void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
+int __kmpc_impl_test_lock(omp_lock_t *lock) {
   warn();
   return 0;
 }
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index b8d112c..197b7d7 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -25,9 +25,8 @@
 #define PRId64 "ld"
 #define PRIu64 "lu"
 
-#define DEVICE
-#define INLINE inline DEVICE
-#define NOINLINE __attribute__((noinline)) DEVICE
+#define INLINE inline
+#define NOINLINE __attribute__((noinline))
 #define ALIGN(N) __attribute__((aligned(N)))
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 56ecab0..63a7091 100644
--- a/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -107,12 +107,12 @@
 }
 
 namespace {
-DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
+uint32_t get_grid_dim(uint32_t n, uint16_t d) {
   uint32_t q = n / d;
   return q + (n > q * d);
 }
-DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
-                                  uint16_t group_size) {
+uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
+                           uint16_t group_size) {
   uint32_t r = grid_size - group_id * group_size;
   return (r < group_size) ? r : group_size;
 }
@@ -140,36 +140,35 @@
 }
 
 // Atomics
-DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
-DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
   return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
 }
-DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
 }
 
-DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
   uint32_t R;
   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
   return R;
 }
-DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
-                                  uint32_t Val) {
+uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
   (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
                                   __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
   return Compare;
 }
 
-DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
-                                                 unsigned long long Val) {
+unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
+                                          unsigned long long Val) {
   unsigned long long R;
   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
   return R;
 }
-DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
-                                            unsigned long long Val) {
+unsigned long long __kmpc_atomic_add(unsigned long long *Address,
+                                     unsigned long long Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 
diff --git a/libomptarget/deviceRTLs/common/device_environment.h b/libomptarget/deviceRTLs/common/device_environment.h
index 68a7757..d1629f8 100644
--- a/libomptarget/deviceRTLs/common/device_environment.h
+++ b/libomptarget/deviceRTLs/common/device_environment.h
@@ -19,6 +19,6 @@
   int32_t debug_level;
 };
 
-extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
+extern omptarget_device_environmentTy omptarget_device_environment;
 
 #endif
diff --git a/libomptarget/deviceRTLs/common/omptarget.h b/libomptarget/deviceRTLs/common/omptarget.h
index dde5652..d774b5d 100644
--- a/libomptarget/deviceRTLs/common/omptarget.h
+++ b/libomptarget/deviceRTLs/common/omptarget.h
@@ -73,8 +73,7 @@
   uint32_t nArgs;
 };
 
-extern DEVICE
-    omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
+extern omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
 
 // Worker slot type which is initialized with the default worker slot
 // size of 4*32 bytes.
@@ -96,7 +95,7 @@
   __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
 };
 
-extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState);
+extern DataSharingStateTy EXTERN_SHARED(DataSharingState);
 
 ////////////////////////////////////////////////////////////////////////////////
 // task ICV and (implicit & explicit) task state
@@ -294,25 +293,23 @@
 // global data tables
 ////////////////////////////////////////////////////////////////////////////////
 
-extern DEVICE omptarget_nvptx_SimpleMemoryManager
-    omptarget_nvptx_simpleMemoryManager;
-extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
-extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
+extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
+extern uint32_t EXTERN_SHARED(usedMemIdx);
+extern uint32_t EXTERN_SHARED(usedSlotIdx);
 #if _OPENMP
-extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
 #else
-extern DEVICE
-    uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
+extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
 #endif
-extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
-extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
-extern DEVICE uint16_t EXTERN_SHARED(nThreads);
-extern DEVICE omptarget_nvptx_ThreadPrivateContext *
+extern uint16_t EXTERN_SHARED(threadLimit);
+extern uint16_t EXTERN_SHARED(threadsInTeam);
+extern uint16_t EXTERN_SHARED(nThreads);
+extern omptarget_nvptx_ThreadPrivateContext *
     EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
 
-extern DEVICE uint32_t EXTERN_SHARED(execution_param);
-extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr);
+extern uint32_t EXTERN_SHARED(execution_param);
+extern void *EXTERN_SHARED(ReductionScratchpadPtr);
 
 ////////////////////////////////////////////////////////////////////////////////
 // work function (outlined parallel/simd functions) and arguments.
@@ -320,8 +317,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 typedef void *omptarget_nvptx_WorkFn;
-extern volatile DEVICE
-    omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
+extern volatile omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
 
 ////////////////////////////////////////////////////////////////////////////////
 // get private data structures
diff --git a/libomptarget/deviceRTLs/common/src/omp_data.cu b/libomptarget/deviceRTLs/common/src/omp_data.cu
index 4736d07..e54e0f0 100644
--- a/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -19,30 +19,30 @@
 // global device environment
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE omptarget_device_environmentTy omptarget_device_environment;
+omptarget_device_environmentTy omptarget_device_environment;
 
 ////////////////////////////////////////////////////////////////////////////////
 // global data holding OpenMP state information
 ////////////////////////////////////////////////////////////////////////////////
 
 // OpenMP will try to call its ctor if we don't add the attribute explicitly
-[[clang::loader_uninitialized]] DEVICE
-    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
-        omptarget_nvptx_device_State[MAX_SM];
+[[clang::loader_uninitialized]] omptarget_nvptx_Queue<
+    omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
+    omptarget_nvptx_device_State[MAX_SM];
 
-DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
-DEVICE uint32_t SHARED(usedMemIdx);
-DEVICE uint32_t SHARED(usedSlotIdx);
+omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
+uint32_t SHARED(usedMemIdx);
+uint32_t SHARED(usedSlotIdx);
 
 // SHARED doesn't work with array so we add the attribute explicitly.
-[[clang::loader_uninitialized]] DEVICE uint8_t
+[[clang::loader_uninitialized]] uint8_t
     parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
 #pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
-DEVICE uint16_t SHARED(threadLimit);
-DEVICE uint16_t SHARED(threadsInTeam);
-DEVICE uint16_t SHARED(nThreads);
+uint16_t SHARED(threadLimit);
+uint16_t SHARED(threadsInTeam);
+uint16_t SHARED(nThreads);
 // Pointer to this team's OpenMP state object
-DEVICE omptarget_nvptx_ThreadPrivateContext *
+omptarget_nvptx_ThreadPrivateContext *
     SHARED(omptarget_nvptx_threadPrivateContext);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -50,26 +50,26 @@
 // communicate with the workers.  Since it is in shared memory, there is one
 // copy of these variables for each kernel, instance, and team.
 ////////////////////////////////////////////////////////////////////////////////
-volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
+volatile omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
 
 ////////////////////////////////////////////////////////////////////////////////
 // OpenMP kernel execution parameters
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE uint32_t SHARED(execution_param);
+uint32_t SHARED(execution_param);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing state
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE DataSharingStateTy SHARED(DataSharingState);
+DataSharingStateTy SHARED(DataSharingState);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE void *SHARED(ReductionScratchpadPtr);
+void *SHARED(ReductionScratchpadPtr);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);
+omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);
 
 #pragma omp end declare target
diff --git a/libomptarget/deviceRTLs/common/src/omptarget.cu b/libomptarget/deviceRTLs/common/src/omptarget.cu
index 39b7b5c..e19d67a 100644
--- a/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -18,9 +18,9 @@
 // global data tables
 ////////////////////////////////////////////////////////////////////////////////
 
-extern DEVICE
-    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
-        omptarget_nvptx_device_State[MAX_SM];
+extern omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext,
+                             OMP_STATE_COUNT>
+    omptarget_nvptx_device_State[MAX_SM];
 
 ////////////////////////////////////////////////////////////////////////////////
 // init entry points
diff --git a/libomptarget/deviceRTLs/common/src/reduction.cu b/libomptarget/deviceRTLs/common/src/reduction.cu
index 9daa78d..4054a6e 100644
--- a/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -174,8 +174,8 @@
 
 INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
 
-DEVICE static volatile uint32_t IterCnt = 0;
-DEVICE static volatile uint32_t Cnt = 0;
+static volatile uint32_t IterCnt = 0;
+static volatile uint32_t Cnt = 0;
 EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     kmp_Ident *loc, int32_t global_tid, void *global_buffer,
     int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
diff --git a/libomptarget/deviceRTLs/common/src/support.cu b/libomptarget/deviceRTLs/common/src/support.cu
index ca0ce20..cd17a7a 100644
--- a/libomptarget/deviceRTLs/common/src/support.cu
+++ b/libomptarget/deviceRTLs/common/src/support.cu
@@ -19,20 +19,20 @@
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
+void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
 }
 
-DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
+bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
 
-DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
+bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
 
-DEVICE bool isRuntimeUninitialized() {
+bool isRuntimeUninitialized() {
   return (execution_param & RuntimeMask) == RuntimeUninitialized;
 }
 
-DEVICE bool isRuntimeInitialized() {
+bool isRuntimeInitialized() {
   return (execution_param & RuntimeMask) == RuntimeInitialized;
 }
 
@@ -40,7 +40,7 @@
 // Execution Modes based on location parameter fields
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE bool checkSPMDMode(kmp_Ident *loc) {
+bool checkSPMDMode(kmp_Ident *loc) {
   if (!loc)
     return isSPMDMode();
 
@@ -58,9 +58,9 @@
   return isSPMDMode();
 }
 
-DEVICE bool checkGenericMode(kmp_Ident *loc) { return !checkSPMDMode(loc); }
+bool checkGenericMode(kmp_Ident *loc) { return !checkSPMDMode(loc); }
 
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) {
+bool checkRuntimeUninitialized(kmp_Ident *loc) {
   if (!loc)
     return isRuntimeUninitialized();
 
@@ -83,7 +83,7 @@
   return isRuntimeUninitialized();
 }
 
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) {
+bool checkRuntimeInitialized(kmp_Ident *loc) {
   return !checkRuntimeUninitialized(loc);
 }
 
@@ -105,13 +105,13 @@
 //      If NumThreads is 1024, master id is 992.
 //
 // Called in Generic Execution Mode only.
-DEVICE int GetMasterThreadID() {
+int GetMasterThreadID() {
   return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
 }
 
 // The last warp is reserved for the master; other warps are workers.
 // Called in Generic Execution Mode only.
-DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
+int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
 
 ////////////////////////////////////////////////////////////////////////////////
 // get thread id in team
@@ -120,7 +120,7 @@
 // or a serial region by the master.  If the master (whose CUDA thread
 // id is GetMasterThreadID()) calls this routine, we return 0 because
 // it is a shadow for the first worker.
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
+int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
   // Implemented using control flow (predication) instead of with a modulo
   // operation.
   int tid = GetThreadIdInBlock();
@@ -136,7 +136,7 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
+int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
   // omp_thread_num
   int rc;
   if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
@@ -152,7 +152,7 @@
   return rc;
 }
 
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
+int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
   // omp_num_threads
   int rc;
   int Level = parallelLevel[GetWarpId()];
@@ -170,12 +170,12 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Team id linked to OpenMP
 
-DEVICE int GetOmpTeamId() {
+int GetOmpTeamId() {
   // omp_team_num
   return GetBlockIdInKernel(); // assume 1 block per team
 }
 
-DEVICE int GetNumberOfOmpTeams() {
+int GetNumberOfOmpTeams() {
   // omp_num_teams
   return GetNumberOfBlocksInKernel(); // assume 1 block per team
 }
@@ -183,12 +183,12 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Masters
 
-DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
+int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Parallel level
 
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
   __kmpc_impl_syncwarp(Mask);
   __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
   unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
@@ -200,7 +200,7 @@
   __kmpc_impl_syncwarp(Mask);
 }
 
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
   __kmpc_impl_syncwarp(Mask);
   __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
   unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
@@ -216,13 +216,13 @@
 // get OpenMP number of procs
 
 // Get the number of processors in the device.
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
   if (!isSPMDExecutionMode)
     return GetNumberOfWorkersInTeam();
   return GetNumberOfThreadsInBlock();
 }
 
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
   return GetNumberOfProcsInDevice(isSPMDExecutionMode);
 }
 
@@ -230,8 +230,8 @@
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE unsigned long PadBytes(unsigned long size,
-                              unsigned long alignment) // must be a power of 2
+unsigned long PadBytes(unsigned long size,
+                       unsigned long alignment) // must be a power of 2
 {
   // compute the necessary padding to satisfy alignment constraint
   ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
@@ -239,7 +239,7 @@
   return (~(unsigned long)size + 1) & (alignment - 1);
 }
 
-DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success
+void *SafeMalloc(size_t size, const char *msg) // check if success
 {
   void *ptr = __kmpc_impl_malloc(size);
   PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
@@ -247,7 +247,7 @@
   return ptr;
 }
 
-DEVICE void *SafeFree(void *ptr, const char *msg) {
+void *SafeFree(void *ptr, const char *msg) {
   PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
   __kmpc_impl_free(ptr);
   return NULL;
@@ -257,11 +257,11 @@
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE unsigned int *GetTeamsReductionTimestamp() {
+unsigned int *GetTeamsReductionTimestamp() {
   return static_cast<unsigned int *>(ReductionScratchpadPtr);
 }
 
-DEVICE char *GetTeamsReductionScratchpad() {
+char *GetTeamsReductionScratchpad() {
   return static_cast<char *>(ReductionScratchpadPtr) + 256;
 }
 
diff --git a/libomptarget/deviceRTLs/common/support.h b/libomptarget/deviceRTLs/common/support.h
index 4d260de..eae5954 100644
--- a/libomptarget/deviceRTLs/common/support.h
+++ b/libomptarget/deviceRTLs/common/support.h
@@ -31,59 +31,59 @@
   RuntimeMask = 0x02u,
 };
 
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-DEVICE bool isGenericMode();
-DEVICE bool isSPMDMode();
-DEVICE bool isRuntimeUninitialized();
-DEVICE bool isRuntimeInitialized();
+void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+bool isGenericMode();
+bool isSPMDMode();
+bool isRuntimeUninitialized();
+bool isRuntimeInitialized();
 
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Modes based on location parameter fields
 ////////////////////////////////////////////////////////////////////////////////
 
-DEVICE bool checkSPMDMode(kmp_Ident *loc);
-DEVICE bool checkGenericMode(kmp_Ident *loc);
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+bool checkSPMDMode(kmp_Ident *loc);
+bool checkGenericMode(kmp_Ident *loc);
+bool checkRuntimeUninitialized(kmp_Ident *loc);
+bool checkRuntimeInitialized(kmp_Ident *loc);
 
 ////////////////////////////////////////////////////////////////////////////////
 // get info from machine
 ////////////////////////////////////////////////////////////////////////////////
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-DEVICE int GetMasterThreadID();
-DEVICE int GetNumberOfWorkersInTeam();
+int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+int GetMasterThreadID();
+int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
-                          bool isSPMDExecutionMode); // omp_thread_num
-DEVICE int GetOmpTeamId();                           // omp_team_num
+int GetOmpThreadId(int threadId,
+                   bool isSPMDExecutionMode); // omp_thread_num
+int GetOmpTeamId();                           // omp_team_num
 
 // get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
+int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
 
 // masters
-DEVICE int IsTeamMaster(int ompThreadId);
+int IsTeamMaster(int ompThreadId);
 
 // Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
 // safe alloc and free
-DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
-DEVICE void *SafeFree(void *ptr, const char *msg);
+void *SafeMalloc(size_t size, const char *msg); // check if success
+void *SafeFree(void *ptr, const char *msg);
 // pad to a alignment (power of 2 only)
-DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+unsigned long PadBytes(unsigned long size, unsigned long alignment);
 #define ADD_BYTES(_addr, _bytes)                                               \
   ((void *)((char *)((void *)(_addr)) + (_bytes)))
 #define SUB_BYTES(_addr, _bytes)                                               \
@@ -92,7 +92,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE unsigned int *GetTeamsReductionTimestamp();
-DEVICE char *GetTeamsReductionScratchpad();
+unsigned int *GetTeamsReductionTimestamp();
+char *GetTeamsReductionScratchpad();
 
 #endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index b856b41..eafa734 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -97,39 +97,38 @@
 EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
 // Atomics
-DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
-DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
   return __nvvm_atom_inc_gen_ui(Address, Val);
 }
 
-DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
 }
 
-DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
   uint32_t R;
   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
   return R;
 }
 
-DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
-                                  uint32_t Val) {
+uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
   (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
                                   __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
   return Compare;
 }
 
-DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
-                                                 unsigned long long Val) {
+unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
+                                          unsigned long long Val) {
   unsigned long long R;
   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
   return R;
 }
 
-DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
-                                            unsigned long long Val) {
+unsigned long long __kmpc_atomic_add(unsigned long long *Address,
+                                     unsigned long long Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index e92ada5..d0d7127 100644
--- a/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -18,8 +18,7 @@
 
 #include "nvptx_interface.h"
 
-#define DEVICE
-#define INLINE inline __attribute__((always_inline)) DEVICE
+#define INLINE inline __attribute__((always_inline))
 #define NOINLINE __attribute__((noinline))
 #define ALIGN(N) __attribute__((aligned(N)))
 
diff --git a/libomptarget/deviceRTLs/target_interface.h b/libomptarget/deviceRTLs/target_interface.h
index 92fca47..c5141c9 100644
--- a/libomptarget/deviceRTLs/target_interface.h
+++ b/libomptarget/deviceRTLs/target_interface.h
@@ -25,16 +25,15 @@
 EXTERN unsigned GetLaneId();
 
 // Atomics
-DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
-DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
+uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
 static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
-DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
-                                                 unsigned long long);
-DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
-                                            unsigned long long);
+unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                          unsigned long long);
+unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long);
 
 // Locks
 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
