libomptarget/deviceRTLs/nvptx/src/target_impl.cu - llvm-project/openmp - Git at Google

 //===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Definitions of target specific functions
 //
 //===----------------------------------------------------------------------===//
 #pragma omp declare target

 #include "target_impl.h"
 #include "common/debug.h"

 #include <cuda.h>

 // Forward declaration of CUDA primitives which will be evetually transformed
 // into LLVM intrinsics.
 extern "C" {
 unsigned int __activemask();
 unsigned int __ballot(unsigned);
 // The default argument here is based on NVIDIA's website
 // https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
 int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
 int __shfl(int val, int src_line, int width = WARPSIZE);
 int __shfl_down(int var, unsigned detla, int width);
 int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
 void __syncwarp(int mask);
 }

 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }

 DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   uint64_t val;
   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
   return val;
 }

 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
   __kmpc_impl_lanemask_t res;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
   return res;
 }

 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
   __kmpc_impl_lanemask_t res;
   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
   return res;
 }

 DEVICE uint32_t __kmpc_impl_smid() {
   uint32_t id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
   return id;
 }

 DEVICE double __kmpc_impl_get_wtick() {
   // Timer precision is 1ns
   return ((double)1E-9);
 }

 DEVICE double __kmpc_impl_get_wtime() {
   unsigned long long nsecs;
   asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
   return (double)nsecs * __kmpc_impl_get_wtick();
 }

 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 #if CUDA_VERSION >= 9000
   return __activemask();
 #else
   return __ballot(1);
 #endif
 }

 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
   return __shfl_sync(Mask, Var, SrcLane);
 #else
   return __shfl(Var, SrcLane);
 #endif // CUDA_VERSION
 }

 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
 #if CUDA_VERSION >= 9000
   return __shfl_down_sync(Mask, Var, Delta, Width);
 #else
   return __shfl_down(Var, Delta, Width);
 #endif // CUDA_VERSION
 }

 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }

 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
   __syncwarp(Mask);
 #else
   // In Cuda < 9.0 no need to sync threads in warps.
 #endif // CUDA_VERSION
 }

 // NVPTX specific kernel initialization
 DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
 }

 // Barrier until num_threads arrive.
 DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.
   int barrier = 1;
   asm volatile("bar.sync %0, %1;"
                :
                : "r"(barrier), "r"(num_threads)
                : "memory");
 }

 DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
 DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
 DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }

 // Calls to the NVPTX layer (assuming 1D layout)
 DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
 DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
 DEVICE int GetNumberOfBlocksInKernel() {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
 DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
 DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }

 // Atomics
 DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
   return __nvvm_atom_inc_gen_ui(Address, Val);
 }

 DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
   return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
 }

 DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
   uint32_t R;
   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
   return R;
 }

 DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
                                   uint32_t Val) {
   (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
                                   __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
   return Compare;
 }

 DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
                                                  unsigned long long Val) {
   unsigned long long R;
   __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
   return R;
 }

 DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
                                             unsigned long long Val) {
   return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }

 #define __OMP_SPIN 1000
 #define UNSET 0u
 #define SET 1u

 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
   __kmpc_impl_unset_lock(lock);
 }

 DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
   __kmpc_impl_unset_lock(lock);
 }

 DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // TODO: not sure spinning is a good idea here..
   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
     int32_t start = __nvvm_read_ptx_sreg_clock();
     int32_t now;
     for (;;) {
       now = __nvvm_read_ptx_sreg_clock();
       int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
       if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
         break;
       }
     }
   } // wait for 0 to be the read value
 }

 DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
   (void)__kmpc_atomic_exchange(lock, UNSET);
 }

 DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
   return __kmpc_atomic_add(lock, 0u);
 }

 DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
 DEVICE void __kmpc_impl_free(void *x) { free(x); }

 #pragma omp end declare target
	//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Definitions of target specific functions
	//
	//===----------------------------------------------------------------------===//
	#pragma omp declare target

	#include "target_impl.h"
	#include "common/debug.h"

	#include <cuda.h>

	// Forward declaration of CUDA primitives which will be evetually transformed
	// into LLVM intrinsics.
	extern "C" {
	unsigned int __activemask();
	unsigned int __ballot(unsigned);
	// The default argument here is based on NVIDIA's website
	// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
	int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
	int __shfl(int val, int src_line, int width = WARPSIZE);
	int __shfl_down(int var, unsigned detla, int width);
	int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
	void __syncwarp(int mask);
	}

	DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
	}

	DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
	uint64_t val;
	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
	return val;
	}

	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
	__kmpc_impl_lanemask_t res;
	asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
	return res;
	}

	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
	__kmpc_impl_lanemask_t res;
	asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
	return res;
	}

	DEVICE uint32_t __kmpc_impl_smid() {
	uint32_t id;
	asm("mov.u32 %0, %%smid;" : "=r"(id));
	return id;
	}

	DEVICE double __kmpc_impl_get_wtick() {
	// Timer precision is 1ns
	return ((double)1E-9);
	}

	DEVICE double __kmpc_impl_get_wtime() {
	unsigned long long nsecs;
	asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
	return (double)nsecs * __kmpc_impl_get_wtick();
	}

	// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
	#if CUDA_VERSION >= 9000
	return __activemask();
	#else
	return __ballot(1);
	#endif
	}

	// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
	DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
	int32_t SrcLane) {
	#if CUDA_VERSION >= 9000
	return __shfl_sync(Mask, Var, SrcLane);
	#else
	return __shfl(Var, SrcLane);
	#endif // CUDA_VERSION
	}

	DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
	int32_t Var, uint32_t Delta,
	int32_t Width) {
	#if CUDA_VERSION >= 9000
	return __shfl_down_sync(Mask, Var, Delta, Width);
	#else
	return __shfl_down(Var, Delta, Width);
	#endif // CUDA_VERSION
	}

	DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }

	DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
	#if CUDA_VERSION >= 9000
	__syncwarp(Mask);
	#else
	// In Cuda < 9.0 no need to sync threads in warps.
	#endif // CUDA_VERSION
	}

	// NVPTX specific kernel initialization
	DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
	}

	// Barrier until num_threads arrive.
	DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
	// The named barrier for active parallel threads of a team in an L1 parallel
	// region to synchronize with each other.
	int barrier = 1;
	asm volatile("bar.sync %0, %1;"
	:
	: "r"(barrier), "r"(num_threads)
	: "memory");
	}

	DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
	DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
	DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }

	// Calls to the NVPTX layer (assuming 1D layout)
	DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
	DEVICE int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
	DEVICE int GetNumberOfBlocksInKernel() {
	return __nvvm_read_ptx_sreg_nctaid_x();
	}
	DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
	DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
	DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }

	// Atomics
	DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
	return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
	}
	DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
	return __nvvm_atom_inc_gen_ui(Address, Val);
	}

	DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
	return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
	}

	DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
	uint32_t R;
	__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
	return R;
	}

	DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
	uint32_t Val) {
	(void)__atomic_compare_exchange(Address, &Compare, &Val, false,
	__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
	return Compare;
	}

	DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
	unsigned long long Val) {
	unsigned long long R;
	__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
	return R;
	}

	DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
	unsigned long long Val) {
	return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
	}

	#define __OMP_SPIN 1000
	#define UNSET 0u
	#define SET 1u

	DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock) {
	__kmpc_impl_unset_lock(lock);
	}

	DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
	__kmpc_impl_unset_lock(lock);
	}

	DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
	// TODO: not sure spinning is a good idea here..
	while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
	int32_t start = __nvvm_read_ptx_sreg_clock();
	int32_t now;
	for (;;) {
	now = __nvvm_read_ptx_sreg_clock();
	int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
	if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
	break;
	}
	}
	} // wait for 0 to be the read value
	}

	DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock) {
	(void)__kmpc_atomic_exchange(lock, UNSET);
	}

	DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) {
	return __kmpc_atomic_add(lock, 0u);
	}

	DEVICE void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
	DEVICE void __kmpc_impl_free(void *x) { free(x); }

	#pragma omp end declare target