libomptarget/deviceRTLs/nvptx/src/target_impl.h - openmp - Git at Google

 //===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Definitions of target specific functions
 //
 //===----------------------------------------------------------------------===//
 #ifndef _TARGET_IMPL_H_
 #define _TARGET_IMPL_H_

 #include <stdint.h>

 #include "option.h"

 INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }

 INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   uint64_t val;
   asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
   return val;
 }

 static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
     UINT32_C(0xffffffff);

 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
   __kmpc_impl_lanemask_t res;
   asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
   return res;
 }

 INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
   __kmpc_impl_lanemask_t res;
   asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
   return res;
 }

 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }

 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }

 #ifndef CUDA_VERSION
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #endif

 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().

 INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 #if CUDA_VERSION >= 9000
   return __activemask();
 #else
   return __ballot(1);
 #endif
 }

 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.

 INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
   return __shfl_sync(Mask, Var, SrcLane);
 #else
   return __shfl(Var, SrcLane);
 #endif // CUDA_VERSION
 }

 INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
 #if CUDA_VERSION >= 9000
   return __shfl_down_sync(Mask, Var, Delta, Width);
 #else
   return __shfl_down(Var, Delta, Width);
 #endif // CUDA_VERSION
 }

 INLINE void __kmpc_impl_syncthreads() {
   // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
 #if !defined(__clang__) || __clang_major__ >= 9
   __syncthreads();
 #else
   asm volatile("bar.sync %0;" : : "r"(0) : "memory");
 #endif // __clang__
 }

 INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
   __syncwarp(Mask);
 #else
   // In Cuda < 9.0 no need to sync threads in warps.
 #endif // CUDA_VERSION
 }

 #endif
	//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Definitions of target specific functions
	//
	//===----------------------------------------------------------------------===//
	#ifndef _TARGET_IMPL_H_
	#define _TARGET_IMPL_H_

	#include <stdint.h>

	#include "option.h"

	INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
	}

	INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
	uint64_t val;
	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
	return val;
	}

	static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes =
	UINT32_C(0xffffffff);

	INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
	__kmpc_impl_lanemask_t res;
	asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
	return res;
	}

	INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
	__kmpc_impl_lanemask_t res;
	asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
	return res;
	}

	INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }

	INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }

	#ifndef CUDA_VERSION
	#error CUDA_VERSION macro is undefined, something wrong with cuda.
	#endif

	// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().

	INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
	#if CUDA_VERSION >= 9000
	return __activemask();
	#else
	return __ballot(1);
	#endif
	}

	// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.

	INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
	int32_t SrcLane) {
	#if CUDA_VERSION >= 9000
	return __shfl_sync(Mask, Var, SrcLane);
	#else
	return __shfl(Var, SrcLane);
	#endif // CUDA_VERSION
	}

	INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
	int32_t Var, uint32_t Delta,
	int32_t Width) {
	#if CUDA_VERSION >= 9000
	return __shfl_down_sync(Mask, Var, Delta, Width);
	#else
	return __shfl_down(Var, Delta, Width);
	#endif // CUDA_VERSION
	}

	INLINE void __kmpc_impl_syncthreads() {
	// Use original __syncthreads if compiled by nvcc or clang >= 9.0.
	#if !defined(__clang__) \|\| __clang_major__ >= 9
	__syncthreads();
	#else
	asm volatile("bar.sync %0;" : : "r"(0) : "memory");
	#endif // __clang__
	}

	INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
	#if CUDA_VERSION >= 9000
	__syncwarp(Mask);
	#else
	// In Cuda < 9.0 no need to sync threads in warps.
	#endif // CUDA_VERSION
	}

	#endif