libomptarget/deviceRTLs/common/include/target/shuffle.h - llvm-project/openmp - Git at Google

 //===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Shuffle function implementations for all supported targets.
 //
 // Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
 //
 //===----------------------------------------------------------------------===//

 #ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
 #define LIBOMPTARGET_DEVICERTL_SHUFFLE_H

 #include <stdint.h>

 #pragma omp declare target

 /// External shuffle API
 ///
 ///{

 extern "C" {
 int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
 int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
 }

 ///}

 /// Forward declarations
 ///
 ///{
 extern "C" {
 unsigned GetLaneId();
 unsigned __kmpc_get_warp_size();
 void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
 uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
 }
 ///}

 /// Fallback implementations of the shuffle sync idiom.
 /// Unavailable at present (would error at link time if used).
 ///
 ///{

 int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);

 int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
                                    int32_t Width);

 ///}

 /// AMDGCN implementations of the shuffle sync idiom.
 ///
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})

 inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
                                      int32_t SrcLane) {
   int Width = __kmpc_get_warp_size();
   int Self = GetLaneId();
   int Index = SrcLane + (Self & ~(Width - 1));
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }

 inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
                                           uint32_t LaneDelta, int32_t Width) {
   int Self = GetLaneId();
   int Index = Self + LaneDelta;
   Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }

 #pragma omp end declare variant
 ///}

 /// NVPTX implementations of the shuffle and shuffle sync idiom.
 ///
 ///{
 #pragma omp begin declare variant match(                                       \
     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

 inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
                                      int32_t SrcLane) {
   return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
 }

 inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
                                           uint32_t Delta, int32_t Width) {
   int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 }

 #pragma omp end declare variant
 ///}

 #pragma omp end declare target

 #endif
	//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Shuffle function implementations for all supported targets.
	//
	// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
	#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H

	#include <stdint.h>

	#pragma omp declare target

	/// External shuffle API
	///
	///{

	extern "C" {
	int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
	int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
	}

	///}

	/// Forward declarations
	///
	///{
	extern "C" {
	unsigned GetLaneId();
	unsigned __kmpc_get_warp_size();
	void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
	uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
	}
	///}

	/// Fallback implementations of the shuffle sync idiom.
	/// Unavailable at present (would error at link time if used).
	///
	///{

	int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);

	int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
	int32_t Width);

	///}

	/// AMDGCN implementations of the shuffle sync idiom.
	///
	///{
	#pragma omp begin declare variant match(device = {arch(amdgcn)})

	inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
	int32_t SrcLane) {
	int Width = __kmpc_get_warp_size();
	int Self = GetLaneId();
	int Index = SrcLane + (Self & ~(Width - 1));
	return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
	}

	inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
	uint32_t LaneDelta, int32_t Width) {
	int Self = GetLaneId();
	int Index = Self + LaneDelta;
	Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
	return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
	}

	#pragma omp end declare variant
	///}

	/// NVPTX implementations of the shuffle and shuffle sync idiom.
	///
	///{
	#pragma omp begin declare variant match( \
	device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})

	inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
	int32_t SrcLane) {
	return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
	}

	inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
	uint32_t Delta, int32_t Width) {
	int32_t T = ((__kmpc_get_warp_size() - Width) << 8) \| 0x1f;
	return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
	}

	#pragma omp end declare variant
	///}

	#pragma omp end declare target

	#endif