libomptarget/deviceRTLs/amdgcn/src/target_impl.h - llvm-project/openmp - Git at Google

 //===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // Declarations and definitions of target specific functions and constants
 //
 //===----------------------------------------------------------------------===//
 #ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
 #define OMPTARGET_AMDGCN_TARGET_IMPL_H

 #ifndef __AMDGCN__
 #error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
 #endif

 #include "amdgcn_interface.h"

 #include <assert.h>
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

 #define DEVICE __attribute__((device))
 #define INLINE inline DEVICE
 #define NOINLINE __attribute__((noinline)) DEVICE
 #define SHARED __attribute__((shared))
 #define ALIGN(N) __attribute__((aligned(N)))

 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
 ////////////////////////////////////////////////////////////////////////////////

 ////////////////////////////////////////////////////////////////////////////////
 // The following def must match the absolute limit hardwired in the host RTL
 // max number of threads per team
 #define MAX_THREADS_PER_TEAM 1024

 #define WARPSIZE 64

 // Maximum number of preallocated arguments to an outlined parallel/simd
 // function. Anything more requires dynamic memory allocation.
 #define MAX_SHARED_ARGS 20

 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #define OMP_STATE_COUNT 32
 #define MAX_SM 64

 #define OMP_ACTIVE_PARALLEL_LEVEL 128

 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
   DS_Max_Worker_Threads = 960,
   // The size reserved for data in a shared memory slot.
   DS_Slot_Size = 256,
   // The slot size that should be reserved for a working warp.
   DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
   // The maximum number of warps in use
   DS_Max_Warp_Number = 16,
 };

 enum : __kmpc_impl_lanemask_t {
   __kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
 };

 INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
   hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
 }

 INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
   return (((uint64_t)hi) << 32) | (uint64_t)lo;
 }

 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
 DEVICE uint32_t __kmpc_impl_smid();
 DEVICE double __kmpc_impl_get_wtick();
 DEVICE double __kmpc_impl_get_wtime();

 INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
 INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }

 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();

 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
                                      int32_t SrcLane);

 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
                                           uint32_t Delta, int32_t Width);

 INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }

 INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
   // AMDGCN doesn't need to sync threads in a warp
 }

 // AMDGCN specific kernel initialization
 DEVICE void __kmpc_impl_target_init();

 // Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
 DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);

 INLINE void __kmpc_impl_threadfence() {
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
 }

 INLINE void __kmpc_impl_threadfence_block() {
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 }

 INLINE void __kmpc_impl_threadfence_system() {
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
 }

 // Calls to the AMDGCN layer (assuming 1D layout)
 INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
 INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
 DEVICE int GetNumberOfBlocksInKernel();
 DEVICE int GetNumberOfThreadsInBlock();
 DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();

 // Atomics
 template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
   return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
 }

 INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
   return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
 }

 template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
   return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
 }

 template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
   T r;
   __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
   return r;
 }

 template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
   (void)__atomic_compare_exchange(address, &compare, &val, false,
                                   __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
   return compare;
 }

 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
 DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
 DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
 DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
 DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);

 // Memory
 DEVICE void *__kmpc_impl_malloc(size_t x);
 DEVICE void __kmpc_impl_free(void *x);

 // DEVICE versions of part of libc
 INLINE void __assert_fail(const char *, const char *, unsigned int,
                           const char *) {
   __builtin_trap();
 }
 EXTERN int printf(const char *, ...);

 #endif
	//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// Declarations and definitions of target specific functions and constants
	//
	//===----------------------------------------------------------------------===//
	#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
	#define OMPTARGET_AMDGCN_TARGET_IMPL_H

	#ifndef __AMDGCN__
	#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
	#endif

	#include "amdgcn_interface.h"

	#include <assert.h>
	#include <inttypes.h>
	#include <stddef.h>
	#include <stdint.h>

	#define DEVICE __attribute__((device))
	#define INLINE inline DEVICE
	#define NOINLINE __attribute__((noinline)) DEVICE
	#define SHARED __attribute__((shared))
	#define ALIGN(N) __attribute__((aligned(N)))

	////////////////////////////////////////////////////////////////////////////////
	// Kernel options
	////////////////////////////////////////////////////////////////////////////////

	////////////////////////////////////////////////////////////////////////////////
	// The following def must match the absolute limit hardwired in the host RTL
	// max number of threads per team
	#define MAX_THREADS_PER_TEAM 1024

	#define WARPSIZE 64

	// Maximum number of preallocated arguments to an outlined parallel/simd
	// function. Anything more requires dynamic memory allocation.
	#define MAX_SHARED_ARGS 20

	// Maximum number of omp state objects per SM allocated statically in global
	// memory.
	#define OMP_STATE_COUNT 32
	#define MAX_SM 64

	#define OMP_ACTIVE_PARALLEL_LEVEL 128

	// Data sharing related quantities, need to match what is used in the compiler.
	enum DATA_SHARING_SIZES {
	// The maximum number of workers in a kernel.
	DS_Max_Worker_Threads = 960,
	// The size reserved for data in a shared memory slot.
	DS_Slot_Size = 256,
	// The slot size that should be reserved for a working warp.
	DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
	// The maximum number of warps in use
	DS_Max_Warp_Number = 16,
	};

	enum : __kmpc_impl_lanemask_t {
	__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
	};

	INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
	lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
	hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
	}

	INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
	return (((uint64_t)hi) << 32) \| (uint64_t)lo;
	}

	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
	DEVICE uint32_t __kmpc_impl_smid();
	DEVICE double __kmpc_impl_get_wtick();
	DEVICE double __kmpc_impl_get_wtime();

	INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
	INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }

	DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();

	DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
	int32_t SrcLane);

	DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
	uint32_t Delta, int32_t Width);

	INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }

	INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
	// AMDGCN doesn't need to sync threads in a warp
	}

	// AMDGCN specific kernel initialization
	DEVICE void __kmpc_impl_target_init();

	// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
	DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);

	INLINE void __kmpc_impl_threadfence() {
	__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
	}

	INLINE void __kmpc_impl_threadfence_block() {
	__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
	}

	INLINE void __kmpc_impl_threadfence_system() {
	__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
	}

	// Calls to the AMDGCN layer (assuming 1D layout)
	INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
	INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
	DEVICE int GetNumberOfBlocksInKernel();
	DEVICE int GetNumberOfThreadsInBlock();
	DEVICE unsigned GetWarpId();
	DEVICE unsigned GetLaneId();

	// Atomics
	template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
	return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
	}

	INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
	return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
	}

	template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
	return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
	}

	template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
	T r;
	__atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
	return r;
	}

	template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
	(void)__atomic_compare_exchange(address, &compare, &val, false,
	__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
	return compare;
	}

	// Locks
	DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
	DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
	DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
	DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
	DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);

	// Memory
	DEVICE void *__kmpc_impl_malloc(size_t x);
	DEVICE void __kmpc_impl_free(void *x);

	// DEVICE versions of part of libc
	INLINE void __assert_fail(const char , const char , unsigned int,
	const char *) {
	__builtin_trap();
	}
	EXTERN int printf(const char *, ...);

	#endif