blob: 6e8a651bd886dcea93270d6ca9cd038cb50b371c [file] [log] [blame]
//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Declarations and definitions of target specific functions and constants
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
#define OMPTARGET_AMDGCN_TARGET_IMPL_H
#ifndef __AMDGCN__
#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
#endif
#include "amdgcn_interface.h"
#include <assert.h>
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#define DEVICE __attribute__((device))
#define INLINE inline DEVICE
#define NOINLINE __attribute__((noinline)) DEVICE
#define SHARED __attribute__((shared))
#define ALIGN(N) __attribute__((aligned(N)))
////////////////////////////////////////////////////////////////////////////////
// Kernel options
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// The following def must match the absolute limit hardwired in the host RTL
// max number of threads per team
#define MAX_THREADS_PER_TEAM 1024
#define WARPSIZE 64
// Maximum number of preallocated arguments to an outlined parallel/simd
// function. Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20
// Maximum number of omp state objects per SM allocated statically in global
// memory.
#define OMP_STATE_COUNT 32
#define MAX_SM 64
#define OMP_ACTIVE_PARALLEL_LEVEL 128
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The maximum number of workers in a kernel.
DS_Max_Worker_Threads = 960,
// The size reserved for data in a shared memory slot.
DS_Slot_Size = 256,
// The slot size that should be reserved for a working warp.
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
// The maximum number of warps in use
DS_Max_Warp_Number = 16,
};
enum : __kmpc_impl_lanemask_t {
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
};
INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
}
INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
return (((uint64_t)hi) << 32) | (uint64_t)lo;
}
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
DEVICE uint32_t __kmpc_impl_smid();
DEVICE double __kmpc_impl_get_wtick();
DEVICE double __kmpc_impl_get_wtime();
INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
int32_t SrcLane);
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
uint32_t Delta, int32_t Width);
INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}
// AMDGCN specific kernel initialization
DEVICE void __kmpc_impl_target_init();
// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
INLINE void __kmpc_impl_threadfence() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
}
INLINE void __kmpc_impl_threadfence_block() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}
INLINE void __kmpc_impl_threadfence_system() {
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
}
// Calls to the AMDGCN layer (assuming 1D layout)
INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
DEVICE int GetNumberOfBlocksInKernel();
DEVICE int GetNumberOfThreadsInBlock();
DEVICE unsigned GetWarpId();
DEVICE unsigned GetLaneId();
// Atomics
template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
}
INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
}
template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
}
template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
T r;
__atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
return r;
}
template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
(void)__atomic_compare_exchange(address, &compare, &val, false,
__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
return compare;
}
// Locks
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
// Memory
DEVICE void *__kmpc_impl_malloc(size_t x);
DEVICE void __kmpc_impl_free(void *x);
// DEVICE versions of part of libc
INLINE void __assert_fail(const char *, const char *, unsigned int,
const char *) {
__builtin_trap();
}
EXTERN int printf(const char *, ...);
#endif