blob: ba3d331876b13bef892977caa7b450abc6a4c1c3 [file] [log] [blame]
//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Definitions of target specific functions
//
//===----------------------------------------------------------------------===//
#ifndef _TARGET_IMPL_H_
#define _TARGET_IMPL_H_
#include <assert.h>
#include <cuda.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include "nvptx_interface.h"
#define DEVICE __device__
#define INLINE __forceinline__ DEVICE
#define NOINLINE __noinline__ DEVICE
#define SHARED __shared__
#define ALIGN(N) __align__(N)
////////////////////////////////////////////////////////////////////////////////
// Kernel options
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// The following def must match the absolute limit hardwired in the host RTL
// max number of threads per team
#define MAX_THREADS_PER_TEAM 1024
#define WARPSIZE 32
// Maximum number of preallocated arguments to an outlined parallel/simd function.
// Anything more requires dynamic memory allocation.
#define MAX_SHARED_ARGS 20
// Maximum number of omp state objects per SM allocated statically in global
// memory.
#if __CUDA_ARCH__ >= 600
#define OMP_STATE_COUNT 32
#else
#define OMP_STATE_COUNT 16
#endif
#if !defined(MAX_SM)
#if __CUDA_ARCH__ >= 900
#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
#elif __CUDA_ARCH__ >= 800
// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
// GA102 design has a maxinum of 84 SMs
#define MAX_SM 108
#elif __CUDA_ARCH__ >= 700
#define MAX_SM 84
#elif __CUDA_ARCH__ >= 600
#define MAX_SM 56
#else
#define MAX_SM 16
#endif
#endif
#define OMP_ACTIVE_PARALLEL_LEVEL 128
// Data sharing related quantities, need to match what is used in the compiler.
enum DATA_SHARING_SIZES {
// The maximum number of workers in a kernel.
DS_Max_Worker_Threads = 992,
// The size reserved for data in a shared memory slot.
DS_Slot_Size = 256,
// The slot size that should be reserved for a working warp.
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
// The maximum number of warps in use
DS_Max_Warp_Number = 32,
// The size of the preallocated shared memory buffer per team
DS_Shared_Memory_Size = 128,
};
enum : __kmpc_impl_lanemask_t {
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
};
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
DEVICE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
DEVICE uint32_t __kmpc_impl_smid();
DEVICE double __kmpc_impl_get_wtick();
DEVICE double __kmpc_impl_get_wtime();
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
#ifndef CUDA_VERSION
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#endif
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
int32_t SrcLane);
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
int32_t Var, uint32_t Delta,
int32_t Width);
DEVICE void __kmpc_impl_syncthreads();
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
// NVPTX specific kernel initialization
DEVICE void __kmpc_impl_target_init();
// Barrier until num_threads arrive.
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
DEVICE void __kmpc_impl_threadfence();
DEVICE void __kmpc_impl_threadfence_block();
DEVICE void __kmpc_impl_threadfence_system();
// Calls to the NVPTX layer (assuming 1D layout)
DEVICE int GetThreadIdInBlock();
DEVICE int GetBlockIdInKernel();
DEVICE int GetNumberOfBlocksInKernel();
DEVICE int GetNumberOfThreadsInBlock();
DEVICE unsigned GetWarpId();
DEVICE unsigned GetLaneId();
// Atomics
template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
return atomicAdd(address, val);
}
template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
return atomicInc(address, val);
}
template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
return atomicMax(address, val);
}
template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
return atomicExch(address, val);
}
template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
return atomicCAS(address, compare, val);
}
// Locks
DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock);
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock);
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock);
// Memory
DEVICE void *__kmpc_impl_malloc(size_t);
DEVICE void __kmpc_impl_free(void *);
#endif