[libomptarget] Build cuda plugin without cuda installed locally

[libomptarget] Build cuda plugin without cuda installed locally

Compiles a new file, `plugins/cuda/dynamic_cuda/cuda.cpp`, to an object file that exposes the same symbols that the plugin presently uses from libcuda. The object file contains dlopen of libcuda and cached dlsym calls. Also provides a cuda.h containing the subset that is used.

This lets the cmake file choose between the system cuda and a dlopen shim, with no changes to rtl.cpp.

The corresponding change to amdgpu is postponed until after a refactor of the plugin to reduce the size of the hsa.h stub required

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95155

GitOrigin-RevId: 47e95e87a3e4f738635ff965616d4e2d96bf838a
diff --git a/libomptarget/include/dlwrap.h b/libomptarget/include/dlwrap.h
new file mode 100644
index 0000000..9e078b3
--- /dev/null
+++ b/libomptarget/include/dlwrap.h
@@ -0,0 +1,277 @@
+//===------- dlwrap.h - Convenience wrapper around dlopen/dlsym  -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The openmp plugins depend on extern libraries. These can be used via:
+//  - bitcode file statically linked
+//  - (relocatable) object file statically linked
+//  - static library
+//  - dynamic library, linked at build time
+//  - dynamic library, loaded at application run time by dlopen
+//
+// This file factors out most boilerplate for using a dlopened library.
+// - Function symbols are generated that are statically linked against
+// - The dlopen can be done implicitly when initializing the library
+// - dlsym lookups are done once and cached
+// - The abstraction is very thin to permit varied uses of the library
+//
+// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
+// int foo(char x0, double x1, void* x2) {
+//   constexpr size_t index = id();
+//   void * dlsymResult = pointer(index);
+//   return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2);
+// }
+//
+// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
+// initialization code that can use the thin abstraction:
+// namespace dlwrap {
+//   static size_t size();
+//   static const char *symbol(size_t);
+//   static void **pointer(size_t);
+// }
+// will compile to an object file that only exposes the symbols that the
+// dynamic library would do, with the right function types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DLWRAP_H_INCLUDED
+#define DLWRAP_H_INCLUDED
+
+#include <array>
+#include <cstddef>
+#include <tuple>
+#include <type_traits>
+
+// Where symbol is a function, these expand to some book keeping and an
+// implementation of that function
+#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
+#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)
+
+// For example, given a prototype:
+// int foo(char, double);
+//
+// DLWRAP(foo, 2) expands to:
+//
+// namespace dlwrap {
+// struct foo_Trait : public dlwrap::trait<decltype(&foo)> {
+//   using T = dlwrap::trait<decltype(&foo)>;
+//   static T::FunctionType get() {
+//     constexpr size_t Index = getIndex();
+//     void *P = *dlwrap::pointer(Index);
+//     return reinterpret_cast<T::FunctionType>(P);
+//   }
+// };
+// }
+// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
+//
+// DLWRAP_INTERNAL is similar, except the function it expands to is:
+// static int dlwrap_foo(char x0, double x1) { ... }
+// so that the function pointer call can be wrapped in library-specific code
+
+// DLWRAP_FINALIZE() expands to definitions of:
+#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
+namespace dlwrap {
+static size_t size();
+static const char *symbol(size_t); // get symbol name in [0, size())
+static void **pointer(size_t); // get pointer to function pointer in [0, size())
+} // namespace dlwrap
+
+// Implementation details follow.
+
+namespace dlwrap {
+
+// Extract return / argument types from address of function symbol
+template <typename F> struct trait;
+template <typename R, typename... Ts> struct trait<R (*)(Ts...)> {
+  constexpr static const size_t nargs = sizeof...(Ts);
+  typedef R ReturnType;
+  template <size_t i> struct arg {
+    typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type;
+  };
+
+  typedef R (*FunctionType)(Ts...);
+};
+
+namespace type {
+// Book keeping is by type specialization
+
+template <size_t S> struct count {
+  static constexpr size_t N = count<S - 1>::N;
+};
+
+template <> struct count<0> { static constexpr size_t N = 0; };
+
+// Get a constexpr size_t ID, starts at zero
+#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N)
+
+// Increment value returned by DLWRAP_ID
+#define DLWRAP_INC()                                                           \
+  template <> struct dlwrap::type::count<__LINE__> {                           \
+    static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N;      \
+  }
+
+template <size_t N> struct symbol;
+#define DLWRAP_SYMBOL(SYMBOL, ID)                                              \
+  template <> struct dlwrap::type::symbol<ID> {                                \
+    static constexpr const char *call() { return #SYMBOL; }                    \
+  }
+} // namespace type
+
+template <size_t N, size_t... Is>
+constexpr std::array<const char *, N> static getSymbolArray(
+    std::index_sequence<Is...>) {
+  return {{dlwrap::type::symbol<Is>::call()...}};
+}
+
+} // namespace dlwrap
+
+#define DLWRAP_INSTANTIATE(SYM_USE, SYM_DEF, ARITY)                            \
+  DLWRAP_INSTANTIATE_##ARITY(SYM_USE, SYM_DEF,                                 \
+                             dlwrap::trait<decltype(&SYM_USE)>)
+
+#define DLWRAP_FINALIZE_IMPL()                                                 \
+  static size_t dlwrap::size() { return DLWRAP_ID(); }                         \
+  static const char *dlwrap::symbol(size_t i) {                                \
+    static constexpr const std::array<const char *, DLWRAP_ID()>               \
+        dlwrap_symbols = getSymbolArray<DLWRAP_ID()>(                          \
+            std::make_index_sequence<DLWRAP_ID()>());                          \
+    return dlwrap_symbols[i];                                                  \
+  }                                                                            \
+  static void **dlwrap::pointer(size_t i) {                                    \
+    static std::array<void *, DLWRAP_ID()> dlwrap_pointers;                    \
+    return &dlwrap_pointers.data()[i];                                         \
+  }
+
+#define DLWRAP_COMMON(SYMBOL, ARITY)                                           \
+  DLWRAP_INC();                                                                \
+  DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1);                                      \
+  namespace dlwrap {                                                           \
+  struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> {            \
+    using T = dlwrap::trait<decltype(&SYMBOL)>;                                \
+    static T::FunctionType get() {                                             \
+      constexpr size_t Index = DLWRAP_ID() - 1;                                \
+      void *P = *dlwrap::pointer(Index);                                       \
+      return reinterpret_cast<T::FunctionType>(P);                             \
+    }                                                                          \
+  };                                                                           \
+  }
+
+#define DLWRAP_IMPL(SYMBOL, ARITY)                                             \
+  DLWRAP_COMMON(SYMBOL, ARITY);                                                \
+  DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY)
+
+#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)                                    \
+  DLWRAP_COMMON(SYMBOL, ARITY);                                                \
+  static DLWRAP_INSTANTIATE(SYMBOL, dlwrap_##SYMBOL, ARITY)
+
+#define DLWRAP_INSTANTIATE_0(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); }
+#define DLWRAP_INSTANTIATE_1(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0);                                 \
+  }
+#define DLWRAP_INSTANTIATE_2(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1);                             \
+  }
+#define DLWRAP_INSTANTIATE_3(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2);                         \
+  }
+#define DLWRAP_INSTANTIATE_4(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3);                     \
+  }
+#define DLWRAP_INSTANTIATE_5(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4);                 \
+  }
+#define DLWRAP_INSTANTIATE_6(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5);             \
+  }
+
+#define DLWRAP_INSTANTIATE_7(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6);         \
+  }
+
+#define DLWRAP_INSTANTIATE_8(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7);     \
+  }
+#define DLWRAP_INSTANTIATE_9(SYM_USE, SYM_DEF, T)                              \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \
+  }
+#define DLWRAP_INSTANTIATE_10(SYM_USE, SYM_DEF, T)                             \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8,                  \
+                        typename T::template arg<9>::type x9) {                \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
+                                          x9);                                 \
+  }
+#define DLWRAP_INSTANTIATE_11(SYM_USE, SYM_DEF, T)                             \
+  T::ReturnType SYM_DEF(typename T::template arg<0>::type x0,                  \
+                        typename T::template arg<1>::type x1,                  \
+                        typename T::template arg<2>::type x2,                  \
+                        typename T::template arg<3>::type x3,                  \
+                        typename T::template arg<4>::type x4,                  \
+                        typename T::template arg<5>::type x5,                  \
+                        typename T::template arg<6>::type x6,                  \
+                        typename T::template arg<7>::type x7,                  \
+                        typename T::template arg<8>::type x8,                  \
+                        typename T::template arg<9>::type x9,                  \
+                        typename T::template arg<10>::type x10) {              \
+    return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8,  \
+                                          x9, x10);                            \
+  }
+
+#endif
diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt
index 9388715..e5b2edf 100644
--- a/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/libomptarget/plugins/cuda/CMakeLists.txt
@@ -15,12 +15,6 @@
 elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
   libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
   return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
-  return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
-  libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
-  return()
 endif()
 
 libomptarget_say("Building CUDA offloading plugin.")
@@ -28,10 +22,22 @@
 # Define the suffix for the runtime messaging dumps.
 add_definitions(-DTARGET_NAME=CUDA)
 
-include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
 include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
 
-add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+option(LIBOMPTARGET_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)
+
+if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND
+      AND NOT LIBOMPTARGET_DLOPEN_LIBCUDA)
+  libomptarget_say("Building CUDA plugin linked against libcuda")
+  include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+  add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+  set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
+else()
+  libomptarget_say("Building CUDA plugin for dlopened libcuda")
+  include_directories(dynamic_cuda)
+  add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
+  set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
+endif()
 
 # Install plugin under the lib destination folder.
 install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -39,7 +45,7 @@
 target_link_libraries(omptarget.rtl.cuda
   elf_common
   MemoryManager
-  ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+  ${LIBOMPTARGET_DEP_LIBRARIES}
   ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
   "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
   "-Wl,-z,defs")
diff --git a/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
new file mode 100644
index 0000000..cc7bc42
--- /dev/null
+++ b/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
@@ -0,0 +1,99 @@
+//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement subset of cuda api by calling into cuda library via dlopen
+// Does the dlopen/dlsym calls as part of the call to cuInit
+//
+//===----------------------------------------------------------------------===//
+
+#include "cuda.h"
+#include "Debug.h"
+#include "dlwrap.h"
+
+#include <dlfcn.h>
+
+DLWRAP_INTERNAL(cuInit, 1);
+
+DLWRAP(cuCtxGetDevice, 1);
+DLWRAP(cuDeviceGet, 2);
+DLWRAP(cuDeviceGetAttribute, 3);
+DLWRAP(cuDeviceGetCount, 1);
+DLWRAP(cuFuncGetAttribute, 3);
+
+DLWRAP(cuGetErrorString, 2);
+DLWRAP(cuLaunchKernel, 11);
+
+DLWRAP(cuMemAlloc, 2);
+DLWRAP(cuMemcpyDtoDAsync, 4);
+
+DLWRAP(cuMemcpyDtoH, 3);
+DLWRAP(cuMemcpyDtoHAsync, 4);
+DLWRAP(cuMemcpyHtoD, 3);
+DLWRAP(cuMemcpyHtoDAsync, 4);
+
+DLWRAP(cuMemFree, 1);
+DLWRAP(cuModuleGetFunction, 3);
+DLWRAP(cuModuleGetGlobal, 4);
+
+DLWRAP(cuModuleUnload, 1);
+DLWRAP(cuStreamCreate, 2);
+DLWRAP(cuStreamDestroy, 1);
+DLWRAP(cuStreamSynchronize, 1);
+DLWRAP(cuCtxSetCurrent, 1);
+DLWRAP(cuDevicePrimaryCtxRelease, 1);
+DLWRAP(cuDevicePrimaryCtxGetState, 3);
+DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
+DLWRAP(cuDevicePrimaryCtxRetain, 2);
+DLWRAP(cuModuleLoadDataEx, 5);
+
+DLWRAP(cuDeviceCanAccessPeer, 3);
+DLWRAP(cuCtxEnablePeerAccess, 2);
+DLWRAP(cuMemcpyPeerAsync, 6);
+
+DLWRAP_FINALIZE();
+
+#ifndef DYNAMIC_CUDA_PATH
+#define DYNAMIC_CUDA_PATH "libcuda.so"
+#endif
+
+#define TARGET_NAME CUDA
+#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
+
+static bool checkForCUDA() {
+  // return true if dlopen succeeded and all functions found
+
+  const char *CudaLib = DYNAMIC_CUDA_PATH;
+  void *DynlibHandle = dlopen(CudaLib, RTLD_NOW);
+  if (!DynlibHandle) {
+    DP("Unable to load library '%s': %s!\n", CudaLib, dlerror());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    void *P = dlsym(DynlibHandle, Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
+      return false;
+    }
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+CUresult cuInit(unsigned X) {
+  // Note: Called exactly once from cuda rtl.cpp in a global constructor so
+  // does not need to handle being called repeatedly or concurrently
+  if (!checkForCUDA()) {
+    return CUDA_ERROR_INVALID_VALUE;
+  }
+  return dlwrap_cuInit(X);
+}
diff --git a/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
new file mode 100644
index 0000000..832c269
--- /dev/null
+++ b/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
@@ -0,0 +1,104 @@
+//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The parts of the cuda api that are presently in use by the openmp cuda plugin
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
+#define DYNAMIC_CUDA_CUDA_H_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+typedef int CUdevice;
+typedef uintptr_t CUdeviceptr;
+typedef struct CUmod_st *CUmodule;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUfunc_st *CUfunction;
+typedef struct CUstream_st *CUstream;
+
+typedef enum cudaError_enum {
+  CUDA_SUCCESS = 0,
+  CUDA_ERROR_INVALID_VALUE = 1,
+} CUresult;
+
+typedef enum CUstream_flags_enum {
+  CU_STREAM_DEFAULT = 0x0,
+  CU_STREAM_NON_BLOCKING = 0x1,
+} CUstream_flags;
+
+typedef enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
+  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+} CUdevice_attribute;
+
+typedef enum CUfunction_attribute_enum {
+  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+} CUfunction_attribute;
+
+typedef enum CUctx_flags_enum {
+  CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
+  CU_CTX_SCHED_MASK = 0x07,
+} CUctx_flags;
+
+#define cuMemFree cuMemFree_v2
+#define cuMemAlloc cuMemAlloc_v2
+#define cuMemcpyDtoH cuMemcpyDtoH_v2
+#define cuMemcpyHtoD cuMemcpyHtoD_v2
+#define cuStreamDestroy cuStreamDestroy_v2
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
+#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
+#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
+#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
+
+CUresult cuCtxGetDevice(CUdevice *);
+CUresult cuDeviceGet(CUdevice *, int);
+CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
+CUresult cuDeviceGetCount(int *);
+CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
+
+CUresult cuGetErrorString(CUresult, const char **);
+CUresult cuInit(unsigned);
+CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
+                        unsigned, unsigned, unsigned, CUstream, void **,
+                        void **);
+
+CUresult cuMemAlloc(CUdeviceptr *, size_t);
+CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+
+CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
+CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
+CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
+CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
+
+CUresult cuMemFree(CUdeviceptr);
+CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
+CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
+
+CUresult cuModuleUnload(CUmodule);
+CUresult cuStreamCreate(CUstream *, unsigned);
+CUresult cuStreamDestroy(CUstream);
+CUresult cuStreamSynchronize(CUstream);
+CUresult cuCtxSetCurrent(CUcontext);
+CUresult cuDevicePrimaryCtxRelease(CUdevice);
+CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
+CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
+CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
+CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
+                            void **);
+
+CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
+CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
+CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
+                           size_t, CUstream);
+
+#endif