final/tools/GPURuntime/GPUJIT.c - polly - Git at Google

 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
 /*                                                                            */
 /*                     The LLVM Compiler Infrastructure                       */
 /*                                                                            */
 /* This file is dual licensed under the MIT and the University of Illinois    */
 /* Open Source License. See LICENSE.TXT for details.                          */
 /*                                                                            */
 /******************************************************************************/
 /*                                                                            */
 /*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
 /*                                                                            */
 /******************************************************************************/

 #include "GPUJIT.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>

 static int DebugMode;
 static int CacheMode;

 static void debug_print(const char *format, ...) {
   if (!DebugMode)
     return;

   va_list args;
   va_start(args, format);
   vfprintf(stderr, format, args);
   va_end(args);
 }
 #define dump_function() debug_print("-> %s\n", __func__)

 /* Define Polly's GPGPU data types. */
 struct PollyGPUContextT {
   CUcontext Cuda;
 };

 struct PollyGPUFunctionT {
   CUfunction Cuda;
   CUmodule CudaModule;
   const char *PTXString;
 };

 struct PollyGPUDevicePtrT {
   CUdeviceptr Cuda;
 };

 /* Dynamic library handles for the CUDA and CUDA runtime library. */
 static void *HandleCuda;
 static void *HandleCudaRT;

 /* Type-defines of function pointer to CUDA driver APIs. */
 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
 static CuMemAllocFcnTy *CuMemAllocFcnPtr;

 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
     CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
     unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
     unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
     void **kernelParams, void **extra);
 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;

 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;

 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;

 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
 static CuMemFreeFcnTy *CuMemFreeFcnPtr;

 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;

 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;

 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
 static CuInitFcnTy *CuInitFcnPtr;

 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;

 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;

 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;

 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
                                                  unsigned int, CUjit_option *,
                                                  void **);
 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;

 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
                                                const void *image);
 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;

 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
                                                   const char *);
 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;

 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;

 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;

 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
                                             CUjitInputType type, void *data,
                                             size_t size, const char *name,
                                             unsigned int numOptions,
                                             CUjit_option *options,
                                             void **optionValues);
 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;

 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
                                            CUjit_option *options,
                                            void **optionValues,
                                            CUlinkState *stateOut);
 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;

 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
                                              size_t *sizeOut);
 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;

 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;

 /* Type-defines of function pointer ot CUDA runtime APIs. */
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;

 static void *getAPIHandle(void *Handle, const char *FuncName) {
   char *Err;
   void *FuncPtr;
   dlerror();
   FuncPtr = dlsym(Handle, FuncName);
   if ((Err = dlerror()) != 0) {
     fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
     return 0;
   }
   return FuncPtr;
 }

 static int initialDeviceAPILibraries() {
   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
   if (!HandleCuda) {
     printf("Cannot open library: %s. \n", dlerror());
     return 0;
   }

   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
   if (!HandleCudaRT) {
     printf("Cannot open library: %s. \n", dlerror());
     return 0;
   }

   return 1;
 }

 static int initialDeviceAPIs() {
   if (initialDeviceAPILibraries() == 0)
     return 0;

   /* Get function pointer to CUDA Driver APIs.
    *
    * Note that compilers conforming to the ISO C standard are required to
    * generate a warning if a conversion from a void * pointer to a function
    * pointer is attempted as in the following statements. The warning
    * of this kind of cast may not be emitted by clang and new versions of gcc
    * as it is valid on POSIX 2008.
    */
   CuLaunchKernelFcnPtr =
       (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");

   CuMemAllocFcnPtr =
       (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");

   CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");

   CuMemcpyDtoHFcnPtr =
       (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");

   CuMemcpyHtoDFcnPtr =
       (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");

   CuModuleUnloadFcnPtr =
       (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");

   CuCtxDestroyFcnPtr =
       (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");

   CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");

   CuDeviceGetCountFcnPtr =
       (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");

   CuDeviceGetFcnPtr =
       (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");

   CuCtxCreateFcnPtr =
       (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");

   CuModuleLoadDataExFcnPtr =
       (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");

   CuModuleLoadDataFcnPtr =
       (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");

   CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
       HandleCuda, "cuModuleGetFunction");

   CuDeviceComputeCapabilityFcnPtr =
       (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
           HandleCuda, "cuDeviceComputeCapability");

   CuDeviceGetNameFcnPtr =
       (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");

   CuLinkAddDataFcnPtr =
       (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");

   CuLinkCreateFcnPtr =
       (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");

   CuLinkCompleteFcnPtr =
       (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");

   CuLinkDestroyFcnPtr =
       (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");

   /* Get function pointer to CUDA Runtime APIs. */
   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
       HandleCudaRT, "cudaThreadSynchronize");

   return 1;
 }

 PollyGPUContext *polly_initContext() {
   DebugMode = getenv("POLLY_DEBUG") != 0;

   dump_function();
   PollyGPUContext *Context;
   CUdevice Device;

   int Major = 0, Minor = 0, DeviceID = 0;
   char DeviceName[256];
   int DeviceCount = 0;

   static __thread PollyGPUContext *CurrentContext = NULL;

   if (CurrentContext)
     return CurrentContext;

   /* Get API handles. */
   if (initialDeviceAPIs() == 0) {
     fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
     exit(-1);
   }

   if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
     fprintf(stdout, "Initializing the CUDA driver API failed.\n");
     exit(-1);
   }

   /* Get number of devices that supports CUDA. */
   CuDeviceGetCountFcnPtr(&DeviceCount);
   if (DeviceCount == 0) {
     fprintf(stdout, "There is no device supporting CUDA.\n");
     exit(-1);
   }

   CuDeviceGetFcnPtr(&Device, 0);

   /* Get compute capabilities and the device name. */
   CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
   CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);

   /* Create context on the device. */
   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
   if (Context == 0) {
     fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
     exit(-1);
   }
   CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);

   CacheMode = getenv("POLLY_NOCACHE") == 0;

   if (CacheMode)
     CurrentContext = Context;

   return Context;
 }

 static void freeKernel(PollyGPUFunction *Kernel) {
   if (Kernel->CudaModule)
     CuModuleUnloadFcnPtr(Kernel->CudaModule);

   if (Kernel)
     free(Kernel);
 }

 #define KERNEL_CACHE_SIZE 10

 PollyGPUFunction *polly_getKernel(const char *PTXBuffer,
                                   const char *KernelName) {
   dump_function();

   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
   static __thread int NextCacheItem = 0;

   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
     // We exploit here the property that all Polly-ACC kernels are allocated
     // as global constants, hence a pointer comparision is sufficient to
     // determin equality.
     if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
       debug_print("  -> using cached kernel\n");
       return KernelCache[i];
     }
   }

   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));

   if (Function == 0) {
     fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
     exit(-1);
   }

   CUresult Res;
   CUlinkState LState;
   CUjit_option Options[6];
   void *OptionVals[6];
   float Walltime = 0;
   unsigned long LogSize = 8192;
   char ErrorLog[8192], InfoLog[8192];
   void *CuOut;
   size_t OutSize;

   // Setup linker options
   // Return walltime from JIT compilation
   Options[0] = CU_JIT_WALL_TIME;
   OptionVals[0] = (void *)&Walltime;
   // Pass a buffer for info messages
   Options[1] = CU_JIT_INFO_LOG_BUFFER;
   OptionVals[1] = (void *)InfoLog;
   // Pass the size of the info buffer
   Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
   OptionVals[2] = (void *)LogSize;
   // Pass a buffer for error message
   Options[3] = CU_JIT_ERROR_LOG_BUFFER;
   OptionVals[3] = (void *)ErrorLog;
   // Pass the size of the error buffer
   Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
   OptionVals[4] = (void *)LogSize;
   // Make the linker verbose
   Options[5] = CU_JIT_LOG_VERBOSE;
   OptionVals[5] = (void *)1;

   memset(ErrorLog, 0, sizeof(ErrorLog));

   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
   Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
                             strlen(PTXBuffer) + 1, 0, 0, 0, 0);
   if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
     exit(-1);
   }

   Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
   if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "Complete ptx linker step failed.\n");
     fprintf(stdout, "\n%s\n", ErrorLog);
     exit(-1);
   }

   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
               InfoLog);

   Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
   if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "Loading ptx assembly text failed.\n");
     exit(-1);
   }

   Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
                                   KernelName);
   if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "Loading kernel function failed.\n");
     exit(-1);
   }

   CuLinkDestroyFcnPtr(LState);

   Function->PTXString = PTXBuffer;

   if (CacheMode) {
     if (KernelCache[NextCacheItem])
       freeKernel(KernelCache[NextCacheItem]);

     KernelCache[NextCacheItem] = Function;

     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
   }

   return Function;
 }

 void polly_freeKernel(PollyGPUFunction *Kernel) {
   dump_function();

   if (CacheMode)
     return;

   freeKernel(Kernel);
 }

 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
                                 long MemSize) {
   dump_function();

   CUdeviceptr CuDevData = DevData->Cuda;
   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
 }

 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
                                 long MemSize) {
   dump_function();

   if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
     fprintf(stdout, "Copying results from device to host memory failed.\n");
     exit(-1);
   }
 }

 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                         unsigned int GridDimY, unsigned int BlockDimX,
                         unsigned int BlockDimY, unsigned int BlockDimZ,
                         void **Parameters) {
   dump_function();

   unsigned GridDimZ = 1;
   unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
   CUstream Stream = 0;
   void **Extra = 0;

   CUresult Res;
   Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
                              BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
                              Stream, Parameters, Extra);
   if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "Launching CUDA kernel failed.\n");
     exit(-1);
   }
 }

 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
   dump_function();
   CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
   free(Allocation);
 }

 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
   dump_function();

   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));

   if (DevData == 0) {
     fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }

   CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);

   if (Res != CUDA_SUCCESS) {
     fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }

   return DevData;
 }

 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
   dump_function();

   return (void *)Allocation->Cuda;
 }

 void polly_freeContext(PollyGPUContext *Context) {
   dump_function();

   if (CacheMode)
     return;

   if (Context->Cuda) {
     CuCtxDestroyFcnPtr(Context->Cuda);
     free(Context);
   }

   dlclose(HandleCuda);
   dlclose(HandleCudaRT);
 }
	/****************** GPUJIT.c - GPUJIT Execution Engine ********************/
	/* */
	/* The LLVM Compiler Infrastructure */
	/* */
	/* This file is dual licensed under the MIT and the University of Illinois */
	/* Open Source License. See LICENSE.TXT for details. */
	/* */
	/******************************************************************************/
	/* */
	/* This file implements GPUJIT, a ptx string execution engine for GPU. */
	/* */
	/******************************************************************************/

	#include "GPUJIT.h"
	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <dlfcn.h>
	#include <stdarg.h>
	#include <stdio.h>
	#include <string.h>

	static int DebugMode;
	static int CacheMode;

	static void debug_print(const char *format, ...) {
	if (!DebugMode)
	return;

	va_list args;
	va_start(args, format);
	vfprintf(stderr, format, args);
	va_end(args);
	}
	#define dump_function() debug_print("-> %s\n", __func__)

	/* Define Polly's GPGPU data types. */
	struct PollyGPUContextT {
	CUcontext Cuda;
	};

	struct PollyGPUFunctionT {
	CUfunction Cuda;
	CUmodule CudaModule;
	const char *PTXString;
	};

	struct PollyGPUDevicePtrT {
	CUdeviceptr Cuda;
	};

	/* Dynamic library handles for the CUDA and CUDA runtime library. */
	static void *HandleCuda;
	static void *HandleCudaRT;

	/* Type-defines of function pointer to CUDA driver APIs. */
	typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
	static CuMemAllocFcnTy *CuMemAllocFcnPtr;

	typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
	CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
	unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
	unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
	void kernelParams, void extra);
	static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;

	typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
	static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;

	typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
	static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;

	typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
	static CuMemFreeFcnTy *CuMemFreeFcnPtr;

	typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
	static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;

	typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
	static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;

	typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
	static CuInitFcnTy *CuInitFcnPtr;

	typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
	static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;

	typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
	static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;

	typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
	static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;

	typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule , const void ,
	unsigned int, CUjit_option *,
	void **);
	static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;

	typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module,
	const void *image);
	static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;

	typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
	const char *);
	static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;

	typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int , int , CUdevice);
	static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;

	typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
	static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;

	typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state,
	CUjitInputType type, void *data,
	size_t size, const char *name,
	unsigned int numOptions,
	CUjit_option *options,
	void **optionValues);
	static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;

	typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions,
	CUjit_option *options,
	void **optionValues,
	CUlinkState *stateOut);
	static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;

	typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut,
	size_t *sizeOut);
	static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;

	typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state);
	static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;

	/* Type-defines of function pointer ot CUDA runtime APIs. */
	typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
	static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;

	static void getAPIHandle(void Handle, const char *FuncName) {
	char *Err;
	void *FuncPtr;
	dlerror();
	FuncPtr = dlsym(Handle, FuncName);
	if ((Err = dlerror()) != 0) {
	fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
	return 0;
	}
	return FuncPtr;
	}

	static int initialDeviceAPILibraries() {
	HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
	if (!HandleCuda) {
	printf("Cannot open library: %s. \n", dlerror());
	return 0;
	}

	HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
	if (!HandleCudaRT) {
	printf("Cannot open library: %s. \n", dlerror());
	return 0;
	}

	return 1;
	}

	static int initialDeviceAPIs() {
	if (initialDeviceAPILibraries() == 0)
	return 0;

	/* Get function pointer to CUDA Driver APIs.
	*
	* Note that compilers conforming to the ISO C standard are required to
	* generate a warning if a conversion from a void * pointer to a function
	* pointer is attempted as in the following statements. The warning
	* of this kind of cast may not be emitted by clang and new versions of gcc
	* as it is valid on POSIX 2008.
	*/
	CuLaunchKernelFcnPtr =
	(CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel");

	CuMemAllocFcnPtr =
	(CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2");

	CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2");

	CuMemcpyDtoHFcnPtr =
	(CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2");

	CuMemcpyHtoDFcnPtr =
	(CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2");

	CuModuleUnloadFcnPtr =
	(CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload");

	CuCtxDestroyFcnPtr =
	(CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy");

	CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit");

	CuDeviceGetCountFcnPtr =
	(CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount");

	CuDeviceGetFcnPtr =
	(CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet");

	CuCtxCreateFcnPtr =
	(CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2");

	CuModuleLoadDataExFcnPtr =
	(CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx");

	CuModuleLoadDataFcnPtr =
	(CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData");

	CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle(
	HandleCuda, "cuModuleGetFunction");

	CuDeviceComputeCapabilityFcnPtr =
	(CuDeviceComputeCapabilityFcnTy *)getAPIHandle(
	HandleCuda, "cuDeviceComputeCapability");

	CuDeviceGetNameFcnPtr =
	(CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName");

	CuLinkAddDataFcnPtr =
	(CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData");

	CuLinkCreateFcnPtr =
	(CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate");

	CuLinkCompleteFcnPtr =
	(CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete");

	CuLinkDestroyFcnPtr =
	(CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy");

	/* Get function pointer to CUDA Runtime APIs. */
	CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle(
	HandleCudaRT, "cudaThreadSynchronize");

	return 1;
	}

	PollyGPUContext *polly_initContext() {
	DebugMode = getenv("POLLY_DEBUG") != 0;

	dump_function();
	PollyGPUContext *Context;
	CUdevice Device;

	int Major = 0, Minor = 0, DeviceID = 0;
	char DeviceName[256];
	int DeviceCount = 0;

	static __thread PollyGPUContext *CurrentContext = NULL;

	if (CurrentContext)
	return CurrentContext;

	/* Get API handles. */
	if (initialDeviceAPIs() == 0) {
	fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
	exit(-1);
	}

	if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
	fprintf(stdout, "Initializing the CUDA driver API failed.\n");
	exit(-1);
	}

	/* Get number of devices that supports CUDA. */
	CuDeviceGetCountFcnPtr(&DeviceCount);
	if (DeviceCount == 0) {
	fprintf(stdout, "There is no device supporting CUDA.\n");
	exit(-1);
	}

	CuDeviceGetFcnPtr(&Device, 0);

	/* Get compute capabilities and the device name. */
	CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
	CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
	debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);

	/* Create context on the device. */
	Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
	if (Context == 0) {
	fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
	exit(-1);
	}
	CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device);

	CacheMode = getenv("POLLY_NOCACHE") == 0;

	if (CacheMode)
	CurrentContext = Context;

	return Context;
	}

	static void freeKernel(PollyGPUFunction *Kernel) {
	if (Kernel->CudaModule)
	CuModuleUnloadFcnPtr(Kernel->CudaModule);

	if (Kernel)
	free(Kernel);
	}

	#define KERNEL_CACHE_SIZE 10

	PollyGPUFunction polly_getKernel(const char PTXBuffer,
	const char *KernelName) {
	dump_function();

	static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
	static __thread int NextCacheItem = 0;

	for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
	// We exploit here the property that all Polly-ACC kernels are allocated
	// as global constants, hence a pointer comparision is sufficient to
	// determin equality.
	if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) {
	debug_print(" -> using cached kernel\n");
	return KernelCache[i];
	}
	}

	PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));

	if (Function == 0) {
	fprintf(stdout, "Allocate memory for Polly GPU function failed.\n");
	exit(-1);
	}

	CUresult Res;
	CUlinkState LState;
	CUjit_option Options[6];
	void *OptionVals[6];
	float Walltime = 0;
	unsigned long LogSize = 8192;
	char ErrorLog[8192], InfoLog[8192];
	void *CuOut;
	size_t OutSize;

	// Setup linker options
	// Return walltime from JIT compilation
	Options[0] = CU_JIT_WALL_TIME;
	OptionVals[0] = (void *)&Walltime;
	// Pass a buffer for info messages
	Options[1] = CU_JIT_INFO_LOG_BUFFER;
	OptionVals[1] = (void *)InfoLog;
	// Pass the size of the info buffer
	Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	OptionVals[2] = (void *)LogSize;
	// Pass a buffer for error message
	Options[3] = CU_JIT_ERROR_LOG_BUFFER;
	OptionVals[3] = (void *)ErrorLog;
	// Pass the size of the error buffer
	Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
	OptionVals[4] = (void *)LogSize;
	// Make the linker verbose
	Options[5] = CU_JIT_LOG_VERBOSE;
	OptionVals[5] = (void *)1;

	memset(ErrorLog, 0, sizeof(ErrorLog));

	CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
	Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer,
	strlen(PTXBuffer) + 1, 0, 0, 0, 0);
	if (Res != CUDA_SUCCESS) {
	fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
	exit(-1);
	}

	Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
	if (Res != CUDA_SUCCESS) {
	fprintf(stdout, "Complete ptx linker step failed.\n");
	fprintf(stdout, "\n%s\n", ErrorLog);
	exit(-1);
	}

	debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
	InfoLog);

	Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut);
	if (Res != CUDA_SUCCESS) {
	fprintf(stdout, "Loading ptx assembly text failed.\n");
	exit(-1);
	}

	Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule,
	KernelName);
	if (Res != CUDA_SUCCESS) {
	fprintf(stdout, "Loading kernel function failed.\n");
	exit(-1);
	}

	CuLinkDestroyFcnPtr(LState);

	Function->PTXString = PTXBuffer;

	if (CacheMode) {
	if (KernelCache[NextCacheItem])
	freeKernel(KernelCache[NextCacheItem]);

	KernelCache[NextCacheItem] = Function;

	NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
	}

	return Function;
	}

	void polly_freeKernel(PollyGPUFunction *Kernel) {
	dump_function();

	if (CacheMode)
	return;

	freeKernel(Kernel);
	}

	void polly_copyFromHostToDevice(void HostData, PollyGPUDevicePtr DevData,
	long MemSize) {
	dump_function();

	CUdeviceptr CuDevData = DevData->Cuda;
	CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
	}

	void polly_copyFromDeviceToHost(PollyGPUDevicePtr DevData, void HostData,
	long MemSize) {
	dump_function();

	if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
	fprintf(stdout, "Copying results from device to host memory failed.\n");
	exit(-1);
	}
	}

	void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
	unsigned int GridDimY, unsigned int BlockDimX,
	unsigned int BlockDimY, unsigned int BlockDimZ,
	void **Parameters) {
	dump_function();

	unsigned GridDimZ = 1;
	unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
	CUstream Stream = 0;
	void **Extra = 0;

	CUresult Res;
	Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ,
	BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes,
	Stream, Parameters, Extra);
	if (Res != CUDA_SUCCESS) {
	fprintf(stdout, "Launching CUDA kernel failed.\n");
	exit(-1);
	}
	}

	void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
	dump_function();
	CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda);
	free(Allocation);
	}

	PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
	dump_function();

	PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));

	if (DevData == 0) {
	fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
	exit(-1);
	}

	CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize);

	if (Res != CUDA_SUCCESS) {
	fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
	exit(-1);
	}

	return DevData;
	}

	void polly_getDevicePtr(PollyGPUDevicePtr Allocation) {
	dump_function();

	return (void *)Allocation->Cuda;
	}

	void polly_freeContext(PollyGPUContext *Context) {
	dump_function();

	if (CacheMode)
	return;

	if (Context->Cuda) {
	CuCtxDestroyFcnPtr(Context->Cuda);
	free(Context);
	}

	dlclose(HandleCuda);
	dlclose(HandleCudaRT);
	}