polly/tools/GPURuntime/GPUJIT.c - llvm-project - Git at Google

 /******************** GPUJIT.c - GPUJIT Execution Engine **********************/
 /*                                                                            */
 /* Part of the LLVM Project, under the Apache License v2.0 with LLVM          */
 /* Exceptions.                                                                */
 /* See https://llvm.org/LICENSE.txt for license information.                  */
 /* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    */
 /*                                                                            */
 /******************************************************************************/
 /*                                                                            */
 /*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
 /*                                                                            */
 /******************************************************************************/

 #include "GPUJIT.h"

 #ifdef HAS_LIBCUDART
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif /* HAS_LIBCUDART */

 #ifdef HAS_LIBOPENCL
 #ifdef __APPLE__
 #include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif /* __APPLE__ */
 #endif /* HAS_LIBOPENCL */

 #include <assert.h>
 #include <dlfcn.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>

 static int DebugMode;
 static int CacheMode;
 #define max(x, y) ((x) > (y) ? (x) : (y))

 static PollyGPURuntime Runtime = RUNTIME_NONE;

 static void debug_print(const char *format, ...) {
   if (!DebugMode)
     return;

   va_list args;
   va_start(args, format);
   vfprintf(stderr, format, args);
   va_end(args);
 }
 #define dump_function() debug_print("-> %s\n", __func__)

 #define KERNEL_CACHE_SIZE 10

 static void err_runtime() __attribute__((noreturn));
 static void err_runtime() {
   fprintf(stderr, "Runtime not correctly initialized.\n");
   exit(-1);
 }

 struct PollyGPUContextT {
   void *Context;
 };

 struct PollyGPUFunctionT {
   void *Kernel;
 };

 struct PollyGPUDevicePtrT {
   void *DevicePtr;
 };

 /******************************************************************************/
 /*                                  OpenCL                                    */
 /******************************************************************************/
 #ifdef HAS_LIBOPENCL

 struct OpenCLContextT {
   cl_context Context;
   cl_command_queue CommandQueue;
 };

 struct OpenCLKernelT {
   cl_kernel Kernel;
   cl_program Program;
   const char *BinaryString;
 };

 struct OpenCLDevicePtrT {
   cl_mem MemObj;
 };

 /* Dynamic library handles for the OpenCL runtime library. */
 static void *HandleOpenCL;
 static void *HandleOpenCLBeignet;

 /* Type-defines of function pointer to OpenCL Runtime API. */
 typedef cl_int clGetPlatformIDsFcnTy(cl_uint NumEntries,
                                      cl_platform_id *Platforms,
                                      cl_uint *NumPlatforms);
 static clGetPlatformIDsFcnTy *clGetPlatformIDsFcnPtr;

 typedef cl_int clGetDeviceIDsFcnTy(cl_platform_id Platform,
                                    cl_device_type DeviceType,
                                    cl_uint NumEntries, cl_device_id *Devices,
                                    cl_uint *NumDevices);
 static clGetDeviceIDsFcnTy *clGetDeviceIDsFcnPtr;

 typedef cl_int clGetDeviceInfoFcnTy(cl_device_id Device,
                                     cl_device_info ParamName,
                                     size_t ParamValueSize, void *ParamValue,
                                     size_t *ParamValueSizeRet);
 static clGetDeviceInfoFcnTy *clGetDeviceInfoFcnPtr;

 typedef cl_int clGetKernelInfoFcnTy(cl_kernel Kernel, cl_kernel_info ParamName,
                                     size_t ParamValueSize, void *ParamValue,
                                     size_t *ParamValueSizeRet);
 static clGetKernelInfoFcnTy *clGetKernelInfoFcnPtr;

 typedef cl_context clCreateContextFcnTy(
     const cl_context_properties *Properties, cl_uint NumDevices,
     const cl_device_id *Devices,
     void CL_CALLBACK *pfn_notify(const char *Errinfo, const void *PrivateInfo,
                                  size_t CB, void *UserData),
     void *UserData, cl_int *ErrcodeRet);
 static clCreateContextFcnTy *clCreateContextFcnPtr;

 typedef cl_command_queue
 clCreateCommandQueueFcnTy(cl_context Context, cl_device_id Device,
                           cl_command_queue_properties Properties,
                           cl_int *ErrcodeRet);
 static clCreateCommandQueueFcnTy *clCreateCommandQueueFcnPtr;

 typedef cl_mem clCreateBufferFcnTy(cl_context Context, cl_mem_flags Flags,
                                    size_t Size, void *HostPtr,
                                    cl_int *ErrcodeRet);
 static clCreateBufferFcnTy *clCreateBufferFcnPtr;

 typedef cl_int
 clEnqueueWriteBufferFcnTy(cl_command_queue CommandQueue, cl_mem Buffer,
                           cl_bool BlockingWrite, size_t Offset, size_t Size,
                           const void *Ptr, cl_uint NumEventsInWaitList,
                           const cl_event *EventWaitList, cl_event *Event);
 static clEnqueueWriteBufferFcnTy *clEnqueueWriteBufferFcnPtr;

 typedef cl_program
 clCreateProgramWithLLVMIntelFcnTy(cl_context Context, cl_uint NumDevices,
                                   const cl_device_id *DeviceList,
                                   const char *Filename, cl_int *ErrcodeRet);
 static clCreateProgramWithLLVMIntelFcnTy *clCreateProgramWithLLVMIntelFcnPtr;

 typedef cl_program clCreateProgramWithBinaryFcnTy(
     cl_context Context, cl_uint NumDevices, const cl_device_id *DeviceList,
     const size_t *Lengths, const unsigned char **Binaries, cl_int *BinaryStatus,
     cl_int *ErrcodeRet);
 static clCreateProgramWithBinaryFcnTy *clCreateProgramWithBinaryFcnPtr;

 typedef cl_int clBuildProgramFcnTy(
     cl_program Program, cl_uint NumDevices, const cl_device_id *DeviceList,
     const char *Options,
     void(CL_CALLBACK *pfn_notify)(cl_program Program, void *UserData),
     void *UserData);
 static clBuildProgramFcnTy *clBuildProgramFcnPtr;

 typedef cl_kernel clCreateKernelFcnTy(cl_program Program,
                                       const char *KernelName,
                                       cl_int *ErrcodeRet);
 static clCreateKernelFcnTy *clCreateKernelFcnPtr;

 typedef cl_int clSetKernelArgFcnTy(cl_kernel Kernel, cl_uint ArgIndex,
                                    size_t ArgSize, const void *ArgValue);
 static clSetKernelArgFcnTy *clSetKernelArgFcnPtr;

 typedef cl_int clEnqueueNDRangeKernelFcnTy(
     cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint WorkDim,
     const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
     const size_t *LocalWorkSize, cl_uint NumEventsInWaitList,
     const cl_event *EventWaitList, cl_event *Event);
 static clEnqueueNDRangeKernelFcnTy *clEnqueueNDRangeKernelFcnPtr;

 typedef cl_int clEnqueueReadBufferFcnTy(cl_command_queue CommandQueue,
                                         cl_mem Buffer, cl_bool BlockingRead,
                                         size_t Offset, size_t Size, void *Ptr,
                                         cl_uint NumEventsInWaitList,
                                         const cl_event *EventWaitList,
                                         cl_event *Event);
 static clEnqueueReadBufferFcnTy *clEnqueueReadBufferFcnPtr;

 typedef cl_int clFlushFcnTy(cl_command_queue CommandQueue);
 static clFlushFcnTy *clFlushFcnPtr;

 typedef cl_int clFinishFcnTy(cl_command_queue CommandQueue);
 static clFinishFcnTy *clFinishFcnPtr;

 typedef cl_int clReleaseKernelFcnTy(cl_kernel Kernel);
 static clReleaseKernelFcnTy *clReleaseKernelFcnPtr;

 typedef cl_int clReleaseProgramFcnTy(cl_program Program);
 static clReleaseProgramFcnTy *clReleaseProgramFcnPtr;

 typedef cl_int clReleaseMemObjectFcnTy(cl_mem Memobject);
 static clReleaseMemObjectFcnTy *clReleaseMemObjectFcnPtr;

 typedef cl_int clReleaseCommandQueueFcnTy(cl_command_queue CommandQueue);
 static clReleaseCommandQueueFcnTy *clReleaseCommandQueueFcnPtr;

 typedef cl_int clReleaseContextFcnTy(cl_context Context);
 static clReleaseContextFcnTy *clReleaseContextFcnPtr;

 static void *getAPIHandleCL(void *Handle, const char *FuncName) {
   char *Err;
   void *FuncPtr;
   dlerror();
   FuncPtr = dlsym(Handle, FuncName);
   if ((Err = dlerror()) != 0) {
     fprintf(stderr, "Load OpenCL Runtime API failed: %s. \n", Err);
     return 0;
   }
   return FuncPtr;
 }

 static int initialDeviceAPILibrariesCL() {
   HandleOpenCLBeignet = dlopen("/usr/local/lib/beignet/libcl.so", RTLD_LAZY);
   HandleOpenCL = dlopen("libOpenCL.so", RTLD_LAZY);
   if (!HandleOpenCL) {
     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
     return 0;
   }
   return 1;
 }

 /* Get function pointer to OpenCL Runtime API.
  *
  * Note that compilers conforming to the ISO C standard are required to
  * generate a warning if a conversion from a void * pointer to a function
  * pointer is attempted as in the following statements. The warning
  * of this kind of cast may not be emitted by clang and new versions of gcc
  * as it is valid on POSIX 2008. For compilers required to generate a warning,
  * we temporarily disable -Wpedantic, to avoid bloating the output with
  * unnecessary warnings.
  *
  * Reference:
  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
  */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpedantic"
 static int initialDeviceAPIsCL() {
   if (initialDeviceAPILibrariesCL() == 0)
     return 0;

   // FIXME: We are now always selecting the Intel Beignet driver if it is
   // available on the system, instead of a possible NVIDIA or AMD OpenCL
   // API. This selection should occurr based on the target architecture
   // chosen when compiling.
   void *Handle =
       (HandleOpenCLBeignet != NULL ? HandleOpenCLBeignet : HandleOpenCL);

   clGetPlatformIDsFcnPtr =
       (clGetPlatformIDsFcnTy *)getAPIHandleCL(Handle, "clGetPlatformIDs");

   clGetDeviceIDsFcnPtr =
       (clGetDeviceIDsFcnTy *)getAPIHandleCL(Handle, "clGetDeviceIDs");

   clGetDeviceInfoFcnPtr =
       (clGetDeviceInfoFcnTy *)getAPIHandleCL(Handle, "clGetDeviceInfo");

   clGetKernelInfoFcnPtr =
       (clGetKernelInfoFcnTy *)getAPIHandleCL(Handle, "clGetKernelInfo");

   clCreateContextFcnPtr =
       (clCreateContextFcnTy *)getAPIHandleCL(Handle, "clCreateContext");

   clCreateCommandQueueFcnPtr = (clCreateCommandQueueFcnTy *)getAPIHandleCL(
       Handle, "clCreateCommandQueue");

   clCreateBufferFcnPtr =
       (clCreateBufferFcnTy *)getAPIHandleCL(Handle, "clCreateBuffer");

   clEnqueueWriteBufferFcnPtr = (clEnqueueWriteBufferFcnTy *)getAPIHandleCL(
       Handle, "clEnqueueWriteBuffer");

   if (HandleOpenCLBeignet)
     clCreateProgramWithLLVMIntelFcnPtr =
         (clCreateProgramWithLLVMIntelFcnTy *)getAPIHandleCL(
             Handle, "clCreateProgramWithLLVMIntel");

   clCreateProgramWithBinaryFcnPtr =
       (clCreateProgramWithBinaryFcnTy *)getAPIHandleCL(
           Handle, "clCreateProgramWithBinary");

   clBuildProgramFcnPtr =
       (clBuildProgramFcnTy *)getAPIHandleCL(Handle, "clBuildProgram");

   clCreateKernelFcnPtr =
       (clCreateKernelFcnTy *)getAPIHandleCL(Handle, "clCreateKernel");

   clSetKernelArgFcnPtr =
       (clSetKernelArgFcnTy *)getAPIHandleCL(Handle, "clSetKernelArg");

   clEnqueueNDRangeKernelFcnPtr = (clEnqueueNDRangeKernelFcnTy *)getAPIHandleCL(
       Handle, "clEnqueueNDRangeKernel");

   clEnqueueReadBufferFcnPtr =
       (clEnqueueReadBufferFcnTy *)getAPIHandleCL(Handle, "clEnqueueReadBuffer");

   clFlushFcnPtr = (clFlushFcnTy *)getAPIHandleCL(Handle, "clFlush");

   clFinishFcnPtr = (clFinishFcnTy *)getAPIHandleCL(Handle, "clFinish");

   clReleaseKernelFcnPtr =
       (clReleaseKernelFcnTy *)getAPIHandleCL(Handle, "clReleaseKernel");

   clReleaseProgramFcnPtr =
       (clReleaseProgramFcnTy *)getAPIHandleCL(Handle, "clReleaseProgram");

   clReleaseMemObjectFcnPtr =
       (clReleaseMemObjectFcnTy *)getAPIHandleCL(Handle, "clReleaseMemObject");

   clReleaseCommandQueueFcnPtr = (clReleaseCommandQueueFcnTy *)getAPIHandleCL(
       Handle, "clReleaseCommandQueue");

   clReleaseContextFcnPtr =
       (clReleaseContextFcnTy *)getAPIHandleCL(Handle, "clReleaseContext");

   return 1;
 }
 #pragma GCC diagnostic pop

 /* Context and Device. */
 static PollyGPUContext *GlobalContext = NULL;
 static cl_device_id GlobalDeviceID = NULL;

 /* Fd-Decl: Print out OpenCL Error codes to human readable strings. */
 static void printOpenCLError(int Error);

 static void checkOpenCLError(int Ret, const char *format, ...) {
   if (Ret == CL_SUCCESS)
     return;

   printOpenCLError(Ret);
   va_list args;
   va_start(args, format);
   vfprintf(stderr, format, args);
   va_end(args);
   exit(-1);
 }

 static PollyGPUContext *initContextCL() {
   dump_function();

   PollyGPUContext *Context;

   cl_platform_id PlatformID = NULL;
   cl_device_id DeviceID = NULL;
   cl_uint NumDevicesRet;
   cl_int Ret;

   char DeviceRevision[256];
   char DeviceName[256];
   size_t DeviceRevisionRetSize, DeviceNameRetSize;

   static __thread PollyGPUContext *CurrentContext = NULL;

   if (CurrentContext)
     return CurrentContext;

   /* Get API handles. */
   if (initialDeviceAPIsCL() == 0) {
     fprintf(stderr, "Getting the \"handle\" for the OpenCL Runtime failed.\n");
     exit(-1);
   }

   /* Get number of devices that support OpenCL. */
   static const int NumberOfPlatforms = 1;
   Ret = clGetPlatformIDsFcnPtr(NumberOfPlatforms, &PlatformID, NULL);
   checkOpenCLError(Ret, "Failed to get platform IDs.\n");
   // TODO: Extend to CL_DEVICE_TYPE_ALL?
   static const int NumberOfDevices = 1;
   Ret = clGetDeviceIDsFcnPtr(PlatformID, CL_DEVICE_TYPE_GPU, NumberOfDevices,
                              &DeviceID, &NumDevicesRet);
   checkOpenCLError(Ret, "Failed to get device IDs.\n");

   GlobalDeviceID = DeviceID;
   if (NumDevicesRet == 0) {
     fprintf(stderr, "There is no device supporting OpenCL.\n");
     exit(-1);
   }

   /* Get device revision. */
   Ret =
       clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_VERSION, sizeof(DeviceRevision),
                             DeviceRevision, &DeviceRevisionRetSize);
   checkOpenCLError(Ret, "Failed to fetch device revision.\n");

   /* Get device name. */
   Ret = clGetDeviceInfoFcnPtr(DeviceID, CL_DEVICE_NAME, sizeof(DeviceName),
                               DeviceName, &DeviceNameRetSize);
   checkOpenCLError(Ret, "Failed to fetch device name.\n");

   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);

   /* Create context on the device. */
   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
   if (Context == 0) {
     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
     exit(-1);
   }
   Context->Context = (OpenCLContext *)malloc(sizeof(OpenCLContext));
   if (Context->Context == 0) {
     fprintf(stderr, "Allocate memory for Polly OpenCL context failed.\n");
     exit(-1);
   }
   ((OpenCLContext *)Context->Context)->Context =
       clCreateContextFcnPtr(NULL, NumDevicesRet, &DeviceID, NULL, NULL, &Ret);
   checkOpenCLError(Ret, "Failed to create context.\n");

   static const int ExtraProperties = 0;
   ((OpenCLContext *)Context->Context)->CommandQueue =
       clCreateCommandQueueFcnPtr(((OpenCLContext *)Context->Context)->Context,
                                  DeviceID, ExtraProperties, &Ret);
   checkOpenCLError(Ret, "Failed to create command queue.\n");

   if (CacheMode)
     CurrentContext = Context;

   GlobalContext = Context;
   return Context;
 }

 static void freeKernelCL(PollyGPUFunction *Kernel) {
   dump_function();

   if (CacheMode)
     return;

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   cl_int Ret;
   Ret = clFlushFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
   checkOpenCLError(Ret, "Failed to flush command queue.\n");
   Ret = clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue);
   checkOpenCLError(Ret, "Failed to finish command queue.\n");

   if (((OpenCLKernel *)Kernel->Kernel)->Kernel) {
     cl_int Ret =
         clReleaseKernelFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Kernel);
     checkOpenCLError(Ret, "Failed to release kernel.\n");
   }

   if (((OpenCLKernel *)Kernel->Kernel)->Program) {
     cl_int Ret =
         clReleaseProgramFcnPtr(((OpenCLKernel *)Kernel->Kernel)->Program);
     checkOpenCLError(Ret, "Failed to release program.\n");
   }

   if (Kernel->Kernel)
     free((OpenCLKernel *)Kernel->Kernel);

   if (Kernel)
     free(Kernel);
 }

 static PollyGPUFunction *getKernelCL(const char *BinaryBuffer,
                                      const char *KernelName) {
   dump_function();

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
   static __thread int NextCacheItem = 0;

   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
     // We exploit here the property that all Polly-ACC kernels are allocated
     // as global constants, hence a pointer comparision is sufficient to
     // determin equality.
     if (KernelCache[i] &&
         ((OpenCLKernel *)KernelCache[i]->Kernel)->BinaryString ==
             BinaryBuffer) {
       debug_print("  -> using cached kernel\n");
       return KernelCache[i];
     }
   }

   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
   if (Function == 0) {
     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
     exit(-1);
   }
   Function->Kernel = (OpenCLKernel *)malloc(sizeof(OpenCLKernel));
   if (Function->Kernel == 0) {
     fprintf(stderr, "Allocate memory for Polly OpenCL kernel failed.\n");
     exit(-1);
   }

   if (!GlobalDeviceID) {
     fprintf(stderr, "GPGPU-code generation not initialized correctly.\n");
     exit(-1);
   }

   cl_int Ret;

   if (HandleOpenCLBeignet) {
     // This is a workaround, since clCreateProgramWithLLVMIntel only
     // accepts a filename to a valid llvm-ir file as an argument, instead
     // of accepting the BinaryBuffer directly.
     char FileName[] = "/tmp/polly_kernelXXXXXX";
     int File = mkstemp(FileName);
     write(File, BinaryBuffer, strlen(BinaryBuffer));

     ((OpenCLKernel *)Function->Kernel)->Program =
         clCreateProgramWithLLVMIntelFcnPtr(
             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
             &GlobalDeviceID, FileName, &Ret);
     checkOpenCLError(Ret, "Failed to create program from llvm.\n");
     close(File);
     unlink(FileName);
   } else {
     size_t BinarySize = strlen(BinaryBuffer);
     ((OpenCLKernel *)Function->Kernel)->Program =
         clCreateProgramWithBinaryFcnPtr(
             ((OpenCLContext *)GlobalContext->Context)->Context, 1,
             &GlobalDeviceID, (const size_t *)&BinarySize,
             (const unsigned char **)&BinaryBuffer, NULL, &Ret);
     checkOpenCLError(Ret, "Failed to create program from binary.\n");
   }

   Ret = clBuildProgramFcnPtr(((OpenCLKernel *)Function->Kernel)->Program, 1,
                              &GlobalDeviceID, NULL, NULL, NULL);
   checkOpenCLError(Ret, "Failed to build program.\n");

   ((OpenCLKernel *)Function->Kernel)->Kernel = clCreateKernelFcnPtr(
       ((OpenCLKernel *)Function->Kernel)->Program, KernelName, &Ret);
   checkOpenCLError(Ret, "Failed to create kernel.\n");

   ((OpenCLKernel *)Function->Kernel)->BinaryString = BinaryBuffer;

   if (CacheMode) {
     if (KernelCache[NextCacheItem])
       freeKernelCL(KernelCache[NextCacheItem]);

     KernelCache[NextCacheItem] = Function;

     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
   }

   return Function;
 }

 static void copyFromHostToDeviceCL(void *HostData, PollyGPUDevicePtr *DevData,
                                    long MemSize) {
   dump_function();

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   cl_int Ret;
   Ret = clEnqueueWriteBufferFcnPtr(
       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
       HostData, 0, NULL, NULL);
   checkOpenCLError(Ret, "Copying data from host memory to device failed.\n");
 }

 static void copyFromDeviceToHostCL(PollyGPUDevicePtr *DevData, void *HostData,
                                    long MemSize) {
   dump_function();

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   cl_int Ret;
   Ret = clEnqueueReadBufferFcnPtr(
       ((OpenCLContext *)GlobalContext->Context)->CommandQueue,
       ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj, CL_TRUE, 0, MemSize,
       HostData, 0, NULL, NULL);
   checkOpenCLError(Ret, "Copying results from device to host memory failed.\n");
 }

 static void launchKernelCL(PollyGPUFunction *Kernel, unsigned int GridDimX,
                            unsigned int GridDimY, unsigned int BlockDimX,
                            unsigned int BlockDimY, unsigned int BlockDimZ,
                            void **Parameters) {
   dump_function();

   cl_int Ret;
   cl_uint NumArgs;

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   OpenCLKernel *CLKernel = (OpenCLKernel *)Kernel->Kernel;
   Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
                               sizeof(cl_uint), &NumArgs, NULL);
   checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");

   /* Argument sizes are stored at the end of the Parameters array. */
   for (cl_uint i = 0; i < NumArgs; i++) {
     Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
                                *((int *)Parameters[NumArgs + i]),
                                (void *)Parameters[i]);
     checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
   }

   unsigned int GridDimZ = 1;
   size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
                               BlockDimZ * GridDimZ};
   size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};

   static const int WorkDim = 3;
   OpenCLContext *CLContext = (OpenCLContext *)GlobalContext->Context;
   Ret = clEnqueueNDRangeKernelFcnPtr(CLContext->CommandQueue, CLKernel->Kernel,
                                      WorkDim, NULL, GlobalWorkSize,
                                      LocalWorkSize, 0, NULL, NULL);
   checkOpenCLError(Ret, "Launching OpenCL kernel failed.\n");
 }

 static void freeDeviceMemoryCL(PollyGPUDevicePtr *Allocation) {
   dump_function();

   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
   cl_int Ret = clReleaseMemObjectFcnPtr((cl_mem)DevPtr->MemObj);
   checkOpenCLError(Ret, "Failed to free device memory.\n");

   free(DevPtr);
   free(Allocation);
 }

 static PollyGPUDevicePtr *allocateMemoryForDeviceCL(long MemSize) {
   dump_function();

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
   if (DevData == 0) {
     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }
   DevData->DevicePtr = (OpenCLDevicePtr *)malloc(sizeof(OpenCLDevicePtr));
   if (DevData->DevicePtr == 0) {
     fprintf(stderr, "Allocate memory for GPU device memory pointer failed.\n");
     exit(-1);
   }

   cl_int Ret;
   ((OpenCLDevicePtr *)DevData->DevicePtr)->MemObj =
       clCreateBufferFcnPtr(((OpenCLContext *)GlobalContext->Context)->Context,
                            CL_MEM_READ_WRITE, MemSize, NULL, &Ret);
   checkOpenCLError(Ret,
                    "Allocate memory for GPU device memory pointer failed.\n");

   return DevData;
 }

 static void *getDevicePtrCL(PollyGPUDevicePtr *Allocation) {
   dump_function();

   OpenCLDevicePtr *DevPtr = (OpenCLDevicePtr *)Allocation->DevicePtr;
   return (void *)DevPtr->MemObj;
 }

 static void synchronizeDeviceCL() {
   dump_function();

   if (!GlobalContext) {
     fprintf(stderr, "GPGPU-code generation not correctly initialized.\n");
     exit(-1);
   }

   if (clFinishFcnPtr(((OpenCLContext *)GlobalContext->Context)->CommandQueue) !=
       CL_SUCCESS) {
     fprintf(stderr, "Synchronizing device and host memory failed.\n");
     exit(-1);
   }
 }

 static void freeContextCL(PollyGPUContext *Context) {
   dump_function();

   cl_int Ret;

   GlobalContext = NULL;

   OpenCLContext *Ctx = (OpenCLContext *)Context->Context;
   if (Ctx->CommandQueue) {
     Ret = clReleaseCommandQueueFcnPtr(Ctx->CommandQueue);
     checkOpenCLError(Ret, "Could not release command queue.\n");
   }

   if (Ctx->Context) {
     Ret = clReleaseContextFcnPtr(Ctx->Context);
     checkOpenCLError(Ret, "Could not release context.\n");
   }

   free(Ctx);
   free(Context);
 }

 static void printOpenCLError(int Error) {

   switch (Error) {
   case CL_SUCCESS:
     // Success, don't print an error.
     break;

   // JIT/Runtime errors.
   case CL_DEVICE_NOT_FOUND:
     fprintf(stderr, "Device not found.\n");
     break;
   case CL_DEVICE_NOT_AVAILABLE:
     fprintf(stderr, "Device not available.\n");
     break;
   case CL_COMPILER_NOT_AVAILABLE:
     fprintf(stderr, "Compiler not available.\n");
     break;
   case CL_MEM_OBJECT_ALLOCATION_FAILURE:
     fprintf(stderr, "Mem object allocation failure.\n");
     break;
   case CL_OUT_OF_RESOURCES:
     fprintf(stderr, "Out of resources.\n");
     break;
   case CL_OUT_OF_HOST_MEMORY:
     fprintf(stderr, "Out of host memory.\n");
     break;
   case CL_PROFILING_INFO_NOT_AVAILABLE:
     fprintf(stderr, "Profiling info not available.\n");
     break;
   case CL_MEM_COPY_OVERLAP:
     fprintf(stderr, "Mem copy overlap.\n");
     break;
   case CL_IMAGE_FORMAT_MISMATCH:
     fprintf(stderr, "Image format mismatch.\n");
     break;
   case CL_IMAGE_FORMAT_NOT_SUPPORTED:
     fprintf(stderr, "Image format not supported.\n");
     break;
   case CL_BUILD_PROGRAM_FAILURE:
     fprintf(stderr, "Build program failure.\n");
     break;
   case CL_MAP_FAILURE:
     fprintf(stderr, "Map failure.\n");
     break;
   case CL_MISALIGNED_SUB_BUFFER_OFFSET:
     fprintf(stderr, "Misaligned sub buffer offset.\n");
     break;
   case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
     fprintf(stderr, "Exec status error for events in wait list.\n");
     break;
   case CL_COMPILE_PROGRAM_FAILURE:
     fprintf(stderr, "Compile program failure.\n");
     break;
   case CL_LINKER_NOT_AVAILABLE:
     fprintf(stderr, "Linker not available.\n");
     break;
   case CL_LINK_PROGRAM_FAILURE:
     fprintf(stderr, "Link program failure.\n");
     break;
   case CL_DEVICE_PARTITION_FAILED:
     fprintf(stderr, "Device partition failed.\n");
     break;
   case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
     fprintf(stderr, "Kernel arg info not available.\n");
     break;

   // Compiler errors.
   case CL_INVALID_VALUE:
     fprintf(stderr, "Invalid value.\n");
     break;
   case CL_INVALID_DEVICE_TYPE:
     fprintf(stderr, "Invalid device type.\n");
     break;
   case CL_INVALID_PLATFORM:
     fprintf(stderr, "Invalid platform.\n");
     break;
   case CL_INVALID_DEVICE:
     fprintf(stderr, "Invalid device.\n");
     break;
   case CL_INVALID_CONTEXT:
     fprintf(stderr, "Invalid context.\n");
     break;
   case CL_INVALID_QUEUE_PROPERTIES:
     fprintf(stderr, "Invalid queue properties.\n");
     break;
   case CL_INVALID_COMMAND_QUEUE:
     fprintf(stderr, "Invalid command queue.\n");
     break;
   case CL_INVALID_HOST_PTR:
     fprintf(stderr, "Invalid host pointer.\n");
     break;
   case CL_INVALID_MEM_OBJECT:
     fprintf(stderr, "Invalid memory object.\n");
     break;
   case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
     fprintf(stderr, "Invalid image format descriptor.\n");
     break;
   case CL_INVALID_IMAGE_SIZE:
     fprintf(stderr, "Invalid image size.\n");
     break;
   case CL_INVALID_SAMPLER:
     fprintf(stderr, "Invalid sampler.\n");
     break;
   case CL_INVALID_BINARY:
     fprintf(stderr, "Invalid binary.\n");
     break;
   case CL_INVALID_BUILD_OPTIONS:
     fprintf(stderr, "Invalid build options.\n");
     break;
   case CL_INVALID_PROGRAM:
     fprintf(stderr, "Invalid program.\n");
     break;
   case CL_INVALID_PROGRAM_EXECUTABLE:
     fprintf(stderr, "Invalid program executable.\n");
     break;
   case CL_INVALID_KERNEL_NAME:
     fprintf(stderr, "Invalid kernel name.\n");
     break;
   case CL_INVALID_KERNEL_DEFINITION:
     fprintf(stderr, "Invalid kernel definition.\n");
     break;
   case CL_INVALID_KERNEL:
     fprintf(stderr, "Invalid kernel.\n");
     break;
   case CL_INVALID_ARG_INDEX:
     fprintf(stderr, "Invalid arg index.\n");
     break;
   case CL_INVALID_ARG_VALUE:
     fprintf(stderr, "Invalid arg value.\n");
     break;
   case CL_INVALID_ARG_SIZE:
     fprintf(stderr, "Invalid arg size.\n");
     break;
   case CL_INVALID_KERNEL_ARGS:
     fprintf(stderr, "Invalid kernel args.\n");
     break;
   case CL_INVALID_WORK_DIMENSION:
     fprintf(stderr, "Invalid work dimension.\n");
     break;
   case CL_INVALID_WORK_GROUP_SIZE:
     fprintf(stderr, "Invalid work group size.\n");
     break;
   case CL_INVALID_WORK_ITEM_SIZE:
     fprintf(stderr, "Invalid work item size.\n");
     break;
   case CL_INVALID_GLOBAL_OFFSET:
     fprintf(stderr, "Invalid global offset.\n");
     break;
   case CL_INVALID_EVENT_WAIT_LIST:
     fprintf(stderr, "Invalid event wait list.\n");
     break;
   case CL_INVALID_EVENT:
     fprintf(stderr, "Invalid event.\n");
     break;
   case CL_INVALID_OPERATION:
     fprintf(stderr, "Invalid operation.\n");
     break;
   case CL_INVALID_GL_OBJECT:
     fprintf(stderr, "Invalid GL object.\n");
     break;
   case CL_INVALID_BUFFER_SIZE:
     fprintf(stderr, "Invalid buffer size.\n");
     break;
   case CL_INVALID_MIP_LEVEL:
     fprintf(stderr, "Invalid mip level.\n");
     break;
   case CL_INVALID_GLOBAL_WORK_SIZE:
     fprintf(stderr, "Invalid global work size.\n");
     break;
   case CL_INVALID_PROPERTY:
     fprintf(stderr, "Invalid property.\n");
     break;
   case CL_INVALID_IMAGE_DESCRIPTOR:
     fprintf(stderr, "Invalid image descriptor.\n");
     break;
   case CL_INVALID_COMPILER_OPTIONS:
     fprintf(stderr, "Invalid compiler options.\n");
     break;
   case CL_INVALID_LINKER_OPTIONS:
     fprintf(stderr, "Invalid linker options.\n");
     break;
   case CL_INVALID_DEVICE_PARTITION_COUNT:
     fprintf(stderr, "Invalid device partition count.\n");
     break;
   case -69: // OpenCL 2.0 Code for CL_INVALID_PIPE_SIZE
     fprintf(stderr, "Invalid pipe size.\n");
     break;
   case -70: // OpenCL 2.0 Code for CL_INVALID_DEVICE_QUEUE
     fprintf(stderr, "Invalid device queue.\n");
     break;

   // NVIDIA specific error.
   case -9999:
     fprintf(stderr, "NVIDIA invalid read or write buffer.\n");
     break;

   default:
     fprintf(stderr, "Unknown error code!\n");
     break;
   }
 }

 #endif /* HAS_LIBOPENCL */
 /******************************************************************************/
 /*                                   CUDA                                     */
 /******************************************************************************/
 #ifdef HAS_LIBCUDART

 struct CUDAContextT {
   CUcontext Cuda;
 };

 struct CUDAKernelT {
   CUfunction Cuda;
   CUmodule CudaModule;
   const char *BinaryString;
 };

 struct CUDADevicePtrT {
   CUdeviceptr Cuda;
 };

 /* Dynamic library handles for the CUDA and CUDA runtime library. */
 static void *HandleCuda;
 static void *HandleCudaRT;

 /* Type-defines of function pointer to CUDA driver APIs. */
 typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
 static CuMemAllocFcnTy *CuMemAllocFcnPtr;

 typedef CUresult CUDAAPI CuMemAllocManagedFcnTy(CUdeviceptr *, size_t,
                                                 unsigned int);
 static CuMemAllocManagedFcnTy *CuMemAllocManagedFcnPtr;

 typedef CUresult CUDAAPI CuLaunchKernelFcnTy(
     CUfunction F, unsigned int GridDimX, unsigned int GridDimY,
     unsigned int gridDimZ, unsigned int blockDimX, unsigned int BlockDimY,
     unsigned int BlockDimZ, unsigned int SharedMemBytes, CUstream HStream,
     void **KernelParams, void **Extra);
 static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr;

 typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
 static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;

 typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
 static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;

 typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
 static CuMemFreeFcnTy *CuMemFreeFcnPtr;

 typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
 static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;

 typedef CUresult CUDAAPI CuProfilerStopFcnTy();
 static CuProfilerStopFcnTy *CuProfilerStopFcnPtr;

 typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
 static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;

 typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
 static CuInitFcnTy *CuInitFcnPtr;

 typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
 static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;

 typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
 static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;

 typedef CUresult CUDAAPI CuCtxGetCurrentFcnTy(CUcontext *);
 static CuCtxGetCurrentFcnTy *CuCtxGetCurrentFcnPtr;

 typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
 static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;

 typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
                                                  unsigned int, CUjit_option *,
                                                  void **);
 static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;

 typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *Module,
                                                const void *Image);
 static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr;

 typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
                                                   const char *);
 static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;

 typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
 static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;

 typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
 static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;

 typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState State,
                                             CUjitInputType Type, void *Data,
                                             size_t Size, const char *Name,
                                             unsigned int NumOptions,
                                             CUjit_option *Options,
                                             void **OptionValues);
 static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr;

 typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int NumOptions,
                                            CUjit_option *Options,
                                            void **OptionValues,
                                            CUlinkState *StateOut);
 static CuLinkCreateFcnTy *CuLinkCreateFcnPtr;

 typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState State, void **CubinOut,
                                              size_t *SizeOut);
 static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr;

 typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState State);
 static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr;

 typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy();
 static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr;

 /* Type-defines of function pointer ot CUDA runtime APIs. */
 typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
 static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;

 static void *getAPIHandleCUDA(void *Handle, const char *FuncName) {
   char *Err;
   void *FuncPtr;
   dlerror();
   FuncPtr = dlsym(Handle, FuncName);
   if ((Err = dlerror()) != 0) {
     fprintf(stderr, "Load CUDA driver API failed: %s. \n", Err);
     return 0;
   }
   return FuncPtr;
 }

 static int initialDeviceAPILibrariesCUDA() {
   HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
   if (!HandleCuda) {
     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
     return 0;
   }

   HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
   if (!HandleCudaRT) {
     fprintf(stderr, "Cannot open library: %s. \n", dlerror());
     return 0;
   }

   return 1;
 }

 /* Get function pointer to CUDA Driver APIs.
  *
  * Note that compilers conforming to the ISO C standard are required to
  * generate a warning if a conversion from a void * pointer to a function
  * pointer is attempted as in the following statements. The warning
  * of this kind of cast may not be emitted by clang and new versions of gcc
  * as it is valid on POSIX 2008. For compilers required to generate a warning,
  * we temporarily disable -Wpedantic, to avoid bloating the output with
  * unnecessary warnings.
  *
  * Reference:
  * http://pubs.opengroup.org/onlinepubs/9699919799/functions/dlsym.html
  */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpedantic"
 static int initialDeviceAPIsCUDA() {
   if (initialDeviceAPILibrariesCUDA() == 0)
     return 0;

   CuLaunchKernelFcnPtr =
       (CuLaunchKernelFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLaunchKernel");

   CuMemAllocFcnPtr =
       (CuMemAllocFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemAlloc_v2");

   CuMemAllocManagedFcnPtr = (CuMemAllocManagedFcnTy *)getAPIHandleCUDA(
       HandleCuda, "cuMemAllocManaged");

   CuMemFreeFcnPtr =
       (CuMemFreeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemFree_v2");

   CuMemcpyDtoHFcnPtr =
       (CuMemcpyDtoHFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyDtoH_v2");

   CuMemcpyHtoDFcnPtr =
       (CuMemcpyHtoDFcnTy *)getAPIHandleCUDA(HandleCuda, "cuMemcpyHtoD_v2");

   CuModuleUnloadFcnPtr =
       (CuModuleUnloadFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleUnload");

   CuProfilerStopFcnPtr =
       (CuProfilerStopFcnTy *)getAPIHandleCUDA(HandleCuda, "cuProfilerStop");

   CuCtxDestroyFcnPtr =
       (CuCtxDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxDestroy");

   CuInitFcnPtr = (CuInitFcnTy *)getAPIHandleCUDA(HandleCuda, "cuInit");

   CuDeviceGetCountFcnPtr =
       (CuDeviceGetCountFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetCount");

   CuDeviceGetFcnPtr =
       (CuDeviceGetFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGet");

   CuCtxCreateFcnPtr =
       (CuCtxCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxCreate_v2");

   CuCtxGetCurrentFcnPtr =
       (CuCtxGetCurrentFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxGetCurrent");

   CuModuleLoadDataExFcnPtr = (CuModuleLoadDataExFcnTy *)getAPIHandleCUDA(
       HandleCuda, "cuModuleLoadDataEx");

   CuModuleLoadDataFcnPtr =
       (CuModuleLoadDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuModuleLoadData");

   CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandleCUDA(
       HandleCuda, "cuModuleGetFunction");

   CuDeviceComputeCapabilityFcnPtr =
       (CuDeviceComputeCapabilityFcnTy *)getAPIHandleCUDA(
           HandleCuda, "cuDeviceComputeCapability");

   CuDeviceGetNameFcnPtr =
       (CuDeviceGetNameFcnTy *)getAPIHandleCUDA(HandleCuda, "cuDeviceGetName");

   CuLinkAddDataFcnPtr =
       (CuLinkAddDataFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkAddData");

   CuLinkCreateFcnPtr =
       (CuLinkCreateFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkCreate");

   CuLinkCompleteFcnPtr =
       (CuLinkCompleteFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkComplete");

   CuLinkDestroyFcnPtr =
       (CuLinkDestroyFcnTy *)getAPIHandleCUDA(HandleCuda, "cuLinkDestroy");

   CuCtxSynchronizeFcnPtr =
       (CuCtxSynchronizeFcnTy *)getAPIHandleCUDA(HandleCuda, "cuCtxSynchronize");

   /* Get function pointer to CUDA Runtime APIs. */
   CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandleCUDA(
       HandleCudaRT, "cudaThreadSynchronize");

   return 1;
 }
 #pragma GCC diagnostic pop

 static PollyGPUContext *initContextCUDA() {
   dump_function();
   PollyGPUContext *Context;
   CUdevice Device;

   int Major = 0, Minor = 0, DeviceID = 0;
   char DeviceName[256];
   int DeviceCount = 0;

   static __thread PollyGPUContext *CurrentContext = NULL;

   if (CurrentContext)
     return CurrentContext;

   /* Get API handles. */
   if (initialDeviceAPIsCUDA() == 0) {
     fprintf(stderr, "Getting the \"handle\" for the CUDA driver API failed.\n");
     exit(-1);
   }

   if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
     fprintf(stderr, "Initializing the CUDA driver API failed.\n");
     exit(-1);
   }

   /* Get number of devices that supports CUDA. */
   CuDeviceGetCountFcnPtr(&DeviceCount);
   if (DeviceCount == 0) {
     fprintf(stderr, "There is no device supporting CUDA.\n");
     exit(-1);
   }

   CuDeviceGetFcnPtr(&Device, 0);

   /* Get compute capabilities and the device name. */
   CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device);
   CuDeviceGetNameFcnPtr(DeviceName, 256, Device);
   debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName);

   /* Create context on the device. */
   Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext));
   if (Context == 0) {
     fprintf(stderr, "Allocate memory for Polly GPU context failed.\n");
     exit(-1);
   }
   Context->Context = malloc(sizeof(CUDAContext));
   if (Context->Context == 0) {
     fprintf(stderr, "Allocate memory for Polly CUDA context failed.\n");
     exit(-1);
   }

   // In cases where managed memory is used, it is quite likely that
   // `cudaMallocManaged` / `polly_mallocManaged` was called before
   // `polly_initContext` was called.
   //
   // If `polly_initContext` calls `CuCtxCreate` when there already was a
   // pre-existing context created by the runtime API, this causes code running
   // on P100 to hang. So, we query for a pre-existing context to try and use.
   // If there is no pre-existing context, we create a new context

   // The possible pre-existing context from previous runtime API calls.
   CUcontext MaybeRuntimeAPIContext;
   if (CuCtxGetCurrentFcnPtr(&MaybeRuntimeAPIContext) != CUDA_SUCCESS) {
     fprintf(stderr, "cuCtxGetCurrent failed.\n");
     exit(-1);
   }

   // There was no previous context, initialise it.
   if (MaybeRuntimeAPIContext == NULL) {
     if (CuCtxCreateFcnPtr(&(((CUDAContext *)Context->Context)->Cuda), 0,
                           Device) != CUDA_SUCCESS) {
       fprintf(stderr, "cuCtxCreateFcnPtr failed.\n");
       exit(-1);
     }
   } else {
     ((CUDAContext *)Context->Context)->Cuda = MaybeRuntimeAPIContext;
   }

   if (CacheMode)
     CurrentContext = Context;

   return Context;
 }

 static void freeKernelCUDA(PollyGPUFunction *Kernel) {
   dump_function();

   if (CacheMode)
     return;

   if (((CUDAKernel *)Kernel->Kernel)->CudaModule)
     CuModuleUnloadFcnPtr(((CUDAKernel *)Kernel->Kernel)->CudaModule);

   if (Kernel->Kernel)
     free((CUDAKernel *)Kernel->Kernel);

   if (Kernel)
     free(Kernel);
 }

 static PollyGPUFunction *getKernelCUDA(const char *BinaryBuffer,
                                        const char *KernelName) {
   dump_function();

   static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE];
   static __thread int NextCacheItem = 0;

   for (long i = 0; i < KERNEL_CACHE_SIZE; i++) {
     // We exploit here the property that all Polly-ACC kernels are allocated
     // as global constants, hence a pointer comparision is sufficient to
     // determin equality.
     if (KernelCache[i] &&
         ((CUDAKernel *)KernelCache[i]->Kernel)->BinaryString == BinaryBuffer) {
       debug_print("  -> using cached kernel\n");
       return KernelCache[i];
     }
   }

   PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction));
   if (Function == 0) {
     fprintf(stderr, "Allocate memory for Polly GPU function failed.\n");
     exit(-1);
   }
   Function->Kernel = (CUDAKernel *)malloc(sizeof(CUDAKernel));
   if (Function->Kernel == 0) {
     fprintf(stderr, "Allocate memory for Polly CUDA function failed.\n");
     exit(-1);
   }

   CUresult Res;
   CUlinkState LState;
   CUjit_option Options[6];
   void *OptionVals[6];
   float Walltime = 0;
   unsigned long LogSize = 8192;
   char ErrorLog[8192], InfoLog[8192];
   void *CuOut;
   size_t OutSize;

   // Setup linker options
   // Return walltime from JIT compilation
   Options[0] = CU_JIT_WALL_TIME;
   OptionVals[0] = (void *)&Walltime;
   // Pass a buffer for info messages
   Options[1] = CU_JIT_INFO_LOG_BUFFER;
   OptionVals[1] = (void *)InfoLog;
   // Pass the size of the info buffer
   Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
   OptionVals[2] = (void *)LogSize;
   // Pass a buffer for error message
   Options[3] = CU_JIT_ERROR_LOG_BUFFER;
   OptionVals[3] = (void *)ErrorLog;
   // Pass the size of the error buffer
   Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
   OptionVals[4] = (void *)LogSize;
   // Make the linker verbose
   Options[5] = CU_JIT_LOG_VERBOSE;
   OptionVals[5] = (void *)1;

   memset(ErrorLog, 0, sizeof(ErrorLog));

   CuLinkCreateFcnPtr(6, Options, OptionVals, &LState);
   Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)BinaryBuffer,
                             strlen(BinaryBuffer) + 1, 0, 0, 0, 0);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog);
     exit(-1);
   }

   Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Complete ptx linker step failed.\n");
     fprintf(stderr, "\n%s\n", ErrorLog);
     exit(-1);
   }

   debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime,
               InfoLog);

   Res = CuModuleLoadDataFcnPtr(&(((CUDAKernel *)Function->Kernel)->CudaModule),
                                CuOut);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Loading ptx assembly text failed.\n");
     exit(-1);
   }

   Res = CuModuleGetFunctionFcnPtr(&(((CUDAKernel *)Function->Kernel)->Cuda),
                                   ((CUDAKernel *)Function->Kernel)->CudaModule,
                                   KernelName);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Loading kernel function failed.\n");
     exit(-1);
   }

   CuLinkDestroyFcnPtr(LState);

   ((CUDAKernel *)Function->Kernel)->BinaryString = BinaryBuffer;

   if (CacheMode) {
     if (KernelCache[NextCacheItem])
       freeKernelCUDA(KernelCache[NextCacheItem]);

     KernelCache[NextCacheItem] = Function;

     NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE;
   }

   return Function;
 }

 static void synchronizeDeviceCUDA() {
   dump_function();
   if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) {
     fprintf(stderr, "Synchronizing device and host memory failed.\n");
     exit(-1);
   }
 }

 static void copyFromHostToDeviceCUDA(void *HostData, PollyGPUDevicePtr *DevData,
                                      long MemSize) {
   dump_function();

   CUdeviceptr CuDevData = ((CUDADevicePtr *)DevData->DevicePtr)->Cuda;
   CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
 }

 static void copyFromDeviceToHostCUDA(PollyGPUDevicePtr *DevData, void *HostData,
                                      long MemSize) {
   dump_function();

   if (CuMemcpyDtoHFcnPtr(HostData, ((CUDADevicePtr *)DevData->DevicePtr)->Cuda,
                          MemSize) != CUDA_SUCCESS) {
     fprintf(stderr, "Copying results from device to host memory failed.\n");
     exit(-1);
   }
 }

 static void launchKernelCUDA(PollyGPUFunction *Kernel, unsigned int GridDimX,
                              unsigned int GridDimY, unsigned int BlockDimX,
                              unsigned int BlockDimY, unsigned int BlockDimZ,
                              void **Parameters) {
   dump_function();

   unsigned GridDimZ = 1;
   unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE;
   CUstream Stream = 0;
   void **Extra = 0;

   CUresult Res;
   Res =
       CuLaunchKernelFcnPtr(((CUDAKernel *)Kernel->Kernel)->Cuda, GridDimX,
                            GridDimY, GridDimZ, BlockDimX, BlockDimY, BlockDimZ,
                            SharedMemBytes, Stream, Parameters, Extra);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "Launching CUDA kernel failed.\n");
     exit(-1);
   }
 }

 // Maximum number of managed memory pointers.
 #define DEFAULT_MAX_POINTERS 4000
 // For the rationale behing a list of free pointers, see `polly_freeManaged`.
 void **g_managedptrs;
 unsigned long long g_nmanagedptrs = 0;
 unsigned long long g_maxmanagedptrs = 0;

 __attribute__((constructor)) static void initManagedPtrsBuffer() {
   g_maxmanagedptrs = DEFAULT_MAX_POINTERS;
   const char *maxManagedPointersString = getenv("POLLY_MAX_MANAGED_POINTERS");
   if (maxManagedPointersString)
     g_maxmanagedptrs = atoll(maxManagedPointersString);

   g_managedptrs = (void **)malloc(sizeof(void *) * g_maxmanagedptrs);
 }

 // Add a pointer as being allocated by cuMallocManaged
 void addManagedPtr(void *mem) {
   assert(g_maxmanagedptrs > 0 && "g_maxmanagedptrs was set to 0!");
   assert(g_nmanagedptrs < g_maxmanagedptrs &&
          "We have hit the maximum number of "
          "managed pointers allowed. Set the "
          "POLLY_MAX_MANAGED_POINTERS environment variable. ");
   g_managedptrs[g_nmanagedptrs++] = mem;
 }

 int isManagedPtr(void *mem) {
   for (unsigned long long i = 0; i < g_nmanagedptrs; i++) {
     if (g_managedptrs[i] == mem)
       return 1;
   }
   return 0;
 }

 void freeManagedCUDA(void *mem) {
   dump_function();

   // In a real-world program this was used (COSMO), there were more `free`
   // calls in the original source than `malloc` calls. Hence, replacing all
   // `free`s with `cudaFree` does not work, since we would try to free
   // 'illegal' memory.
   // As a quick fix, we keep a free list and check if `mem` is a managed memory
   // pointer. If it is, we call `cudaFree`.
   // If not, we pass it along to the underlying allocator.
   // This is a hack, and can be removed if the underlying issue is fixed.
   if (isManagedPtr(mem)) {
     if (CuMemFreeFcnPtr((size_t)mem) != CUDA_SUCCESS) {
       fprintf(stderr, "cudaFree failed.\n");
       exit(-1);
     }
     return;
   } else {
     free(mem);
   }
 }

 void *mallocManagedCUDA(size_t size) {
   // Note: [Size 0 allocations]
   // Sometimes, some runtime computation of size could create a size of 0
   // for an allocation. In these cases, we do not wish to fail.
   // The CUDA API fails on size 0 allocations.
   // So, we allocate size a minimum of size 1.
   if (!size && DebugMode)
     fprintf(stderr, "cudaMallocManaged called with size 0. "
                     "Promoting to size 1");
   size = max(size, 1);
   PollyGPUContext *_ = polly_initContextCUDA();
   assert(_ && "polly_initContextCUDA failed");

   void *newMemPtr;
   const CUresult Res = CuMemAllocManagedFcnPtr((CUdeviceptr *)&newMemPtr, size,
                                                CU_MEM_ATTACH_GLOBAL);
   if (Res != CUDA_SUCCESS) {
     fprintf(stderr, "cudaMallocManaged failed for size: %zu\n", size);
     exit(-1);
   }
   addManagedPtr(newMemPtr);
   return newMemPtr;
 }

 static void freeDeviceMemoryCUDA(PollyGPUDevicePtr *Allocation) {
   dump_function();
   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
   CuMemFreeFcnPtr((CUdeviceptr)DevPtr->Cuda);
   free(DevPtr);
   free(Allocation);
 }

 static PollyGPUDevicePtr *allocateMemoryForDeviceCUDA(long MemSize) {
   if (!MemSize && DebugMode)
     fprintf(stderr, "allocateMemoryForDeviceCUDA called with size 0. "
                     "Promoting to size 1");
   // see: [Size 0 allocations]
   MemSize = max(MemSize, 1);
   dump_function();

   PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr));
   if (DevData == 0) {
     fprintf(stderr,
             "Allocate memory for GPU device memory pointer failed."
             " Line: %d | Size: %ld\n",
             __LINE__, MemSize);
     exit(-1);
   }
   DevData->DevicePtr = (CUDADevicePtr *)malloc(sizeof(CUDADevicePtr));
   if (DevData->DevicePtr == 0) {
     fprintf(stderr,
             "Allocate memory for GPU device memory pointer failed."
             " Line: %d | Size: %ld\n",
             __LINE__, MemSize);
     exit(-1);
   }

   CUresult Res =
       CuMemAllocFcnPtr(&(((CUDADevicePtr *)DevData->DevicePtr)->Cuda), MemSize);

   if (Res != CUDA_SUCCESS) {
     fprintf(stderr,
             "Allocate memory for GPU device memory pointer failed."
             " Line: %d | Size: %ld\n",
             __LINE__, MemSize);
     exit(-1);
   }

   return DevData;
 }

 static void *getDevicePtrCUDA(PollyGPUDevicePtr *Allocation) {
   dump_function();

   CUDADevicePtr *DevPtr = (CUDADevicePtr *)Allocation->DevicePtr;
   return (void *)DevPtr->Cuda;
 }

 static void freeContextCUDA(PollyGPUContext *Context) {
   dump_function();

   CUDAContext *Ctx = (CUDAContext *)Context->Context;
   if (Ctx->Cuda) {
     CuProfilerStopFcnPtr();
     CuCtxDestroyFcnPtr(Ctx->Cuda);
     free(Ctx);
     free(Context);
   }

   dlclose(HandleCuda);
   dlclose(HandleCudaRT);
 }

 #endif /* HAS_LIBCUDART */
 /******************************************************************************/
 /*                                    API                                     */
 /******************************************************************************/

 PollyGPUContext *polly_initContext() {
   DebugMode = getenv("POLLY_DEBUG") != 0;
   CacheMode = getenv("POLLY_NOCACHE") == 0;

   dump_function();

   PollyGPUContext *Context;

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     Context = initContextCUDA();
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     Context = initContextCL();
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }

   return Context;
 }

 void polly_freeKernel(PollyGPUFunction *Kernel) {
   dump_function();

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     freeKernelCUDA(Kernel);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     freeKernelCL(Kernel);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 PollyGPUFunction *polly_getKernel(const char *BinaryBuffer,
                                   const char *KernelName) {
   dump_function();

   PollyGPUFunction *Function;

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     Function = getKernelCUDA(BinaryBuffer, KernelName);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     Function = getKernelCL(BinaryBuffer, KernelName);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }

   return Function;
 }

 void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData,
                                 long MemSize) {
   dump_function();

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     copyFromHostToDeviceCUDA(HostData, DevData, MemSize);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     copyFromHostToDeviceCL(HostData, DevData, MemSize);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData,
                                 long MemSize) {
   dump_function();

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     copyFromDeviceToHostCUDA(DevData, HostData, MemSize);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     copyFromDeviceToHostCL(DevData, HostData, MemSize);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX,
                         unsigned int GridDimY, unsigned int BlockDimX,
                         unsigned int BlockDimY, unsigned int BlockDimZ,
                         void **Parameters) {
   dump_function();

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     launchKernelCUDA(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
                      BlockDimZ, Parameters);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     launchKernelCL(Kernel, GridDimX, GridDimY, BlockDimX, BlockDimY, BlockDimZ,
                    Parameters);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) {
   dump_function();

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     freeDeviceMemoryCUDA(Allocation);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     freeDeviceMemoryCL(Allocation);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) {
   dump_function();

   PollyGPUDevicePtr *DevData;

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     DevData = allocateMemoryForDeviceCUDA(MemSize);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     DevData = allocateMemoryForDeviceCL(MemSize);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }

   return DevData;
 }

 void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) {
   dump_function();

   void *DevPtr;

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     DevPtr = getDevicePtrCUDA(Allocation);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     DevPtr = getDevicePtrCL(Allocation);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }

   return DevPtr;
 }

 void polly_synchronizeDevice() {
   dump_function();

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     synchronizeDeviceCUDA();
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     synchronizeDeviceCL();
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 void polly_freeContext(PollyGPUContext *Context) {
   dump_function();

   if (CacheMode)
     return;

   switch (Runtime) {
 #ifdef HAS_LIBCUDART
   case RUNTIME_CUDA:
     freeContextCUDA(Context);
     break;
 #endif /* HAS_LIBCUDART */
 #ifdef HAS_LIBOPENCL
   case RUNTIME_CL:
     freeContextCL(Context);
     break;
 #endif /* HAS_LIBOPENCL */
   default:
     err_runtime();
   }
 }

 void polly_freeManaged(void *mem) {
   dump_function();

 #ifdef HAS_LIBCUDART
   freeManagedCUDA(mem);
 #else
   fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
   exit(-1);
 #endif
 }

 void *polly_mallocManaged(size_t size) {
   dump_function();

 #ifdef HAS_LIBCUDART
   return mallocManagedCUDA(size);
 #else
   fprintf(stderr, "No CUDA Runtime. Managed memory only supported by CUDA\n");
   exit(-1);
 #endif
 }

 /* Initialize GPUJIT with CUDA as runtime library. */
 PollyGPUContext *polly_initContextCUDA() {
 #ifdef HAS_LIBCUDART
   Runtime = RUNTIME_CUDA;
   return polly_initContext();
 #else
   fprintf(stderr, "GPU Runtime was built without CUDA support.\n");
   exit(-1);
 #endif /* HAS_LIBCUDART */
 }

 /* Initialize GPUJIT with OpenCL as runtime library. */
 PollyGPUContext *polly_initContextCL() {
 #ifdef HAS_LIBOPENCL
   Runtime = RUNTIME_CL;
   return polly_initContext();
 #else
   fprintf(stderr, "GPU Runtime was built without OpenCL support.\n");
   exit(-1);
 #endif /* HAS_LIBOPENCL */
 }