/******************** GPUJIT.cpp - GPUJIT Execution Engine ********************/
/*                                                                            */
/*                     The LLVM Compiler Infrastructure                       */
/*                                                                            */
/* This file is dual licensed under the MIT and the University of Illinois    */
/* Open Source License. See LICENSE.TXT for details.                         */
/*                                                                            */
/******************************************************************************/
/*                                                                            */
/*  This file implements GPUJIT, a ptx string execution engine for GPU.       */
/*                                                                            */
/******************************************************************************/

#include "GPUJIT.h"
#include <cuda.h>
#include <cuda_runtime.h>
#include <dlfcn.h>
#include <stdio.h>

/* Define Polly's GPGPU data types. */
struct PollyGPUContextT {
  CUcontext Cuda;
};

struct PollyGPUModuleT {
  CUmodule Cuda;
};

struct PollyGPUFunctionT {
  CUfunction Cuda;
};

struct PollyGPUDeviceT {
  CUdevice Cuda;
};

struct PollyGPUDevicePtrT {
  CUdeviceptr Cuda;
};

struct PollyGPUEventT {
  cudaEvent_t Cuda;
};

/* Dynamic library handles for the CUDA and CUDA runtime library. */
static void *HandleCuda;
static void *HandleCudaRT;

/* Type-defines of function pointer to CUDA driver APIs. */
typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t);
static CuMemAllocFcnTy *CuMemAllocFcnPtr;

typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int);
static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr;

typedef CUresult CUDAAPI CuParamSetvFcnTy(CUfunction, int, void *,
                                          unsigned int);
static CuParamSetvFcnTy *CuParamSetvFcnPtr;

typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int);
static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr;

typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int);
static CuLaunchGridFcnTy *CuLaunchGridFcnPtr;

typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t);
static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr;

typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t);
static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr;

typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr);
static CuMemFreeFcnTy *CuMemFreeFcnPtr;

typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule);
static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr;

typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext);
static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr;

typedef CUresult CUDAAPI CuInitFcnTy(unsigned int);
static CuInitFcnTy *CuInitFcnPtr;

typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *);
static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr;

typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice);
static CuCtxCreateFcnTy *CuCtxCreateFcnPtr;

typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int);
static CuDeviceGetFcnTy *CuDeviceGetFcnPtr;

typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *,
                                                 unsigned int, CUjit_option *,
                                                 void **);
static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr;

typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule,
                                                  const char *);
static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr;

typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice);
static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr;

typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice);
static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr;

/* Type-defines of function pointer ot CUDA runtime APIs. */
typedef cudaError_t CUDARTAPI CudaEventCreateFcnTy(cudaEvent_t *);
static CudaEventCreateFcnTy *CudaEventCreateFcnPtr;

typedef cudaError_t CUDARTAPI CudaEventRecordFcnTy(cudaEvent_t,
                                                   cudaStream_t);
static CudaEventRecordFcnTy *CudaEventRecordFcnPtr;

typedef cudaError_t CUDARTAPI CudaEventSynchronizeFcnTy(cudaEvent_t);
static CudaEventSynchronizeFcnTy *CudaEventSynchronizeFcnPtr;

typedef cudaError_t CUDARTAPI CudaEventElapsedTimeFcnTy(float *, cudaEvent_t,
                                                        cudaEvent_t);
static CudaEventElapsedTimeFcnTy *CudaEventElapsedTimeFcnPtr;

typedef cudaError_t CUDARTAPI CudaEventDestroyFcnTy(cudaEvent_t);
static CudaEventDestroyFcnTy *CudaEventDestroyFcnPtr;

typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void);
static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr;

static void *getAPIHandle(void *Handle, const char *FuncName) {
  char *Err;
  void *FuncPtr;
  dlerror();
  FuncPtr = dlsym(Handle, FuncName);
  if ((Err = dlerror()) != 0) {
    fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err);
    return 0;
  }
  return FuncPtr;
}

static int initialDeviceAPILibraries() {
  HandleCuda = dlopen("libcuda.so", RTLD_LAZY);
  if (!HandleCuda) {
    printf("Cannot open library: %s. \n", dlerror());
    return 0;
  }

  HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY);
  if (!HandleCudaRT) {
    printf("Cannot open library: %s. \n", dlerror());
    return 0;
  }

  return 1;
}

static int initialDeviceAPIs() {
  if (initialDeviceAPILibraries() == 0)
    return 0;

  /* Get function pointer to CUDA Driver APIs.
   *
   * Note that compilers conforming to the ISO C standard are required to
   * generate a warning if a conversion from a void * pointer to a function
   * pointer is attempted as in the following statements. The warning
   * of this kind of cast may not be emitted by clang and new versions of gcc
   * as it is valid on POSIX 2008.
   */
  CuFuncSetBlockShapeFcnPtr =
    (CuFuncSetBlockShapeFcnTy *) getAPIHandle(HandleCuda,
                                              "cuFuncSetBlockShape");

  CuParamSetvFcnPtr = (CuParamSetvFcnTy *) getAPIHandle(HandleCuda,
                                                        "cuParamSetv");

  CuParamSetSizeFcnPtr = (CuParamSetSizeFcnTy *) getAPIHandle(HandleCuda,
                                                              "cuParamSetSize");

  CuLaunchGridFcnPtr = (CuLaunchGridFcnTy *) getAPIHandle(HandleCuda,
                                                          "cuLaunchGrid");

  CuMemAllocFcnPtr = (CuMemAllocFcnTy *) getAPIHandle(HandleCuda,
                                                      "cuMemAlloc_v2");

  CuMemFreeFcnPtr = (CuMemFreeFcnTy *) getAPIHandle(HandleCuda, "cuMemFree_v2");

  CuMemcpyDtoHFcnPtr = (CuMemcpyDtoHFcnTy *) getAPIHandle(HandleCuda,
                                                          "cuMemcpyDtoH_v2");

  CuMemcpyHtoDFcnPtr = (CuMemcpyHtoDFcnTy *) getAPIHandle(HandleCuda,
                                                          "cuMemcpyHtoD_v2");

  CuModuleUnloadFcnPtr = (CuModuleUnloadFcnTy *) getAPIHandle(HandleCuda,
                                                              "cuModuleUnload");

  CuCtxDestroyFcnPtr = (CuCtxDestroyFcnTy *) getAPIHandle(HandleCuda,
                                                          "cuCtxDestroy");

  CuInitFcnPtr = (CuInitFcnTy *) getAPIHandle(HandleCuda, "cuInit");

  CuDeviceGetCountFcnPtr = (CuDeviceGetCountFcnTy *) getAPIHandle(HandleCuda,
                                                            "cuDeviceGetCount");

  CuDeviceGetFcnPtr = (CuDeviceGetFcnTy *) getAPIHandle(HandleCuda,
                                                        "cuDeviceGet");

  CuCtxCreateFcnPtr = (CuCtxCreateFcnTy *) getAPIHandle(HandleCuda,
                                                        "cuCtxCreate_v2");

  CuModuleLoadDataExFcnPtr =
    (CuModuleLoadDataExFcnTy *) getAPIHandle(HandleCuda, "cuModuleLoadDataEx");

  CuModuleGetFunctionFcnPtr =
    (CuModuleGetFunctionFcnTy *)getAPIHandle(HandleCuda, "cuModuleGetFunction");

  CuDeviceComputeCapabilityFcnPtr =
    (CuDeviceComputeCapabilityFcnTy *)getAPIHandle(HandleCuda,
                                                   "cuDeviceComputeCapability");

  CuDeviceGetNameFcnPtr =
    (CuDeviceGetNameFcnTy *) getAPIHandle(HandleCuda, "cuDeviceGetName");

  /* Get function pointer to CUDA Runtime APIs. */
  CudaEventCreateFcnPtr =
    (CudaEventCreateFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventCreate");

  CudaEventRecordFcnPtr =
    (CudaEventRecordFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventRecord");

  CudaEventSynchronizeFcnPtr =
    (CudaEventSynchronizeFcnTy *) getAPIHandle(HandleCudaRT,
                                               "cudaEventSynchronize");

  CudaEventElapsedTimeFcnPtr =
    (CudaEventElapsedTimeFcnTy *) getAPIHandle(HandleCudaRT,
                                               "cudaEventElapsedTime");

  CudaEventDestroyFcnPtr =
    (CudaEventDestroyFcnTy *) getAPIHandle(HandleCudaRT, "cudaEventDestroy");

  CudaThreadSynchronizeFcnPtr =
    (CudaThreadSynchronizeFcnTy *) getAPIHandle(HandleCudaRT,
                                                "cudaThreadSynchronize");

  return 1;
}

void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device) {
  int Major = 0, Minor = 0, DeviceID = 0;
  char DeviceName[256];
  int DeviceCount = 0;

  /* Get API handles. */
  if (initialDeviceAPIs() == 0) {
    fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n");
    exit(-1);
  }

  if (CuInitFcnPtr(0) != CUDA_SUCCESS) {
    fprintf(stdout, "Initializing the CUDA driver API failed.\n");
    exit(-1);
  }

  /* Get number of devices that supports CUDA. */
  CuDeviceGetCountFcnPtr(&DeviceCount);
  if (DeviceCount == 0) {
    fprintf(stdout, "There is no device supporting CUDA.\n");
    exit(-1);
  }

  /* We select the 1st device as default. */
  *Device =  malloc(sizeof(PollyGPUDevice));
  if (*Device == 0) {
    fprintf(stdout, "Allocate memory for Polly GPU device failed.\n");
    exit(-1);
  }
  CuDeviceGetFcnPtr(&((*Device)->Cuda), 0);

  /* Get compute capabilities and the device name. */
  CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, (*Device)->Cuda);
  CuDeviceGetNameFcnPtr(DeviceName, 256, (*Device)->Cuda);
  fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName);

  /* Create context on the device. */
  *Context = malloc(sizeof(PollyGPUContext));
  if (*Context == 0) {
    fprintf(stdout, "Allocate memory for Polly GPU context failed.\n");
    exit(-1);
  }
  CuCtxCreateFcnPtr(&((*Context)->Cuda), 0, (*Device)->Cuda);
}

void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) {
  *Module = malloc(sizeof(PollyGPUModule));
  if (*Module == 0) {
    fprintf(stdout, "Allocate memory for Polly GPU module failed.\n");
    exit(-1);
  }

  if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0)
      != CUDA_SUCCESS) {
    fprintf(stdout, "Loading ptx assembly text failed.\n");
    exit(-1);
  }
}

void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module,
                             PollyGPUFunction **Kernel) {
  *Kernel = malloc(sizeof(PollyGPUFunction));
  if (*Kernel == 0) {
    fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n");
    exit(-1);
  }

  /* Locate the kernel entry point. */
  if(CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName)
     !=  CUDA_SUCCESS) {
    fprintf(stdout, "Loading kernel function failed.\n");
    exit(-1);
  }
}

void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop) {
  *Start = malloc(sizeof(PollyGPUEvent));
  if (*Start == 0) {
    fprintf(stdout, "Allocate memory for Polly GPU start timer failed.\n");
    exit(-1);
  }
  CudaEventCreateFcnPtr(&((*Start)->Cuda));

  *Stop = malloc(sizeof(PollyGPUEvent));
  if (*Stop == 0) {
    fprintf(stdout, "Allocate memory for Polly GPU stop timer failed.\n");
    exit(-1);
  }
  CudaEventCreateFcnPtr(&((*Stop)->Cuda));

  /* Record the start time. */
  CudaEventRecordFcnPtr((*Start)->Cuda, 0);
}

void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop,
                                float *ElapsedTimes) {
  /* Record the end time. */
  CudaEventRecordFcnPtr(Stop->Cuda, 0);
  CudaEventSynchronizeFcnPtr(Start->Cuda);
  CudaEventSynchronizeFcnPtr(Stop->Cuda);
  CudaEventElapsedTimeFcnPtr(ElapsedTimes, Start->Cuda, Stop->Cuda);
  CudaEventDestroyFcnPtr(Start->Cuda);
  CudaEventDestroyFcnPtr(Stop->Cuda);
  fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes);

  free(Start);
  free(Stop);
}

void polly_allocateMemoryForHostAndDevice(void **HostData,
                                          PollyGPUDevicePtr **DevData,
                                          int MemSize) {
  if ((*HostData = (int *)malloc(MemSize)) == 0) {
    fprintf(stdout, "Could not allocate host memory.\n");
    exit(-1);
  }

  *DevData = malloc(sizeof(PollyGPUDevicePtr));
  if (*DevData == 0) {
    fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n");
    exit(-1);
  }
  CuMemAllocFcnPtr(&((*DevData)->Cuda), MemSize);
}

void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData,
                                int MemSize) {
  CUdeviceptr CuDevData = DevData->Cuda;
  CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize);
}

void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData,
                                int MemSize) {
  if(CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) {
    fprintf(stdout, "Copying results from device to host memory failed.\n");
    exit(-1);
  }
}

void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth,
                               int BlockHeight, PollyGPUDevicePtr *DevData) {
  int ParamOffset = 0;

  CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1);
  CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda),
                    sizeof(DevData->Cuda));
  ParamOffset += sizeof(DevData->Cuda);
  CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset);
}

void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth,
                        int GridHeight) {
  if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) {
    fprintf(stdout, "Launching CUDA kernel failed.\n");
    exit(-1);
  }
  CudaThreadSynchronizeFcnPtr();
  fprintf(stdout, "CUDA kernel launched.\n");
}

void polly_cleanupGPGPUResources(void *HostData, PollyGPUDevicePtr *DevData,
                                 PollyGPUModule *Module,
                                 PollyGPUContext *Context,
                                 PollyGPUFunction *Kernel) {
  if (HostData) {
    free(HostData);
    HostData = 0;
  }

  if (DevData->Cuda) {
    CuMemFreeFcnPtr(DevData->Cuda);
    free(DevData);
  }

  if (Module->Cuda) {
    CuModuleUnloadFcnPtr(Module->Cuda);
    free(Module);
  }

  if (Context->Cuda) {
    CuCtxDestroyFcnPtr(Context->Cuda);
    free(Context);
  }

  if (Kernel) {
    free(Kernel);
  }

  dlclose(HandleCuda);
  dlclose(HandleCudaRT);
}
