| /******************** GPUJIT.cpp - GPUJIT Execution Engine \ |
| * ********************/ |
| /* */ |
| /* The LLVM Compiler Infrastructure */ |
| /* */ |
| /* This file is dual licensed under the MIT and the University of Illinois */ |
| /* Open Source License. See LICENSE.TXT for details. */ |
| /* */ |
| /******************************************************************************/ |
| /* */ |
| /* This file implements GPUJIT, a ptx string execution engine for GPU. */ |
| /* */ |
| /******************************************************************************/ |
| |
| #include "GPUJIT.h" |
| #include <cuda.h> |
| #include <cuda_runtime.h> |
| #include <dlfcn.h> |
| #include <stdio.h> |
| |
| /* Define Polly's GPGPU data types. */ |
| struct PollyGPUContextT { |
| CUcontext Cuda; |
| }; |
| |
| struct PollyGPUModuleT { |
| CUmodule Cuda; |
| }; |
| |
| struct PollyGPUFunctionT { |
| CUfunction Cuda; |
| }; |
| |
| struct PollyGPUDeviceT { |
| CUdevice Cuda; |
| }; |
| |
| struct PollyGPUDevicePtrT { |
| CUdeviceptr Cuda; |
| }; |
| |
| struct PollyGPUEventT { |
| cudaEvent_t Cuda; |
| }; |
| |
| /* Dynamic library handles for the CUDA and CUDA runtime library. */ |
| static void *HandleCuda; |
| static void *HandleCudaRT; |
| |
| /* Type-defines of function pointer to CUDA driver APIs. */ |
| typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); |
| static CuMemAllocFcnTy *CuMemAllocFcnPtr; |
| |
| typedef CUresult CUDAAPI CuFuncSetBlockShapeFcnTy(CUfunction, int, int, int); |
| static CuFuncSetBlockShapeFcnTy *CuFuncSetBlockShapeFcnPtr; |
| |
| typedef CUresult CUDAAPI |
| CuParamSetvFcnTy(CUfunction, int, void *, unsigned int); |
| static CuParamSetvFcnTy *CuParamSetvFcnPtr; |
| |
| typedef CUresult CUDAAPI CuParamSetSizeFcnTy(CUfunction, unsigned int); |
| static CuParamSetSizeFcnTy *CuParamSetSizeFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLaunchGridFcnTy(CUfunction, int, int); |
| static CuLaunchGridFcnTy *CuLaunchGridFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); |
| static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t); |
| static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr); |
| static CuMemFreeFcnTy *CuMemFreeFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule); |
| static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext); |
| static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr; |
| |
| typedef CUresult CUDAAPI CuInitFcnTy(unsigned int); |
| static CuInitFcnTy *CuInitFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *); |
| static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice); |
| static CuCtxCreateFcnTy *CuCtxCreateFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int); |
| static CuDeviceGetFcnTy *CuDeviceGetFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy( |
| CUmodule *, const void *, unsigned int, CUjit_option *, void **); |
| static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; |
| |
| typedef CUresult CUDAAPI |
| CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, const char *); |
| static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice); |
| static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); |
| static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; |
| |
| /* Type-defines of function pointer ot CUDA runtime APIs. */ |
| typedef cudaError_t CUDARTAPI CudaEventCreateFcnTy(cudaEvent_t *); |
| static CudaEventCreateFcnTy *CudaEventCreateFcnPtr; |
| |
| typedef cudaError_t CUDARTAPI CudaEventRecordFcnTy(cudaEvent_t, cudaStream_t); |
| static CudaEventRecordFcnTy *CudaEventRecordFcnPtr; |
| |
| typedef cudaError_t CUDARTAPI CudaEventSynchronizeFcnTy(cudaEvent_t); |
| static CudaEventSynchronizeFcnTy *CudaEventSynchronizeFcnPtr; |
| |
| typedef cudaError_t CUDARTAPI |
| CudaEventElapsedTimeFcnTy(float *, cudaEvent_t, cudaEvent_t); |
| static CudaEventElapsedTimeFcnTy *CudaEventElapsedTimeFcnPtr; |
| |
| typedef cudaError_t CUDARTAPI CudaEventDestroyFcnTy(cudaEvent_t); |
| static CudaEventDestroyFcnTy *CudaEventDestroyFcnPtr; |
| |
| typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); |
| static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; |
| |
| static void *getAPIHandle(void *Handle, const char *FuncName) { |
| char *Err; |
| void *FuncPtr; |
| dlerror(); |
| FuncPtr = dlsym(Handle, FuncName); |
| if ((Err = dlerror()) != 0) { |
| fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err); |
| return 0; |
| } |
| return FuncPtr; |
| } |
| |
| static int initialDeviceAPILibraries() { |
| HandleCuda = dlopen("libcuda.so", RTLD_LAZY); |
| if (!HandleCuda) { |
| printf("Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| |
| HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); |
| if (!HandleCudaRT) { |
| printf("Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| static int initialDeviceAPIs() { |
| if (initialDeviceAPILibraries() == 0) |
| return 0; |
| |
| /* Get function pointer to CUDA Driver APIs. |
| * |
| * Note that compilers conforming to the ISO C standard are required to |
| * generate a warning if a conversion from a void * pointer to a function |
| * pointer is attempted as in the following statements. The warning |
| * of this kind of cast may not be emitted by clang and new versions of gcc |
| * as it is valid on POSIX 2008. |
| */ |
| CuFuncSetBlockShapeFcnPtr = (CuFuncSetBlockShapeFcnTy *)getAPIHandle( |
| HandleCuda, "cuFuncSetBlockShape"); |
| |
| CuParamSetvFcnPtr = |
| (CuParamSetvFcnTy *)getAPIHandle(HandleCuda, "cuParamSetv"); |
| |
| CuParamSetSizeFcnPtr = |
| (CuParamSetSizeFcnTy *)getAPIHandle(HandleCuda, "cuParamSetSize"); |
| |
| CuLaunchGridFcnPtr = |
| (CuLaunchGridFcnTy *)getAPIHandle(HandleCuda, "cuLaunchGrid"); |
| |
| CuMemAllocFcnPtr = |
| (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); |
| |
| CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); |
| |
| CuMemcpyDtoHFcnPtr = |
| (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); |
| |
| CuMemcpyHtoDFcnPtr = |
| (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); |
| |
| CuModuleUnloadFcnPtr = |
| (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); |
| |
| CuCtxDestroyFcnPtr = |
| (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); |
| |
| CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); |
| |
| CuDeviceGetCountFcnPtr = |
| (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); |
| |
| CuDeviceGetFcnPtr = |
| (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); |
| |
| CuCtxCreateFcnPtr = |
| (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); |
| |
| CuModuleLoadDataExFcnPtr = |
| (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); |
| |
| CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( |
| HandleCuda, "cuModuleGetFunction"); |
| |
| CuDeviceComputeCapabilityFcnPtr = |
| (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( |
| HandleCuda, "cuDeviceComputeCapability"); |
| |
| CuDeviceGetNameFcnPtr = |
| (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); |
| |
| /* Get function pointer to CUDA Runtime APIs. */ |
| CudaEventCreateFcnPtr = |
| (CudaEventCreateFcnTy *)getAPIHandle(HandleCudaRT, "cudaEventCreate"); |
| |
| CudaEventRecordFcnPtr = |
| (CudaEventRecordFcnTy *)getAPIHandle(HandleCudaRT, "cudaEventRecord"); |
| |
| CudaEventSynchronizeFcnPtr = (CudaEventSynchronizeFcnTy *)getAPIHandle( |
| HandleCudaRT, "cudaEventSynchronize"); |
| |
| CudaEventElapsedTimeFcnPtr = (CudaEventElapsedTimeFcnTy *)getAPIHandle( |
| HandleCudaRT, "cudaEventElapsedTime"); |
| |
| CudaEventDestroyFcnPtr = |
| (CudaEventDestroyFcnTy *)getAPIHandle(HandleCudaRT, "cudaEventDestroy"); |
| |
| CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( |
| HandleCudaRT, "cudaThreadSynchronize"); |
| |
| return 1; |
| } |
| |
| void polly_initDevice(PollyGPUContext **Context, PollyGPUDevice **Device) { |
| int Major = 0, Minor = 0, DeviceID = 0; |
| char DeviceName[256]; |
| int DeviceCount = 0; |
| |
| /* Get API handles. */ |
| if (initialDeviceAPIs() == 0) { |
| fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); |
| exit(-1); |
| } |
| |
| if (CuInitFcnPtr(0) != CUDA_SUCCESS) { |
| fprintf(stdout, "Initializing the CUDA driver API failed.\n"); |
| exit(-1); |
| } |
| |
| /* Get number of devices that supports CUDA. */ |
| CuDeviceGetCountFcnPtr(&DeviceCount); |
| if (DeviceCount == 0) { |
| fprintf(stdout, "There is no device supporting CUDA.\n"); |
| exit(-1); |
| } |
| |
| /* We select the 1st device as default. */ |
| *Device = malloc(sizeof(PollyGPUDevice)); |
| if (*Device == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU device failed.\n"); |
| exit(-1); |
| } |
| CuDeviceGetFcnPtr(&((*Device)->Cuda), 0); |
| |
| /* Get compute capabilities and the device name. */ |
| CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, (*Device)->Cuda); |
| CuDeviceGetNameFcnPtr(DeviceName, 256, (*Device)->Cuda); |
| fprintf(stderr, "> Running on GPU device %d : %s.\n", DeviceID, DeviceName); |
| |
| /* Create context on the device. */ |
| *Context = malloc(sizeof(PollyGPUContext)); |
| if (*Context == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); |
| exit(-1); |
| } |
| CuCtxCreateFcnPtr(&((*Context)->Cuda), 0, (*Device)->Cuda); |
| } |
| |
| void polly_getPTXModule(void *PTXBuffer, PollyGPUModule **Module) { |
| *Module = malloc(sizeof(PollyGPUModule)); |
| if (*Module == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU module failed.\n"); |
| exit(-1); |
| } |
| |
| if (CuModuleLoadDataExFcnPtr(&((*Module)->Cuda), PTXBuffer, 0, 0, 0) != |
| CUDA_SUCCESS) { |
| fprintf(stdout, "Loading ptx assembly text failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| void polly_getPTXKernelEntry(const char *KernelName, PollyGPUModule *Module, |
| PollyGPUFunction **Kernel) { |
| *Kernel = malloc(sizeof(PollyGPUFunction)); |
| if (*Kernel == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU kernel failed.\n"); |
| exit(-1); |
| } |
| |
| /* Locate the kernel entry point. */ |
| if (CuModuleGetFunctionFcnPtr(&((*Kernel)->Cuda), Module->Cuda, KernelName) != |
| CUDA_SUCCESS) { |
| fprintf(stdout, "Loading kernel function failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| void polly_startTimerByCudaEvent(PollyGPUEvent **Start, PollyGPUEvent **Stop) { |
| *Start = malloc(sizeof(PollyGPUEvent)); |
| if (*Start == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU start timer failed.\n"); |
| exit(-1); |
| } |
| CudaEventCreateFcnPtr(&((*Start)->Cuda)); |
| |
| *Stop = malloc(sizeof(PollyGPUEvent)); |
| if (*Stop == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU stop timer failed.\n"); |
| exit(-1); |
| } |
| CudaEventCreateFcnPtr(&((*Stop)->Cuda)); |
| |
| /* Record the start time. */ |
| CudaEventRecordFcnPtr((*Start)->Cuda, 0); |
| } |
| |
| void polly_stopTimerByCudaEvent(PollyGPUEvent *Start, PollyGPUEvent *Stop, |
| float *ElapsedTimes) { |
| /* Record the end time. */ |
| CudaEventRecordFcnPtr(Stop->Cuda, 0); |
| CudaEventSynchronizeFcnPtr(Start->Cuda); |
| CudaEventSynchronizeFcnPtr(Stop->Cuda); |
| CudaEventElapsedTimeFcnPtr(ElapsedTimes, Start->Cuda, Stop->Cuda); |
| CudaEventDestroyFcnPtr(Start->Cuda); |
| CudaEventDestroyFcnPtr(Stop->Cuda); |
| fprintf(stderr, "Processing time: %f (ms).\n", *ElapsedTimes); |
| |
| free(Start); |
| free(Stop); |
| } |
| |
| void polly_allocateMemoryForHostAndDevice( |
| void **HostData, PollyGPUDevicePtr **DevData, int MemSize) { |
| if ((*HostData = (int *)malloc(MemSize)) == 0) { |
| fprintf(stdout, "Could not allocate host memory.\n"); |
| exit(-1); |
| } |
| |
| *DevData = malloc(sizeof(PollyGPUDevicePtr)); |
| if (*DevData == 0) { |
| fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); |
| exit(-1); |
| } |
| CuMemAllocFcnPtr(&((*DevData)->Cuda), MemSize); |
| } |
| |
| void polly_copyFromHostToDevice(PollyGPUDevicePtr *DevData, void *HostData, |
| int MemSize) { |
| CUdeviceptr CuDevData = DevData->Cuda; |
| CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); |
| } |
| |
| void polly_copyFromDeviceToHost(void *HostData, PollyGPUDevicePtr *DevData, |
| int MemSize) { |
| if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { |
| fprintf(stdout, "Copying results from device to host memory failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| void polly_setKernelParameters(PollyGPUFunction *Kernel, int BlockWidth, |
| int BlockHeight, PollyGPUDevicePtr *DevData) { |
| int ParamOffset = 0; |
| |
| CuFuncSetBlockShapeFcnPtr(Kernel->Cuda, BlockWidth, BlockHeight, 1); |
| CuParamSetvFcnPtr(Kernel->Cuda, ParamOffset, &(DevData->Cuda), |
| sizeof(DevData->Cuda)); |
| ParamOffset += sizeof(DevData->Cuda); |
| CuParamSetSizeFcnPtr(Kernel->Cuda, ParamOffset); |
| } |
| |
| void polly_launchKernel(PollyGPUFunction *Kernel, int GridWidth, |
| int GridHeight) { |
| if (CuLaunchGridFcnPtr(Kernel->Cuda, GridWidth, GridHeight) != CUDA_SUCCESS) { |
| fprintf(stdout, "Launching CUDA kernel failed.\n"); |
| exit(-1); |
| } |
| CudaThreadSynchronizeFcnPtr(); |
| fprintf(stdout, "CUDA kernel launched.\n"); |
| } |
| |
| void polly_cleanupGPGPUResources( |
| void *HostData, PollyGPUDevicePtr *DevData, PollyGPUModule *Module, |
| PollyGPUContext *Context, PollyGPUFunction *Kernel) { |
| if (HostData) { |
| free(HostData); |
| HostData = 0; |
| } |
| |
| if (DevData->Cuda) { |
| CuMemFreeFcnPtr(DevData->Cuda); |
| free(DevData); |
| } |
| |
| if (Module->Cuda) { |
| CuModuleUnloadFcnPtr(Module->Cuda); |
| free(Module); |
| } |
| |
| if (Context->Cuda) { |
| CuCtxDestroyFcnPtr(Context->Cuda); |
| free(Context); |
| } |
| |
| if (Kernel) { |
| free(Kernel); |
| } |
| |
| dlclose(HandleCuda); |
| dlclose(HandleCudaRT); |
| } |