| /******************** GPUJIT.c - GPUJIT Execution Engine **********************/ |
| /* */ |
| /* The LLVM Compiler Infrastructure */ |
| /* */ |
| /* This file is dual licensed under the MIT and the University of Illinois */ |
| /* Open Source License. See LICENSE.TXT for details. */ |
| /* */ |
| /******************************************************************************/ |
| /* */ |
| /* This file implements GPUJIT, a ptx string execution engine for GPU. */ |
| /* */ |
| /******************************************************************************/ |
| |
| #include "GPUJIT.h" |
| #include <cuda.h> |
| #include <cuda_runtime.h> |
| #include <dlfcn.h> |
| #include <stdarg.h> |
| #include <stdio.h> |
| #include <string.h> |
| |
| static int DebugMode; |
| static int CacheMode; |
| |
| static void debug_print(const char *format, ...) { |
| if (!DebugMode) |
| return; |
| |
| va_list args; |
| va_start(args, format); |
| vfprintf(stderr, format, args); |
| va_end(args); |
| } |
| #define dump_function() debug_print("-> %s\n", __func__) |
| |
| /* Define Polly's GPGPU data types. */ |
| struct PollyGPUContextT { |
| CUcontext Cuda; |
| }; |
| |
| struct PollyGPUFunctionT { |
| CUfunction Cuda; |
| CUmodule CudaModule; |
| const char *PTXString; |
| }; |
| |
| struct PollyGPUDevicePtrT { |
| CUdeviceptr Cuda; |
| }; |
| |
| /* Dynamic library handles for the CUDA and CUDA runtime library. */ |
| static void *HandleCuda; |
| static void *HandleCudaRT; |
| |
| /* Type-defines of function pointer to CUDA driver APIs. */ |
| typedef CUresult CUDAAPI CuMemAllocFcnTy(CUdeviceptr *, size_t); |
| static CuMemAllocFcnTy *CuMemAllocFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLaunchKernelFcnTy( |
| CUfunction f, unsigned int gridDimX, unsigned int gridDimY, |
| unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, |
| unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, |
| void **kernelParams, void **extra); |
| static CuLaunchKernelFcnTy *CuLaunchKernelFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemcpyDtoHFcnTy(void *, CUdeviceptr, size_t); |
| static CuMemcpyDtoHFcnTy *CuMemcpyDtoHFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemcpyHtoDFcnTy(CUdeviceptr, const void *, size_t); |
| static CuMemcpyHtoDFcnTy *CuMemcpyHtoDFcnPtr; |
| |
| typedef CUresult CUDAAPI CuMemFreeFcnTy(CUdeviceptr); |
| static CuMemFreeFcnTy *CuMemFreeFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleUnloadFcnTy(CUmodule); |
| static CuModuleUnloadFcnTy *CuModuleUnloadFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxDestroyFcnTy(CUcontext); |
| static CuCtxDestroyFcnTy *CuCtxDestroyFcnPtr; |
| |
| typedef CUresult CUDAAPI CuInitFcnTy(unsigned int); |
| static CuInitFcnTy *CuInitFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetCountFcnTy(int *); |
| static CuDeviceGetCountFcnTy *CuDeviceGetCountFcnPtr; |
| |
| typedef CUresult CUDAAPI CuCtxCreateFcnTy(CUcontext *, unsigned int, CUdevice); |
| static CuCtxCreateFcnTy *CuCtxCreateFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetFcnTy(CUdevice *, int); |
| static CuDeviceGetFcnTy *CuDeviceGetFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleLoadDataExFcnTy(CUmodule *, const void *, |
| unsigned int, CUjit_option *, |
| void **); |
| static CuModuleLoadDataExFcnTy *CuModuleLoadDataExFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleLoadDataFcnTy(CUmodule *module, |
| const void *image); |
| static CuModuleLoadDataFcnTy *CuModuleLoadDataFcnPtr; |
| |
| typedef CUresult CUDAAPI CuModuleGetFunctionFcnTy(CUfunction *, CUmodule, |
| const char *); |
| static CuModuleGetFunctionFcnTy *CuModuleGetFunctionFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceComputeCapabilityFcnTy(int *, int *, CUdevice); |
| static CuDeviceComputeCapabilityFcnTy *CuDeviceComputeCapabilityFcnPtr; |
| |
| typedef CUresult CUDAAPI CuDeviceGetNameFcnTy(char *, int, CUdevice); |
| static CuDeviceGetNameFcnTy *CuDeviceGetNameFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkAddDataFcnTy(CUlinkState state, |
| CUjitInputType type, void *data, |
| size_t size, const char *name, |
| unsigned int numOptions, |
| CUjit_option *options, |
| void **optionValues); |
| static CuLinkAddDataFcnTy *CuLinkAddDataFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkCreateFcnTy(unsigned int numOptions, |
| CUjit_option *options, |
| void **optionValues, |
| CUlinkState *stateOut); |
| static CuLinkCreateFcnTy *CuLinkCreateFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkCompleteFcnTy(CUlinkState state, void **cubinOut, |
| size_t *sizeOut); |
| static CuLinkCompleteFcnTy *CuLinkCompleteFcnPtr; |
| |
| typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); |
| static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; |
| |
| /* Type-defines of function pointer ot CUDA runtime APIs. */ |
| typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); |
| static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; |
| |
| static void *getAPIHandle(void *Handle, const char *FuncName) { |
| char *Err; |
| void *FuncPtr; |
| dlerror(); |
| FuncPtr = dlsym(Handle, FuncName); |
| if ((Err = dlerror()) != 0) { |
| fprintf(stdout, "Load CUDA driver API failed: %s. \n", Err); |
| return 0; |
| } |
| return FuncPtr; |
| } |
| |
| static int initialDeviceAPILibraries() { |
| HandleCuda = dlopen("libcuda.so", RTLD_LAZY); |
| if (!HandleCuda) { |
| printf("Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| |
| HandleCudaRT = dlopen("libcudart.so", RTLD_LAZY); |
| if (!HandleCudaRT) { |
| printf("Cannot open library: %s. \n", dlerror()); |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| static int initialDeviceAPIs() { |
| if (initialDeviceAPILibraries() == 0) |
| return 0; |
| |
| /* Get function pointer to CUDA Driver APIs. |
| * |
| * Note that compilers conforming to the ISO C standard are required to |
| * generate a warning if a conversion from a void * pointer to a function |
| * pointer is attempted as in the following statements. The warning |
| * of this kind of cast may not be emitted by clang and new versions of gcc |
| * as it is valid on POSIX 2008. |
| */ |
| CuLaunchKernelFcnPtr = |
| (CuLaunchKernelFcnTy *)getAPIHandle(HandleCuda, "cuLaunchKernel"); |
| |
| CuMemAllocFcnPtr = |
| (CuMemAllocFcnTy *)getAPIHandle(HandleCuda, "cuMemAlloc_v2"); |
| |
| CuMemFreeFcnPtr = (CuMemFreeFcnTy *)getAPIHandle(HandleCuda, "cuMemFree_v2"); |
| |
| CuMemcpyDtoHFcnPtr = |
| (CuMemcpyDtoHFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyDtoH_v2"); |
| |
| CuMemcpyHtoDFcnPtr = |
| (CuMemcpyHtoDFcnTy *)getAPIHandle(HandleCuda, "cuMemcpyHtoD_v2"); |
| |
| CuModuleUnloadFcnPtr = |
| (CuModuleUnloadFcnTy *)getAPIHandle(HandleCuda, "cuModuleUnload"); |
| |
| CuCtxDestroyFcnPtr = |
| (CuCtxDestroyFcnTy *)getAPIHandle(HandleCuda, "cuCtxDestroy"); |
| |
| CuInitFcnPtr = (CuInitFcnTy *)getAPIHandle(HandleCuda, "cuInit"); |
| |
| CuDeviceGetCountFcnPtr = |
| (CuDeviceGetCountFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetCount"); |
| |
| CuDeviceGetFcnPtr = |
| (CuDeviceGetFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGet"); |
| |
| CuCtxCreateFcnPtr = |
| (CuCtxCreateFcnTy *)getAPIHandle(HandleCuda, "cuCtxCreate_v2"); |
| |
| CuModuleLoadDataExFcnPtr = |
| (CuModuleLoadDataExFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadDataEx"); |
| |
| CuModuleLoadDataFcnPtr = |
| (CuModuleLoadDataFcnTy *)getAPIHandle(HandleCuda, "cuModuleLoadData"); |
| |
| CuModuleGetFunctionFcnPtr = (CuModuleGetFunctionFcnTy *)getAPIHandle( |
| HandleCuda, "cuModuleGetFunction"); |
| |
| CuDeviceComputeCapabilityFcnPtr = |
| (CuDeviceComputeCapabilityFcnTy *)getAPIHandle( |
| HandleCuda, "cuDeviceComputeCapability"); |
| |
| CuDeviceGetNameFcnPtr = |
| (CuDeviceGetNameFcnTy *)getAPIHandle(HandleCuda, "cuDeviceGetName"); |
| |
| CuLinkAddDataFcnPtr = |
| (CuLinkAddDataFcnTy *)getAPIHandle(HandleCuda, "cuLinkAddData"); |
| |
| CuLinkCreateFcnPtr = |
| (CuLinkCreateFcnTy *)getAPIHandle(HandleCuda, "cuLinkCreate"); |
| |
| CuLinkCompleteFcnPtr = |
| (CuLinkCompleteFcnTy *)getAPIHandle(HandleCuda, "cuLinkComplete"); |
| |
| CuLinkDestroyFcnPtr = |
| (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); |
| |
| /* Get function pointer to CUDA Runtime APIs. */ |
| CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( |
| HandleCudaRT, "cudaThreadSynchronize"); |
| |
| return 1; |
| } |
| |
| PollyGPUContext *polly_initContext() { |
| DebugMode = getenv("POLLY_DEBUG") != 0; |
| |
| dump_function(); |
| PollyGPUContext *Context; |
| CUdevice Device; |
| |
| int Major = 0, Minor = 0, DeviceID = 0; |
| char DeviceName[256]; |
| int DeviceCount = 0; |
| |
| static __thread PollyGPUContext *CurrentContext = NULL; |
| |
| if (CurrentContext) |
| return CurrentContext; |
| |
| /* Get API handles. */ |
| if (initialDeviceAPIs() == 0) { |
| fprintf(stdout, "Getting the \"handle\" for the CUDA driver API failed.\n"); |
| exit(-1); |
| } |
| |
| if (CuInitFcnPtr(0) != CUDA_SUCCESS) { |
| fprintf(stdout, "Initializing the CUDA driver API failed.\n"); |
| exit(-1); |
| } |
| |
| /* Get number of devices that supports CUDA. */ |
| CuDeviceGetCountFcnPtr(&DeviceCount); |
| if (DeviceCount == 0) { |
| fprintf(stdout, "There is no device supporting CUDA.\n"); |
| exit(-1); |
| } |
| |
| CuDeviceGetFcnPtr(&Device, 0); |
| |
| /* Get compute capabilities and the device name. */ |
| CuDeviceComputeCapabilityFcnPtr(&Major, &Minor, Device); |
| CuDeviceGetNameFcnPtr(DeviceName, 256, Device); |
| debug_print("> Running on GPU device %d : %s.\n", DeviceID, DeviceName); |
| |
| /* Create context on the device. */ |
| Context = (PollyGPUContext *)malloc(sizeof(PollyGPUContext)); |
| if (Context == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU context failed.\n"); |
| exit(-1); |
| } |
| CuCtxCreateFcnPtr(&(Context->Cuda), 0, Device); |
| |
| CacheMode = getenv("POLLY_NOCACHE") == 0; |
| |
| if (CacheMode) |
| CurrentContext = Context; |
| |
| return Context; |
| } |
| |
| static void freeKernel(PollyGPUFunction *Kernel) { |
| if (Kernel->CudaModule) |
| CuModuleUnloadFcnPtr(Kernel->CudaModule); |
| |
| if (Kernel) |
| free(Kernel); |
| } |
| |
| #define KERNEL_CACHE_SIZE 10 |
| |
| PollyGPUFunction *polly_getKernel(const char *PTXBuffer, |
| const char *KernelName) { |
| dump_function(); |
| |
| static __thread PollyGPUFunction *KernelCache[KERNEL_CACHE_SIZE]; |
| static __thread int NextCacheItem = 0; |
| |
| for (long i = 0; i < KERNEL_CACHE_SIZE; i++) { |
| // We exploit here the property that all Polly-ACC kernels are allocated |
| // as global constants, hence a pointer comparision is sufficient to |
| // determin equality. |
| if (KernelCache[i] && KernelCache[i]->PTXString == PTXBuffer) { |
| debug_print(" -> using cached kernel\n"); |
| return KernelCache[i]; |
| } |
| } |
| |
| PollyGPUFunction *Function = malloc(sizeof(PollyGPUFunction)); |
| |
| if (Function == 0) { |
| fprintf(stdout, "Allocate memory for Polly GPU function failed.\n"); |
| exit(-1); |
| } |
| |
| CUresult Res; |
| CUlinkState LState; |
| CUjit_option Options[6]; |
| void *OptionVals[6]; |
| float Walltime = 0; |
| unsigned long LogSize = 8192; |
| char ErrorLog[8192], InfoLog[8192]; |
| void *CuOut; |
| size_t OutSize; |
| |
| // Setup linker options |
| // Return walltime from JIT compilation |
| Options[0] = CU_JIT_WALL_TIME; |
| OptionVals[0] = (void *)&Walltime; |
| // Pass a buffer for info messages |
| Options[1] = CU_JIT_INFO_LOG_BUFFER; |
| OptionVals[1] = (void *)InfoLog; |
| // Pass the size of the info buffer |
| Options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; |
| OptionVals[2] = (void *)LogSize; |
| // Pass a buffer for error message |
| Options[3] = CU_JIT_ERROR_LOG_BUFFER; |
| OptionVals[3] = (void *)ErrorLog; |
| // Pass the size of the error buffer |
| Options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; |
| OptionVals[4] = (void *)LogSize; |
| // Make the linker verbose |
| Options[5] = CU_JIT_LOG_VERBOSE; |
| OptionVals[5] = (void *)1; |
| |
| memset(ErrorLog, 0, sizeof(ErrorLog)); |
| |
| CuLinkCreateFcnPtr(6, Options, OptionVals, &LState); |
| Res = CuLinkAddDataFcnPtr(LState, CU_JIT_INPUT_PTX, (void *)PTXBuffer, |
| strlen(PTXBuffer) + 1, 0, 0, 0, 0); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stdout, "PTX Linker Error:\n%s\n%s", ErrorLog, InfoLog); |
| exit(-1); |
| } |
| |
| Res = CuLinkCompleteFcnPtr(LState, &CuOut, &OutSize); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stdout, "Complete ptx linker step failed.\n"); |
| fprintf(stdout, "\n%s\n", ErrorLog); |
| exit(-1); |
| } |
| |
| debug_print("CUDA Link Completed in %fms. Linker Output:\n%s\n", Walltime, |
| InfoLog); |
| |
| Res = CuModuleLoadDataFcnPtr(&(Function->CudaModule), CuOut); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stdout, "Loading ptx assembly text failed.\n"); |
| exit(-1); |
| } |
| |
| Res = CuModuleGetFunctionFcnPtr(&(Function->Cuda), Function->CudaModule, |
| KernelName); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stdout, "Loading kernel function failed.\n"); |
| exit(-1); |
| } |
| |
| CuLinkDestroyFcnPtr(LState); |
| |
| Function->PTXString = PTXBuffer; |
| |
| if (CacheMode) { |
| if (KernelCache[NextCacheItem]) |
| freeKernel(KernelCache[NextCacheItem]); |
| |
| KernelCache[NextCacheItem] = Function; |
| |
| NextCacheItem = (NextCacheItem + 1) % KERNEL_CACHE_SIZE; |
| } |
| |
| return Function; |
| } |
| |
| void polly_freeKernel(PollyGPUFunction *Kernel) { |
| dump_function(); |
| |
| if (CacheMode) |
| return; |
| |
| freeKernel(Kernel); |
| } |
| |
| void polly_copyFromHostToDevice(void *HostData, PollyGPUDevicePtr *DevData, |
| long MemSize) { |
| dump_function(); |
| |
| CUdeviceptr CuDevData = DevData->Cuda; |
| CuMemcpyHtoDFcnPtr(CuDevData, HostData, MemSize); |
| } |
| |
| void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, |
| long MemSize) { |
| dump_function(); |
| |
| if (CuMemcpyDtoHFcnPtr(HostData, DevData->Cuda, MemSize) != CUDA_SUCCESS) { |
| fprintf(stdout, "Copying results from device to host memory failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, |
| unsigned int GridDimY, unsigned int BlockDimX, |
| unsigned int BlockDimY, unsigned int BlockDimZ, |
| void **Parameters) { |
| dump_function(); |
| |
| unsigned GridDimZ = 1; |
| unsigned int SharedMemBytes = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE; |
| CUstream Stream = 0; |
| void **Extra = 0; |
| |
| CUresult Res; |
| Res = CuLaunchKernelFcnPtr(Kernel->Cuda, GridDimX, GridDimY, GridDimZ, |
| BlockDimX, BlockDimY, BlockDimZ, SharedMemBytes, |
| Stream, Parameters, Extra); |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stdout, "Launching CUDA kernel failed.\n"); |
| exit(-1); |
| } |
| } |
| |
| void polly_freeDeviceMemory(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| CuMemFreeFcnPtr((CUdeviceptr)Allocation->Cuda); |
| free(Allocation); |
| } |
| |
| PollyGPUDevicePtr *polly_allocateMemoryForDevice(long MemSize) { |
| dump_function(); |
| |
| PollyGPUDevicePtr *DevData = malloc(sizeof(PollyGPUDevicePtr)); |
| |
| if (DevData == 0) { |
| fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); |
| exit(-1); |
| } |
| |
| CUresult Res = CuMemAllocFcnPtr(&(DevData->Cuda), MemSize); |
| |
| if (Res != CUDA_SUCCESS) { |
| fprintf(stdout, "Allocate memory for GPU device memory pointer failed.\n"); |
| exit(-1); |
| } |
| |
| return DevData; |
| } |
| |
| void *polly_getDevicePtr(PollyGPUDevicePtr *Allocation) { |
| dump_function(); |
| |
| return (void *)Allocation->Cuda; |
| } |
| |
| void polly_freeContext(PollyGPUContext *Context) { |
| dump_function(); |
| |
| if (CacheMode) |
| return; |
| |
| if (Context->Cuda) { |
| CuCtxDestroyFcnPtr(Context->Cuda); |
| free(Context); |
| } |
| |
| dlclose(HandleCuda); |
| dlclose(HandleCudaRT); |
| } |