|  | //===-- Loader Implementation for NVPTX devices --------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file impelements a simple loader to run images supporting the NVPTX | 
|  | // architecture. The file launches the '_start' kernel which should be provided | 
|  | // by the device application start code and call ultimately call the 'main' | 
|  | // function. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "llvm-gpu-loader.h" | 
|  | #include "server.h" | 
|  |  | 
|  | #include "cuda.h" | 
|  |  | 
|  | #include "llvm/Object/ELF.h" | 
|  | #include "llvm/Object/ELFObjectFile.h" | 
|  |  | 
|  | #include <atomic> | 
|  | #include <cstddef> | 
|  | #include <cstdio> | 
|  | #include <cstdlib> | 
|  | #include <cstring> | 
|  | #include <thread> | 
|  | #include <vector> | 
|  |  | 
|  | using namespace llvm; | 
|  | using namespace object; | 
|  |  | 
|  | static void handle_error_impl(const char *file, int32_t line, CUresult err) { | 
|  | if (err == CUDA_SUCCESS) | 
|  | return; | 
|  |  | 
|  | const char *err_str = nullptr; | 
|  | CUresult result = cuGetErrorString(err, &err_str); | 
|  | if (result != CUDA_SUCCESS) | 
|  | fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line); | 
|  | else | 
|  | fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str); | 
|  | exit(1); | 
|  | } | 
|  |  | 
|  | // Gets the names of all the globals that contain functions to initialize or | 
|  | // deinitialize. We need to do this manually because the NVPTX toolchain does | 
|  | // not contain the necessary binary manipulation tools. | 
|  | template <typename Alloc> | 
|  | Expected<void *> get_ctor_dtor_array(const void *image, const size_t size, | 
|  | Alloc allocator, CUmodule binary) { | 
|  | auto mem_buffer = MemoryBuffer::getMemBuffer( | 
|  | StringRef(reinterpret_cast<const char *>(image), size), "image", | 
|  | /*RequiresNullTerminator=*/false); | 
|  | Expected<ELF64LEObjectFile> elf_or_err = | 
|  | ELF64LEObjectFile::create(*mem_buffer); | 
|  | if (!elf_or_err) | 
|  | handle_error(toString(elf_or_err.takeError()).c_str()); | 
|  |  | 
|  | std::vector<std::pair<const char *, uint16_t>> ctors; | 
|  | std::vector<std::pair<const char *, uint16_t>> dtors; | 
|  | // CUDA has no way to iterate over all the symbols so we need to inspect the | 
|  | // ELF directly using the LLVM libraries. | 
|  | for (const auto &symbol : elf_or_err->symbols()) { | 
|  | auto name_or_err = symbol.getName(); | 
|  | if (!name_or_err) | 
|  | handle_error(toString(name_or_err.takeError()).c_str()); | 
|  |  | 
|  | // Search for all symbols that contain a constructor or destructor. | 
|  | if (!name_or_err->starts_with("__init_array_object_") && | 
|  | !name_or_err->starts_with("__fini_array_object_")) | 
|  | continue; | 
|  |  | 
|  | uint16_t priority; | 
|  | if (name_or_err->rsplit('_').second.getAsInteger(10, priority)) | 
|  | handle_error("Invalid priority for constructor or destructor"); | 
|  |  | 
|  | if (name_or_err->starts_with("__init")) | 
|  | ctors.emplace_back(std::make_pair(name_or_err->data(), priority)); | 
|  | else | 
|  | dtors.emplace_back(std::make_pair(name_or_err->data(), priority)); | 
|  | } | 
|  | // Lower priority constructors are run before higher ones. The reverse is true | 
|  | // for destructors. | 
|  | llvm::sort(ctors, llvm::less_second()); | 
|  | llvm::sort(dtors, llvm::less_second()); | 
|  |  | 
|  | // Allocate host pinned memory to make these arrays visible to the GPU. | 
|  | CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator( | 
|  | ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr))); | 
|  | uint64_t global_size = 0; | 
|  |  | 
|  | // Get the address of the global and then store the address of the constructor | 
|  | // function to call in the constructor array. | 
|  | CUdeviceptr *dev_ctors_start = dev_memory; | 
|  | CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size(); | 
|  | for (uint64_t i = 0; i < ctors.size(); ++i) { | 
|  | CUdeviceptr dev_ptr; | 
|  | if (CUresult err = | 
|  | cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first)) | 
|  | handle_error(err); | 
|  | if (CUresult err = | 
|  | cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t))) | 
|  | handle_error(err); | 
|  | } | 
|  |  | 
|  | // Get the address of the global and then store the address of the destructor | 
|  | // function to call in the destructor array. | 
|  | CUdeviceptr *dev_dtors_start = dev_ctors_end; | 
|  | CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size(); | 
|  | for (uint64_t i = 0; i < dtors.size(); ++i) { | 
|  | CUdeviceptr dev_ptr; | 
|  | if (CUresult err = | 
|  | cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first)) | 
|  | handle_error(err); | 
|  | if (CUresult err = | 
|  | cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t))) | 
|  | handle_error(err); | 
|  | } | 
|  |  | 
|  | // Obtain the address of the pointers the startup implementation uses to | 
|  | // iterate the constructors and destructors. | 
|  | CUdeviceptr init_start; | 
|  | if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary, | 
|  | "__init_array_start")) | 
|  | handle_error(err); | 
|  | CUdeviceptr init_end; | 
|  | if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary, | 
|  | "__init_array_end")) | 
|  | handle_error(err); | 
|  | CUdeviceptr fini_start; | 
|  | if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary, | 
|  | "__fini_array_start")) | 
|  | handle_error(err); | 
|  | CUdeviceptr fini_end; | 
|  | if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary, | 
|  | "__fini_array_end")) | 
|  | handle_error(err); | 
|  |  | 
|  | // Copy the pointers to the newly written array to the symbols so the startup | 
|  | // implementation can iterate them. | 
|  | if (CUresult err = | 
|  | cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t))) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t))) | 
|  | handle_error(err); | 
|  | if (CUresult err = | 
|  | cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t))) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t))) | 
|  | handle_error(err); | 
|  |  | 
|  | return dev_memory; | 
|  | } | 
|  |  | 
|  | void print_kernel_resources(CUmodule binary, const char *kernel_name) { | 
|  | CUfunction function; | 
|  | if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) | 
|  | handle_error(err); | 
|  | int num_regs; | 
|  | if (CUresult err = | 
|  | cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function)) | 
|  | handle_error(err); | 
|  | printf("Executing kernel %s:\n", kernel_name); | 
|  | printf("%6s registers: %d\n", kernel_name, num_regs); | 
|  | } | 
|  |  | 
|  | template <typename args_t> | 
|  | CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server, | 
|  | const LaunchParameters ¶ms, const char *kernel_name, | 
|  | args_t kernel_args, bool print_resource_usage) { | 
|  | // look up the '_start' kernel in the loaded module. | 
|  | CUfunction function; | 
|  | if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Set up the arguments to the '_start' kernel on the GPU. | 
|  | uint64_t args_size = sizeof(args_t); | 
|  | void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args, | 
|  | CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size, | 
|  | CU_LAUNCH_PARAM_END}; | 
|  | if (print_resource_usage) | 
|  | print_kernel_resources(binary, kernel_name); | 
|  |  | 
|  | // Initialize a non-blocking CUDA stream to allocate memory if needed. | 
|  | // This needs to be done on a separate stream or else it will deadlock | 
|  | // with the executing kernel. | 
|  | CUstream memory_stream; | 
|  | if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING)) | 
|  | handle_error(err); | 
|  |  | 
|  | std::atomic<bool> finished = false; | 
|  | std::thread server_thread( | 
|  | [](std::atomic<bool> *finished, rpc::Server *server, | 
|  | CUstream memory_stream) { | 
|  | auto malloc_handler = [&](size_t size) -> void * { | 
|  | CUdeviceptr dev_ptr; | 
|  | if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream)) | 
|  | dev_ptr = 0UL; | 
|  |  | 
|  | // Wait until the memory allocation is complete. | 
|  | while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY) | 
|  | ; | 
|  | return reinterpret_cast<void *>(dev_ptr); | 
|  | }; | 
|  |  | 
|  | auto free_handler = [&](void *ptr) -> void { | 
|  | if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), | 
|  | memory_stream)) | 
|  | handle_error(err); | 
|  | }; | 
|  |  | 
|  | uint32_t index = 0; | 
|  | while (!*finished) { | 
|  | index = | 
|  | handle_server<32>(*server, index, malloc_handler, free_handler); | 
|  | } | 
|  | }, | 
|  | &finished, &server, memory_stream); | 
|  |  | 
|  | // Call the kernel with the given arguments. | 
|  | if (CUresult err = cuLaunchKernel( | 
|  | function, params.num_blocks_x, params.num_blocks_y, | 
|  | params.num_blocks_z, params.num_threads_x, params.num_threads_y, | 
|  | params.num_threads_z, 0, stream, nullptr, args_config)) | 
|  | handle_error(err); | 
|  |  | 
|  | if (CUresult err = cuStreamSynchronize(stream)) | 
|  | handle_error(err); | 
|  |  | 
|  | finished = true; | 
|  | if (server_thread.joinable()) | 
|  | server_thread.join(); | 
|  |  | 
|  | return CUDA_SUCCESS; | 
|  | } | 
|  |  | 
|  | int load_nvptx(int argc, const char **argv, const char **envp, void *image, | 
|  | size_t size, const LaunchParameters ¶ms, | 
|  | bool print_resource_usage) { | 
|  | if (CUresult err = cuInit(0)) | 
|  | handle_error(err); | 
|  | // Obtain the first device found on the system. | 
|  | uint32_t device_id = 0; | 
|  | CUdevice device; | 
|  | if (CUresult err = cuDeviceGet(&device, device_id)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Initialize the CUDA context and claim it for this execution. | 
|  | CUcontext context; | 
|  | if (CUresult err = cuDevicePrimaryCtxRetain(&context, device)) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuCtxSetCurrent(context)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Increase the stack size per thread. | 
|  | // TODO: We should allow this to be passed in so only the tests that require a | 
|  | // larger stack can specify it to save on memory usage. | 
|  | if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Initialize a non-blocking CUDA stream to execute the kernel. | 
|  | CUstream stream; | 
|  | if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Load the image into a CUDA module. | 
|  | CUmodule binary; | 
|  | if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Allocate pinned memory on the host to hold the pointer array for the | 
|  | // copied argv and allow the GPU device to access it. | 
|  | auto allocator = [&](uint64_t size) -> void * { | 
|  | void *dev_ptr; | 
|  | if (CUresult err = cuMemAllocHost(&dev_ptr, size)) | 
|  | handle_error(err); | 
|  | return dev_ptr; | 
|  | }; | 
|  |  | 
|  | auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary); | 
|  | if (!memory_or_err) | 
|  | handle_error(toString(memory_or_err.takeError()).c_str()); | 
|  |  | 
|  | void *dev_argv = copy_argument_vector(argc, argv, allocator); | 
|  | if (!dev_argv) | 
|  | handle_error("Failed to allocate device argv"); | 
|  |  | 
|  | // Allocate pinned memory on the host to hold the pointer array for the | 
|  | // copied environment array and allow the GPU device to access it. | 
|  | void *dev_envp = copy_environment(envp, allocator); | 
|  | if (!dev_envp) | 
|  | handle_error("Failed to allocate device environment"); | 
|  |  | 
|  | // Allocate space for the return pointer and initialize it to zero. | 
|  | CUdeviceptr dev_ret; | 
|  | if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int))) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) | 
|  | handle_error(err); | 
|  |  | 
|  | uint32_t warp_size = 32; | 
|  | void *rpc_buffer = nullptr; | 
|  | if (CUresult err = cuMemAllocHost( | 
|  | &rpc_buffer, | 
|  | rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT))) | 
|  | handle_error(err); | 
|  | rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer); | 
|  | rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer); | 
|  |  | 
|  | // Initialize the RPC client on the device by copying the local data to the | 
|  | // device's internal pointer. | 
|  | CUdeviceptr rpc_client_dev = 0; | 
|  | uint64_t client_ptr_size = sizeof(void *); | 
|  | if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size, | 
|  | binary, "__llvm_rpc_client")) | 
|  | handle_error(err); | 
|  |  | 
|  | if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client))) | 
|  | handle_error(err); | 
|  |  | 
|  | LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; | 
|  | begin_args_t init_args = {argc, dev_argv, dev_envp}; | 
|  | if (CUresult err = | 
|  | launch_kernel(binary, stream, server, single_threaded_params, | 
|  | "_begin", init_args, print_resource_usage)) | 
|  | handle_error(err); | 
|  |  | 
|  | start_args_t args = {argc, dev_argv, dev_envp, | 
|  | reinterpret_cast<void *>(dev_ret)}; | 
|  | if (CUresult err = launch_kernel(binary, stream, server, params, "_start", | 
|  | args, print_resource_usage)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Copy the return value back from the kernel and wait. | 
|  | int host_ret = 0; | 
|  | if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int))) | 
|  | handle_error(err); | 
|  |  | 
|  | if (CUresult err = cuStreamSynchronize(stream)) | 
|  | handle_error(err); | 
|  |  | 
|  | end_args_t fini_args = {host_ret}; | 
|  | if (CUresult err = | 
|  | launch_kernel(binary, stream, server, single_threaded_params, "_end", | 
|  | fini_args, print_resource_usage)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Free the memory allocated for the device. | 
|  | if (CUresult err = cuMemFreeHost(*memory_or_err)) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuMemFree(dev_ret)) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuMemFreeHost(dev_argv)) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuMemFreeHost(rpc_buffer)) | 
|  | handle_error(err); | 
|  |  | 
|  | // Destroy the context and the loaded binary. | 
|  | if (CUresult err = cuModuleUnload(binary)) | 
|  | handle_error(err); | 
|  | if (CUresult err = cuDevicePrimaryCtxRelease(device)) | 
|  | handle_error(err); | 
|  | return host_ret; | 
|  | } |