llvm/tools/llvm-gpu-loader/nvptx.cpp - llvm-project.git - Git at Google

 //===-- Loader Implementation for NVPTX devices --------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file impelements a simple loader to run images supporting the NVPTX
 // architecture. The file launches the '_start' kernel which should be provided
 // by the device application start code and call ultimately call the 'main'
 // function.
 //
 //===----------------------------------------------------------------------===//

 #include "llvm-gpu-loader.h"
 #include "server.h"

 #include "cuda.h"

 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"

 #include <atomic>
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <thread>
 #include <vector>

 using namespace llvm;
 using namespace object;

 static void handle_error_impl(const char *file, int32_t line, CUresult err) {
   if (err == CUDA_SUCCESS)
     return;

   const char *err_str = nullptr;
   CUresult result = cuGetErrorString(err, &err_str);
   if (result != CUDA_SUCCESS)
     fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
   else
     fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
   exit(1);
 }

 // Gets the names of all the globals that contain functions to initialize or
 // deinitialize. We need to do this manually because the NVPTX toolchain does
 // not contain the necessary binary manipulation tools.
 template <typename Alloc>
 Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
                                      Alloc allocator, CUmodule binary) {
   auto mem_buffer = MemoryBuffer::getMemBuffer(
       StringRef(reinterpret_cast<const char *>(image), size), "image",
       /*RequiresNullTerminator=*/false);
   Expected<ELF64LEObjectFile> elf_or_err =
       ELF64LEObjectFile::create(*mem_buffer);
   if (!elf_or_err)
     handle_error(toString(elf_or_err.takeError()).c_str());

   std::vector<std::pair<const char *, uint16_t>> ctors;
   std::vector<std::pair<const char *, uint16_t>> dtors;
   // CUDA has no way to iterate over all the symbols so we need to inspect the
   // ELF directly using the LLVM libraries.
   for (const auto &symbol : elf_or_err->symbols()) {
     auto name_or_err = symbol.getName();
     if (!name_or_err)
       handle_error(toString(name_or_err.takeError()).c_str());

     // Search for all symbols that contain a constructor or destructor.
     if (!name_or_err->starts_with("__init_array_object_") &&
         !name_or_err->starts_with("__fini_array_object_"))
       continue;

     uint16_t priority;
     if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
       handle_error("Invalid priority for constructor or destructor");

     if (name_or_err->starts_with("__init"))
       ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
     else
       dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
   }
   // Lower priority constructors are run before higher ones. The reverse is true
   // for destructors.
   llvm::sort(ctors, llvm::less_second());
   llvm::sort(dtors, llvm::less_second());

   // Allocate host pinned memory to make these arrays visible to the GPU.
   CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
       ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
   uint64_t global_size = 0;

   // Get the address of the global and then store the address of the constructor
   // function to call in the constructor array.
   CUdeviceptr *dev_ctors_start = dev_memory;
   CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
   for (uint64_t i = 0; i < ctors.size(); ++i) {
     CUdeviceptr dev_ptr;
     if (CUresult err =
             cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
       handle_error(err);
     if (CUresult err =
             cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
       handle_error(err);
   }

   // Get the address of the global and then store the address of the destructor
   // function to call in the destructor array.
   CUdeviceptr *dev_dtors_start = dev_ctors_end;
   CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
   for (uint64_t i = 0; i < dtors.size(); ++i) {
     CUdeviceptr dev_ptr;
     if (CUresult err =
             cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
       handle_error(err);
     if (CUresult err =
             cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
       handle_error(err);
   }

   // Obtain the address of the pointers the startup implementation uses to
   // iterate the constructors and destructors.
   CUdeviceptr init_start;
   if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
                                        "__init_array_start"))
     handle_error(err);
   CUdeviceptr init_end;
   if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
                                        "__init_array_end"))
     handle_error(err);
   CUdeviceptr fini_start;
   if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
                                        "__fini_array_start"))
     handle_error(err);
   CUdeviceptr fini_end;
   if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
                                        "__fini_array_end"))
     handle_error(err);

   // Copy the pointers to the newly written array to the symbols so the startup
   // implementation can iterate them.
   if (CUresult err =
           cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
     handle_error(err);
   if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
     handle_error(err);
   if (CUresult err =
           cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
     handle_error(err);
   if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
     handle_error(err);

   return dev_memory;
 }

 void print_kernel_resources(CUmodule binary, const char *kernel_name) {
   CUfunction function;
   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
     handle_error(err);
   int num_regs;
   if (CUresult err =
           cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
     handle_error(err);
   printf("Executing kernel %s:\n", kernel_name);
   printf("%6s registers: %d\n", kernel_name, num_regs);
 }

 template <typename args_t>
 CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
                        const LaunchParameters &params, const char *kernel_name,
                        args_t kernel_args, bool print_resource_usage) {
   // look up the '_start' kernel in the loaded module.
   CUfunction function;
   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
     handle_error(err);

   // Set up the arguments to the '_start' kernel on the GPU.
   uint64_t args_size = sizeof(args_t);
   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
   if (print_resource_usage)
     print_kernel_resources(binary, kernel_name);

   // Initialize a non-blocking CUDA stream to allocate memory if needed.
   // This needs to be done on a separate stream or else it will deadlock
   // with the executing kernel.
   CUstream memory_stream;
   if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
     handle_error(err);

   std::atomic<bool> finished = false;
   std::thread server_thread(
       [](std::atomic<bool> *finished, rpc::Server *server,
          CUstream memory_stream) {
         auto malloc_handler = [&](size_t size) -> void * {
           CUdeviceptr dev_ptr;
           if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
             dev_ptr = 0UL;

           // Wait until the memory allocation is complete.
           while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
             ;
           return reinterpret_cast<void *>(dev_ptr);
         };

         auto free_handler = [&](void *ptr) -> void {
           if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
                                             memory_stream))
             handle_error(err);
         };

         uint32_t index = 0;
         while (!*finished) {
           index =
               handle_server<32>(*server, index, malloc_handler, free_handler);
         }
       },
       &finished, &server, memory_stream);

   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
           function, params.num_blocks_x, params.num_blocks_y,
           params.num_blocks_z, params.num_threads_x, params.num_threads_y,
           params.num_threads_z, 0, stream, nullptr, args_config))
     handle_error(err);

   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);

   finished = true;
   if (server_thread.joinable())
     server_thread.join();

   return CUDA_SUCCESS;
 }

 int load_nvptx(int argc, const char **argv, const char **envp, void *image,
                size_t size, const LaunchParameters &params,
                bool print_resource_usage) {
   if (CUresult err = cuInit(0))
     handle_error(err);
   // Obtain the first device found on the system.
   uint32_t device_id = 0;
   CUdevice device;
   if (CUresult err = cuDeviceGet(&device, device_id))
     handle_error(err);

   // Initialize the CUDA context and claim it for this execution.
   CUcontext context;
   if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
     handle_error(err);
   if (CUresult err = cuCtxSetCurrent(context))
     handle_error(err);

   // Increase the stack size per thread.
   // TODO: We should allow this to be passed in so only the tests that require a
   // larger stack can specify it to save on memory usage.
   if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
     handle_error(err);

   // Initialize a non-blocking CUDA stream to execute the kernel.
   CUstream stream;
   if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
     handle_error(err);

   // Load the image into a CUDA module.
   CUmodule binary;
   if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
     handle_error(err);

   // Allocate pinned memory on the host to hold the pointer array for the
   // copied argv and allow the GPU device to access it.
   auto allocator = [&](uint64_t size) -> void * {
     void *dev_ptr;
     if (CUresult err = cuMemAllocHost(&dev_ptr, size))
       handle_error(err);
     return dev_ptr;
   };

   auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
   if (!memory_or_err)
     handle_error(toString(memory_or_err.takeError()).c_str());

   void *dev_argv = copy_argument_vector(argc, argv, allocator);
   if (!dev_argv)
     handle_error("Failed to allocate device argv");

   // Allocate pinned memory on the host to hold the pointer array for the
   // copied environment array and allow the GPU device to access it.
   void *dev_envp = copy_environment(envp, allocator);
   if (!dev_envp)
     handle_error("Failed to allocate device environment");

   // Allocate space for the return pointer and initialize it to zero.
   CUdeviceptr dev_ret;
   if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
     handle_error(err);
   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
     handle_error(err);

   uint32_t warp_size = 32;
   void *rpc_buffer = nullptr;
   if (CUresult err = cuMemAllocHost(
           &rpc_buffer,
           rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
     handle_error(err);
   rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
   rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);

   // Initialize the RPC client on the device by copying the local data to the
   // device's internal pointer.
   CUdeviceptr rpc_client_dev = 0;
   uint64_t client_ptr_size = sizeof(void *);
   if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
                                        binary, "__llvm_rpc_client"))
     handle_error(err);

   if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
     handle_error(err);

   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (CUresult err =
           launch_kernel(binary, stream, server, single_threaded_params,
                         "_begin", init_args, print_resource_usage))
     handle_error(err);

   start_args_t args = {argc, dev_argv, dev_envp,
                        reinterpret_cast<void *>(dev_ret)};
   if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
                                    args, print_resource_usage))
     handle_error(err);

   // Copy the return value back from the kernel and wait.
   int host_ret = 0;
   if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
     handle_error(err);

   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);

   end_args_t fini_args = {host_ret};
   if (CUresult err =
           launch_kernel(binary, stream, server, single_threaded_params, "_end",
                         fini_args, print_resource_usage))
     handle_error(err);

   // Free the memory allocated for the device.
   if (CUresult err = cuMemFreeHost(*memory_or_err))
     handle_error(err);
   if (CUresult err = cuMemFree(dev_ret))
     handle_error(err);
   if (CUresult err = cuMemFreeHost(dev_argv))
     handle_error(err);
   if (CUresult err = cuMemFreeHost(rpc_buffer))
     handle_error(err);

   // Destroy the context and the loaded binary.
   if (CUresult err = cuModuleUnload(binary))
     handle_error(err);
   if (CUresult err = cuDevicePrimaryCtxRelease(device))
     handle_error(err);
   return host_ret;
 }
	//===-- Loader Implementation for NVPTX devices --------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file impelements a simple loader to run images supporting the NVPTX
	// architecture. The file launches the '_start' kernel which should be provided
	// by the device application start code and call ultimately call the 'main'
	// function.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm-gpu-loader.h"
	#include "server.h"

	#include "cuda.h"

	#include "llvm/Object/ELF.h"
	#include "llvm/Object/ELFObjectFile.h"

	#include <atomic>
	#include <cstddef>
	#include <cstdio>
	#include <cstdlib>
	#include <cstring>
	#include <thread>
	#include <vector>

	using namespace llvm;
	using namespace object;

	static void handle_error_impl(const char *file, int32_t line, CUresult err) {
	if (err == CUDA_SUCCESS)
	return;

	const char *err_str = nullptr;
	CUresult result = cuGetErrorString(err, &err_str);
	if (result != CUDA_SUCCESS)
	fprintf(stderr, "%s:%d:0: Unknown Error\n", file, line);
	else
	fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, err_str);
	exit(1);
	}

	// Gets the names of all the globals that contain functions to initialize or
	// deinitialize. We need to do this manually because the NVPTX toolchain does
	// not contain the necessary binary manipulation tools.
	template <typename Alloc>
	Expected<void > get_ctor_dtor_array(const void image, const size_t size,
	Alloc allocator, CUmodule binary) {
	auto mem_buffer = MemoryBuffer::getMemBuffer(
	StringRef(reinterpret_cast<const char *>(image), size), "image",
	/RequiresNullTerminator=/false);
	Expected<ELF64LEObjectFile> elf_or_err =
	ELF64LEObjectFile::create(*mem_buffer);
	if (!elf_or_err)
	handle_error(toString(elf_or_err.takeError()).c_str());

	std::vector<std::pair<const char *, uint16_t>> ctors;
	std::vector<std::pair<const char *, uint16_t>> dtors;
	// CUDA has no way to iterate over all the symbols so we need to inspect the
	// ELF directly using the LLVM libraries.
	for (const auto &symbol : elf_or_err->symbols()) {
	auto name_or_err = symbol.getName();
	if (!name_or_err)
	handle_error(toString(name_or_err.takeError()).c_str());

	// Search for all symbols that contain a constructor or destructor.
	if (!name_or_err->starts_with("__init_array_object_") &&
	!name_or_err->starts_with("__fini_array_object_"))
	continue;

	uint16_t priority;
	if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
	handle_error("Invalid priority for constructor or destructor");

	if (name_or_err->starts_with("__init"))
	ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
	else
	dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
	}
	// Lower priority constructors are run before higher ones. The reverse is true
	// for destructors.
	llvm::sort(ctors, llvm::less_second());
	llvm::sort(dtors, llvm::less_second());

	// Allocate host pinned memory to make these arrays visible to the GPU.
	CUdeviceptr dev_memory = reinterpret_cast<CUdeviceptr >(allocator(
	ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
	uint64_t global_size = 0;

	// Get the address of the global and then store the address of the constructor
	// function to call in the constructor array.
	CUdeviceptr *dev_ctors_start = dev_memory;
	CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
	for (uint64_t i = 0; i < ctors.size(); ++i) {
	CUdeviceptr dev_ptr;
	if (CUresult err =
	cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
	handle_error(err);
	if (CUresult err =
	cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
	handle_error(err);
	}

	// Get the address of the global and then store the address of the destructor
	// function to call in the destructor array.
	CUdeviceptr *dev_dtors_start = dev_ctors_end;
	CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
	for (uint64_t i = 0; i < dtors.size(); ++i) {
	CUdeviceptr dev_ptr;
	if (CUresult err =
	cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
	handle_error(err);
	if (CUresult err =
	cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
	handle_error(err);
	}

	// Obtain the address of the pointers the startup implementation uses to
	// iterate the constructors and destructors.
	CUdeviceptr init_start;
	if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
	"__init_array_start"))
	handle_error(err);
	CUdeviceptr init_end;
	if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
	"__init_array_end"))
	handle_error(err);
	CUdeviceptr fini_start;
	if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
	"__fini_array_start"))
	handle_error(err);
	CUdeviceptr fini_end;
	if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
	"__fini_array_end"))
	handle_error(err);

	// Copy the pointers to the newly written array to the symbols so the startup
	// implementation can iterate them.
	if (CUresult err =
	cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
	handle_error(err);
	if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
	handle_error(err);
	if (CUresult err =
	cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
	handle_error(err);
	if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
	handle_error(err);

	return dev_memory;
	}

	void print_kernel_resources(CUmodule binary, const char *kernel_name) {
	CUfunction function;
	if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
	handle_error(err);
	int num_regs;
	if (CUresult err =
	cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, function))
	handle_error(err);
	printf("Executing kernel %s:\n", kernel_name);
	printf("%6s registers: %d\n", kernel_name, num_regs);
	}

	template <typename args_t>
	CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
	const LaunchParameters &params, const char *kernel_name,
	args_t kernel_args, bool print_resource_usage) {
	// look up the '_start' kernel in the loaded module.
	CUfunction function;
	if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
	handle_error(err);

	// Set up the arguments to the '_start' kernel on the GPU.
	uint64_t args_size = sizeof(args_t);
	void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
	CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
	CU_LAUNCH_PARAM_END};
	if (print_resource_usage)
	print_kernel_resources(binary, kernel_name);

	// Initialize a non-blocking CUDA stream to allocate memory if needed.
	// This needs to be done on a separate stream or else it will deadlock
	// with the executing kernel.
	CUstream memory_stream;
	if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
	handle_error(err);

	std::atomic<bool> finished = false;
	std::thread server_thread(
	[](std::atomic<bool> finished, rpc::Server server,
	CUstream memory_stream) {
	auto malloc_handler = [&](size_t size) -> void * {
	CUdeviceptr dev_ptr;
	if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
	dev_ptr = 0UL;

	// Wait until the memory allocation is complete.
	while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
	;
	return reinterpret_cast<void *>(dev_ptr);
	};

	auto free_handler = [&](void *ptr) -> void {
	if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
	memory_stream))
	handle_error(err);
	};

	uint32_t index = 0;
	while (!*finished) {
	index =
	handle_server<32>(*server, index, malloc_handler, free_handler);
	}
	},
	&finished, &server, memory_stream);

	// Call the kernel with the given arguments.
	if (CUresult err = cuLaunchKernel(
	function, params.num_blocks_x, params.num_blocks_y,
	params.num_blocks_z, params.num_threads_x, params.num_threads_y,
	params.num_threads_z, 0, stream, nullptr, args_config))
	handle_error(err);

	if (CUresult err = cuStreamSynchronize(stream))
	handle_error(err);

	finished = true;
	if (server_thread.joinable())
	server_thread.join();

	return CUDA_SUCCESS;
	}

	int load_nvptx(int argc, const char argv, const char envp, void *image,
	size_t size, const LaunchParameters &params,
	bool print_resource_usage) {
	if (CUresult err = cuInit(0))
	handle_error(err);
	// Obtain the first device found on the system.
	uint32_t device_id = 0;
	CUdevice device;
	if (CUresult err = cuDeviceGet(&device, device_id))
	handle_error(err);

	// Initialize the CUDA context and claim it for this execution.
	CUcontext context;
	if (CUresult err = cuDevicePrimaryCtxRetain(&context, device))
	handle_error(err);
	if (CUresult err = cuCtxSetCurrent(context))
	handle_error(err);

	// Increase the stack size per thread.
	// TODO: We should allow this to be passed in so only the tests that require a
	// larger stack can specify it to save on memory usage.
	if (CUresult err = cuCtxSetLimit(CU_LIMIT_STACK_SIZE, 3 * 1024))
	handle_error(err);

	// Initialize a non-blocking CUDA stream to execute the kernel.
	CUstream stream;
	if (CUresult err = cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING))
	handle_error(err);

	// Load the image into a CUDA module.
	CUmodule binary;
	if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
	handle_error(err);

	// Allocate pinned memory on the host to hold the pointer array for the
	// copied argv and allow the GPU device to access it.
	auto allocator = [&](uint64_t size) -> void * {
	void *dev_ptr;
	if (CUresult err = cuMemAllocHost(&dev_ptr, size))
	handle_error(err);
	return dev_ptr;
	};

	auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
	if (!memory_or_err)
	handle_error(toString(memory_or_err.takeError()).c_str());

	void *dev_argv = copy_argument_vector(argc, argv, allocator);
	if (!dev_argv)
	handle_error("Failed to allocate device argv");

	// Allocate pinned memory on the host to hold the pointer array for the
	// copied environment array and allow the GPU device to access it.
	void *dev_envp = copy_environment(envp, allocator);
	if (!dev_envp)
	handle_error("Failed to allocate device environment");

	// Allocate space for the return pointer and initialize it to zero.
	CUdeviceptr dev_ret;
	if (CUresult err = cuMemAlloc(&dev_ret, sizeof(int)))
	handle_error(err);
	if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
	handle_error(err);

	uint32_t warp_size = 32;
	void *rpc_buffer = nullptr;
	if (CUresult err = cuMemAllocHost(
	&rpc_buffer,
	rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
	handle_error(err);
	rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
	rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);

	// Initialize the RPC client on the device by copying the local data to the
	// device's internal pointer.
	CUdeviceptr rpc_client_dev = 0;
	uint64_t client_ptr_size = sizeof(void *);
	if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
	binary, "__llvm_rpc_client"))
	handle_error(err);

	if (CUresult err = cuMemcpyHtoD(rpc_client_dev, &client, sizeof(rpc::Client)))
	handle_error(err);

	LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
	begin_args_t init_args = {argc, dev_argv, dev_envp};
	if (CUresult err =
	launch_kernel(binary, stream, server, single_threaded_params,
	"_begin", init_args, print_resource_usage))
	handle_error(err);

	start_args_t args = {argc, dev_argv, dev_envp,
	reinterpret_cast<void *>(dev_ret)};
	if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
	args, print_resource_usage))
	handle_error(err);

	// Copy the return value back from the kernel and wait.
	int host_ret = 0;
	if (CUresult err = cuMemcpyDtoH(&host_ret, dev_ret, sizeof(int)))
	handle_error(err);

	if (CUresult err = cuStreamSynchronize(stream))
	handle_error(err);

	end_args_t fini_args = {host_ret};
	if (CUresult err =
	launch_kernel(binary, stream, server, single_threaded_params, "_end",
	fini_args, print_resource_usage))
	handle_error(err);

	// Free the memory allocated for the device.
	if (CUresult err = cuMemFreeHost(*memory_or_err))
	handle_error(err);
	if (CUresult err = cuMemFree(dev_ret))
	handle_error(err);
	if (CUresult err = cuMemFreeHost(dev_argv))
	handle_error(err);
	if (CUresult err = cuMemFreeHost(rpc_buffer))
	handle_error(err);

	// Destroy the context and the loaded binary.
	if (CUresult err = cuModuleUnload(binary))
	handle_error(err);
	if (CUresult err = cuDevicePrimaryCtxRelease(device))
	handle_error(err);
	return host_ret;
	}