flang-rt/lib/cuda/allocator.cpp - llvm-project.git - Git at Google

 //===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 #include "flang/Runtime/CUDA/allocator.h"
 #include "flang-rt/runtime/allocator-registry.h"
 #include "flang-rt/runtime/derived.h"
 #include "flang-rt/runtime/descriptor.h"
 #include "flang-rt/runtime/environment.h"
 #include "flang-rt/runtime/lock.h"
 #include "flang-rt/runtime/stat.h"
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/type-info.h"
 #include "flang/Common/ISO_Fortran_binding_wrapper.h"
 #include "flang/Runtime/CUDA/common.h"
 #include "flang/Support/Fortran.h"

 #include "cuda_runtime.h"

 namespace Fortran::runtime::cuda {

 struct DeviceAllocation {
   void *ptr;
   std::size_t size;
   cudaStream_t stream;
 };

 // Compare address values. nullptr will be sorted at the end of the array.
 int compareDeviceAlloc(const void *a, const void *b) {
   const DeviceAllocation *deva = (const DeviceAllocation *)a;
   const DeviceAllocation *devb = (const DeviceAllocation *)b;
   if (deva->ptr == nullptr && devb->ptr == nullptr)
     return 0;
   if (deva->ptr == nullptr)
     return 1;
   if (devb->ptr == nullptr)
     return -1;
   return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
 }

 // Dynamic array for tracking asynchronous allocations.
 static DeviceAllocation *deviceAllocations = nullptr;
 Lock lock;
 static int maxDeviceAllocations{512}; // Initial size
 static int numDeviceAllocations{0};
 static constexpr int allocNotFound{-1};

 static void initAllocations() {
   if (!deviceAllocations) {
     deviceAllocations = static_cast<DeviceAllocation *>(
         malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
     if (!deviceAllocations) {
       Terminator terminator{__FILE__, __LINE__};
       terminator.Crash("Failed to allocate tracking array");
     }
   }
 }

 static void doubleAllocationArray() {
   unsigned newSize = maxDeviceAllocations * 2;
   DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
       realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
   if (!newArray) {
     Terminator terminator{__FILE__, __LINE__};
     terminator.Crash("Failed to reallocate tracking array");
   }
   deviceAllocations = newArray;
   maxDeviceAllocations = newSize;
 }

 static unsigned findAllocation(void *ptr) {
   if (numDeviceAllocations == 0) {
     return allocNotFound;
   }

   int left{0};
   int right{numDeviceAllocations - 1};

   if (left == right) {
     return left;
   }

   while (left <= right) {
     int mid = left + (right - left) / 2;
     if (deviceAllocations[mid].ptr == ptr) {
       return mid;
     }
     if (deviceAllocations[mid].ptr < ptr) {
       left = mid + 1;
     } else {
       right = mid - 1;
     }
   }
   return allocNotFound;
 }

 static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
   CriticalSection critical{lock};
   initAllocations();
   if (numDeviceAllocations >= maxDeviceAllocations) {
     doubleAllocationArray();
   }
   deviceAllocations[numDeviceAllocations].ptr = ptr;
   deviceAllocations[numDeviceAllocations].size = size;
   deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
   ++numDeviceAllocations;
   qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
       compareDeviceAlloc);
 }

 static void eraseAllocation(int pos) {
   deviceAllocations[pos].ptr = nullptr;
   deviceAllocations[pos].size = 0;
   deviceAllocations[pos].stream = (cudaStream_t)0;
   qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
       compareDeviceAlloc);
   --numDeviceAllocations;
 }

 extern "C" {

 void RTDEF(CUFRegisterAllocator)() {
   allocatorRegistry.Register(
       kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
   allocatorRegistry.Register(
       kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
   allocatorRegistry.Register(
       kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
   allocatorRegistry.Register(
       kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
 }
 }

 void *CUFAllocPinned(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
   return p;
 }

 void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }

 void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
     CUDA_REPORT_IF_ERROR(
         cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
   } else {
     if (asyncId == kNoAsyncId) {
       CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
     } else {
       CUDA_REPORT_IF_ERROR(
           cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
       insertAllocation(p, sizeInBytes, asyncId);
     }
   }
   return p;
 }

 void CUFFreeDevice(void *p) {
   CriticalSection critical{lock};
   int pos = findAllocation(p);
   if (pos >= 0) {
     cudaStream_t stream = deviceAllocations[pos].stream;
     eraseAllocation(pos);
     CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
   } else {
     CUDA_REPORT_IF_ERROR(cudaFree(p));
   }
 }

 void *CUFAllocManaged(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   void *p;
   CUDA_REPORT_IF_ERROR(
       cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
   return reinterpret_cast<void *>(p);
 }

 void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }

 void *CUFAllocUnified(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
   // Call alloc managed for the time being.
   return CUFAllocManaged(sizeInBytes, asyncId);
 }

 void CUFFreeUnified(void *p) {
   // Call free managed for the time being.
   CUFFreeManaged(p);
 }

 } // namespace Fortran::runtime::cuda
	//===-- lib/cuda/allocator.cpp ----------------------------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	#include "flang/Runtime/CUDA/allocator.h"
	#include "flang-rt/runtime/allocator-registry.h"
	#include "flang-rt/runtime/derived.h"
	#include "flang-rt/runtime/descriptor.h"
	#include "flang-rt/runtime/environment.h"
	#include "flang-rt/runtime/lock.h"
	#include "flang-rt/runtime/stat.h"
	#include "flang-rt/runtime/terminator.h"
	#include "flang-rt/runtime/type-info.h"
	#include "flang/Common/ISO_Fortran_binding_wrapper.h"
	#include "flang/Runtime/CUDA/common.h"
	#include "flang/Support/Fortran.h"

	#include "cuda_runtime.h"

	namespace Fortran::runtime::cuda {

	struct DeviceAllocation {
	void *ptr;
	std::size_t size;
	cudaStream_t stream;
	};

	// Compare address values. nullptr will be sorted at the end of the array.
	int compareDeviceAlloc(const void a, const void b) {
	const DeviceAllocation deva = (const DeviceAllocation )a;
	const DeviceAllocation devb = (const DeviceAllocation )b;
	if (deva->ptr == nullptr && devb->ptr == nullptr)
	return 0;
	if (deva->ptr == nullptr)
	return 1;
	if (devb->ptr == nullptr)
	return -1;
	return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
	}

	// Dynamic array for tracking asynchronous allocations.
	static DeviceAllocation *deviceAllocations = nullptr;
	Lock lock;
	static int maxDeviceAllocations{512}; // Initial size
	static int numDeviceAllocations{0};
	static constexpr int allocNotFound{-1};

	static void initAllocations() {
	if (!deviceAllocations) {
	deviceAllocations = static_cast<DeviceAllocation *>(
	malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
	if (!deviceAllocations) {
	Terminator terminator{__FILE__, __LINE__};
	terminator.Crash("Failed to allocate tracking array");
	}
	}
	}

	static void doubleAllocationArray() {
	unsigned newSize = maxDeviceAllocations * 2;
	DeviceAllocation newArray = static_cast<DeviceAllocation >(
	realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
	if (!newArray) {
	Terminator terminator{__FILE__, __LINE__};
	terminator.Crash("Failed to reallocate tracking array");
	}
	deviceAllocations = newArray;
	maxDeviceAllocations = newSize;
	}

	static unsigned findAllocation(void *ptr) {
	if (numDeviceAllocations == 0) {
	return allocNotFound;
	}

	int left{0};
	int right{numDeviceAllocations - 1};

	if (left == right) {
	return left;
	}

	while (left <= right) {
	int mid = left + (right - left) / 2;
	if (deviceAllocations[mid].ptr == ptr) {
	return mid;
	}
	if (deviceAllocations[mid].ptr < ptr) {
	left = mid + 1;
	} else {
	right = mid - 1;
	}
	}
	return allocNotFound;
	}

	static void insertAllocation(void *ptr, std::size_t size, std::int64_t stream) {
	CriticalSection critical{lock};
	initAllocations();
	if (numDeviceAllocations >= maxDeviceAllocations) {
	doubleAllocationArray();
	}
	deviceAllocations[numDeviceAllocations].ptr = ptr;
	deviceAllocations[numDeviceAllocations].size = size;
	deviceAllocations[numDeviceAllocations].stream = (cudaStream_t)stream;
	++numDeviceAllocations;
	qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
	compareDeviceAlloc);
	}

	static void eraseAllocation(int pos) {
	deviceAllocations[pos].ptr = nullptr;
	deviceAllocations[pos].size = 0;
	deviceAllocations[pos].stream = (cudaStream_t)0;
	qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
	compareDeviceAlloc);
	--numDeviceAllocations;
	}

	extern "C" {

	void RTDEF(CUFRegisterAllocator)() {
	allocatorRegistry.Register(
	kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
	allocatorRegistry.Register(
	kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
	allocatorRegistry.Register(
	kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
	allocatorRegistry.Register(
	kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
	}
	}

	void *CUFAllocPinned(
	std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
	void *p;
	CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
	return p;
	}

	void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }

	void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncId) {
	void *p;
	if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
	CUDA_REPORT_IF_ERROR(
	cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
	} else {
	if (asyncId == kNoAsyncId) {
	CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
	} else {
	CUDA_REPORT_IF_ERROR(
	cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)asyncId));
	insertAllocation(p, sizeInBytes, asyncId);
	}
	}
	return p;
	}

	void CUFFreeDevice(void *p) {
	CriticalSection critical{lock};
	int pos = findAllocation(p);
	if (pos >= 0) {
	cudaStream_t stream = deviceAllocations[pos].stream;
	eraseAllocation(pos);
	CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
	} else {
	CUDA_REPORT_IF_ERROR(cudaFree(p));
	}
	}

	void *CUFAllocManaged(
	std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
	void *p;
	CUDA_REPORT_IF_ERROR(
	cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
	return reinterpret_cast<void *>(p);
	}

	void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }

	void *CUFAllocUnified(
	std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
	// Call alloc managed for the time being.
	return CUFAllocManaged(sizeInBytes, asyncId);
	}

	void CUFFreeUnified(void *p) {
	// Call free managed for the time being.
	CUFFreeManaged(p);
	}

	} // namespace Fortran::runtime::cuda