acxxel/examples/simple_example.cu - llvm-project/parallel-libs - Git at Google

 //===--- simple_example.cu - Simple example of using Acxxel ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// This file is a simple example of using Acxxel.
 ///
 //===----------------------------------------------------------------------===//

 /// [Example simple saxpy]
 #include "acxxel.h"

 #include <array>
 #include <cstdio>
 #include <cstdlib>

 // A standard CUDA kernel.
 __global__ void saxpyKernel(float A, float *X, float *Y, int N) {
   int I = (blockDim.x * blockIdx.x) + threadIdx.x;
   if (I < N)
     X[I] = A * X[I] + Y[I];
 }

 // A host library wrapping the CUDA kernel. All Acxxel calls are in here.
 template <size_t N>
 void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
   // Get the CUDA platform and make a CUDA stream.
   acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
   acxxel::Stream Stream = CUDA->createStream().takeValue();

   // Allocate space for device arrays.
   auto DeviceX = CUDA->mallocD<float>(N).takeValue();
   auto DeviceY = CUDA->mallocD<float>(N).takeValue();

   // Copy X and Y out to the device.
   Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);

   // Launch the kernel using triple-chevron notation.
   saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);

   // Copy the results back to the host.
   acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();

   // Check for any errors.
   if (Status.isError()) {
     std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
                  Status.getMessage().c_str());
     std::exit(EXIT_FAILURE);
   }
 }
 /// [Example simple saxpy]

 /// [Example CUDA simple saxpy]
 template <size_t N>
 void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
   // This size is needed all over the place, so give it a name.
   constexpr size_t Size = N * sizeof(float);

   // Allocate space for device arrays.
   float *DeviceX;
   float *DeviceY;
   cudaMalloc(&DeviceX, Size);
   cudaMalloc(&DeviceY, Size);

   // Copy X and Y out to the device.
   cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
   cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);

   // Launch the kernel using triple-chevron notation.
   saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);

   // Copy the results back to the host.
   cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);

   // Free resources.
   cudaFree(DeviceX);
   cudaFree(DeviceY);

   // Check for any errors.
   cudaError_t Error = cudaGetLastError();
   if (Error) {
     std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
                  cudaGetErrorString(Error));
     std::exit(EXIT_FAILURE);
   }
 }
 /// [Example CUDA simple saxpy]

 template <typename F> void testSaxpy(F &&SaxpyFunction) {
   float A = 2.f;
   std::array<float, 3> X = {{0.f, 1.f, 2.f}};
   std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
   std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
   SaxpyFunction(A, X, Y);
   for (int I = 0; I < 3; ++I)
     if (X[I] != Expected[I]) {
       std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
                    Expected[I]);
       std::exit(EXIT_FAILURE);
     }
 }

 int main() {
   testSaxpy(saxpy<3>);
   testSaxpy(cudaSaxpy<3>);
 }
	//===--- simple_example.cu - Simple example of using Acxxel ---------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// This file is a simple example of using Acxxel.
	///
	//===----------------------------------------------------------------------===//

	/// [Example simple saxpy]
	#include "acxxel.h"

	#include <array>
	#include <cstdio>
	#include <cstdlib>

	// A standard CUDA kernel.
	__global__ void saxpyKernel(float A, float X, float Y, int N) {
	int I = (blockDim.x * blockIdx.x) + threadIdx.x;
	if (I < N)
	X[I] = A * X[I] + Y[I];
	}

	// A host library wrapping the CUDA kernel. All Acxxel calls are in here.
	template <size_t N>
	void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
	// Get the CUDA platform and make a CUDA stream.
	acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
	acxxel::Stream Stream = CUDA->createStream().takeValue();

	// Allocate space for device arrays.
	auto DeviceX = CUDA->mallocD<float>(N).takeValue();
	auto DeviceY = CUDA->mallocD<float>(N).takeValue();

	// Copy X and Y out to the device.
	Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);

	// Launch the kernel using triple-chevron notation.
	saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);

	// Copy the results back to the host.
	acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();

	// Check for any errors.
	if (Status.isError()) {
	std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
	Status.getMessage().c_str());
	std::exit(EXIT_FAILURE);
	}
	}
	/// [Example simple saxpy]

	/// [Example CUDA simple saxpy]
	template <size_t N>
	void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
	// This size is needed all over the place, so give it a name.
	constexpr size_t Size = N * sizeof(float);

	// Allocate space for device arrays.
	float *DeviceX;
	float *DeviceY;
	cudaMalloc(&DeviceX, Size);
	cudaMalloc(&DeviceY, Size);

	// Copy X and Y out to the device.
	cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
	cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);

	// Launch the kernel using triple-chevron notation.
	saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);

	// Copy the results back to the host.
	cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);

	// Free resources.
	cudaFree(DeviceX);
	cudaFree(DeviceY);

	// Check for any errors.
	cudaError_t Error = cudaGetLastError();
	if (Error) {
	std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
	cudaGetErrorString(Error));
	std::exit(EXIT_FAILURE);
	}
	}
	/// [Example CUDA simple saxpy]

	template <typename F> void testSaxpy(F &&SaxpyFunction) {
	float A = 2.f;
	std::array<float, 3> X = {{0.f, 1.f, 2.f}};
	std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
	std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
	SaxpyFunction(A, X, Y);
	for (int I = 0; I < 3; ++I)
	if (X[I] != Expected[I]) {
	std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
	Expected[I]);
	std::exit(EXIT_FAILURE);
	}
	}

	int main() {
	testSaxpy(saxpy<3>);
	testSaxpy(cudaSaxpy<3>);
	}