streamexecutor/examples/CUDASaxpy.cpp - llvm-project/parallel-libs - Git at Google

 //===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file contains example code demonstrating the usage of the
 /// StreamExecutor API. Snippets of this file will be included as code examples
 /// in documentation. Taking these examples from a real source file guarantees
 /// that the examples will always compile.
 ///
 //===----------------------------------------------------------------------===//

 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
 #include <vector>

 #include "streamexecutor/StreamExecutor.h"

 /// [Example saxpy compiler-generated]
 // Code in this namespace is generated by the compiler (e.g. clang).
 //
 // The name of this namespace may depend on the compiler that generated it, so
 // this is just an example name.
 namespace __compilergen {

 // Specialization of the streamexecutor::Kernel template class for the parameter
 // types of the saxpy(float A, float *X, float *Y) kernel.
 using SaxpyKernel =
     streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
                            streamexecutor::GlobalDeviceMemory<float>>;

 // A string containing the PTX code generated by the device compiler for the
 // saxpy kernel. String contents not shown here.
 extern const char *SaxpyPTX;

 // A global instance of a loader spec that knows how to load the code in the
 // SaxpyPTX string.
 static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
   streamexecutor::MultiKernelLoaderSpec Spec;
   Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
   return Spec;
 }();

 } // namespace __compilergen
 /// [Example saxpy compiler-generated]

 /// [Example saxpy host PTX]
 // The PTX text for a saxpy kernel.
 const char *__compilergen::SaxpyPTX = R"(
   .version 4.3
   .target sm_20
   .address_size 64

   .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
     .reg .f32 %AValue;
     .reg .f32 %XValue;
     .reg .f32 %YValue;
     .reg .f32 %Result;

     .reg .b64 %XBaseAddrGeneric;
     .reg .b64 %YBaseAddrGeneric;
     .reg .b64 %XBaseAddrGlobal;
     .reg .b64 %YBaseAddrGlobal;
     .reg .b64 %XAddr;
     .reg .b64 %YAddr;
     .reg .b64 %ThreadByteOffset;

     .reg .b32 %TID;

     ld.param.f32 %AValue, [A];
     ld.param.u64 %XBaseAddrGeneric, [X];
     ld.param.u64 %YBaseAddrGeneric, [Y];
     cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
     cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
     mov.u32 %TID, %tid.x;
     mul.wide.u32 %ThreadByteOffset, %TID, 4;
     add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
     add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
     ld.global.f32 %XValue, [%XAddr];
     ld.global.f32 %YValue, [%YAddr];
     fma.rn.f32 %Result, %AValue, %XValue, %YValue;
     st.global.f32 [%XAddr], %Result;
     ret;
   }
 )";
 /// [Example saxpy host PTX]

 int main() {
   /// [Example saxpy host main]
   namespace se = ::streamexecutor;
   namespace cg = ::__compilergen;

   // Create some host data.
   float A = 42.0f;
   std::vector<float> HostX = {0, 1, 2, 3};
   std::vector<float> HostY = {4, 5, 6, 7};
   size_t ArraySize = HostX.size();

   // Get a device object.
   se::Platform *Platform =
       getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
   if (Platform->getDeviceCount() == 0) {
     return EXIT_FAILURE;
   }
   se::Device Device = getOrDie(Platform->getDevice(0));

   // Load the kernel onto the device.
   cg::SaxpyKernel Kernel =
       getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));

   se::RegisteredHostMemory<float> RegisteredX =
       getOrDie(Device.registerHostMemory<float>(HostX));
   se::RegisteredHostMemory<float> RegisteredY =
       getOrDie(Device.registerHostMemory<float>(HostY));

   // Allocate memory on the device.
   se::GlobalDeviceMemory<float> X =
       getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
   se::GlobalDeviceMemory<float> Y =
       getOrDie(Device.allocateDeviceMemory<float>(ArraySize));

   // Run operations on a stream.
   se::Stream Stream = getOrDie(Device.createStream());
   Stream.thenCopyH2D(RegisteredX, X)
       .thenCopyH2D(RegisteredY, Y)
       .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
       .thenCopyD2H(X, RegisteredX);
   // Wait for the stream to complete.
   se::dieIfError(Stream.blockHostUntilDone());

   // Process output data in HostX.
   std::vector<float> ExpectedX = {4, 47, 90, 133};
   assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
   /// [Example saxpy host main]
 }
	//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file contains example code demonstrating the usage of the
	/// StreamExecutor API. Snippets of this file will be included as code examples
	/// in documentation. Taking these examples from a real source file guarantees
	/// that the examples will always compile.
	///
	//===----------------------------------------------------------------------===//

	#include <algorithm>
	#include <cassert>
	#include <cstdlib>
	#include <vector>

	#include "streamexecutor/StreamExecutor.h"

	/// [Example saxpy compiler-generated]
	// Code in this namespace is generated by the compiler (e.g. clang).
	//
	// The name of this namespace may depend on the compiler that generated it, so
	// this is just an example name.
	namespace __compilergen {

	// Specialization of the streamexecutor::Kernel template class for the parameter
	// types of the saxpy(float A, float X, float Y) kernel.
	using SaxpyKernel =
	streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
	streamexecutor::GlobalDeviceMemory<float>>;

	// A string containing the PTX code generated by the device compiler for the
	// saxpy kernel. String contents not shown here.
	extern const char *SaxpyPTX;

	// A global instance of a loader spec that knows how to load the code in the
	// SaxpyPTX string.
	static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
	streamexecutor::MultiKernelLoaderSpec Spec;
	Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
	return Spec;
	}();

	} // namespace __compilergen
	/// [Example saxpy compiler-generated]

	/// [Example saxpy host PTX]
	// The PTX text for a saxpy kernel.
	const char *__compilergen::SaxpyPTX = R"(
	.version 4.3
	.target sm_20
	.address_size 64

	.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
	.reg .f32 %AValue;
	.reg .f32 %XValue;
	.reg .f32 %YValue;
	.reg .f32 %Result;

	.reg .b64 %XBaseAddrGeneric;
	.reg .b64 %YBaseAddrGeneric;
	.reg .b64 %XBaseAddrGlobal;
	.reg .b64 %YBaseAddrGlobal;
	.reg .b64 %XAddr;
	.reg .b64 %YAddr;
	.reg .b64 %ThreadByteOffset;

	.reg .b32 %TID;

	ld.param.f32 %AValue, [A];
	ld.param.u64 %XBaseAddrGeneric, [X];
	ld.param.u64 %YBaseAddrGeneric, [Y];
	cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
	cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
	mov.u32 %TID, %tid.x;
	mul.wide.u32 %ThreadByteOffset, %TID, 4;
	add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
	add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
	ld.global.f32 %XValue, [%XAddr];
	ld.global.f32 %YValue, [%YAddr];
	fma.rn.f32 %Result, %AValue, %XValue, %YValue;
	st.global.f32 [%XAddr], %Result;
	ret;
	}
	)";
	/// [Example saxpy host PTX]

	int main() {
	/// [Example saxpy host main]
	namespace se = ::streamexecutor;
	namespace cg = ::__compilergen;

	// Create some host data.
	float A = 42.0f;
	std::vector<float> HostX = {0, 1, 2, 3};
	std::vector<float> HostY = {4, 5, 6, 7};
	size_t ArraySize = HostX.size();

	// Get a device object.
	se::Platform *Platform =
	getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
	if (Platform->getDeviceCount() == 0) {
	return EXIT_FAILURE;
	}
	se::Device Device = getOrDie(Platform->getDevice(0));

	// Load the kernel onto the device.
	cg::SaxpyKernel Kernel =
	getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));

	se::RegisteredHostMemory<float> RegisteredX =
	getOrDie(Device.registerHostMemory<float>(HostX));
	se::RegisteredHostMemory<float> RegisteredY =
	getOrDie(Device.registerHostMemory<float>(HostY));

	// Allocate memory on the device.
	se::GlobalDeviceMemory<float> X =
	getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
	se::GlobalDeviceMemory<float> Y =
	getOrDie(Device.allocateDeviceMemory<float>(ArraySize));

	// Run operations on a stream.
	se::Stream Stream = getOrDie(Device.createStream());
	Stream.thenCopyH2D(RegisteredX, X)
	.thenCopyH2D(RegisteredY, Y)
	.thenLaunch(ArraySize, 1, Kernel, A, X, Y)
	.thenCopyD2H(X, RegisteredX);
	// Wait for the stream to complete.
	se::dieIfError(Stream.blockHostUntilDone());

	// Process output data in HostX.
	std::vector<float> ExpectedX = {4, 47, 90, 133};
	assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
	/// [Example saxpy host main]
	}