[SE] Host platform implementation

Summary:
This implementation does not currently support multiple concurrent streams, and
it won't allow kernels to be launched with grids larger than one block or
blocks larger than one thread. These limitations could be removed in the future
by launching new threads on the host, but that is not done in this
implementation.

Reviewers: jlebar

Subscribers: beanz, mgorny, jprice, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D24473

llvm-svn: 281377
GitOrigin-RevId: 3088696499c2d883d8573eb384cd5d2455a3c6c1
diff --git a/streamexecutor/examples/CMakeLists.txt b/streamexecutor/examples/CMakeLists.txt
index 1d09a54..cb061d5 100644
--- a/streamexecutor/examples/CMakeLists.txt
+++ b/streamexecutor/examples/CMakeLists.txt
@@ -1,2 +1,5 @@
 add_executable(cuda_saxpy_example CUDASaxpy.cpp)
 target_link_libraries(cuda_saxpy_example streamexecutor)
+
+add_executable(host_saxpy_example HostSaxpy.cpp)
+target_link_libraries(host_saxpy_example streamexecutor)
diff --git a/streamexecutor/examples/CUDASaxpy.cpp b/streamexecutor/examples/CUDASaxpy.cpp
index 5fb3dba..0fce5ed 100644
--- a/streamexecutor/examples/CUDASaxpy.cpp
+++ b/streamexecutor/examples/CUDASaxpy.cpp
@@ -17,7 +17,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstdio>
 #include <cstdlib>
 #include <vector>
 
diff --git a/streamexecutor/examples/HostSaxpy.cpp b/streamexecutor/examples/HostSaxpy.cpp
new file mode 100644
index 0000000..525c445
--- /dev/null
+++ b/streamexecutor/examples/HostSaxpy.cpp
@@ -0,0 +1,94 @@
+//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains example code demonstrating the usage of the
+/// StreamExecutor API for a host platform.
+///
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <vector>
+
+#include "streamexecutor/StreamExecutor.h"
+
+void Saxpy(float A, float *X, float *Y, size_t N) {
+  for (size_t I = 0; I < N; ++I)
+    X[I] = A * X[I] + Y[I];
+}
+
+namespace __compilergen {
+using SaxpyKernel =
+    streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
+                           streamexecutor::GlobalDeviceMemory<float>, size_t>;
+
+// Wrapper function converts argument addresses to arguments.
+void SaxpyWrapper(const void *const *ArgumentAddresses) {
+  Saxpy(*static_cast<const float *>(ArgumentAddresses[0]),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[1])),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[2])),
+        *static_cast<const size_t *>(ArgumentAddresses[3]));
+}
+
+// The wrapper function is what gets registered.
+static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
+  streamexecutor::MultiKernelLoaderSpec Spec;
+  Spec.addHostFunction("Saxpy", SaxpyWrapper);
+  return Spec;
+}();
+} // namespace __compilergen
+
+int main() {
+  namespace se = ::streamexecutor;
+  namespace cg = ::__compilergen;
+
+  // Create some host data.
+  float A = 42.0f;
+  std::vector<float> HostX = {0, 1, 2, 3};
+  std::vector<float> HostY = {4, 5, 6, 7};
+  size_t ArraySize = HostX.size();
+
+  // Get a device object.
+  se::Platform *Platform =
+      getOrDie(se::PlatformManager::getPlatformByName("host"));
+  if (Platform->getDeviceCount() == 0) {
+    return EXIT_FAILURE;
+  }
+  se::Device *Device = getOrDie(Platform->getDevice(0));
+
+  // Load the kernel onto the device.
+  cg::SaxpyKernel Kernel =
+      getOrDie(Device->createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
+
+  se::RegisteredHostMemory<float> RegisteredX =
+      getOrDie(Device->registerHostMemory<float>(HostX));
+  se::RegisteredHostMemory<float> RegisteredY =
+      getOrDie(Device->registerHostMemory<float>(HostY));
+
+  // Allocate memory on the device.
+  se::GlobalDeviceMemory<float> X =
+      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+  se::GlobalDeviceMemory<float> Y =
+      getOrDie(Device->allocateDeviceMemory<float>(ArraySize));
+
+  // Run operations on a stream.
+  se::Stream Stream = getOrDie(Device->createStream());
+  Stream.thenCopyH2D(RegisteredX, X)
+      .thenCopyH2D(RegisteredY, Y)
+      .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize)
+      .thenCopyD2H(X, RegisteredX);
+  // Wait for the stream to complete.
+  se::dieIfError(Stream.blockHostUntilDone());
+
+  // Process output data in HostX.
+  std::vector<float> ExpectedX = {4, 47, 90, 133};
+  assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
+}
diff --git a/streamexecutor/include/streamexecutor/KernelSpec.h b/streamexecutor/include/streamexecutor/KernelSpec.h
index c4b6722..caf6f1b 100644
--- a/streamexecutor/include/streamexecutor/KernelSpec.h
+++ b/streamexecutor/include/streamexecutor/KernelSpec.h
@@ -65,11 +65,13 @@
 #define STREAMEXECUTOR_KERNELSPEC_H
 
 #include <cassert>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace streamexecutor {
@@ -199,6 +201,9 @@
 /// than doing it by hand.
 class MultiKernelLoaderSpec {
 public:
+  /// Type of functions used as host platform kernels.
+  using HostFunctionTy = std::function<void(const void **)>;
+
   std::string getKernelName() const {
     if (TheKernelName)
       return *TheKernelName;
@@ -215,6 +220,7 @@
   bool hasOpenCLTextInMemory() const {
     return TheOpenCLTextInMemorySpec != nullptr;
   }
+  bool hasHostFunction() const { return HostFunction != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   //
@@ -233,6 +239,11 @@
     return *TheOpenCLTextInMemorySpec;
   }
 
+  const HostFunctionTy &getHostFunction() const {
+    assert(hasHostFunction() && "getting spec that is not present");
+    return *HostFunction;
+  }
+
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec.
   //
@@ -256,6 +267,12 @@
   MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
                                                const char *OpenCLText);
 
+  MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName,
+                                         HostFunctionTy Function) {
+    HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function));
+    return *this;
+  }
+
 private:
   void setKernelName(llvm::StringRef KernelName);
 
@@ -263,6 +280,7 @@
   std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
   std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
   std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
+  std::unique_ptr<HostFunctionTy> HostFunction;
 };
 
 } // namespace streamexecutor
diff --git a/streamexecutor/include/streamexecutor/PlatformDevice.h b/streamexecutor/include/streamexecutor/PlatformDevice.h
index cc1ae40..d55680d 100644
--- a/streamexecutor/include/streamexecutor/PlatformDevice.h
+++ b/streamexecutor/include/streamexecutor/PlatformDevice.h
@@ -149,10 +149,10 @@
   /// Similar to synchronousCopyD2H(const void *, size_t, void
   /// *, size_t, size_t), but copies memory from one location in device memory
   /// to another rather than from device to host.
-  virtual Error synchronousCopyD2D(const void *DeviceDstHandle,
-                                   size_t DstByteOffset,
-                                   const void *DeviceSrcHandle,
-                                   size_t SrcByteOffset, size_t ByteCount) {
+  virtual Error synchronousCopyD2D(const void *DeviceSrcHandle,
+                                   size_t SrcByteOffset,
+                                   const void *DeviceDstHandle,
+                                   size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyD2D not implemented for platform " +
                       getName());
   }
diff --git a/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h b/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
new file mode 100644
index 0000000..52ad1ea
--- /dev/null
+++ b/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
@@ -0,0 +1,56 @@
+//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+
+#include "HostPlatformDevice.h"
+#include "streamexecutor/Device.h"
+#include "streamexecutor/Platform.h"
+
+#include "llvm/Support/Mutex.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// Platform that performs work on the host rather than offloading to an
+/// accelerator.
+class HostPlatform : public Platform {
+public:
+  size_t getDeviceCount() const override { return 1; }
+
+  Expected<Device *> getDevice(size_t DeviceIndex) override {
+    if (DeviceIndex != 0) {
+      return make_error(
+          "Requested device index " + llvm::Twine(DeviceIndex) +
+          " from host platform which only supports device index 0");
+    }
+    llvm::sys::ScopedLock Lock(Mutex);
+    if (!TheDevice) {
+      ThePlatformDevice = llvm::make_unique<HostPlatformDevice>();
+      TheDevice = llvm::make_unique<Device>(ThePlatformDevice.get());
+    }
+    return TheDevice.get();
+  }
+
+private:
+  llvm::sys::Mutex Mutex;
+  std::unique_ptr<HostPlatformDevice> ThePlatformDevice;
+  std::unique_ptr<Device> TheDevice;
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
diff --git a/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h b/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
new file mode 100644
index 0000000..e51552d
--- /dev/null
+++ b/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
@@ -0,0 +1,151 @@
+//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// A concrete PlatformDevice subclass that performs its work on the host rather
+/// than offloading to an accelerator.
+class HostPlatformDevice : public PlatformDevice {
+public:
+  std::string getName() const override { return "host"; }
+
+  Expected<const void *>
+  createKernel(const MultiKernelLoaderSpec &Spec) override {
+    if (!Spec.hasHostFunction()) {
+      return make_error("no host implementation available for kernel " +
+                        Spec.getKernelName());
+    }
+    return static_cast<const void *>(&Spec.getHostFunction());
+  }
+
+  Error destroyKernel(const void *Handle) override { return Error::success(); }
+
+  Expected<const void *> createStream() override {
+    // TODO(jhen): Do something with threads to allow multiple streams.
+    return this;
+  }
+
+  Error destroyStream(const void *Handle) override { return Error::success(); }
+
+  Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+               GridDimensions GridSize, const void *PKernelHandle,
+               const PackedKernelArgumentArrayBase &ArgumentArray) override {
+    // TODO(jhen): Can we do something with BlockSize and GridSize?
+    if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) {
+      return make_error(
+          "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," +
+          llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+    if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) {
+      return make_error(
+          "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," +
+          llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+
+    (*static_cast<const std::function<void(const void *const *)> *>(
+        PKernelHandle))(ArgumentArray.getAddresses());
+    return Error::success();
+  }
+
+  Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+                size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error blockHostUntilDone(const void *PlatformStreamHandle) override {
+    // All host operations are synchronous anyway.
+    return Error::success();
+  }
+
+  Expected<void *> allocateDeviceMemory(size_t ByteCount) override {
+    return std::malloc(ByteCount);
+  }
+
+  Error freeDeviceMemory(const void *Handle) override {
+    std::free(const_cast<void *>(Handle));
+    return Error::success();
+  }
+
+  Error registerHostMemory(void *Memory, size_t ByteCount) override {
+    return Error::success();
+  }
+
+  Error unregisterHostMemory(const void *Memory) override {
+    return Error::success();
+  }
+
+  Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           void *HostDst, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+private:
+  static void *offset(const void *Base, size_t Offset) {
+    return const_cast<char *>(static_cast<const char *>(Base) + Offset);
+  }
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
diff --git a/streamexecutor/lib/PlatformManager.cpp b/streamexecutor/lib/PlatformManager.cpp
index 9cae5b1..7304cca 100644
--- a/streamexecutor/lib/PlatformManager.cpp
+++ b/streamexecutor/lib/PlatformManager.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "streamexecutor/PlatformManager.h"
+#include "streamexecutor/platforms/host/HostPlatform.h"
 
 namespace streamexecutor {
 
@@ -23,6 +24,8 @@
   //    appropriate code to include here.
   //  * Use static initialization tricks to have platform libraries register
   //    themselves when they are loaded.
+
+  PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
 }
 
 Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {