[StreamExecutor] Executor add synchronous methods

Summary:
Add Executor methods that block the host until completion. Since these
methods are host-synchronous, they don't require Stream arguments.

Reviewers: jlebar

Subscribers: jprice, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D23577

llvm-svn: 279640
GitOrigin-RevId: bb1322d49510ffde956bd7d34117fced7117b636
diff --git a/streamexecutor/include/streamexecutor/DeviceMemory.h b/streamexecutor/include/streamexecutor/DeviceMemory.h
index b3b0fd2..45faf7b 100644
--- a/streamexecutor/include/streamexecutor/DeviceMemory.h
+++ b/streamexecutor/include/streamexecutor/DeviceMemory.h
@@ -18,9 +18,9 @@
 /// and a byte count to tell how much memory is pointed to by that void*.
 ///
 /// GlobalDeviceMemory<T> is a subclass of GlobalDeviceMemoryBase which keeps
-/// track of the type of element to be stored in the device array. It is similar
-/// to a pair of a T* pointer and an element count to tell how many elements of
-/// type T fit in the memory pointed to by that T*.
+/// track of the type of element to be stored in the device memory. It is
+/// similar to a pair of a T* pointer and an element count to tell how many
+/// elements of type T fit in the memory pointed to by that T*.
 ///
 /// SharedDeviceMemoryBase is just the size in bytes of a shared memory buffer.
 ///
@@ -38,6 +38,7 @@
 #ifndef STREAMEXECUTOR_DEVICEMEMORY_H
 #define STREAMEXECUTOR_DEVICEMEMORY_H
 
+#include <cassert>
 #include <cstddef>
 
 namespace streamexecutor {
@@ -91,6 +92,71 @@
   size_t ByteCount;   // Size in bytes of this allocation.
 };
 
+template <typename ElemT> class GlobalDeviceMemory;
+
+/// Reference to a slice of device memory.
+///
+/// Contains a base memory handle, an element count offset into that base
+/// memory, and an element count for the size of the slice.
+template <typename ElemT> class GlobalDeviceMemorySlice {
+public:
+  /// Intentionally implicit so GlobalDeviceMemory<T> can be passed to functions
+  /// expecting GlobalDeviceMemorySlice<T> arguments.
+  GlobalDeviceMemorySlice(const GlobalDeviceMemory<ElemT> &Memory)
+      : BaseMemory(Memory), ElementOffset(0),
+        ElementCount(Memory.getElementCount()) {}
+
+  GlobalDeviceMemorySlice(const GlobalDeviceMemory<ElemT> &BaseMemory,
+                          size_t ElementOffset, size_t ElementCount)
+      : BaseMemory(BaseMemory), ElementOffset(ElementOffset),
+        ElementCount(ElementCount) {
+    assert(ElementOffset + ElementCount <= BaseMemory.getElementCount() &&
+           "slicing past the end of a GlobalDeviceMemory buffer");
+  }
+
+  /// Gets the GlobalDeviceMemory backing this slice.
+  GlobalDeviceMemory<ElemT> getBaseMemory() const { return BaseMemory; }
+
+  /// Gets the offset of this slice from the base memory.
+  ///
+  /// The offset is measured in elements, not bytes.
+  size_t getElementOffset() const { return ElementOffset; }
+
+  /// Gets the number of elements in this slice.
+  size_t getElementCount() const { return ElementCount; }
+
+  /// Creates a slice of the memory with the first DropCount elements removed.
+  GlobalDeviceMemorySlice<ElemT> drop_front(size_t DropCount) const {
+    assert(DropCount <= ElementCount &&
+           "dropping more than the size of a slice");
+    return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset + DropCount,
+                                          ElementCount - DropCount);
+  }
+
+  /// Creates a slice of the memory with the last DropCount elements removed.
+  GlobalDeviceMemorySlice<ElemT> drop_back(size_t DropCount) const {
+    assert(DropCount <= ElementCount &&
+           "dropping more than the size of a slice");
+    return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset,
+                                          ElementCount - DropCount);
+  }
+
+  /// Creates a slice of the memory that chops off the first DropCount elements
+  /// and keeps the next TakeCount elements.
+  GlobalDeviceMemorySlice<ElemT> slice(size_t DropCount,
+                                       size_t TakeCount) const {
+    assert(DropCount + TakeCount <= ElementCount &&
+           "sub-slice operation overruns slice bounds");
+    return GlobalDeviceMemorySlice<ElemT>(BaseMemory, ElementOffset + DropCount,
+                                          TakeCount);
+  }
+
+private:
+  GlobalDeviceMemory<ElemT> BaseMemory;
+  size_t ElementOffset;
+  size_t ElementCount;
+};
+
 /// Typed wrapper around the "void *"-like GlobalDeviceMemoryBase class.
 ///
 /// For example, GlobalDeviceMemory<int> is a simple wrapper around
@@ -125,6 +191,11 @@
   /// allocation.
   size_t getElementCount() const { return getByteCount() / sizeof(ElemT); }
 
+  /// Converts this memory object into a slice.
+  GlobalDeviceMemorySlice<ElemT> asSlice() {
+    return GlobalDeviceMemorySlice<ElemT>(*this);
+  }
+
 private:
   /// Constructs a GlobalDeviceMemory instance from an opaque handle and an
   /// element count.
diff --git a/streamexecutor/include/streamexecutor/Executor.h b/streamexecutor/include/streamexecutor/Executor.h
index 0f06962..ea4224e 100644
--- a/streamexecutor/include/streamexecutor/Executor.h
+++ b/streamexecutor/include/streamexecutor/Executor.h
@@ -16,12 +16,12 @@
 #define STREAMEXECUTOR_EXECUTOR_H
 
 #include "streamexecutor/KernelSpec.h"
+#include "streamexecutor/PlatformInterfaces.h"
 #include "streamexecutor/Utils/Error.h"
 
 namespace streamexecutor {
 
 class KernelInterface;
-class PlatformExecutor;
 class Stream;
 
 class Executor {
@@ -38,6 +38,311 @@
 
   Expected<std::unique_ptr<Stream>> createStream();
 
+  /// Allocates an array of ElementCount entries of type T in device memory.
+  template <typename T>
+  Expected<GlobalDeviceMemory<T>> allocateDeviceMemory(size_t ElementCount) {
+    return PExecutor->allocateDeviceMemory(ElementCount * sizeof(T));
+  }
+
+  /// Frees memory previously allocated with allocateDeviceMemory.
+  template <typename T> Error freeDeviceMemory(GlobalDeviceMemory<T> Memory) {
+    return PExecutor->freeDeviceMemory(Memory);
+  }
+
+  /// Allocates an array of ElementCount entries of type T in host memory.
+  ///
+  /// Host memory allocated by this function can be used for asynchronous memory
+  /// copies on streams. See Stream::thenCopyD2H and Stream::thenCopyH2D.
+  template <typename T> Expected<T *> allocateHostMemory(size_t ElementCount) {
+    return PExecutor->allocateHostMemory(ElementCount * sizeof(T));
+  }
+
+  /// Frees memory previously allocated with allocateHostMemory.
+  template <typename T> Error freeHostMemory(T *Memory) {
+    return PExecutor->freeHostMemory(Memory);
+  }
+
+  /// Registers a previously allocated host array of type T for asynchronous
+  /// memory operations.
+  ///
+  /// Host memory registered by this function can be used for asynchronous
+  /// memory copies on streams. See Stream::thenCopyD2H and Stream::thenCopyH2D.
+  template <typename T>
+  Error registerHostMemory(T *Memory, size_t ElementCount) {
+    return PExecutor->registerHostMemory(Memory, ElementCount * sizeof(T));
+  }
+
+  /// Unregisters host memory previously registered by registerHostMemory.
+  template <typename T> Error unregisterHostMemory(T *Memory) {
+    return PExecutor->unregisterHostMemory(Memory);
+  }
+
+  /// Host-synchronously copies a slice of an array of elements of type T from
+  /// host to device memory.
+  ///
+  /// Returns an error if ElementCount is too large for the source slice or the
+  /// destination.
+  ///
+  /// The calling host thread is blocked until the copy completes. Can be used
+  /// with any host memory, the host memory does not have to be allocated with
+  /// allocateHostMemory or registered with registerHostMemory. Does not block
+  /// any ongoing device calls.
+  template <typename T>
+  Error synchronousCopyD2H(GlobalDeviceMemorySlice<T> Src,
+                           llvm::MutableArrayRef<T> Dst, size_t ElementCount) {
+    if (ElementCount > Src.getElementCount())
+      return make_error("copying too many elements, " +
+                        llvm::Twine(ElementCount) +
+                        ", from a device array of element count " +
+                        llvm::Twine(Src.getElementCount()));
+    if (ElementCount > Dst.size())
+      return make_error(
+          "copying too many elements, " + llvm::Twine(ElementCount) +
+          ", to a host array of element count " + llvm::Twine(Dst.size()));
+    return PExecutor->synchronousCopyD2H(
+        Src.getBaseMemory(), Src.getElementOffset() * sizeof(T), Dst.data(), 0,
+        ElementCount * sizeof(T));
+  }
+
+  /// Similar to synchronousCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but does not take an element count
+  /// argument because it copies the entire source array.
+  ///
+  /// Returns an error if the Src and Dst sizes do not match.
+  template <typename T>
+  Error synchronousCopyD2H(GlobalDeviceMemorySlice<T> Src,
+                           llvm::MutableArrayRef<T> Dst) {
+    if (Src.getElementCount() != Dst.size())
+      return make_error(
+          "array size mismatch for D2H, device source has element count " +
+          llvm::Twine(Src.getElementCount()) +
+          " but host destination has element count " + llvm::Twine(Dst.size()));
+    return synchronousCopyD2H(Src, Dst, Src.getElementCount());
+  }
+
+  /// Similar to synchronousCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but copies to a pointer rather than an
+  /// llvm::MutableArrayRef.
+  ///
+  /// Returns an error if ElementCount is too large for the source slice.
+  template <typename T>
+  Error synchronousCopyD2H(GlobalDeviceMemorySlice<T> Src, T *Dst,
+                           size_t ElementCount) {
+    return synchronousCopyD2H(Src, llvm::MutableArrayRef<T>(Dst, ElementCount),
+                              ElementCount);
+  }
+
+  /// Similar to synchronousCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but the source is a GlobalDeviceMemory
+  /// rather than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Error synchronousCopyD2H(GlobalDeviceMemory<T> Src,
+                           llvm::MutableArrayRef<T> Dst, size_t ElementCount) {
+    return synchronousCopyD2H(Src.asSlice(), Dst, ElementCount);
+  }
+
+  /// Similar to  synchronousCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>) but the source is a GlobalDeviceMemory rather
+  /// than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Error synchronousCopyD2H(GlobalDeviceMemory<T> Src,
+                           llvm::MutableArrayRef<T> Dst) {
+    return synchronousCopyD2H(Src.asSlice(), Dst);
+  }
+
+  /// Similar to synchronousCopyD2H(GlobalDeviceMemorySlice<T>, T*, size_t) but
+  /// the source is a GlobalDeviceMemory rather than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Error synchronousCopyD2H(GlobalDeviceMemory<T> Src, T *Dst,
+                           size_t ElementCount) {
+    return synchronousCopyD2H(Src.asSlice(), Dst, ElementCount);
+  }
+
+  /// Host-synchronously copies a slice of an array of elements of type T from
+  /// device to host memory.
+  ///
+  /// Returns an error if ElementCount is too large for the source or the
+  /// destination.
+  ///
+  /// The calling host thread is blocked until the copy completes. Can be used
+  /// with any host memory, the host memory does not have to be allocated with
+  /// allocateHostMemory or registered with registerHostMemory. Does not block
+  /// any ongoing device calls.
+  template <typename T>
+  Error synchronousCopyH2D(llvm::ArrayRef<T> Src,
+                           GlobalDeviceMemorySlice<T> Dst,
+                           size_t ElementCount) {
+    if (ElementCount > Src.size())
+      return make_error(
+          "copying too many elements, " + llvm::Twine(ElementCount) +
+          ", from a host array of element count " + llvm::Twine(Src.size()));
+    if (ElementCount > Dst.getElementCount())
+      return make_error("copying too many elements, " +
+                        llvm::Twine(ElementCount) +
+                        ", to a device array of element count " +
+                        llvm::Twine(Dst.getElementCount()));
+    return PExecutor->synchronousCopyH2D(Src.data(), 0, Dst.getBaseMemory(),
+                                         Dst.getElementOffset() * sizeof(T),
+                                         ElementCount * sizeof(T));
+  }
+
+  /// Similar to synchronousCopyH2D(llvm::ArrayRef<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but does not take an element count
+  /// argument because it copies the entire source array.
+  ///
+  /// Returns an error if the Src and Dst sizes do not match.
+  template <typename T>
+  Error synchronousCopyH2D(llvm::ArrayRef<T> Src,
+                           GlobalDeviceMemorySlice<T> Dst) {
+    if (Src.size() != Dst.getElementCount())
+      return make_error(
+          "array size mismatch for H2D, host source has element count " +
+          llvm::Twine(Src.size()) +
+          " but device destination has element count " +
+          llvm::Twine(Dst.getElementCount()));
+    return synchronousCopyH2D(Src, Dst, Dst.getElementCount());
+  }
+
+  /// Similar to synchronousCopyH2D(llvm::ArrayRef<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but copies from a pointer rather than
+  /// an llvm::ArrayRef.
+  ///
+  /// Returns an error if ElementCount is too large for the destination.
+  template <typename T>
+  Error synchronousCopyH2D(T *Src, GlobalDeviceMemorySlice<T> Dst,
+                           size_t ElementCount) {
+    return synchronousCopyH2D(llvm::ArrayRef<T>(Src, ElementCount), Dst,
+                              ElementCount);
+  }
+
+  /// Similar to synchronousCopyH2D(llvm::ArrayRef<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the destination is a
+  /// GlobalDeviceMemory rather than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Error synchronousCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemory<T> Dst,
+                           size_t ElementCount) {
+    return synchronousCopyH2D(Src, Dst.asSlice(), ElementCount);
+  }
+
+  /// Similar to synchronousCopyH2D(llvm::ArrayRef<T>,
+  /// GlobalDeviceMemorySlice<T>) but the destination is a GlobalDeviceMemory
+  /// rather than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Error synchronousCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemory<T> Dst) {
+    return synchronousCopyH2D(Src, Dst.asSlice());
+  }
+
+  /// Similar to synchronousCopyH2D(T*, GlobalDeviceMemorySlice<T>, size_t) but
+  /// the destination is a GlobalDeviceMemory rather than a
+  /// GlobalDeviceMemorySlice.
+  template <typename T>
+  Error synchronousCopyH2D(T *Src, GlobalDeviceMemory<T> Dst,
+                           size_t ElementCount) {
+    return synchronousCopyH2D(Src, Dst.asSlice(), ElementCount);
+  }
+
+  /// Host-synchronously copies a slice of an array of elements of type T from
+  /// one location in device memory to another.
+  ///
+  /// Returns an error if ElementCount is too large for the source slice or the
+  /// destination.
+  ///
+  /// The calling host thread is blocked until the copy completes. Can be used
+  /// with any host memory, the host memory does not have to be allocated with
+  /// allocateHostMemory or registered with registerHostMemory. Does not block
+  /// any ongoing device calls.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                           GlobalDeviceMemorySlice<T> Dst,
+                           size_t ElementCount) {
+    if (ElementCount > Src.getElementCount())
+      return make_error("copying too many elements, " +
+                        llvm::Twine(ElementCount) +
+                        ", from a device array of element count " +
+                        llvm::Twine(Src.getElementCount()));
+    if (ElementCount > Dst.getElementCount())
+      return make_error("copying too many elements, " +
+                        llvm::Twine(ElementCount) +
+                        ", to a device array of element count " +
+                        llvm::Twine(Dst.getElementCount()));
+    return PExecutor->synchronousCopyD2D(
+        Src.getBaseMemory(), Src.getElementOffset() * sizeof(T),
+        Dst.getBaseMemory(), Dst.getElementOffset() * sizeof(T),
+        ElementCount * sizeof(T));
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but does not take an element count
+  /// argument because it copies the entire source array.
+  ///
+  /// Returns an error if the Src and Dst sizes do not match.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                           GlobalDeviceMemorySlice<T> Dst) {
+    if (Src.getElementCount() != Dst.getElementCount())
+      return make_error(
+          "array size mismatch for D2D, device source has element count " +
+          llvm::Twine(Src.getElementCount()) +
+          " but device destination has element count " +
+          llvm::Twine(Dst.getElementCount()));
+    return synchronousCopyD2D(Src, Dst, Src.getElementCount());
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the source is a
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemory<T> Src,
+                           GlobalDeviceMemorySlice<T> Dst,
+                           size_t ElementCount) {
+    return synchronousCopyD2D(Src.asSlice(), Dst, ElementCount);
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>) but the source is a GlobalDeviceMemory<T>
+  /// rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemory<T> Src,
+                           GlobalDeviceMemorySlice<T> Dst) {
+    return synchronousCopyD2D(Src.asSlice(), Dst);
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the destination is a
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                           GlobalDeviceMemory<T> Dst, size_t ElementCount) {
+    return synchronousCopyD2D(Src, Dst.asSlice(), ElementCount);
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>) but the destination is a GlobalDeviceMemory<T>
+  /// rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                           GlobalDeviceMemory<T> Dst) {
+    return synchronousCopyD2D(Src, Dst.asSlice());
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the source and destination are
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemory<T> Src, GlobalDeviceMemory<T> Dst,
+                           size_t ElementCount) {
+    return synchronousCopyD2D(Src.asSlice(), Dst.asSlice(), ElementCount);
+  }
+
+  /// Similar to synchronousCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>) but the source and destination are
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Error synchronousCopyD2D(GlobalDeviceMemory<T> Src,
+                           GlobalDeviceMemory<T> Dst) {
+    return synchronousCopyD2D(Src.asSlice(), Dst.asSlice());
+  }
+
 private:
   PlatformExecutor *PExecutor;
 };
diff --git a/streamexecutor/include/streamexecutor/PlatformInterfaces.h b/streamexecutor/include/streamexecutor/PlatformInterfaces.h
index 23bae9e..2c8fce3 100644
--- a/streamexecutor/include/streamexecutor/PlatformInterfaces.h
+++ b/streamexecutor/include/streamexecutor/PlatformInterfaces.h
@@ -76,23 +76,32 @@
   }
 
   /// Copies data from the device to the host.
-  virtual Error memcpyD2H(PlatformStreamHandle *S,
-                          const GlobalDeviceMemoryBase &DeviceSrc,
-                          void *HostDst, size_t ByteCount) {
-    return make_error("memcpyD2H not implemented for platform " + getName());
+  ///
+  /// HostDst should have been allocated by allocateHostMemory or registered
+  /// with registerHostMemory.
+  virtual Error copyD2H(PlatformStreamHandle *S,
+                        const GlobalDeviceMemoryBase &DeviceSrc,
+                        size_t SrcByteOffset, void *HostDst,
+                        size_t DstByteOffset, size_t ByteCount) {
+    return make_error("copyD2H not implemented for platform " + getName());
   }
 
   /// Copies data from the host to the device.
-  virtual Error memcpyH2D(PlatformStreamHandle *S, const void *HostSrc,
-                          GlobalDeviceMemoryBase *DeviceDst, size_t ByteCount) {
-    return make_error("memcpyH2D not implemented for platform " + getName());
+  ///
+  /// HostSrc should have been allocated by allocateHostMemory or registered
+  /// with registerHostMemory.
+  virtual Error copyH2D(PlatformStreamHandle *S, const void *HostSrc,
+                        size_t SrcByteOffset, GlobalDeviceMemoryBase DeviceDst,
+                        size_t DstByteOffset, size_t ByteCount) {
+    return make_error("copyH2D not implemented for platform " + getName());
   }
 
   /// Copies data from one device location to another.
-  virtual Error memcpyD2D(PlatformStreamHandle *S,
-                          const GlobalDeviceMemoryBase &DeviceSrc,
-                          GlobalDeviceMemoryBase *DeviceDst, size_t ByteCount) {
-    return make_error("memcpyD2D not implemented for platform " + getName());
+  virtual Error copyD2D(PlatformStreamHandle *S,
+                        const GlobalDeviceMemoryBase &DeviceSrc,
+                        size_t SrcByteOffset, GlobalDeviceMemoryBase DeviceDst,
+                        size_t DstByteOffset, size_t ByteCount) {
+    return make_error("copyD2D not implemented for platform " + getName());
   }
 
   /// Blocks the host until the given stream completes all the work enqueued up
@@ -101,6 +110,80 @@
     return make_error("blockHostUntilDone not implemented for platform " +
                       getName());
   }
+
+  /// Allocates untyped device memory of a given size in bytes.
+  virtual Expected<GlobalDeviceMemoryBase>
+  allocateDeviceMemory(size_t ByteCount) {
+    return make_error("allocateDeviceMemory not implemented for platform " +
+                      getName());
+  }
+
+  /// Frees device memory previously allocated by allocateDeviceMemory.
+  virtual Error freeDeviceMemory(GlobalDeviceMemoryBase Memory) {
+    return make_error("freeDeviceMemory not implemented for platform " +
+                      getName());
+  }
+
+  /// Allocates untyped host memory of a given size in bytes.
+  ///
+  /// Host memory allocated via this method is suitable for use with copyH2D and
+  /// copyD2H.
+  virtual Expected<void *> allocateHostMemory(size_t ByteCount) {
+    return make_error("allocateHostMemory not implemented for platform " +
+                      getName());
+  }
+
+  /// Frees host memory allocated by allocateHostMemory.
+  virtual Error freeHostMemory(void *Memory) {
+    return make_error("freeHostMemory not implemented for platform " +
+                      getName());
+  }
+
+  /// Registers previously allocated host memory so it can be used with copyH2D
+  /// and copyD2H.
+  virtual Error registerHostMemory(void *Memory, size_t ByteCount) {
+    return make_error("registerHostMemory not implemented for platform " +
+                      getName());
+  }
+
+  /// Unregisters host memory previously registered with registerHostMemory.
+  virtual Error unregisterHostMemory(void *Memory) {
+    return make_error("unregisterHostMemory not implemented for platform " +
+                      getName());
+  }
+
+  /// Copies the given number of bytes from device memory to host memory.
+  ///
+  /// Blocks the calling host thread until the copy is completed. Can operate on
+  /// any host memory, not just registered host memory or host memory allocated
+  /// by allocateHostMemory. Does not block any ongoing device calls.
+  virtual Error synchronousCopyD2H(const GlobalDeviceMemoryBase &DeviceSrc,
+                                   size_t SrcByteOffset, void *HostDst,
+                                   size_t DstByteOffset, size_t ByteCount) {
+    return make_error("synchronousCopyD2H not implemented for platform " +
+                      getName());
+  }
+
+  /// Similar to synchronousCopyD2H(const GlobalDeviceMemoryBase &, size_t, void
+  /// *, size_t, size_t), but copies memory from host to device rather than
+  /// device to host.
+  virtual Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                                   GlobalDeviceMemoryBase DeviceDst,
+                                   size_t DstByteOffset, size_t ByteCount) {
+    return make_error("synchronousCopyH2D not implemented for platform " +
+                      getName());
+  }
+
+  /// Similar to synchronousCopyD2H(const GlobalDeviceMemoryBase &, size_t, void
+  /// *, size_t, size_t), but copies memory from one location in device memory
+  /// to another rather than from device to host.
+  virtual Error synchronousCopyD2D(GlobalDeviceMemoryBase DeviceDst,
+                                   size_t DstByteOffset,
+                                   const GlobalDeviceMemoryBase &DeviceSrc,
+                                   size_t SrcByteOffset, size_t ByteCount) {
+    return make_error("synchronousCopyD2D not implemented for platform " +
+                      getName());
+  }
 };
 
 } // namespace streamexecutor
diff --git a/streamexecutor/include/streamexecutor/Stream.h b/streamexecutor/include/streamexecutor/Stream.h
index ba126fa..87a2c7c 100644
--- a/streamexecutor/include/streamexecutor/Stream.h
+++ b/streamexecutor/include/streamexecutor/Stream.h
@@ -17,7 +17,7 @@
 /// The Stream instance will perform its work on the device managed by the
 /// Executor that created it.
 ///
-/// The various "then" methods of the Stream object, such as thenMemcpyH2D and
+/// The various "then" methods of the Stream object, such as thenCopyH2D and
 /// thenLaunch, may be used to enqueue work on the Stream, and the
 /// blockHostUntilDone() method may be used to block the host code until the
 /// Stream has completed all its work.
@@ -99,102 +99,262 @@
     return *this;
   }
 
-  /// Entrain onto the stream a memcpy of a given number of elements from a
-  /// device source to a host destination.
+  /// Enqueues on this stream a command to copy a slice of an array of elements
+  /// of type T from device to host memory.
   ///
-  /// HostDst must be a pointer to host memory allocated by
-  /// Executor::allocateHostMemory or otherwise allocated and then
-  /// registered with Executor::registerHostMemory.
+  /// Sets an error if ElementCount is too large for the source or the
+  /// destination.
+  ///
+  /// If the Src memory was not created by allocateHostMemory or registered with
+  /// registerHostMemory, then the copy operation may cause the host and device
+  /// to block until the copy operation is completed.
   template <typename T>
-  Stream &thenMemcpyD2H(const GlobalDeviceMemory<T> &DeviceSrc,
-                        llvm::MutableArrayRef<T> HostDst, size_t ElementCount) {
-    if (ElementCount > DeviceSrc.getElementCount())
+  Stream &thenCopyD2H(GlobalDeviceMemorySlice<T> Src,
+                      llvm::MutableArrayRef<T> Dst, size_t ElementCount) {
+    if (ElementCount > Src.getElementCount())
       setError("copying too many elements, " + llvm::Twine(ElementCount) +
-               ", from device memory array of size " +
-               llvm::Twine(DeviceSrc.getElementCount()));
-    else if (ElementCount > HostDst.size())
+               ", from a device array of element count " +
+               llvm::Twine(Src.getElementCount()));
+    else if (ElementCount > Dst.size())
       setError("copying too many elements, " + llvm::Twine(ElementCount) +
-               ", to host array of size " + llvm::Twine(HostDst.size()));
+               ", to a host array of element count " + llvm::Twine(Dst.size()));
     else
-      setError(PExecutor->memcpyD2H(ThePlatformStream.get(), DeviceSrc,
-                                    HostDst.data(), ElementCount * sizeof(T)));
+      setError(PExecutor->copyD2H(ThePlatformStream.get(), Src.getBaseMemory(),
+                                  Src.getElementOffset() * sizeof(T),
+                                  Dst.data(), 0, ElementCount * sizeof(T)));
     return *this;
   }
 
-  /// Same as thenMemcpyD2H above, but copies the entire source to the
-  /// destination.
-  template <typename T>
-  Stream &thenMemcpyD2H(const GlobalDeviceMemory<T> &DeviceSrc,
-                        llvm::MutableArrayRef<T> HostDst) {
-    return thenMemcpyD2H(DeviceSrc, HostDst, DeviceSrc.getElementCount());
-  }
-
-  /// Entrain onto the stream a memcpy of a given number of elements from a host
-  /// source to a device destination.
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but does not take an element count
+  /// argument because it copies the entire source array.
   ///
-  /// HostSrc must be a pointer to host memory allocated by
-  /// Executor::allocateHostMemory or otherwise allocated and then
-  /// registered with Executor::registerHostMemory.
+  /// Sets an error if the Src and Dst sizes do not match.
   template <typename T>
-  Stream &thenMemcpyH2D(llvm::ArrayRef<T> HostSrc,
-                        GlobalDeviceMemory<T> *DeviceDst, size_t ElementCount) {
-    if (ElementCount > HostSrc.size())
-      setError("copying too many elements, " + llvm::Twine(ElementCount) +
-               ", from host array of size " + llvm::Twine(HostSrc.size()));
-    else if (ElementCount > DeviceDst->getElementCount())
-      setError("copying too many elements, " + llvm::Twine(ElementCount) +
-               ", to device memory array of size " +
-               llvm::Twine(DeviceDst->getElementCount()));
+  Stream &thenCopyD2H(GlobalDeviceMemorySlice<T> Src,
+                      llvm::MutableArrayRef<T> Dst) {
+    if (Src.getElementCount() != Dst.size())
+      setError("array size mismatch for D2H, device source has element count " +
+               llvm::Twine(Src.getElementCount()) +
+               " but host destination has element count " +
+               llvm::Twine(Dst.size()));
     else
-      setError(PExecutor->memcpyH2D(ThePlatformStream.get(), HostSrc.data(),
-                                    DeviceDst, ElementCount * sizeof(T)));
+      thenCopyD2H(Src, Dst, Src.getElementCount());
     return *this;
   }
 
-  /// Same as thenMemcpyH2D above, but copies the entire source to the
-  /// destination.
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but copies to a pointer rather than an
+  /// llvm::MutableArrayRef.
+  ///
+  /// Sets an error if ElementCount is too large for the source slice.
   template <typename T>
-  Stream &thenMemcpyH2D(llvm::ArrayRef<T> HostSrc,
-                        GlobalDeviceMemory<T> *DeviceDst) {
-    return thenMemcpyH2D(HostSrc, DeviceDst, HostSrc.size());
-  }
-
-  /// Entrain onto the stream a memcpy of a given number of elements from a
-  /// device source to a device destination.
-  template <typename T>
-  Stream &thenMemcpyD2D(const GlobalDeviceMemory<T> &DeviceSrc,
-                        GlobalDeviceMemory<T> *DeviceDst, size_t ElementCount) {
-    if (ElementCount > DeviceSrc.getElementCount())
-      setError("copying too many elements, " + llvm::Twine(ElementCount) +
-               ", from device memory array of size " +
-               llvm::Twine(DeviceSrc.getElementCount()));
-    else if (ElementCount > DeviceDst->getElementCount())
-      setError("copying too many elements, " + llvm::Twine(ElementCount) +
-               ", to device memory array of size " +
-               llvm::Twine(DeviceDst->getElementCount()));
-    else
-      setError(PExecutor->memcpyD2D(ThePlatformStream.get(), DeviceSrc,
-                                    DeviceDst, ElementCount * sizeof(T)));
+  Stream &thenCopyD2H(GlobalDeviceMemorySlice<T> Src, T *Dst,
+                      size_t ElementCount) {
+    thenCopyD2H(Src, llvm::MutableArrayRef<T>(Dst, ElementCount), ElementCount);
     return *this;
   }
 
-  /// Same as thenMemcpyD2D above, but copies the entire source to the
-  /// destination.
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but the source is a GlobalDeviceMemory
+  /// rather than a GlobalDeviceMemorySlice.
   template <typename T>
-  Stream &thenMemcpyD2D(const GlobalDeviceMemory<T> &DeviceSrc,
-                        GlobalDeviceMemory<T> *DeviceDst) {
-    return thenMemcpyD2D(DeviceSrc, DeviceDst, DeviceSrc.getElementCount());
+  Stream &thenCopyD2H(GlobalDeviceMemory<T> Src, llvm::MutableArrayRef<T> Dst,
+                      size_t ElementCount) {
+    thenCopyD2H(Src.asSlice(), Dst, ElementCount);
+    return *this;
   }
 
-  /// Blocks the host code, waiting for the operations entrained on the stream
-  /// (enqueued up to this point in program execution) to complete.
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>) but the source is a GlobalDeviceMemory rather
+  /// than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Stream &thenCopyD2H(GlobalDeviceMemory<T> Src, llvm::MutableArrayRef<T> Dst) {
+    thenCopyD2H(Src.asSlice(), Dst);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>, T*, size_t) but the
+  /// source is a GlobalDeviceMemory rather than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Stream &thenCopyD2H(GlobalDeviceMemory<T> Src, T *Dst, size_t ElementCount) {
+    thenCopyD2H(Src.asSlice(), Dst, ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but copies from host to device memory
+  /// rather than device to host memory.
+  template <typename T>
+  Stream &thenCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemorySlice<T> Dst,
+                      size_t ElementCount) {
+    if (ElementCount > Src.size())
+      setError("copying too many elements, " + llvm::Twine(ElementCount) +
+               ", from a host array of element count " +
+               llvm::Twine(Src.size()));
+    else if (ElementCount > Dst.getElementCount())
+      setError("copying too many elements, " + llvm::Twine(ElementCount) +
+               ", to a device array of element count " +
+               llvm::Twine(Dst.getElementCount()));
+    else
+      setError(PExecutor->copyH2D(
+          ThePlatformStream.get(), Src.data(), 0, Dst.getBaseMemory(),
+          Dst.getElementOffset() * sizeof(T), ElementCount * sizeof(T)));
+    return *this;
+  }
+
+  /// Similar to thenCopyH2D(llvm::ArrayRef<T>, GlobalDeviceMemorySlice<T>,
+  /// size_t) but does not take an element count argument because it copies the
+  /// entire source array.
   ///
-  /// Returns true if there are no errors on the stream.
-  bool blockHostUntilDone() {
-    Error E = PExecutor->blockHostUntilDone(ThePlatformStream.get());
-    bool returnValue = static_cast<bool>(E);
-    setError(std::move(E));
-    return returnValue;
+  /// Sets an error if the Src and Dst sizes do not match.
+  template <typename T>
+  Stream &thenCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemorySlice<T> Dst) {
+    if (Src.size() != Dst.getElementCount())
+      setError("array size mismatch for H2D, host source has element count " +
+               llvm::Twine(Src.size()) +
+               " but device destination has element count " +
+               llvm::Twine(Dst.getElementCount()));
+    else
+      thenCopyH2D(Src, Dst, Dst.getElementCount());
+    return *this;
+  }
+
+  /// Similar to thenCopyH2D(llvm::ArrayRef<T>, GlobalDeviceMemorySlice<T>,
+  /// size_t) but copies from a pointer rather than an llvm::ArrayRef.
+  ///
+  /// Sets an error if ElementCount is too large for the destination.
+  template <typename T>
+  Stream &thenCopyH2D(T *Src, GlobalDeviceMemorySlice<T> Dst,
+                      size_t ElementCount) {
+    thenCopyH2D(llvm::ArrayRef<T>(Src, ElementCount), Dst, ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyH2D(llvm::ArrayRef<T>, GlobalDeviceMemorySlice<T>,
+  /// size_t) but the destination is a GlobalDeviceMemory rather than a
+  /// GlobalDeviceMemorySlice.
+  template <typename T>
+  Stream &thenCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemory<T> Dst,
+                      size_t ElementCount) {
+    thenCopyH2D(Src, Dst.asSlice(), ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyH2D(llvm::ArrayRef<T>, GlobalDeviceMemorySlice<T>) but
+  /// the destination is a GlobalDeviceMemory rather than a
+  /// GlobalDeviceMemorySlice.
+  template <typename T>
+  Stream &thenCopyH2D(llvm::ArrayRef<T> Src, GlobalDeviceMemory<T> Dst) {
+    thenCopyH2D(Src, Dst.asSlice());
+    return *this;
+  }
+
+  /// Similar to thenCopyH2D(T*, GlobalDeviceMemorySlice<T>, size_t) but the
+  /// destination is a GlobalDeviceMemory rather than a GlobalDeviceMemorySlice.
+  template <typename T>
+  Stream &thenCopyH2D(T *Src, GlobalDeviceMemory<T> Dst, size_t ElementCount) {
+    thenCopyH2D(Src, Dst.asSlice(), ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2H(GlobalDeviceMemorySlice<T>,
+  /// llvm::MutableArrayRef<T>, size_t) but copies from one location in device
+  /// memory to another rather than from device to host memory.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                      GlobalDeviceMemorySlice<T> Dst, size_t ElementCount) {
+    if (ElementCount > Src.getElementCount())
+      setError("copying too many elements, " + llvm::Twine(ElementCount) +
+               ", from a device array of element count " +
+               llvm::Twine(Src.getElementCount()));
+    else if (ElementCount > Dst.getElementCount())
+      setError("copying too many elements, " + llvm::Twine(ElementCount) +
+               ", to a device array of element count " +
+               llvm::Twine(Dst.getElementCount()));
+    else
+      setError(PExecutor->copyD2D(
+          ThePlatformStream.get(), Src.getBaseMemory(),
+          Src.getElementOffset() * sizeof(T), Dst.getBaseMemory(),
+          Dst.getElementOffset() * sizeof(T), ElementCount * sizeof(T)));
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but does not take an element count
+  /// argument because it copies the entire source array.
+  ///
+  /// Sets an error if the Src and Dst sizes do not match.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                      GlobalDeviceMemorySlice<T> Dst) {
+    if (Src.getElementCount() != Dst.getElementCount())
+      setError("array size mismatch for D2D, device source has element count " +
+               llvm::Twine(Src.getElementCount()) +
+               " but device destination has element count " +
+               llvm::Twine(Dst.getElementCount()));
+    else
+      thenCopyD2D(Src, Dst, Src.getElementCount());
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the source is a
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemory<T> Src, GlobalDeviceMemorySlice<T> Dst,
+                      size_t ElementCount) {
+    thenCopyD2D(Src.asSlice(), Dst, ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>) but the source is a GlobalDeviceMemory<T>
+  /// rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemory<T> Src,
+                      GlobalDeviceMemorySlice<T> Dst) {
+    thenCopyD2D(Src.asSlice(), Dst);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the destination is a
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemorySlice<T> Src, GlobalDeviceMemory<T> Dst,
+                      size_t ElementCount) {
+    thenCopyD2D(Src, Dst.asSlice(), ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>) but the destination is a GlobalDeviceMemory<T>
+  /// rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemorySlice<T> Src,
+                      GlobalDeviceMemory<T> Dst) {
+    thenCopyD2D(Src, Dst.asSlice());
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>, size_t) but the source and destination are
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemory<T> Src, GlobalDeviceMemory<T> Dst,
+                      size_t ElementCount) {
+    thenCopyD2D(Src.asSlice(), Dst.asSlice(), ElementCount);
+    return *this;
+  }
+
+  /// Similar to thenCopyD2D(GlobalDeviceMemorySlice<T>,
+  /// GlobalDeviceMemorySlice<T>) but the source and destination are
+  /// GlobalDeviceMemory<T> rather than a GlobalDeviceMemorySlice<T>.
+  template <typename T>
+  Stream &thenCopyD2D(GlobalDeviceMemory<T> Src, GlobalDeviceMemory<T> Dst) {
+    thenCopyD2D(Src.asSlice(), Dst.asSlice());
+    return *this;
   }
 
 private:
diff --git a/streamexecutor/include/streamexecutor/Utils/Error.h b/streamexecutor/include/streamexecutor/Utils/Error.h
index 3811847..e7b313e 100644
--- a/streamexecutor/include/streamexecutor/Utils/Error.h
+++ b/streamexecutor/include/streamexecutor/Utils/Error.h
@@ -30,7 +30,7 @@
 /// }
 /// \endcode
 ///
-/// Error instances are implicitly convertable to bool. Error values convert to
+/// Error instances are implicitly convertible to bool. Error values convert to
 /// true and successes convert to false. Error instances must have their boolean
 /// values checked or they must be moved before they go out of scope, otherwise
 /// their destruction will cause the program to abort with a warning about an
@@ -169,10 +169,10 @@
 using llvm::consumeError;
 using llvm::Error;
 using llvm::Expected;
-using llvm::StringRef;
+using llvm::Twine;
 
 // Makes an Error object from an error message.
-Error make_error(StringRef Message);
+Error make_error(Twine Message);
 
 // Consumes the input error and returns its error message.
 //
diff --git a/streamexecutor/lib/Utils/Error.cpp b/streamexecutor/lib/Utils/Error.cpp
index 78912c5..f3d0967 100644
--- a/streamexecutor/lib/Utils/Error.cpp
+++ b/streamexecutor/lib/Utils/Error.cpp
@@ -27,7 +27,7 @@
 
   std::error_code convertToErrorCode() const override {
     llvm_unreachable(
-        "StreamExecutorError does not support convertion to std::error_code");
+        "StreamExecutorError does not support conversion to std::error_code");
   }
 
   std::string getErrorMessage() const { return Message; }
@@ -44,8 +44,8 @@
 
 namespace streamexecutor {
 
-Error make_error(StringRef Message) {
-  return llvm::make_error<StreamExecutorError>(Message);
+Error make_error(Twine Message) {
+  return llvm::make_error<StreamExecutorError>(Message.str());
 }
 
 std::string consumeAndGetMessage(Error &&E) {
diff --git a/streamexecutor/lib/unittests/CMakeLists.txt b/streamexecutor/lib/unittests/CMakeLists.txt
index f6e6edb..244312f 100644
--- a/streamexecutor/lib/unittests/CMakeLists.txt
+++ b/streamexecutor/lib/unittests/CMakeLists.txt
@@ -1,4 +1,14 @@
 add_executable(
+    executor_test
+    ExecutorTest.cpp)
+target_link_libraries(
+    executor_test
+    streamexecutor
+    ${GTEST_BOTH_LIBRARIES}
+    ${CMAKE_THREAD_LIBS_INIT})
+add_test(ExecutorTest executor_test)
+
+add_executable(
     kernel_test
     KernelTest.cpp)
 target_link_libraries(
diff --git a/streamexecutor/lib/unittests/ExecutorTest.cpp b/streamexecutor/lib/unittests/ExecutorTest.cpp
new file mode 100644
index 0000000..d2d03fb
--- /dev/null
+++ b/streamexecutor/lib/unittests/ExecutorTest.cpp
@@ -0,0 +1,451 @@
+//===-- ExecutorTest.cpp - Tests for Executor -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the unit tests for Executor code.
+///
+//===----------------------------------------------------------------------===//
+
+#include <cstdlib>
+#include <cstring>
+
+#include "streamexecutor/Executor.h"
+#include "streamexecutor/PlatformInterfaces.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+
+namespace se = ::streamexecutor;
+
+class MockPlatformExecutor : public se::PlatformExecutor {
+public:
+  ~MockPlatformExecutor() override {}
+
+  std::string getName() const override { return "MockPlatformExecutor"; }
+
+  se::Expected<std::unique_ptr<se::PlatformStreamHandle>>
+  createStream() override {
+    return se::make_error("not implemented");
+  }
+
+  se::Expected<se::GlobalDeviceMemoryBase>
+  allocateDeviceMemory(size_t ByteCount) override {
+    return se::GlobalDeviceMemoryBase(std::malloc(ByteCount));
+  }
+
+  se::Error freeDeviceMemory(se::GlobalDeviceMemoryBase Memory) override {
+    std::free(const_cast<void *>(Memory.getHandle()));
+    return se::Error::success();
+  }
+
+  se::Expected<void *> allocateHostMemory(size_t ByteCount) override {
+    return std::malloc(ByteCount);
+  }
+
+  se::Error freeHostMemory(void *Memory) override {
+    std::free(Memory);
+    return se::Error::success();
+  }
+
+  se::Error synchronousCopyD2H(const se::GlobalDeviceMemoryBase &DeviceSrc,
+                               size_t SrcByteOffset, void *HostDst,
+                               size_t DstByteOffset,
+                               size_t ByteCount) override {
+    std::memcpy(static_cast<char *>(HostDst) + DstByteOffset,
+                static_cast<const char *>(DeviceSrc.getHandle()) +
+                    SrcByteOffset,
+                ByteCount);
+    return se::Error::success();
+  }
+
+  se::Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                               se::GlobalDeviceMemoryBase DeviceDst,
+                               size_t DstByteOffset,
+                               size_t ByteCount) override {
+    std::memcpy(static_cast<char *>(const_cast<void *>(DeviceDst.getHandle())) +
+                    DstByteOffset,
+                static_cast<const char *>(HostSrc) + SrcByteOffset, ByteCount);
+    return se::Error::success();
+  }
+
+  se::Error synchronousCopyD2D(se::GlobalDeviceMemoryBase DeviceDst,
+                               size_t DstByteOffset,
+                               const se::GlobalDeviceMemoryBase &DeviceSrc,
+                               size_t SrcByteOffset,
+                               size_t ByteCount) override {
+    std::memcpy(static_cast<char *>(const_cast<void *>(DeviceDst.getHandle())) +
+                    DstByteOffset,
+                static_cast<const char *>(DeviceSrc.getHandle()) +
+                    SrcByteOffset,
+                ByteCount);
+    return se::Error::success();
+  }
+};
+
+/// Test fixture to hold objects used by tests.
+class ExecutorTest : public ::testing::Test {
+public:
+  ExecutorTest()
+      : HostA5{0, 1, 2, 3, 4}, HostB5{5, 6, 7, 8, 9},
+        HostA7{10, 11, 12, 13, 14, 15, 16}, HostB7{17, 18, 19, 20, 21, 22, 23},
+        DeviceA5(se::GlobalDeviceMemory<int>::makeFromElementCount(HostA5, 5)),
+        DeviceB5(se::GlobalDeviceMemory<int>::makeFromElementCount(HostB5, 5)),
+        DeviceA7(se::GlobalDeviceMemory<int>::makeFromElementCount(HostA7, 7)),
+        DeviceB7(se::GlobalDeviceMemory<int>::makeFromElementCount(HostB7, 7)),
+        Host5{24, 25, 26, 27, 28}, Host7{29, 30, 31, 32, 33, 34, 35},
+        Executor(&PExecutor) {}
+
+  // Device memory is backed by host arrays.
+  int HostA5[5];
+  int HostB5[5];
+  int HostA7[7];
+  int HostB7[7];
+  se::GlobalDeviceMemory<int> DeviceA5;
+  se::GlobalDeviceMemory<int> DeviceB5;
+  se::GlobalDeviceMemory<int> DeviceA7;
+  se::GlobalDeviceMemory<int> DeviceB7;
+
+  // Host memory to be used as actual host memory.
+  int Host5[5];
+  int Host7[7];
+
+  MockPlatformExecutor PExecutor;
+  se::Executor Executor;
+};
+
+#define EXPECT_NO_ERROR(E) EXPECT_FALSE(static_cast<bool>(E))
+#define EXPECT_ERROR(E)                                                        \
+  do {                                                                         \
+    se::Error E__ = E;                                                         \
+    EXPECT_TRUE(static_cast<bool>(E__));                                       \
+    consumeError(std::move(E__));                                              \
+  } while (false)
+
+using llvm::ArrayRef;
+using llvm::MutableArrayRef;
+
+// D2H tests
+
+TEST_F(ExecutorTest, SyncCopyD2HToMutableArrayRefByCount) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host5), 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2H(DeviceB5, MutableArrayRef<int>(Host5), 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2H(DeviceA7, MutableArrayRef<int>(Host5), 7));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host7), 7));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host5), 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2HToMutableArrayRef) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host5)));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2H(DeviceA7, MutableArrayRef<int>(Host5)));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2H(DeviceA5, MutableArrayRef<int>(Host7)));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2HToPointer) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2H(DeviceA5, Host5, 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA5, Host7, 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2HSliceToMutableArrayRefByCount) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2H(
+      DeviceA5.asSlice().drop_front(1), MutableArrayRef<int>(Host5 + 1, 4), 4));
+  for (int I = 1; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2H(DeviceB5.asSlice().drop_back(1),
+                                              MutableArrayRef<int>(Host5), 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA7.asSlice(),
+                                           MutableArrayRef<int>(Host5), 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA5.asSlice(),
+                                           MutableArrayRef<int>(Host7), 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA5.asSlice(),
+                                           MutableArrayRef<int>(Host5), 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2HSliceToMutableArrayRef) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2H(DeviceA7.asSlice().slice(1, 5),
+                                              MutableArrayRef<int>(Host5)));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA7[I + 1], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA7.asSlice().drop_back(1),
+                                           MutableArrayRef<int>(Host5)));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA5.asSlice(),
+                                           MutableArrayRef<int>(Host7)));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2HSliceToPointer) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2H(DeviceA5.asSlice().drop_front(1),
+                                              Host5 + 1, 4));
+  for (int I = 1; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2H(DeviceA5.asSlice(), Host7, 7));
+}
+
+// H2D tests
+
+TEST_F(ExecutorTest, SyncCopyH2DToArrayRefByCount) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5, 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceB5, 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5, 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7, 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5, 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyH2DToArrayRef) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7));
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5));
+}
+
+TEST_F(ExecutorTest, SyncCopyH2DToPointer) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyH2D(Host5, DeviceA5, 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(Host7, DeviceA5, 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyH2DSliceToArrayRefByCount) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyH2D(
+      ArrayRef<int>(Host5 + 1, 4), DeviceA5.asSlice().drop_front(1), 4));
+  for (int I = 1; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_NO_ERROR(Executor.synchronousCopyH2D(
+      ArrayRef<int>(Host5), DeviceB5.asSlice().drop_back(1), 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5.asSlice(), 7));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7.asSlice(), 7));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5.asSlice(), 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyH2DSliceToArrayRef) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA5.asSlice()));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host5), DeviceA7.asSlice()));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyH2D(ArrayRef<int>(Host7), DeviceA5.asSlice()));
+}
+
+TEST_F(ExecutorTest, SyncCopyH2DSliceToPointer) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyH2D(Host5, DeviceA5.asSlice(), 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyH2D(Host7, DeviceA5.asSlice(), 7));
+}
+
+// D2D tests
+
+TEST_F(ExecutorTest, SyncCopyD2DByCount) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB5, 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(DeviceA7, DeviceB7, 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB5, 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA7, DeviceB5, 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB7, 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2D) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA7, DeviceB5));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB7));
+}
+
+TEST_F(ExecutorTest, SyncCopySliceD2DByCount) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(DeviceA5.asSlice().drop_front(1),
+                                              DeviceB5, 4));
+  for (int I = 0; I < 4; ++I) {
+    EXPECT_EQ(HostA5[I + 1], HostB5[I]);
+  }
+
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(DeviceA7.asSlice().drop_back(1),
+                                              DeviceB7, 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5, 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB5, 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB7, 7));
+}
+
+TEST_F(ExecutorTest, SyncCopySliceD2D) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2D(DeviceA7.asSlice().drop_back(2), DeviceB5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA7[I], HostB5[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA7.asSlice().drop_front(1), DeviceB5));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5.asSlice().drop_back(1), DeviceB7));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2DSliceByCount) {
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(
+      DeviceA5, DeviceB7.asSlice().drop_front(2), 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB7[I + 2]);
+  }
+
+  EXPECT_NO_ERROR(Executor.synchronousCopyD2D(
+      DeviceA7, DeviceB7.asSlice().drop_back(3), 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB5.asSlice(), 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA7, DeviceB5.asSlice(), 7));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice(), 7));
+}
+
+TEST_F(ExecutorTest, SyncCopyD2DSlice) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice().drop_back(2)));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB7[I]);
+  }
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA7, DeviceB5.asSlice()));
+
+  EXPECT_ERROR(Executor.synchronousCopyD2D(DeviceA5, DeviceB7.asSlice()));
+}
+
+TEST_F(ExecutorTest, SyncCopySliceD2DSliceByCount) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice(), 5));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB7.asSlice(), 2));
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice(), 7));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB5.asSlice(), 7));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB7.asSlice(), 7));
+}
+
+TEST_F(ExecutorTest, SyncCopySliceD2DSlice) {
+  EXPECT_NO_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice()));
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA7.asSlice(), DeviceB5.asSlice()));
+
+  EXPECT_ERROR(
+      Executor.synchronousCopyD2D(DeviceA5.asSlice(), DeviceB7.asSlice()));
+}
+
+} // namespace
diff --git a/streamexecutor/lib/unittests/StreamTest.cpp b/streamexecutor/lib/unittests/StreamTest.cpp
index 6ef2183..7564670 100644
--- a/streamexecutor/lib/unittests/StreamTest.cpp
+++ b/streamexecutor/lib/unittests/StreamTest.cpp
@@ -40,26 +40,34 @@
     return nullptr;
   }
 
-  se::Error memcpyD2H(se::PlatformStreamHandle *,
-                      const se::GlobalDeviceMemoryBase &DeviceSrc,
-                      void *HostDst, size_t ByteCount) override {
-    std::memcpy(HostDst, DeviceSrc.getHandle(), ByteCount);
+  se::Error copyD2H(se::PlatformStreamHandle *S,
+                    const se::GlobalDeviceMemoryBase &DeviceSrc,
+                    size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+                    size_t ByteCount) override {
+    std::memcpy(HostDst, static_cast<const char *>(DeviceSrc.getHandle()) +
+                             SrcByteOffset,
+                ByteCount);
     return se::Error::success();
   }
 
-  se::Error memcpyH2D(se::PlatformStreamHandle *, const void *HostSrc,
-                      se::GlobalDeviceMemoryBase *DeviceDst,
-                      size_t ByteCount) override {
-    std::memcpy(const_cast<void *>(DeviceDst->getHandle()), HostSrc, ByteCount);
+  se::Error copyH2D(se::PlatformStreamHandle *S, const void *HostSrc,
+                    size_t SrcByteOffset, se::GlobalDeviceMemoryBase DeviceDst,
+                    size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(static_cast<char *>(const_cast<void *>(DeviceDst.getHandle())) +
+                    DstByteOffset,
+                HostSrc, ByteCount);
     return se::Error::success();
   }
 
-  se::Error memcpyD2D(se::PlatformStreamHandle *,
-                      const se::GlobalDeviceMemoryBase &DeviceSrc,
-                      se::GlobalDeviceMemoryBase *DeviceDst,
-                      size_t ByteCount) override {
-    std::memcpy(const_cast<void *>(DeviceDst->getHandle()),
-                DeviceSrc.getHandle(), ByteCount);
+  se::Error copyD2D(se::PlatformStreamHandle *S,
+                    const se::GlobalDeviceMemoryBase &DeviceSrc,
+                    size_t SrcByteOffset, se::GlobalDeviceMemoryBase DeviceDst,
+                    size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(static_cast<char *>(const_cast<void *>(DeviceDst.getHandle())) +
+                    DstByteOffset,
+                static_cast<const char *>(DeviceSrc.getHandle()) +
+                    SrcByteOffset,
+                ByteCount);
     return se::Error::success();
   }
 };
@@ -68,47 +76,317 @@
 class StreamTest : public ::testing::Test {
 public:
   StreamTest()
-      : DeviceA(se::GlobalDeviceMemory<int>::makeFromElementCount(HostA, 10)),
-        DeviceB(se::GlobalDeviceMemory<int>::makeFromElementCount(HostB, 10)),
+      : HostA5{0, 1, 2, 3, 4}, HostB5{5, 6, 7, 8, 9},
+        HostA7{10, 11, 12, 13, 14, 15, 16}, HostB7{17, 18, 19, 20, 21, 22, 23},
+        DeviceA5(se::GlobalDeviceMemory<int>::makeFromElementCount(HostA5, 5)),
+        DeviceB5(se::GlobalDeviceMemory<int>::makeFromElementCount(HostB5, 5)),
+        DeviceA7(se::GlobalDeviceMemory<int>::makeFromElementCount(HostA7, 7)),
+        DeviceB7(se::GlobalDeviceMemory<int>::makeFromElementCount(HostB7, 7)),
+        Host5{24, 25, 26, 27, 28}, Host7{29, 30, 31, 32, 33, 34, 35},
         Stream(llvm::make_unique<se::PlatformStreamHandle>(&PExecutor)) {}
 
 protected:
   // Device memory is backed by host arrays.
-  int HostA[10];
-  se::GlobalDeviceMemory<int> DeviceA;
-  int HostB[10];
-  se::GlobalDeviceMemory<int> DeviceB;
+  int HostA5[5];
+  int HostB5[5];
+  int HostA7[7];
+  int HostB7[7];
+  se::GlobalDeviceMemory<int> DeviceA5;
+  se::GlobalDeviceMemory<int> DeviceB5;
+  se::GlobalDeviceMemory<int> DeviceA7;
+  se::GlobalDeviceMemory<int> DeviceB7;
 
   // Host memory to be used as actual host memory.
-  int Host[10];
+  int Host5[5];
+  int Host7[7];
 
   MockPlatformExecutor PExecutor;
   se::Stream Stream;
 };
 
-TEST_F(StreamTest, MemcpyCorrectSize) {
-  Stream.thenMemcpyH2D(llvm::ArrayRef<int>(Host), &DeviceA);
-  EXPECT_TRUE(Stream.isOK());
+using llvm::ArrayRef;
+using llvm::MutableArrayRef;
 
-  Stream.thenMemcpyD2H(DeviceA, llvm::MutableArrayRef<int>(Host));
-  EXPECT_TRUE(Stream.isOK());
+// D2H tests
 
-  Stream.thenMemcpyD2D(DeviceA, &DeviceB);
+TEST_F(StreamTest, CopyD2HToMutableArrayRefByCount) {
+  Stream.thenCopyD2H(DeviceA5, MutableArrayRef<int>(Host5), 5);
   EXPECT_TRUE(Stream.isOK());
-}
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
 
-TEST_F(StreamTest, MemcpyH2DTooManyElements) {
-  Stream.thenMemcpyH2D(llvm::ArrayRef<int>(Host), &DeviceA, 20);
+  Stream.thenCopyD2H(DeviceB5, MutableArrayRef<int>(Host5), 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceA7, MutableArrayRef<int>(Host5), 7);
   EXPECT_FALSE(Stream.isOK());
 }
 
-TEST_F(StreamTest, MemcpyD2HTooManyElements) {
-  Stream.thenMemcpyD2H(DeviceA, llvm::MutableArrayRef<int>(Host), 20);
+TEST_F(StreamTest, CopyD2HToMutableArrayRef) {
+  Stream.thenCopyD2H(DeviceA5, MutableArrayRef<int>(Host5));
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceA5, MutableArrayRef<int>(Host7));
   EXPECT_FALSE(Stream.isOK());
 }
 
-TEST_F(StreamTest, MemcpyD2DTooManyElements) {
-  Stream.thenMemcpyD2D(DeviceA, &DeviceB, 20);
+TEST_F(StreamTest, CopyD2HToPointer) {
+  Stream.thenCopyD2H(DeviceA5, Host5, 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceA5, Host7, 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyD2HSliceToMutableArrayRefByCount) {
+  Stream.thenCopyD2H(DeviceA5.asSlice().drop_front(1),
+                     MutableArrayRef<int>(Host5 + 1, 4), 4);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 1; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceB5.asSlice().drop_back(1),
+                     MutableArrayRef<int>(Host5), 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceA5.asSlice(), MutableArrayRef<int>(Host7), 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyD2HSliceToMutableArrayRef) {
+  Stream.thenCopyD2H(DeviceA7.asSlice().slice(1, 5),
+                     MutableArrayRef<int>(Host5));
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA7[I + 1], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceA5.asSlice(), MutableArrayRef<int>(Host7));
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyD2HSliceToPointer) {
+  Stream.thenCopyD2H(DeviceA5.asSlice().drop_front(1), Host5 + 1, 4);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 1; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyD2H(DeviceA5.asSlice(), Host7, 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+// H2D tests
+
+TEST_F(StreamTest, CopyH2DToArrayRefByCount) {
+  Stream.thenCopyH2D(ArrayRef<int>(Host5), DeviceA5, 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host5), DeviceB5, 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host7), DeviceA5, 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyH2DToArrayRef) {
+  Stream.thenCopyH2D(ArrayRef<int>(Host5), DeviceA5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host7), DeviceA5);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyH2DToPointer) {
+  Stream.thenCopyH2D(Host5, DeviceA5, 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(Host7, DeviceA5, 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyH2DSliceToArrayRefByCount) {
+  Stream.thenCopyH2D(ArrayRef<int>(Host5 + 1, 4),
+                     DeviceA5.asSlice().drop_front(1), 4);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 1; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host5), DeviceB5.asSlice().drop_back(1), 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostB5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host5), DeviceA5.asSlice(), 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyH2DSliceToArrayRef) {
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host5), DeviceA5.asSlice());
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(ArrayRef<int>(Host7), DeviceA5.asSlice());
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyH2DSliceToPointer) {
+  Stream.thenCopyH2D(Host5, DeviceA5.asSlice(), 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], Host5[I]);
+  }
+
+  Stream.thenCopyH2D(Host7, DeviceA5.asSlice(), 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+// D2D tests
+
+TEST_F(StreamTest, CopyD2DByCount) {
+  Stream.thenCopyD2D(DeviceA5, DeviceB5, 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7, DeviceB7, 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7, DeviceB5, 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyD2D) {
+  Stream.thenCopyD2D(DeviceA5, DeviceB5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7, DeviceB5);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopySliceD2DByCount) {
+  Stream.thenCopyD2D(DeviceA5.asSlice().drop_front(1), DeviceB5, 4);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 4; ++I) {
+    EXPECT_EQ(HostA5[I + 1], HostB5[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7.asSlice().drop_back(1), DeviceB7, 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB5, 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopySliceD2D) {
+
+  Stream.thenCopyD2D(DeviceA7.asSlice().drop_back(2), DeviceB5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA7[I], HostB5[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA5.asSlice().drop_back(1), DeviceB7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyD2DSliceByCount) {
+  Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice().drop_front(2), 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB7[I + 2]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7, DeviceB7.asSlice().drop_back(3), 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice(), 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopyD2DSlice) {
+
+  Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice().drop_back(2));
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB7[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA5, DeviceB7.asSlice());
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopySliceD2DSliceByCount) {
+
+  Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice(), 5);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7.asSlice(), DeviceB7.asSlice(), 2);
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 2; ++I) {
+    EXPECT_EQ(HostA7[I], HostB7[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA7.asSlice(), DeviceB5.asSlice(), 7);
+  EXPECT_FALSE(Stream.isOK());
+}
+
+TEST_F(StreamTest, CopySliceD2DSlice) {
+
+  Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB5.asSlice());
+  EXPECT_TRUE(Stream.isOK());
+  for (int I = 0; I < 5; ++I) {
+    EXPECT_EQ(HostA5[I], HostB5[I]);
+  }
+
+  Stream.thenCopyD2D(DeviceA5.asSlice(), DeviceB7.asSlice());
   EXPECT_FALSE(Stream.isOK());
 }