include/mlir/ExecutionEngine/CRunnerUtils.h - llvm-project/mlir - Git at Google

 //===- CRunnerUtils.h - Utils for debugging MLIR execution ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file declares basic classes and functions to manipulate structured MLIR
 // types at runtime. Entities in this file must be compliant with C++11 and be
 // retargetable, including on targets without a C++ runtime.
 //
 //===----------------------------------------------------------------------===//

 #ifndef MLIR_EXECUTIONENGINE_CRUNNERUTILS_H
 #define MLIR_EXECUTIONENGINE_CRUNNERUTILS_H

 #ifdef _WIN32
 #ifndef MLIR_CRUNNERUTILS_EXPORT
 #ifdef mlir_c_runner_utils_EXPORTS
 // We are building this library
 #define MLIR_CRUNNERUTILS_EXPORT __declspec(dllexport)
 #define MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
 #else
 // We are using this library
 #define MLIR_CRUNNERUTILS_EXPORT __declspec(dllimport)
 #endif // mlir_c_runner_utils_EXPORTS
 #endif // MLIR_CRUNNERUTILS_EXPORT
 #else  // _WIN32
 // Non-windows: use visibility attributes.
 #define MLIR_CRUNNERUTILS_EXPORT __attribute__((visibility("default")))
 #define MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
 #endif // _WIN32

 #include <array>
 #include <cassert>
 #include <cstdint>
 #include <initializer_list>
 #include <vector>

 //===----------------------------------------------------------------------===//
 // Codegen-compatible structures for Vector type.
 //===----------------------------------------------------------------------===//
 namespace mlir {
 namespace detail {

 constexpr bool isPowerOf2(int n) { return (!(n & (n - 1))); }

 constexpr unsigned nextPowerOf2(int n) {
   return (n <= 1) ? 1 : (isPowerOf2(n) ? n : (2 * nextPowerOf2((n + 1) / 2)));
 }

 template <typename T, int Dim, bool IsPowerOf2>
 struct Vector1D;

 template <typename T, int Dim>
 struct Vector1D<T, Dim, /*IsPowerOf2=*/true> {
   Vector1D() {
     static_assert(detail::nextPowerOf2(sizeof(T[Dim])) == sizeof(T[Dim]),
                   "size error");
   }
   inline T &operator[](unsigned i) { return vector[i]; }
   inline const T &operator[](unsigned i) const { return vector[i]; }

 private:
   T vector[Dim];
 };

 // 1-D vector, padded to the next power of 2 allocation.
 // Specialization occurs to avoid zero size arrays (which fail in -Werror).
 template <typename T, int Dim>
 struct Vector1D<T, Dim, /*IsPowerOf2=*/false> {
   Vector1D() {
     static_assert(nextPowerOf2(sizeof(T[Dim])) > sizeof(T[Dim]), "size error");
     static_assert(nextPowerOf2(sizeof(T[Dim])) < 2 * sizeof(T[Dim]),
                   "size error");
   }
   inline T &operator[](unsigned i) { return vector[i]; }
   inline const T &operator[](unsigned i) const { return vector[i]; }

 private:
   T vector[Dim];
   char padding[nextPowerOf2(sizeof(T[Dim])) - sizeof(T[Dim])];
 };
 } // namespace detail
 } // namespace mlir

 // N-D vectors recurse down to 1-D.
 template <typename T, int Dim, int... Dims>
 struct Vector {
   inline Vector<T, Dims...> &operator[](unsigned i) { return vector[i]; }
   inline const Vector<T, Dims...> &operator[](unsigned i) const {
     return vector[i];
   }

 private:
   Vector<T, Dims...> vector[Dim];
 };

 // 1-D vectors in LLVM are automatically padded to the next power of 2.
 // We insert explicit padding in to account for this.
 template <typename T, int Dim>
 struct Vector<T, Dim>
     : public mlir::detail::Vector1D<T, Dim,
                                     mlir::detail::isPowerOf2(sizeof(T[Dim]))> {
 };

 template <int D1, typename T>
 using Vector1D = Vector<T, D1>;
 template <int D1, int D2, typename T>
 using Vector2D = Vector<T, D1, D2>;
 template <int D1, int D2, int D3, typename T>
 using Vector3D = Vector<T, D1, D2, D3>;
 template <int D1, int D2, int D3, int D4, typename T>
 using Vector4D = Vector<T, D1, D2, D3, D4>;

 template <int N>
 void dropFront(int64_t arr[N], int64_t *res) {
   for (unsigned i = 1; i < N; ++i)
     *(res + i - 1) = arr[i];
 }

 //===----------------------------------------------------------------------===//
 // Codegen-compatible structures for StridedMemRef type.
 //===----------------------------------------------------------------------===//
 template <typename T, int Rank>
 class StridedMemrefIterator;

 /// StridedMemRef descriptor type with static rank.
 template <typename T, int N>
 struct StridedMemRefType {
   T *basePtr;
   T *data;
   int64_t offset;
   int64_t sizes[N];
   int64_t strides[N];

   template <typename Range,
             typename sfinae = decltype(std::declval<Range>().begin())>
   T &operator[](Range &&indices) {
     assert(indices.size() == N &&
            "indices should match rank in memref subscript");
     int64_t curOffset = offset;
     for (int dim = N - 1; dim >= 0; --dim) {
       int64_t currentIndex = *(indices.begin() + dim);
       assert(currentIndex < sizes[dim] && "Index overflow");
       curOffset += currentIndex * strides[dim];
     }
     return data[curOffset];
   }

   StridedMemrefIterator<T, N> begin() { return {*this, offset}; }
   StridedMemrefIterator<T, N> end() { return {*this, -1}; }

   // This operator[] is extremely slow and only for sugaring purposes.
   StridedMemRefType<T, N - 1> operator[](int64_t idx) {
     StridedMemRefType<T, N - 1> res;
     res.basePtr = basePtr;
     res.data = data;
     res.offset = offset + idx * strides[0];
     dropFront<N>(sizes, res.sizes);
     dropFront<N>(strides, res.strides);
     return res;
   }
 };

 /// StridedMemRef descriptor type specialized for rank 1.
 template <typename T>
 struct StridedMemRefType<T, 1> {
   T *basePtr;
   T *data;
   int64_t offset;
   int64_t sizes[1];
   int64_t strides[1];

   template <typename Range,
             typename sfinae = decltype(std::declval<Range>().begin())>
   T &operator[](Range indices) {
     assert(indices.size() == 1 &&
            "indices should match rank in memref subscript");
     return (*this)[*indices.begin()];
   }

   StridedMemrefIterator<T, 1> begin() { return {*this, offset}; }
   StridedMemrefIterator<T, 1> end() { return {*this, -1}; }

   T &operator[](int64_t idx) { return *(data + offset + idx * strides[0]); }
 };

 /// StridedMemRef descriptor type specialized for rank 0.
 template <typename T>
 struct StridedMemRefType<T, 0> {
   T *basePtr;
   T *data;
   int64_t offset;

   template <typename Range,
             typename sfinae = decltype(std::declval<Range>().begin())>
   T &operator[](Range indices) {
     assert((indices.size() == 0) &&
            "Expect empty indices for 0-rank memref subscript");
     return data[offset];
   }

   StridedMemrefIterator<T, 0> begin() { return {*this, offset}; }
   StridedMemrefIterator<T, 0> end() { return {*this, offset + 1}; }
 };

 /// Iterate over all elements in a strided memref.
 template <typename T, int Rank>
 class StridedMemrefIterator {
 public:
   using iterator_category = std::forward_iterator_tag;
   using value_type = T;
   using difference_type = std::ptrdiff_t;
   using pointer = T *;
   using reference = T &;

   StridedMemrefIterator(StridedMemRefType<T, Rank> &descriptor,
                         int64_t offset = 0)
       : offset(offset), descriptor(&descriptor) {}
   StridedMemrefIterator<T, Rank> &operator++() {
     int dim = Rank - 1;
     while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) {
       offset -= indices[dim] * descriptor->strides[dim];
       indices[dim] = 0;
       --dim;
     }
     if (dim < 0) {
       offset = -1;
       return *this;
     }
     ++indices[dim];
     offset += descriptor->strides[dim];
     return *this;
   }

   reference operator*() { return descriptor->data[offset]; }
   pointer operator->() { return &descriptor->data[offset]; }

   const std::array<int64_t, Rank> &getIndices() { return indices; }

   bool operator==(const StridedMemrefIterator &other) const {
     return other.offset == offset && other.descriptor == descriptor;
   }

   bool operator!=(const StridedMemrefIterator &other) const {
     return !(*this == other);
   }

 private:
   /// Offset in the buffer. This can be derived from the indices and the
   /// descriptor.
   int64_t offset = 0;

   /// Array of indices in the multi-dimensional memref.
   std::array<int64_t, Rank> indices = {};

   /// Descriptor for the strided memref.
   StridedMemRefType<T, Rank> *descriptor;
 };

 /// Iterate over all elements in a 0-ranked strided memref.
 template <typename T>
 class StridedMemrefIterator<T, 0> {
 public:
   using iterator_category = std::forward_iterator_tag;
   using value_type = T;
   using difference_type = std::ptrdiff_t;
   using pointer = T *;
   using reference = T &;

   StridedMemrefIterator(StridedMemRefType<T, 0> &descriptor, int64_t offset = 0)
       : elt(descriptor.data + offset) {}

   StridedMemrefIterator<T, 0> &operator++() {
     ++elt;
     return *this;
   }

   reference operator*() { return *elt; }
   pointer operator->() { return elt; }

   // There are no indices for a 0-ranked memref, but this API is provided for
   // consistency with the general case.
   const std::array<int64_t, 0> &getIndices() {
     // Since this is a 0-array of indices we can keep a single global const
     // copy.
     static const std::array<int64_t, 0> indices = {};
     return indices;
   }

   bool operator==(const StridedMemrefIterator &other) const {
     return other.elt == elt;
   }

   bool operator!=(const StridedMemrefIterator &other) const {
     return !(*this == other);
   }

 private:
   /// Pointer to the single element in the zero-ranked memref.
   T *elt;
 };

 //===----------------------------------------------------------------------===//
 // Codegen-compatible structure for UnrankedMemRef type.
 //===----------------------------------------------------------------------===//
 // Unranked MemRef
 template <typename T>
 struct UnrankedMemRefType {
   int64_t rank;
   void *descriptor;
 };

 //===----------------------------------------------------------------------===//
 // DynamicMemRefType type.
 //===----------------------------------------------------------------------===//
 template <typename T>
 class DynamicMemRefIterator;

 // A reference to one of the StridedMemRef types.
 template <typename T>
 class DynamicMemRefType {
 public:
   int64_t rank;
   T *basePtr;
   T *data;
   int64_t offset;
   const int64_t *sizes;
   const int64_t *strides;

   explicit DynamicMemRefType(const StridedMemRefType<T, 0> &memRef)
       : rank(0), basePtr(memRef.basePtr), data(memRef.data),
         offset(memRef.offset), sizes(nullptr), strides(nullptr) {}
   template <int N>
   explicit DynamicMemRefType(const StridedMemRefType<T, N> &memRef)
       : rank(N), basePtr(memRef.basePtr), data(memRef.data),
         offset(memRef.offset), sizes(memRef.sizes), strides(memRef.strides) {}
   explicit DynamicMemRefType(const ::UnrankedMemRefType<T> &memRef)
       : rank(memRef.rank) {
     auto *desc = static_cast<StridedMemRefType<T, 1> *>(memRef.descriptor);
     basePtr = desc->basePtr;
     data = desc->data;
     offset = desc->offset;
     sizes = rank == 0 ? nullptr : desc->sizes;
     strides = sizes + rank;
   }

   template <typename Range,
             typename sfinae = decltype(std::declval<Range>().begin())>
   T &operator[](Range &&indices) {
     assert(indices.size() == rank &&
            "indices should match rank in memref subscript");
     if (rank == 0)
       return data[offset];

     int64_t curOffset = offset;
     for (int dim = rank - 1; dim >= 0; --dim) {
       int64_t currentIndex = *(indices.begin() + dim);
       assert(currentIndex < sizes[dim] && "Index overflow");
       curOffset += currentIndex * strides[dim];
     }
     return data[curOffset];
   }

   DynamicMemRefIterator<T> begin() { return {*this, offset}; }
   DynamicMemRefIterator<T> end() { return {*this, -1}; }

   // This operator[] is extremely slow and only for sugaring purposes.
   DynamicMemRefType<T> operator[](int64_t idx) {
     assert(rank > 0 && "can't make a subscript of a zero ranked array");

     DynamicMemRefType<T> res(*this);
     --res.rank;
     res.offset += idx * res.strides[0];
     ++res.sizes;
     ++res.strides;
     return res;
   }

   // This operator* can be used in conjunction with the previous operator[] in
   // order to access the underlying value in case of zero-ranked memref.
   T &operator*() {
     assert(rank == 0 && "not a zero-ranked memRef");
     return data[offset];
   }
 };

 /// Iterate over all elements in a dynamic memref.
 template <typename T>
 class DynamicMemRefIterator {
 public:
   using iterator_category = std::forward_iterator_tag;
   using value_type = T;
   using difference_type = std::ptrdiff_t;
   using pointer = T *;
   using reference = T &;

   DynamicMemRefIterator(DynamicMemRefType<T> &descriptor, int64_t offset = 0)
       : offset(offset), descriptor(&descriptor) {
     indices.resize(descriptor.rank, 0);
   }

   DynamicMemRefIterator<T> &operator++() {
     if (descriptor->rank == 0) {
       offset = -1;
       return *this;
     }

     int dim = descriptor->rank - 1;

     while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) {
       offset -= indices[dim] * descriptor->strides[dim];
       indices[dim] = 0;
       --dim;
     }

     if (dim < 0) {
       offset = -1;
       return *this;
     }

     ++indices[dim];
     offset += descriptor->strides[dim];
     return *this;
   }

   reference operator*() { return descriptor->data[offset]; }
   pointer operator->() { return &descriptor->data[offset]; }

   const std::vector<int64_t> &getIndices() { return indices; }

   bool operator==(const DynamicMemRefIterator &other) const {
     return other.offset == offset && other.descriptor == descriptor;
   }

   bool operator!=(const DynamicMemRefIterator &other) const {
     return !(*this == other);
   }

 private:
   /// Offset in the buffer. This can be derived from the indices and the
   /// descriptor.
   int64_t offset = 0;

   /// Array of indices in the multi-dimensional memref.
   std::vector<int64_t> indices = {};

   /// Descriptor for the dynamic memref.
   DynamicMemRefType<T> *descriptor;
 };

 //===----------------------------------------------------------------------===//
 // Small runtime support library for memref.copy lowering during codegen.
 //===----------------------------------------------------------------------===//
 extern "C" MLIR_CRUNNERUTILS_EXPORT void
 memrefCopy(int64_t elemSize, ::UnrankedMemRefType<char> *src,
            ::UnrankedMemRefType<char> *dst);

 //===----------------------------------------------------------------------===//
 // Small runtime support library for vector.print lowering during codegen.
 //===----------------------------------------------------------------------===//
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printI64(int64_t i);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printU64(uint64_t u);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printF32(float f);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printF64(double d);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printString(char const *s);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printOpen();
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printClose();
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printComma();
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printNewline();

 //===----------------------------------------------------------------------===//
 // Small runtime support library for timing execution and printing GFLOPS
 //===----------------------------------------------------------------------===//
 extern "C" MLIR_CRUNNERUTILS_EXPORT void printFlops(double flops);
 extern "C" MLIR_CRUNNERUTILS_EXPORT double rtclock();

 //===----------------------------------------------------------------------===//
 // Runtime support library for random number generation.
 //===----------------------------------------------------------------------===//
 // Uses a seed to initialize a random generator and returns the generator.
 extern "C" MLIR_CRUNNERUTILS_EXPORT void *rtsrand(uint64_t s);
 // Uses a random number generator g and returns a random number
 // in the range of [0, m).
 extern "C" MLIR_CRUNNERUTILS_EXPORT uint64_t rtrand(void *g, uint64_t m);
 // Deletes the random number generator.
 extern "C" MLIR_CRUNNERUTILS_EXPORT void rtdrand(void *g);
 // Uses a random number generator g and std::shuffle to modify mref
 // in place. Memref mref will be a permutation of all numbers
 // in the range of [0, size of mref).
 extern "C" MLIR_CRUNNERUTILS_EXPORT void
 _mlir_ciface_shuffle(StridedMemRefType<uint64_t, 1> *mref, void *g);

 //===----------------------------------------------------------------------===//
 // Runtime support library to allow the use of std::sort in MLIR program.
 //===----------------------------------------------------------------------===//
 extern "C" MLIR_CRUNNERUTILS_EXPORT void
 _mlir_ciface_stdSortI64(uint64_t n, StridedMemRefType<int64_t, 1> *vref);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void
 _mlir_ciface_stdSortF64(uint64_t n, StridedMemRefType<double, 1> *vref);
 extern "C" MLIR_CRUNNERUTILS_EXPORT void
 _mlir_ciface_stdSortF32(uint64_t n, StridedMemRefType<float, 1> *vref);
 #endif // MLIR_EXECUTIONENGINE_CRUNNERUTILS_H
	//===- CRunnerUtils.h - Utils for debugging MLIR execution ----------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares basic classes and functions to manipulate structured MLIR
	// types at runtime. Entities in this file must be compliant with C++11 and be
	// retargetable, including on targets without a C++ runtime.
	//
	//===----------------------------------------------------------------------===//

	#ifndef MLIR_EXECUTIONENGINE_CRUNNERUTILS_H
	#define MLIR_EXECUTIONENGINE_CRUNNERUTILS_H

	#ifdef _WIN32
	#ifndef MLIR_CRUNNERUTILS_EXPORT
	#ifdef mlir_c_runner_utils_EXPORTS
	// We are building this library
	#define MLIR_CRUNNERUTILS_EXPORT __declspec(dllexport)
	#define MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
	#else
	// We are using this library
	#define MLIR_CRUNNERUTILS_EXPORT __declspec(dllimport)
	#endif // mlir_c_runner_utils_EXPORTS
	#endif // MLIR_CRUNNERUTILS_EXPORT
	#else // _WIN32
	// Non-windows: use visibility attributes.
	#define MLIR_CRUNNERUTILS_EXPORT __attribute__((visibility("default")))
	#define MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
	#endif // _WIN32

	#include <array>
	#include <cassert>
	#include <cstdint>
	#include <initializer_list>
	#include <vector>

	//===----------------------------------------------------------------------===//
	// Codegen-compatible structures for Vector type.
	//===----------------------------------------------------------------------===//
	namespace mlir {
	namespace detail {

	constexpr bool isPowerOf2(int n) { return (!(n & (n - 1))); }

	constexpr unsigned nextPowerOf2(int n) {
	return (n <= 1) ? 1 : (isPowerOf2(n) ? n : (2 * nextPowerOf2((n + 1) / 2)));
	}

	template <typename T, int Dim, bool IsPowerOf2>
	struct Vector1D;

	template <typename T, int Dim>
	struct Vector1D<T, Dim, /IsPowerOf2=/true> {
	Vector1D() {
	static_assert(detail::nextPowerOf2(sizeof(T[Dim])) == sizeof(T[Dim]),
	"size error");
	}
	inline T &operator[](unsigned i) { return vector[i]; }
	inline const T &operator[](unsigned i) const { return vector[i]; }

	private:
	T vector[Dim];
	};

	// 1-D vector, padded to the next power of 2 allocation.
	// Specialization occurs to avoid zero size arrays (which fail in -Werror).
	template <typename T, int Dim>
	struct Vector1D<T, Dim, /IsPowerOf2=/false> {
	Vector1D() {
	static_assert(nextPowerOf2(sizeof(T[Dim])) > sizeof(T[Dim]), "size error");
	static_assert(nextPowerOf2(sizeof(T[Dim])) < 2 * sizeof(T[Dim]),
	"size error");
	}
	inline T &operator[](unsigned i) { return vector[i]; }
	inline const T &operator[](unsigned i) const { return vector[i]; }

	private:
	T vector[Dim];
	char padding[nextPowerOf2(sizeof(T[Dim])) - sizeof(T[Dim])];
	};
	} // namespace detail
	} // namespace mlir

	// N-D vectors recurse down to 1-D.
	template <typename T, int Dim, int... Dims>
	struct Vector {
	inline Vector<T, Dims...> &operator[](unsigned i) { return vector[i]; }
	inline const Vector<T, Dims...> &operator[](unsigned i) const {
	return vector[i];
	}

	private:
	Vector<T, Dims...> vector[Dim];
	};

	// 1-D vectors in LLVM are automatically padded to the next power of 2.
	// We insert explicit padding in to account for this.
	template <typename T, int Dim>
	struct Vector<T, Dim>
	: public mlir::detail::Vector1D<T, Dim,
	mlir::detail::isPowerOf2(sizeof(T[Dim]))> {
	};

	template <int D1, typename T>
	using Vector1D = Vector<T, D1>;
	template <int D1, int D2, typename T>
	using Vector2D = Vector<T, D1, D2>;
	template <int D1, int D2, int D3, typename T>
	using Vector3D = Vector<T, D1, D2, D3>;
	template <int D1, int D2, int D3, int D4, typename T>
	using Vector4D = Vector<T, D1, D2, D3, D4>;

	template <int N>
	void dropFront(int64_t arr[N], int64_t *res) {
	for (unsigned i = 1; i < N; ++i)
	*(res + i - 1) = arr[i];
	}

	//===----------------------------------------------------------------------===//
	// Codegen-compatible structures for StridedMemRef type.
	//===----------------------------------------------------------------------===//
	template <typename T, int Rank>
	class StridedMemrefIterator;

	/// StridedMemRef descriptor type with static rank.
	template <typename T, int N>
	struct StridedMemRefType {
	T *basePtr;
	T *data;
	int64_t offset;
	int64_t sizes[N];
	int64_t strides[N];

	template <typename Range,
	typename sfinae = decltype(std::declval<Range>().begin())>
	T &operator[](Range &&indices) {
	assert(indices.size() == N &&
	"indices should match rank in memref subscript");
	int64_t curOffset = offset;
	for (int dim = N - 1; dim >= 0; --dim) {
	int64_t currentIndex = *(indices.begin() + dim);
	assert(currentIndex < sizes[dim] && "Index overflow");
	curOffset += currentIndex * strides[dim];
	}
	return data[curOffset];
	}

	StridedMemrefIterator<T, N> begin() { return {*this, offset}; }
	StridedMemrefIterator<T, N> end() { return {*this, -1}; }

	// This operator[] is extremely slow and only for sugaring purposes.
	StridedMemRefType<T, N - 1> operator[](int64_t idx) {
	StridedMemRefType<T, N - 1> res;
	res.basePtr = basePtr;
	res.data = data;
	res.offset = offset + idx * strides[0];
	dropFront<N>(sizes, res.sizes);
	dropFront<N>(strides, res.strides);
	return res;
	}
	};

	/// StridedMemRef descriptor type specialized for rank 1.
	template <typename T>
	struct StridedMemRefType<T, 1> {
	T *basePtr;
	T *data;
	int64_t offset;
	int64_t sizes[1];
	int64_t strides[1];

	template <typename Range,
	typename sfinae = decltype(std::declval<Range>().begin())>
	T &operator[](Range indices) {
	assert(indices.size() == 1 &&
	"indices should match rank in memref subscript");
	return (this)[indices.begin()];
	}

	StridedMemrefIterator<T, 1> begin() { return {*this, offset}; }
	StridedMemrefIterator<T, 1> end() { return {*this, -1}; }

	T &operator[](int64_t idx) { return (data + offset + idx strides[0]); }
	};

	/// StridedMemRef descriptor type specialized for rank 0.
	template <typename T>
	struct StridedMemRefType<T, 0> {
	T *basePtr;
	T *data;
	int64_t offset;

	template <typename Range,
	typename sfinae = decltype(std::declval<Range>().begin())>
	T &operator[](Range indices) {
	assert((indices.size() == 0) &&
	"Expect empty indices for 0-rank memref subscript");
	return data[offset];
	}

	StridedMemrefIterator<T, 0> begin() { return {*this, offset}; }
	StridedMemrefIterator<T, 0> end() { return {*this, offset + 1}; }
	};

	/// Iterate over all elements in a strided memref.
	template <typename T, int Rank>
	class StridedMemrefIterator {
	public:
	using iterator_category = std::forward_iterator_tag;
	using value_type = T;
	using difference_type = std::ptrdiff_t;
	using pointer = T *;
	using reference = T &;

	StridedMemrefIterator(StridedMemRefType<T, Rank> &descriptor,
	int64_t offset = 0)
	: offset(offset), descriptor(&descriptor) {}
	StridedMemrefIterator<T, Rank> &operator++() {
	int dim = Rank - 1;
	while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) {
	offset -= indices[dim] * descriptor->strides[dim];
	indices[dim] = 0;
	--dim;
	}
	if (dim < 0) {
	offset = -1;
	return *this;
	}
	++indices[dim];
	offset += descriptor->strides[dim];
	return *this;
	}

	reference operator*() { return descriptor->data[offset]; }
	pointer operator->() { return &descriptor->data[offset]; }

	const std::array<int64_t, Rank> &getIndices() { return indices; }

	bool operator==(const StridedMemrefIterator &other) const {
	return other.offset == offset && other.descriptor == descriptor;
	}

	bool operator!=(const StridedMemrefIterator &other) const {
	return !(*this == other);
	}

	private:
	/// Offset in the buffer. This can be derived from the indices and the
	/// descriptor.
	int64_t offset = 0;

	/// Array of indices in the multi-dimensional memref.
	std::array<int64_t, Rank> indices = {};

	/// Descriptor for the strided memref.
	StridedMemRefType<T, Rank> *descriptor;
	};

	/// Iterate over all elements in a 0-ranked strided memref.
	template <typename T>
	class StridedMemrefIterator<T, 0> {
	public:
	using iterator_category = std::forward_iterator_tag;
	using value_type = T;
	using difference_type = std::ptrdiff_t;
	using pointer = T *;
	using reference = T &;

	StridedMemrefIterator(StridedMemRefType<T, 0> &descriptor, int64_t offset = 0)
	: elt(descriptor.data + offset) {}

	StridedMemrefIterator<T, 0> &operator++() {
	++elt;
	return *this;
	}

	reference operator() { return elt; }
	pointer operator->() { return elt; }

	// There are no indices for a 0-ranked memref, but this API is provided for
	// consistency with the general case.
	const std::array<int64_t, 0> &getIndices() {
	// Since this is a 0-array of indices we can keep a single global const
	// copy.
	static const std::array<int64_t, 0> indices = {};
	return indices;
	}

	bool operator==(const StridedMemrefIterator &other) const {
	return other.elt == elt;
	}

	bool operator!=(const StridedMemrefIterator &other) const {
	return !(*this == other);
	}

	private:
	/// Pointer to the single element in the zero-ranked memref.
	T *elt;
	};

	//===----------------------------------------------------------------------===//
	// Codegen-compatible structure for UnrankedMemRef type.
	//===----------------------------------------------------------------------===//
	// Unranked MemRef
	template <typename T>
	struct UnrankedMemRefType {
	int64_t rank;
	void *descriptor;
	};

	//===----------------------------------------------------------------------===//
	// DynamicMemRefType type.
	//===----------------------------------------------------------------------===//
	template <typename T>
	class DynamicMemRefIterator;

	// A reference to one of the StridedMemRef types.
	template <typename T>
	class DynamicMemRefType {
	public:
	int64_t rank;
	T *basePtr;
	T *data;
	int64_t offset;
	const int64_t *sizes;
	const int64_t *strides;

	explicit DynamicMemRefType(const StridedMemRefType<T, 0> &memRef)
	: rank(0), basePtr(memRef.basePtr), data(memRef.data),
	offset(memRef.offset), sizes(nullptr), strides(nullptr) {}
	template <int N>
	explicit DynamicMemRefType(const StridedMemRefType<T, N> &memRef)
	: rank(N), basePtr(memRef.basePtr), data(memRef.data),
	offset(memRef.offset), sizes(memRef.sizes), strides(memRef.strides) {}
	explicit DynamicMemRefType(const ::UnrankedMemRefType<T> &memRef)
	: rank(memRef.rank) {
	auto desc = static_cast<StridedMemRefType<T, 1> >(memRef.descriptor);
	basePtr = desc->basePtr;
	data = desc->data;
	offset = desc->offset;
	sizes = rank == 0 ? nullptr : desc->sizes;
	strides = sizes + rank;
	}

	template <typename Range,
	typename sfinae = decltype(std::declval<Range>().begin())>
	T &operator[](Range &&indices) {
	assert(indices.size() == rank &&
	"indices should match rank in memref subscript");
	if (rank == 0)
	return data[offset];

	int64_t curOffset = offset;
	for (int dim = rank - 1; dim >= 0; --dim) {
	int64_t currentIndex = *(indices.begin() + dim);
	assert(currentIndex < sizes[dim] && "Index overflow");
	curOffset += currentIndex * strides[dim];
	}
	return data[curOffset];
	}

	DynamicMemRefIterator<T> begin() { return {*this, offset}; }
	DynamicMemRefIterator<T> end() { return {*this, -1}; }

	// This operator[] is extremely slow and only for sugaring purposes.
	DynamicMemRefType<T> operator[](int64_t idx) {
	assert(rank > 0 && "can't make a subscript of a zero ranked array");

	DynamicMemRefType<T> res(*this);
	--res.rank;
	res.offset += idx * res.strides[0];
	++res.sizes;
	++res.strides;
	return res;
	}

	// This operator* can be used in conjunction with the previous operator[] in
	// order to access the underlying value in case of zero-ranked memref.
	T &operator*() {
	assert(rank == 0 && "not a zero-ranked memRef");
	return data[offset];
	}
	};

	/// Iterate over all elements in a dynamic memref.
	template <typename T>
	class DynamicMemRefIterator {
	public:
	using iterator_category = std::forward_iterator_tag;
	using value_type = T;
	using difference_type = std::ptrdiff_t;
	using pointer = T *;
	using reference = T &;

	DynamicMemRefIterator(DynamicMemRefType<T> &descriptor, int64_t offset = 0)
	: offset(offset), descriptor(&descriptor) {
	indices.resize(descriptor.rank, 0);
	}

	DynamicMemRefIterator<T> &operator++() {
	if (descriptor->rank == 0) {
	offset = -1;
	return *this;
	}

	int dim = descriptor->rank - 1;

	while (dim >= 0 && indices[dim] == (descriptor->sizes[dim] - 1)) {
	offset -= indices[dim] * descriptor->strides[dim];
	indices[dim] = 0;
	--dim;
	}

	if (dim < 0) {
	offset = -1;
	return *this;
	}

	++indices[dim];
	offset += descriptor->strides[dim];
	return *this;
	}

	reference operator*() { return descriptor->data[offset]; }
	pointer operator->() { return &descriptor->data[offset]; }

	const std::vector<int64_t> &getIndices() { return indices; }

	bool operator==(const DynamicMemRefIterator &other) const {
	return other.offset == offset && other.descriptor == descriptor;
	}

	bool operator!=(const DynamicMemRefIterator &other) const {
	return !(*this == other);
	}

	private:
	/// Offset in the buffer. This can be derived from the indices and the
	/// descriptor.
	int64_t offset = 0;

	/// Array of indices in the multi-dimensional memref.
	std::vector<int64_t> indices = {};

	/// Descriptor for the dynamic memref.
	DynamicMemRefType<T> *descriptor;
	};

	//===----------------------------------------------------------------------===//
	// Small runtime support library for memref.copy lowering during codegen.
	//===----------------------------------------------------------------------===//
	extern "C" MLIR_CRUNNERUTILS_EXPORT void
	memrefCopy(int64_t elemSize, ::UnrankedMemRefType<char> *src,
	::UnrankedMemRefType<char> *dst);

	//===----------------------------------------------------------------------===//
	// Small runtime support library for vector.print lowering during codegen.
	//===----------------------------------------------------------------------===//
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printI64(int64_t i);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printU64(uint64_t u);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printF32(float f);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printF64(double d);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printString(char const *s);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printOpen();
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printClose();
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printComma();
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printNewline();

	//===----------------------------------------------------------------------===//
	// Small runtime support library for timing execution and printing GFLOPS
	//===----------------------------------------------------------------------===//
	extern "C" MLIR_CRUNNERUTILS_EXPORT void printFlops(double flops);
	extern "C" MLIR_CRUNNERUTILS_EXPORT double rtclock();

	//===----------------------------------------------------------------------===//
	// Runtime support library for random number generation.
	//===----------------------------------------------------------------------===//
	// Uses a seed to initialize a random generator and returns the generator.
	extern "C" MLIR_CRUNNERUTILS_EXPORT void *rtsrand(uint64_t s);
	// Uses a random number generator g and returns a random number
	// in the range of [0, m).
	extern "C" MLIR_CRUNNERUTILS_EXPORT uint64_t rtrand(void *g, uint64_t m);
	// Deletes the random number generator.
	extern "C" MLIR_CRUNNERUTILS_EXPORT void rtdrand(void *g);
	// Uses a random number generator g and std::shuffle to modify mref
	// in place. Memref mref will be a permutation of all numbers
	// in the range of [0, size of mref).
	extern "C" MLIR_CRUNNERUTILS_EXPORT void
	_mlir_ciface_shuffle(StridedMemRefType<uint64_t, 1> mref, void g);

	//===----------------------------------------------------------------------===//
	// Runtime support library to allow the use of std::sort in MLIR program.
	//===----------------------------------------------------------------------===//
	extern "C" MLIR_CRUNNERUTILS_EXPORT void
	_mlir_ciface_stdSortI64(uint64_t n, StridedMemRefType<int64_t, 1> *vref);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void
	_mlir_ciface_stdSortF64(uint64_t n, StridedMemRefType<double, 1> *vref);
	extern "C" MLIR_CRUNNERUTILS_EXPORT void
	_mlir_ciface_stdSortF32(uint64_t n, StridedMemRefType<float, 1> *vref);
	#endif // MLIR_EXECUTIONENGINE_CRUNNERUTILS_H