[SingleSource/Vectorizer] Add runtime checks tests for nested loops
This patch adds tests for nested loops like this:
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
a[(i * (N + 1)) + j] += b[(i * N) + j];
}
}
where we generate runtime checks for the inner loop that do not
currently get hoisted above the outer loop.
Differential Revision: https://reviews.llvm.org/D154719
diff --git a/SingleSource/UnitTests/Vectorizer/common.h b/SingleSource/UnitTests/Vectorizer/common.h
index cd02176..d8cd421 100644
--- a/SingleSource/UnitTests/Vectorizer/common.h
+++ b/SingleSource/UnitTests/Vectorizer/common.h
@@ -17,6 +17,42 @@
_Pragma("clang loop vectorize(enable)") Loop \
};
+#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(InnerLoopCode) \
+ auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
+ for (unsigned long i = 0; i < OuterTC; i++) { \
+ _Pragma("clang loop vectorize(disable) interleave_count(1)") \
+ for (unsigned long j = 0; j < InnerTC; j++) { \
+ InnerLoopCode \
+ } \
+ } \
+ }; \
+ auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
+ for (unsigned long i = 0; i < OuterTC; i++) { \
+ _Pragma("clang loop vectorize(enable)") \
+ for (unsigned long j = 0; j < InnerTC; j++) { \
+ InnerLoopCode \
+ } \
+ } \
+ };
+
+#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(InnerLoopCode) \
+ auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
+ for (long i = OuterTC - 1; i >= 0; i--) { \
+ _Pragma("clang loop vectorize(disable) interleave_count(1)") \
+ for (unsigned long j = 0; j < InnerTC; j++) { \
+ InnerLoopCode \
+ } \
+ } \
+ }; \
+ auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
+ for (long i = OuterTC - 1; i >= 0; i--) { \
+ _Pragma("clang loop vectorize(enable)") \
+ for (unsigned long j = 0; j < InnerTC; j++) { \
+ InnerLoopCode \
+ } \
+ } \
+ };
+
static std::mt19937 rng;
// Initialize arrays A with random numbers.
diff --git a/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp b/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp
index 6ca3b39..0cbb089 100644
--- a/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp
+++ b/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp
@@ -7,6 +7,7 @@
#include "common.h"
+
// Tests for memory runtime checks generated by the vectorizer. Runs scalar and
// vectorized versions of a loop requiring runtime checks on the same inputs
// with pointers to the same buffer using various offsets between reads and
@@ -108,6 +109,51 @@
CheckWithOffsetSecond(i);
}
+
+
+template <typename Ty>
+using Fn4Ty = std::function<void(Ty *, Ty *, unsigned, unsigned)>;
+template <typename Ty>
+static void checkOverlappingMemoryTwoRuntimeChecksNested(Fn4Ty<Ty> ScalarFn,
+ Fn4Ty<Ty> VectorFn,
+ const int OuterTC,
+ const int InnerTC,
+ const char *Name) {
+ std::cout << "Checking " << Name << "\n";
+
+ // Make sure we have enough extra elements so we can be liberal with offsets.
+ const unsigned NumArrayElements = (InnerTC * (OuterTC + 1)) * 8;
+ std::unique_ptr<Ty[]> Input1(new Ty[NumArrayElements]);
+ std::unique_ptr<Ty[]> Reference(new Ty[NumArrayElements]);
+ std::unique_ptr<Ty[]> ToCheck(new Ty[NumArrayElements]);
+
+ auto CheckWithOffsetSecond = [&](int Offset) {
+ init_data(Input1, NumArrayElements);
+ for (unsigned i = 0; i < NumArrayElements; i++) {
+ Reference[i] = Input1[i];
+ ToCheck[i] = Input1[i];
+ }
+
+ // Run scalar function to generate reference output.
+ Ty *ReferenceStart = &Reference[NumArrayElements / 2];
+ ScalarFn(ReferenceStart + Offset, ReferenceStart, OuterTC, InnerTC);
+
+ // Run vector function to generate output to check.
+ Ty *StartPtr = &ToCheck[NumArrayElements / 2];
+ callThroughOptnone(VectorFn, StartPtr + Offset, StartPtr, OuterTC, InnerTC);
+
+ // Compare scalar and vector output.
+ check(Reference, ToCheck, NumArrayElements, Offset);
+ };
+
+ // With a nested loop, sometimes the runtime checks will fail and sometimes
+ // succeed. For example, with large offsets you'd expect for the first and
+ // last one or two executions of the inner loop there is no overlap.
+ for (int i = -(2 * (InnerTC + 1)); i <= (2 * (InnerTC + 1)); i++)
+ CheckWithOffsetSecond(i);
+}
+
+
int main(void) {
rng = std::mt19937(15);
@@ -261,5 +307,73 @@
ScalarFn, VectorFn, "1 read, 2 writes, simple indices, uint64_t");
}
+ {
+ DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(
+ auto X = B[(i * OuterTC) + j];
+ A[(i * (OuterTC + 1)) + j] = X;
+ );
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+ ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint8_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+ ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint32_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+ ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint64_t");
+
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+ ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint8_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+ ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint32_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+ ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint64_t");
+ }
+
+ {
+ DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(
+ auto X = B[(i * OuterTC) + j];
+ A[(i * (OuterTC + 1)) + j] += X;
+ );
+
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+ ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint8_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+ ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint32_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+ ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint64_t");
+
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+ ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint8_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+ ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint32_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+ ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint64_t");
+ }
+
+ {
+ DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(
+ auto X = B[(i * OuterTC) + j];
+ A[(i * (OuterTC + 1)) + j] = X;
+ );
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+ ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+ ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+ ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t");
+ }
+
+ {
+ DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(
+ auto X = B[(i * OuterTC) + j];
+ A[(i * (OuterTC + 1)) + j] += X;
+ );
+
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+ ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+ ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t");
+ checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+ ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t");
+ }
+
return 0;
}
diff --git a/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output b/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output
index 609e4f3..35793ca 100644
--- a/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output
+++ b/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output
@@ -28,4 +28,22 @@
Checking 1 read, 2 writes, simple indices, uint8_t
Checking 1 read, 2 writes, simple indices, uint32_t
Checking 1 read, 2 writes, simple indices, uint64_t
+Checking 1 read, 1 write, nested loop (matching trip counts), uint8_t
+Checking 1 read, 1 write, nested loop (matching trip counts), uint32_t
+Checking 1 read, 1 write, nested loop (matching trip counts), uint64_t
+Checking 1 read, 1 write, nested loop (different trip counts), uint8_t
+Checking 1 read, 1 write, nested loop (different trip counts), uint32_t
+Checking 1 read, 1 write, nested loop (different trip counts), uint64_t
+Checking 2 reads, 1 write, nested loop (matching trip counts), uint8_t
+Checking 2 reads, 1 write, nested loop (matching trip counts), uint32_t
+Checking 2 reads, 1 write, nested loop (matching trip counts), uint64_t
+Checking 2 reads, 1 write, nested loop (different trip counts), uint8_t
+Checking 2 reads, 1 write, nested loop (different trip counts), uint32_t
+Checking 2 reads, 1 write, nested loop (different trip counts), uint64_t
+Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t
+Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t
+Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t
+Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t
+Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t
+Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t
exit 0