[SingleSource/Vectorizer] Add runtime checks tests for nested loops

This patch adds tests for nested loops like this:

  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      a[(i * (N + 1)) + j] += b[(i * N) + j];
    }
  }

where we generate runtime checks for the inner loop that do not
currently get hoisted above the outer loop.

Differential Revision: https://reviews.llvm.org/D154719
diff --git a/SingleSource/UnitTests/Vectorizer/common.h b/SingleSource/UnitTests/Vectorizer/common.h
index cd02176..d8cd421 100644
--- a/SingleSource/UnitTests/Vectorizer/common.h
+++ b/SingleSource/UnitTests/Vectorizer/common.h
@@ -17,6 +17,42 @@
     _Pragma("clang loop vectorize(enable)") Loop                               \
   };
 
+#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(InnerLoopCode)                     \
+  auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) {   \
+    for (unsigned long i = 0; i < OuterTC; i++) {                              \
+      _Pragma("clang loop vectorize(disable) interleave_count(1)")             \
+      for (unsigned long j = 0; j < InnerTC; j++) {                            \
+        InnerLoopCode                                                          \
+      }                                                                        \
+    }                                                                          \
+  };                                                                           \
+  auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) {   \
+    for (unsigned long i = 0; i < OuterTC; i++) {                              \
+      _Pragma("clang loop vectorize(enable)")                                  \
+      for (unsigned long j = 0; j < InnerTC; j++) {                            \
+        InnerLoopCode                                                          \
+      }                                                                        \
+    }                                                                          \
+  };
+
+#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(InnerLoopCode)                            \
+  auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) {   \
+    for (long i = OuterTC - 1; i >= 0; i--) {                         \
+      _Pragma("clang loop vectorize(disable) interleave_count(1)")             \
+      for (unsigned long j = 0; j < InnerTC; j++) {                            \
+        InnerLoopCode                                                          \
+      }                                                                        \
+    }                                                                          \
+  };                                                                           \
+  auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) {   \
+    for (long i = OuterTC - 1; i >= 0; i--) {                         \
+      _Pragma("clang loop vectorize(enable)")                                  \
+      for (unsigned long j = 0; j < InnerTC; j++) {                            \
+        InnerLoopCode                                                          \
+      }                                                                        \
+    }                                                                          \
+  };
+
 static std::mt19937 rng;
 
 // Initialize arrays A with random numbers.
diff --git a/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp b/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp
index 6ca3b39..0cbb089 100644
--- a/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp
+++ b/SingleSource/UnitTests/Vectorizer/runtime-checks.cpp
@@ -7,6 +7,7 @@
 
 #include "common.h"
 
+
 // Tests for memory runtime checks generated by the vectorizer. Runs scalar and
 // vectorized versions of a loop requiring runtime checks on the same inputs
 // with pointers to the same buffer using various offsets between reads and
@@ -108,6 +109,51 @@
     CheckWithOffsetSecond(i);
 }
 
+
+
+template <typename Ty>
+using Fn4Ty = std::function<void(Ty *, Ty *, unsigned, unsigned)>;
+template <typename Ty>
+static void checkOverlappingMemoryTwoRuntimeChecksNested(Fn4Ty<Ty> ScalarFn,
+                                                         Fn4Ty<Ty> VectorFn,
+                                                         const int OuterTC,
+                                                         const int InnerTC,
+                                                         const char *Name) {
+  std::cout << "Checking " << Name << "\n";
+
+  // Make sure we have enough extra elements so we can be liberal with offsets.
+  const unsigned NumArrayElements = (InnerTC * (OuterTC + 1)) * 8;
+  std::unique_ptr<Ty[]> Input1(new Ty[NumArrayElements]);
+  std::unique_ptr<Ty[]> Reference(new Ty[NumArrayElements]);
+  std::unique_ptr<Ty[]> ToCheck(new Ty[NumArrayElements]);
+
+  auto CheckWithOffsetSecond = [&](int Offset) {
+    init_data(Input1, NumArrayElements);
+    for (unsigned i = 0; i < NumArrayElements; i++) {
+      Reference[i] = Input1[i];
+      ToCheck[i] = Input1[i];
+    }
+
+    // Run scalar function to generate reference output.
+    Ty *ReferenceStart = &Reference[NumArrayElements / 2];
+    ScalarFn(ReferenceStart + Offset, ReferenceStart, OuterTC, InnerTC);
+
+    // Run vector function to generate output to check.
+    Ty *StartPtr = &ToCheck[NumArrayElements / 2];
+    callThroughOptnone(VectorFn, StartPtr + Offset, StartPtr, OuterTC, InnerTC);
+
+    // Compare scalar and vector output.
+    check(Reference, ToCheck, NumArrayElements, Offset);
+  };
+
+  // With a nested loop, sometimes the runtime checks will fail and sometimes
+  // succeed. For example, with large offsets you'd expect for the first and
+  // last one or two executions of the inner loop there is no overlap.
+  for (int i = -(2 * (InnerTC + 1)); i <= (2 * (InnerTC + 1)); i++)
+    CheckWithOffsetSecond(i);
+}
+
+
 int main(void) {
   rng = std::mt19937(15);
 
@@ -261,5 +307,73 @@
         ScalarFn, VectorFn, "1 read, 2 writes, simple indices, uint64_t");
   }
 
+  {
+    DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(
+        auto X = B[(i * OuterTC) + j];
+        A[(i * (OuterTC + 1)) + j] = X;
+    );
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+        ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint8_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+        ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint32_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+        ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (matching trip counts), uint64_t");
+
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+        ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint8_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+        ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint32_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+        ScalarFn, VectorFn, 100, 50, "1 read, 1 write, nested loop (different trip counts), uint64_t");
+  }
+
+  {
+    DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(
+        auto X = B[(i * OuterTC) + j];
+        A[(i * (OuterTC + 1)) + j] += X;
+    );
+
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+        ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint8_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+        ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint32_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+        ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (matching trip counts), uint64_t");
+
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+        ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint8_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+        ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint32_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+        ScalarFn, VectorFn, 100, 50, "2 reads, 1 write, nested loop (different trip counts), uint64_t");
+  }
+
+  {
+    DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(
+        auto X = B[(i * OuterTC) + j];
+        A[(i * (OuterTC + 1)) + j] = X;
+    );
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+        ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+        ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+        ScalarFn, VectorFn, 100, 100, "1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t");
+  }
+
+  {
+    DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(
+        auto X = B[(i * OuterTC) + j];
+        A[(i * (OuterTC + 1)) + j] += X;
+    );
+
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint8_t>(
+        ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint32_t>(
+        ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t");
+    checkOverlappingMemoryTwoRuntimeChecksNested<uint64_t>(
+        ScalarFn, VectorFn, 100, 100, "2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t");
+  }
+
   return 0;
 }
diff --git a/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output b/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output
index 609e4f3..35793ca 100644
--- a/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output
+++ b/SingleSource/UnitTests/Vectorizer/runtime-checks.reference_output
@@ -28,4 +28,22 @@
 Checking 1 read, 2 writes, simple indices, uint8_t
 Checking 1 read, 2 writes, simple indices, uint32_t
 Checking 1 read, 2 writes, simple indices, uint64_t
+Checking 1 read, 1 write, nested loop (matching trip counts), uint8_t
+Checking 1 read, 1 write, nested loop (matching trip counts), uint32_t
+Checking 1 read, 1 write, nested loop (matching trip counts), uint64_t
+Checking 1 read, 1 write, nested loop (different trip counts), uint8_t
+Checking 1 read, 1 write, nested loop (different trip counts), uint32_t
+Checking 1 read, 1 write, nested loop (different trip counts), uint64_t
+Checking 2 reads, 1 write, nested loop (matching trip counts), uint8_t
+Checking 2 reads, 1 write, nested loop (matching trip counts), uint32_t
+Checking 2 reads, 1 write, nested loop (matching trip counts), uint64_t
+Checking 2 reads, 1 write, nested loop (different trip counts), uint8_t
+Checking 2 reads, 1 write, nested loop (different trip counts), uint32_t
+Checking 2 reads, 1 write, nested loop (different trip counts), uint64_t
+Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t
+Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t
+Checking 1 read, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t
+Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint8_t
+Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint32_t
+Checking 2 reads, 1 write, nested loop (decreasing outer iv, matching trip counts), uint64_t
 exit 0