[LAA] Use MaxStride instead of CommonStride to calculate MaxVF (#98142)
We bail out from MaxVF calculation if the strides are not the same.
Instead, we are dependent on runtime checks, though not yet implemented.
We could instead use the MaxStride to conservatively use an upper bound.
This handles cases like the following:
```c
#define LEN 256 * 256
float a[LEN];
void gather() {
for (int i = 0; i < LEN - 1024 - 255; i++) {
#pragma clang loop interleave(disable)
#pragma clang loop unroll(disable)
for (int j = 0; j < 256; j++)
a[i + j + 1024] += a[j * 4 + i];
}
}
```
---------
Co-authored-by: Florian Hahn <flo@fhahn.com>
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7ec9bdb..f222a99 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2148,10 +2148,6 @@
"different type sizes\n");
return Dependence::Unknown;
}
-
- if (!CommonStride)
- return Dependence::Unknown;
-
// Bail out early if passed-in parameters make vectorization not feasible.
unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
VectorizerParams::VectorizationFactor : 1);
@@ -2162,7 +2158,7 @@
// It's not vectorizable if the distance is smaller than the minimum distance
// needed for a vectroized/unrolled version. Vectorizing one iteration in
- // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
+ // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize.
// (No need to plus the last gap distance).
//
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
@@ -2186,11 +2182,14 @@
// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
// the minimum distance needed is 28, which is greater than distance. It is
// not safe to do vectorization.
+ //
+ // We use MaxStride (maximum of src and sink strides) to get a conservative
+ // lower bound on the MinDistanceNeeded in case of different strides.
// We know that Dist is positive, but it may not be constant. Use the signed
// minimum for computations below, as this ensures we compute the closest
// possible dependence distance.
- uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
+ uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize;
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
if (!ConstDist) {
// For non-constant distances, we checked the lower bound of the
@@ -2236,7 +2235,7 @@
couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride))
return Dependence::BackwardVectorizableButPreventsForwarding;
- uint64_t MaxVF = MinDepDistBytes / *CommonStride;
+ uint64_t MaxVF = MinDepDistBytes / MaxStride;
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
<< " with max VF = " << MaxVF << '\n');
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
new file mode 100644
index 0000000..c5f31de
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<access-info>" %s 2>&1 | FileCheck %s
+
+@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
+
+; Generated from the following C code:
+; #define LEN 256 * 256
+; float a[LEN];
+;
+; void different_strides() {
+; for (int i = 0; i < LEN - 1024 - 255; i++) {
+; #pragma clang loop interleave(disable)
+; #pragma clang loop unroll(disable)
+; for (int j = 0; j < 256; j++)
+; a[i + j + 1024] += a[j * 4 + i];
+; }
+; }
+; The load and store have different strides(4 and 16 bytes respectively) but the store
+; is always at safe positive distance away from the load, thus BackwardVectorizable
+define void @different_strides_backward_vectorizable() {
+; CHECK-LABEL: 'different_strides_backward_vectorizable'
+; CHECK-NEXT: inner.body:
+; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: BackwardVectorizable:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: outer.header:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %outer.header
+
+outer.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
+ %0 = add nuw nsw i64 %i, 1024
+ br label %inner.body
+
+inner.body:
+ %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+ %1 = shl nuw nsw i64 %j, 2
+ %2 = add nuw nsw i64 %1, %i
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %j
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %j.next = add nuw nsw i64 %j, 1
+ %exitcond.not = icmp eq i64 %j.next, 256
+ br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %outerexitcond.not = icmp eq i64 %i.next, 64257
+ br i1 %outerexitcond.not, label %exit, label %outer.header
+
+exit:
+ ret void
+}
+
+
+; Generated from following C code:
+; void different_stride_and_not_vectorizable(){
+; for(int i = 0; i < LEN2; i++){
+; for(int j = 0 ; j < LEN; j++){
+; a[i + j + LEN] += a[i + 4*j];
+; }
+; }
+; }
+; The load and store have different strides, but the store and load are not at a
+; safe distance away from each other, thus not safe for vectorization.
+define void @different_stride_and_not_vectorizable() {
+; CHECK-LABEL: 'different_stride_and_not_vectorizable'
+; CHECK-NEXT: inner.body:
+; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT: Unknown data dependence.
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Unknown:
+; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Forward:
+; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
+; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+; CHECK-NEXT: outer.header:
+; CHECK-NEXT: Report: loop is not the innermost loop
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ br label %outer.header
+
+outer.header:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
+ %0 = add nuw nsw i64 %i, 256
+ br label %inner.body
+
+inner.body:
+ %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
+ %1 = shl nuw nsw i64 %j, 2
+ %2 = add nuw nsw i64 %1, %i
+ %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
+ %3 = load float, ptr %arrayidx, align 4
+ %4 = add nuw nsw i64 %0, %j
+ %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
+ %5 = load float, ptr %arrayidx8, align 4
+ %add9 = fadd fast float %5, %3
+ store float %add9, ptr %arrayidx8, align 4
+ %j.next = add nuw nsw i64 %j, 1
+ %exitcond.not = icmp eq i64 %j.next, 256
+ br i1 %exitcond.not, label %outer.latch, label %inner.body
+
+outer.latch:
+ %i.next = add nuw nsw i64 %i, 1
+ %exitcond29.not = icmp eq i64 %i.next, 65536
+ br i1 %exitcond29.not, label %exit, label %outer.header
+
+exit:
+ ret void
+}