blob: 61b586bdfcb6e87e2a83689657346dca86c1132f [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "preds = %middle.block" --version 6
; RUN: opt -p loop-vectorize %s -S | FileCheck %s
target triple = "arm64-apple-macosx"
; In this test %sum1 and %sum2 share values. %sum2 is a valid partial reduction
; chain. %sum1 is not a valid partial reduction chain as its exit value
; `%sum1.next = add i64 %accum, 1` does not have an extended operand.
; The sum2 chain will be matched, but rejected as some of its inputs are used
; by sum1, which is not a partial reduction.
;
; TODO: Should we allow %sum2 to lower to a partial reduction in this case?
; This would require relaxing the "ExtendUsersValid" restriction in
; createPartialReductions.
define void @partial_reduce_extends_shared_with_reduction(ptr %ptr, i64 %n) #0 {
; CHECK-LABEL: define void @partial_reduce_extends_shared_with_reduction(
; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], [[VEC_EPILOG_PH:label %.*]], label %[[VECTOR_PH1:.*]]
; CHECK: [[VECTOR_PH1]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x i8], ptr [[PTR]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[TMP1]], i64 8
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[WIDE_LOAD5]] to <8 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]]
; CHECK-NEXT: [[TMP13:%.*]] = mul <8 x i32> [[TMP11]], [[TMP11]]
; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i32> [[TMP3]] to <8 x i64>
; CHECK-NEXT: [[TMP15:%.*]] = sext <8 x i32> [[TMP13]] to <8 x i64>
; CHECK-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI1]], [[TMP4]]
; CHECK-NEXT: [[TMP10]] = add <8 x i64> [[VEC_PHI4]], [[TMP15]]
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i64> [[VEC_PHI]], [[TMP4]]
; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i64> [[VEC_PHI2]], [[TMP15]]
; CHECK-NEXT: [[TMP7]] = add <8 x i64> [[TMP6]], splat (i64 1)
; CHECK-NEXT: [[TMP14]] = add <8 x i64> [[TMP12]], splat (i64 1)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP14]], [[TMP7]]
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i64> [[TMP10]], [[TMP5]]
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX6]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
;
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%sum1 = phi i64 [ 0, %entry ], [ %sum1.next, %loop ]
%sum2 = phi i64 [ 0, %entry ], [ %sum2.next, %loop ]
%gep = getelementptr [2 x i8], ptr %ptr, i64 %iv
%load = load i16, ptr %gep, align 2
%ext.a = sext i16 %load to i32
%ext.b = sext i16 %load to i32
%ext.c = sext i16 %load to i32
%mul.1 = mul i32 %ext.b, %ext.c
%mul.1.ext = sext i32 %mul.1 to i64
%sum2.next = add i64 %sum2, %mul.1.ext
%mul.2 = mul i32 %ext.a, %ext.c
%mul.2.ext = sext i32 %mul.2 to i64
%accum = add i64 %sum1, %mul.2.ext
%sum1.next = add i64 %accum, 1
%iv.next = add i64 %iv, 1
%exitcond = icmp eq i64 %iv, %n
br i1 %exitcond, label %exit, label %loop
exit:
call void @use(i64 %sum1.next, i64 %sum2.next)
ret void
}
declare void @use(i64, i64)
attributes #0 = { "target-cpu"="neoverse-v2" }