| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "preds = %middle.block" --version 6 |
| ; RUN: opt -p loop-vectorize %s -S | FileCheck %s |
| |
| target triple = "arm64-apple-macosx" |
| |
| ; In this test %sum1 and %sum2 share values. %sum2 is a valid partial reduction |
| ; chain. %sum1 is not a valid partial reduction chain as its exit value |
| ; `%sum1.next = add i64 %accum, 1` does not have an extended operand. |
| ; The sum2 chain will be matched, but rejected as some of its inputs are used |
| ; by sum1, which is not a partial reduction. |
| ; |
| ; TODO: Should we allow %sum2 to lower to a partial reduction in this case? |
| ; This would require relaxing the "ExtendUsersValid" restriction in |
| ; createPartialReductions. |
| define void @partial_reduce_extends_shared_with_reduction(ptr %ptr, i64 %n) #0 { |
| ; CHECK-LABEL: define void @partial_reduce_extends_shared_with_reduction( |
| ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; CHECK-NEXT: [[ENTRY:.*:]] |
| ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 |
| ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 |
| ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_PH:.*]] |
| ; CHECK: [[VECTOR_PH]]: |
| ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 |
| ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], [[VEC_EPILOG_PH:label %.*]], label %[[VECTOR_PH1:.*]] |
| ; CHECK: [[VECTOR_PH1]]: |
| ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 |
| ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] |
| ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] |
| ; CHECK: [[VECTOR_BODY]]: |
| ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [2 x i8], ptr [[PTR]], i64 [[INDEX]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[TMP1]], i64 8 |
| ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 |
| ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2 |
| ; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> |
| ; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[WIDE_LOAD5]] to <8 x i32> |
| ; CHECK-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = mul <8 x i32> [[TMP11]], [[TMP11]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i32> [[TMP3]] to <8 x i64> |
| ; CHECK-NEXT: [[TMP15:%.*]] = sext <8 x i32> [[TMP13]] to <8 x i64> |
| ; CHECK-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI1]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP10]] = add <8 x i64> [[VEC_PHI4]], [[TMP15]] |
| ; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i64> [[VEC_PHI]], [[TMP4]] |
| ; CHECK-NEXT: [[TMP12:%.*]] = add <8 x i64> [[VEC_PHI2]], [[TMP15]] |
| ; CHECK-NEXT: [[TMP7]] = add <8 x i64> [[TMP6]], splat (i64 1) |
| ; CHECK-NEXT: [[TMP14]] = add <8 x i64> [[TMP12]], splat (i64 1) |
| ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 |
| ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] |
| ; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] |
| ; CHECK: [[MIDDLE_BLOCK]]: |
| ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP14]], [[TMP7]] |
| ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) |
| ; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <8 x i64> [[TMP10]], [[TMP5]] |
| ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX6]]) |
| ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] |
| ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] |
| ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] |
| %sum1 = phi i64 [ 0, %entry ], [ %sum1.next, %loop ] |
| %sum2 = phi i64 [ 0, %entry ], [ %sum2.next, %loop ] |
| %gep = getelementptr [2 x i8], ptr %ptr, i64 %iv |
| %load = load i16, ptr %gep, align 2 |
| %ext.a = sext i16 %load to i32 |
| %ext.b = sext i16 %load to i32 |
| %ext.c = sext i16 %load to i32 |
| %mul.1 = mul i32 %ext.b, %ext.c |
| %mul.1.ext = sext i32 %mul.1 to i64 |
| %sum2.next = add i64 %sum2, %mul.1.ext |
| %mul.2 = mul i32 %ext.a, %ext.c |
| %mul.2.ext = sext i32 %mul.2 to i64 |
| %accum = add i64 %sum1, %mul.2.ext |
| %sum1.next = add i64 %accum, 1 |
| %iv.next = add i64 %iv, 1 |
| %exitcond = icmp eq i64 %iv, %n |
| br i1 %exitcond, label %exit, label %loop |
| |
| exit: |
| call void @use(i64 %sum1.next, i64 %sum2.next) |
| ret void |
| } |
| |
| declare void @use(i64, i64) |
| attributes #0 = { "target-cpu"="neoverse-v2" } |