blob: b214b9422b79ef81d4c805a5c594ca5fc4b64e9b [file]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=riscv64 -mattr=+v -passes=slp-vectorizer -S < %s | FileCheck %s
; The four fshl.i16 calls use a constant shift amount (1), so the scalar cost
; of each is 3 (Or + Shl + LShr only; Sub, modulo, ICmp and Select are not
; needed for constant shift amounts). Total scalar fshl cost = 4 x 3 = 12.
;
; SLP considers vectorizing the fshl+store bundle to <4 x i16>:
; - fshl bundle: VectorCost=7 ScalarCost=12 net=-5
; - store bundle: VectorCost=1 ScalarCost=4 net=-3
; - right-input gather (non-contiguous phi values): +3
; Tree total cost = -5
;
; However, the four fshl results are also consumed by scalar add/sub in
; use.results, requiring element extractions from the vector:
; ExtractElement cost = 1+2+2+2 = 7
;
; Total cost = -5 + 7 = 2 > 0, so SLP correctly decides not to vectorize.
;
; Before the fix, Sub/ICmp/Select were always included in the scalar fshl cost
; even for constant shifts, giving ScalarCost=24 for the bundle (net=-17),
; which overwhelmed the extract cost (total=-10) and caused incorrect
; vectorization.
declare i16 @llvm.fshl.i16(i16, i16, i16)
define void @foo(i16 %lx3, ptr %extra_bits, i16 %init_count) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: i16 [[LX3:%.*]], ptr [[EXTRA_BITS:%.*]], i16 [[INIT_COUNT:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[EB1_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 2
; CHECK-NEXT: [[EB2_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 4
; CHECK-NEXT: [[EB3_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 6
; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
; CHECK: [[WHILE_BODY]]:
; CHECK-NEXT: [[EB0:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[USE_RESULTS:.*]] ]
; CHECK-NEXT: [[EB1:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[USE_RESULTS]] ]
; CHECK-NEXT: [[EB2:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[USE_RESULTS]] ]
; CHECK-NEXT: [[EB3:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[USE_RESULTS]] ]
; CHECK-NEXT: [[CTR:%.*]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.*]], %[[USE_RESULTS]] ]
; CHECK-NEXT: [[TMP7]] = tail call i16 @llvm.fshl.i16(i16 [[EB3]], i16 [[LX3]], i16 1)
; CHECK-NEXT: store i16 [[TMP7]], ptr [[EB3_PTR]], align 2
; CHECK-NEXT: [[TMP6]] = tail call i16 @llvm.fshl.i16(i16 [[EB2]], i16 [[EB3]], i16 1)
; CHECK-NEXT: store i16 [[TMP6]], ptr [[EB2_PTR]], align 2
; CHECK-NEXT: [[TMP5]] = tail call i16 @llvm.fshl.i16(i16 [[EB1]], i16 [[EB2]], i16 1)
; CHECK-NEXT: store i16 [[TMP5]], ptr [[EB1_PTR]], align 2
; CHECK-NEXT: [[TMP4]] = tail call i16 @llvm.fshl.i16(i16 [[EB0]], i16 [[EB1]], i16 1)
; CHECK-NEXT: store i16 [[TMP4]], ptr [[EXTRA_BITS]], align 2
; CHECK-NEXT: br label %[[USE_RESULTS]]
; CHECK: [[USE_RESULTS]]:
; CHECK-NEXT: [[SUM01:%.*]] = add i16 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[SUM23:%.*]] = sub i16 [[TMP6]], [[TMP7]]
; CHECK-NEXT: [[SUM:%.*]] = add i16 [[SUM01]], [[SUM23]]
; CHECK-NEXT: store i16 [[SUM]], ptr [[EXTRA_BITS]], align 2
; CHECK-NEXT: [[CTR_DEC]] = add i16 [[CTR]], -1
; CHECK-NEXT: [[DONE:%.*]] = icmp sgt i16 [[CTR_DEC]], -1
; CHECK-NEXT: br i1 [[DONE]], label %[[WHILE_BODY]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
%eb1_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 2
%eb2_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 4
%eb3_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 6
br label %while.body
while.body:
%eb0 = phi i16 [ 0, %entry ], [ %new_eb0, %use.results ]
%eb1 = phi i16 [ 0, %entry ], [ %new_eb1, %use.results ]
%eb2 = phi i16 [ 0, %entry ], [ %new_eb2, %use.results ]
%eb3 = phi i16 [ 0, %entry ], [ %new_eb3, %use.results ]
%ctr = phi i16 [ %init_count, %entry ], [ %ctr.dec, %use.results ]
%new_eb3 = tail call i16 @llvm.fshl.i16(i16 %eb3, i16 %lx3, i16 1)
store i16 %new_eb3, ptr %eb3_ptr, align 2
%new_eb2 = tail call i16 @llvm.fshl.i16(i16 %eb2, i16 %eb3, i16 1)
store i16 %new_eb2, ptr %eb2_ptr, align 2
%new_eb1 = tail call i16 @llvm.fshl.i16(i16 %eb1, i16 %eb2, i16 1)
store i16 %new_eb1, ptr %eb1_ptr, align 2
%new_eb0 = tail call i16 @llvm.fshl.i16(i16 %eb0, i16 %eb1, i16 1)
store i16 %new_eb0, ptr %extra_bits, align 2
br label %use.results
use.results:
%sum01 = add i16 %new_eb0, %new_eb1
%sum23 = sub i16 %new_eb2, %new_eb3
%sum = add i16 %sum01, %sum23
store i16 %sum, ptr %extra_bits, align 2
%ctr.dec = add i16 %ctr, -1
%done = icmp sgt i16 %ctr.dec, -1
br i1 %done, label %while.body, label %exit
exit:
ret void
}