llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -mtriple=riscv64 -mattr=+v -passes=slp-vectorizer -S < %s | FileCheck %s

 ; The four fshl.i16 calls use a constant shift amount (1), so the scalar cost
 ; of each is 3 (Or + Shl + LShr only; Sub, modulo, ICmp and Select are not
 ; needed for constant shift amounts).  Total scalar fshl cost = 4 x 3 = 12.
 ;
 ; SLP considers vectorizing the fshl+store bundle to <4 x i16>:
 ;   - fshl bundle:  VectorCost=7  ScalarCost=12  net=-5
 ;   - store bundle: VectorCost=1  ScalarCost=4   net=-3
 ;   - right-input gather (non-contiguous phi values): +3
 ;   Tree total cost = -5
 ;
 ; However, the four fshl results are also consumed by scalar add/sub in
 ; use.results, requiring element extractions from the vector:
 ;   ExtractElement cost = 1+2+2+2 = 7
 ;
 ; Total cost = -5 + 7 = 2 > 0, so SLP correctly decides not to vectorize.
 ;
 ; Before the fix, Sub/ICmp/Select were always included in the scalar fshl cost
 ; even for constant shifts, giving ScalarCost=24 for the bundle (net=-17),
 ; which overwhelmed the extract cost (total=-10) and caused incorrect
 ; vectorization.

 declare i16 @llvm.fshl.i16(i16, i16, i16)

 define void @foo(i16 %lx3, ptr %extra_bits, i16 %init_count) {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: i16 [[LX3:%.*]], ptr [[EXTRA_BITS:%.*]], i16 [[INIT_COUNT:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[EB1_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 2
 ; CHECK-NEXT:    [[EB2_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 4
 ; CHECK-NEXT:    [[EB3_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 6
 ; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK:       [[WHILE_BODY]]:
 ; CHECK-NEXT:    [[EB0:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[USE_RESULTS:.*]] ]
 ; CHECK-NEXT:    [[EB1:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[USE_RESULTS]] ]
 ; CHECK-NEXT:    [[EB2:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[USE_RESULTS]] ]
 ; CHECK-NEXT:    [[EB3:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[USE_RESULTS]] ]
 ; CHECK-NEXT:    [[CTR:%.*]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.*]], %[[USE_RESULTS]] ]
 ; CHECK-NEXT:    [[TMP7]] = tail call i16 @llvm.fshl.i16(i16 [[EB3]], i16 [[LX3]], i16 1)
 ; CHECK-NEXT:    store i16 [[TMP7]], ptr [[EB3_PTR]], align 2
 ; CHECK-NEXT:    [[TMP6]] = tail call i16 @llvm.fshl.i16(i16 [[EB2]], i16 [[EB3]], i16 1)
 ; CHECK-NEXT:    store i16 [[TMP6]], ptr [[EB2_PTR]], align 2
 ; CHECK-NEXT:    [[TMP5]] = tail call i16 @llvm.fshl.i16(i16 [[EB1]], i16 [[EB2]], i16 1)
 ; CHECK-NEXT:    store i16 [[TMP5]], ptr [[EB1_PTR]], align 2
 ; CHECK-NEXT:    [[TMP4]] = tail call i16 @llvm.fshl.i16(i16 [[EB0]], i16 [[EB1]], i16 1)
 ; CHECK-NEXT:    store i16 [[TMP4]], ptr [[EXTRA_BITS]], align 2
 ; CHECK-NEXT:    br label %[[USE_RESULTS]]
 ; CHECK:       [[USE_RESULTS]]:
 ; CHECK-NEXT:    [[SUM01:%.*]] = add i16 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[SUM23:%.*]] = sub i16 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[SUM:%.*]] = add i16 [[SUM01]], [[SUM23]]
 ; CHECK-NEXT:    store i16 [[SUM]], ptr [[EXTRA_BITS]], align 2
 ; CHECK-NEXT:    [[CTR_DEC]] = add i16 [[CTR]], -1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp sgt i16 [[CTR_DEC]], -1
 ; CHECK-NEXT:    br i1 [[DONE]], label %[[WHILE_BODY]], label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %eb1_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 2
   %eb2_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 4
   %eb3_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 6
   br label %while.body

 while.body:
   %eb0 = phi i16 [ 0, %entry ], [ %new_eb0, %use.results ]
   %eb1 = phi i16 [ 0, %entry ], [ %new_eb1, %use.results ]
   %eb2 = phi i16 [ 0, %entry ], [ %new_eb2, %use.results ]
   %eb3 = phi i16 [ 0, %entry ], [ %new_eb3, %use.results ]
   %ctr = phi i16 [ %init_count, %entry ], [ %ctr.dec, %use.results ]

   %new_eb3 = tail call i16 @llvm.fshl.i16(i16 %eb3, i16 %lx3, i16 1)
   store i16 %new_eb3, ptr %eb3_ptr, align 2
   %new_eb2 = tail call i16 @llvm.fshl.i16(i16 %eb2, i16 %eb3, i16 1)
   store i16 %new_eb2, ptr %eb2_ptr, align 2
   %new_eb1 = tail call i16 @llvm.fshl.i16(i16 %eb1, i16 %eb2, i16 1)
   store i16 %new_eb1, ptr %eb1_ptr, align 2
   %new_eb0 = tail call i16 @llvm.fshl.i16(i16 %eb0, i16 %eb1, i16 1)
   store i16 %new_eb0, ptr %extra_bits, align 2
   br label %use.results

 use.results:
   %sum01 = add i16 %new_eb0, %new_eb1
   %sum23 = sub i16 %new_eb2, %new_eb3
   %sum   = add i16 %sum01, %sum23
   store i16 %sum, ptr %extra_bits, align 2
   %ctr.dec = add i16 %ctr, -1
   %done    = icmp sgt i16 %ctr.dec, -1
   br i1 %done, label %while.body, label %exit

 exit:
   ret void
 }
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
	; RUN: opt -mtriple=riscv64 -mattr=+v -passes=slp-vectorizer -S < %s \| FileCheck %s

	; The four fshl.i16 calls use a constant shift amount (1), so the scalar cost
	; of each is 3 (Or + Shl + LShr only; Sub, modulo, ICmp and Select are not
	; needed for constant shift amounts). Total scalar fshl cost = 4 x 3 = 12.
	;
	; SLP considers vectorizing the fshl+store bundle to <4 x i16>:
	; - fshl bundle: VectorCost=7 ScalarCost=12 net=-5
	; - store bundle: VectorCost=1 ScalarCost=4 net=-3
	; - right-input gather (non-contiguous phi values): +3
	; Tree total cost = -5
	;
	; However, the four fshl results are also consumed by scalar add/sub in
	; use.results, requiring element extractions from the vector:
	; ExtractElement cost = 1+2+2+2 = 7
	;
	; Total cost = -5 + 7 = 2 > 0, so SLP correctly decides not to vectorize.
	;
	; Before the fix, Sub/ICmp/Select were always included in the scalar fshl cost
	; even for constant shifts, giving ScalarCost=24 for the bundle (net=-17),
	; which overwhelmed the extract cost (total=-10) and caused incorrect
	; vectorization.

	declare i16 @llvm.fshl.i16(i16, i16, i16)

	define void @foo(i16 %lx3, ptr %extra_bits, i16 %init_count) {
	; CHECK-LABEL: define void @foo(
	; CHECK-SAME: i16 [[LX3:%.]], ptr [[EXTRA_BITS:%.]], i16 [[INIT_COUNT:%.*]]) #[[ATTR1:[0-9]+]] {
	; CHECK-NEXT: [[ENTRY:.*]]:
	; CHECK-NEXT: [[EB1_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 2
	; CHECK-NEXT: [[EB2_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 4
	; CHECK-NEXT: [[EB3_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 6
	; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
	; CHECK: [[WHILE_BODY]]:
	; CHECK-NEXT: [[EB0:%.]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP4:%.]], %[[USE_RESULTS:.*]] ]
	; CHECK-NEXT: [[EB1:%.]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP5:%.]], %[[USE_RESULTS]] ]
	; CHECK-NEXT: [[EB2:%.]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP6:%.]], %[[USE_RESULTS]] ]
	; CHECK-NEXT: [[EB3:%.]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP7:%.]], %[[USE_RESULTS]] ]
	; CHECK-NEXT: [[CTR:%.]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.]], %[[USE_RESULTS]] ]
	; CHECK-NEXT: [[TMP7]] = tail call i16 @llvm.fshl.i16(i16 [[EB3]], i16 [[LX3]], i16 1)
	; CHECK-NEXT: store i16 [[TMP7]], ptr [[EB3_PTR]], align 2
	; CHECK-NEXT: [[TMP6]] = tail call i16 @llvm.fshl.i16(i16 [[EB2]], i16 [[EB3]], i16 1)
	; CHECK-NEXT: store i16 [[TMP6]], ptr [[EB2_PTR]], align 2
	; CHECK-NEXT: [[TMP5]] = tail call i16 @llvm.fshl.i16(i16 [[EB1]], i16 [[EB2]], i16 1)
	; CHECK-NEXT: store i16 [[TMP5]], ptr [[EB1_PTR]], align 2
	; CHECK-NEXT: [[TMP4]] = tail call i16 @llvm.fshl.i16(i16 [[EB0]], i16 [[EB1]], i16 1)
	; CHECK-NEXT: store i16 [[TMP4]], ptr [[EXTRA_BITS]], align 2
	; CHECK-NEXT: br label %[[USE_RESULTS]]
	; CHECK: [[USE_RESULTS]]:
	; CHECK-NEXT: [[SUM01:%.*]] = add i16 [[TMP4]], [[TMP5]]
	; CHECK-NEXT: [[SUM23:%.*]] = sub i16 [[TMP6]], [[TMP7]]
	; CHECK-NEXT: [[SUM:%.*]] = add i16 [[SUM01]], [[SUM23]]
	; CHECK-NEXT: store i16 [[SUM]], ptr [[EXTRA_BITS]], align 2
	; CHECK-NEXT: [[CTR_DEC]] = add i16 [[CTR]], -1
	; CHECK-NEXT: [[DONE:%.*]] = icmp sgt i16 [[CTR_DEC]], -1
	; CHECK-NEXT: br i1 [[DONE]], label %[[WHILE_BODY]], label %[[EXIT:.*]]
	; CHECK: [[EXIT]]:
	; CHECK-NEXT: ret void
	;
	entry:
	%eb1_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 2
	%eb2_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 4
	%eb3_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 6
	br label %while.body

	while.body:
	%eb0 = phi i16 [ 0, %entry ], [ %new_eb0, %use.results ]
	%eb1 = phi i16 [ 0, %entry ], [ %new_eb1, %use.results ]
	%eb2 = phi i16 [ 0, %entry ], [ %new_eb2, %use.results ]
	%eb3 = phi i16 [ 0, %entry ], [ %new_eb3, %use.results ]
	%ctr = phi i16 [ %init_count, %entry ], [ %ctr.dec, %use.results ]

	%new_eb3 = tail call i16 @llvm.fshl.i16(i16 %eb3, i16 %lx3, i16 1)
	store i16 %new_eb3, ptr %eb3_ptr, align 2
	%new_eb2 = tail call i16 @llvm.fshl.i16(i16 %eb2, i16 %eb3, i16 1)
	store i16 %new_eb2, ptr %eb2_ptr, align 2
	%new_eb1 = tail call i16 @llvm.fshl.i16(i16 %eb1, i16 %eb2, i16 1)
	store i16 %new_eb1, ptr %eb1_ptr, align 2
	%new_eb0 = tail call i16 @llvm.fshl.i16(i16 %eb0, i16 %eb1, i16 1)
	store i16 %new_eb0, ptr %extra_bits, align 2
	br label %use.results

	use.results:
	%sum01 = add i16 %new_eb0, %new_eb1
	%sum23 = sub i16 %new_eb2, %new_eb3
	%sum = add i16 %sum01, %sum23
	store i16 %sum, ptr %extra_bits, align 2
	%ctr.dec = add i16 %ctr, -1
	%done = icmp sgt i16 %ctr.dec, -1
	br i1 %done, label %while.body, label %exit

	exit:
	ret void
	}