| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: opt -mtriple=riscv64 -mattr=+v -passes=slp-vectorizer -S < %s | FileCheck %s |
| |
| ; The four fshl.i16 calls use a constant shift amount (1), so the scalar cost |
| ; of each is 3 (Or + Shl + LShr only; Sub, modulo, ICmp and Select are not |
| ; needed for constant shift amounts). Total scalar fshl cost = 4 x 3 = 12. |
| ; |
| ; SLP considers vectorizing the fshl+store bundle to <4 x i16>: |
| ; - fshl bundle: VectorCost=7 ScalarCost=12 net=-5 |
| ; - store bundle: VectorCost=1 ScalarCost=4 net=-3 |
| ; - right-input gather (non-contiguous phi values): +3 |
| ; Tree total cost = -5 |
| ; |
| ; However, the four fshl results are also consumed by scalar add/sub in |
| ; use.results, requiring element extractions from the vector: |
| ; ExtractElement cost = 1+2+2+2 = 7 |
| ; |
| ; Total cost = -5 + 7 = 2 > 0, so SLP correctly decides not to vectorize. |
| ; |
| ; Before the fix, Sub/ICmp/Select were always included in the scalar fshl cost |
| ; even for constant shifts, giving ScalarCost=24 for the bundle (net=-17), |
| ; which overwhelmed the extract cost (total=-10) and caused incorrect |
| ; vectorization. |
| |
| declare i16 @llvm.fshl.i16(i16, i16, i16) |
| |
| define void @foo(i16 %lx3, ptr %extra_bits, i16 %init_count) { |
| ; CHECK-LABEL: define void @foo( |
| ; CHECK-SAME: i16 [[LX3:%.*]], ptr [[EXTRA_BITS:%.*]], i16 [[INIT_COUNT:%.*]]) #[[ATTR1:[0-9]+]] { |
| ; CHECK-NEXT: [[ENTRY:.*]]: |
| ; CHECK-NEXT: [[EB1_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 2 |
| ; CHECK-NEXT: [[EB2_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 4 |
| ; CHECK-NEXT: [[EB3_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 6 |
| ; CHECK-NEXT: br label %[[WHILE_BODY:.*]] |
| ; CHECK: [[WHILE_BODY]]: |
| ; CHECK-NEXT: [[EB0:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[USE_RESULTS:.*]] ] |
| ; CHECK-NEXT: [[EB1:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[USE_RESULTS]] ] |
| ; CHECK-NEXT: [[EB2:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[USE_RESULTS]] ] |
| ; CHECK-NEXT: [[EB3:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[USE_RESULTS]] ] |
| ; CHECK-NEXT: [[CTR:%.*]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.*]], %[[USE_RESULTS]] ] |
| ; CHECK-NEXT: [[TMP7]] = tail call i16 @llvm.fshl.i16(i16 [[EB3]], i16 [[LX3]], i16 1) |
| ; CHECK-NEXT: store i16 [[TMP7]], ptr [[EB3_PTR]], align 2 |
| ; CHECK-NEXT: [[TMP6]] = tail call i16 @llvm.fshl.i16(i16 [[EB2]], i16 [[EB3]], i16 1) |
| ; CHECK-NEXT: store i16 [[TMP6]], ptr [[EB2_PTR]], align 2 |
| ; CHECK-NEXT: [[TMP5]] = tail call i16 @llvm.fshl.i16(i16 [[EB1]], i16 [[EB2]], i16 1) |
| ; CHECK-NEXT: store i16 [[TMP5]], ptr [[EB1_PTR]], align 2 |
| ; CHECK-NEXT: [[TMP4]] = tail call i16 @llvm.fshl.i16(i16 [[EB0]], i16 [[EB1]], i16 1) |
| ; CHECK-NEXT: store i16 [[TMP4]], ptr [[EXTRA_BITS]], align 2 |
| ; CHECK-NEXT: br label %[[USE_RESULTS]] |
| ; CHECK: [[USE_RESULTS]]: |
| ; CHECK-NEXT: [[SUM01:%.*]] = add i16 [[TMP4]], [[TMP5]] |
| ; CHECK-NEXT: [[SUM23:%.*]] = sub i16 [[TMP6]], [[TMP7]] |
| ; CHECK-NEXT: [[SUM:%.*]] = add i16 [[SUM01]], [[SUM23]] |
| ; CHECK-NEXT: store i16 [[SUM]], ptr [[EXTRA_BITS]], align 2 |
| ; CHECK-NEXT: [[CTR_DEC]] = add i16 [[CTR]], -1 |
| ; CHECK-NEXT: [[DONE:%.*]] = icmp sgt i16 [[CTR_DEC]], -1 |
| ; CHECK-NEXT: br i1 [[DONE]], label %[[WHILE_BODY]], label %[[EXIT:.*]] |
| ; CHECK: [[EXIT]]: |
| ; CHECK-NEXT: ret void |
| ; |
| entry: |
| %eb1_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 2 |
| %eb2_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 4 |
| %eb3_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 6 |
| br label %while.body |
| |
| while.body: |
| %eb0 = phi i16 [ 0, %entry ], [ %new_eb0, %use.results ] |
| %eb1 = phi i16 [ 0, %entry ], [ %new_eb1, %use.results ] |
| %eb2 = phi i16 [ 0, %entry ], [ %new_eb2, %use.results ] |
| %eb3 = phi i16 [ 0, %entry ], [ %new_eb3, %use.results ] |
| %ctr = phi i16 [ %init_count, %entry ], [ %ctr.dec, %use.results ] |
| |
| %new_eb3 = tail call i16 @llvm.fshl.i16(i16 %eb3, i16 %lx3, i16 1) |
| store i16 %new_eb3, ptr %eb3_ptr, align 2 |
| %new_eb2 = tail call i16 @llvm.fshl.i16(i16 %eb2, i16 %eb3, i16 1) |
| store i16 %new_eb2, ptr %eb2_ptr, align 2 |
| %new_eb1 = tail call i16 @llvm.fshl.i16(i16 %eb1, i16 %eb2, i16 1) |
| store i16 %new_eb1, ptr %eb1_ptr, align 2 |
| %new_eb0 = tail call i16 @llvm.fshl.i16(i16 %eb0, i16 %eb1, i16 1) |
| store i16 %new_eb0, ptr %extra_bits, align 2 |
| br label %use.results |
| |
| use.results: |
| %sum01 = add i16 %new_eb0, %new_eb1 |
| %sum23 = sub i16 %new_eb2, %new_eb3 |
| %sum = add i16 %sum01, %sum23 |
| store i16 %sum, ptr %extra_bits, align 2 |
| %ctr.dec = add i16 %ctr, -1 |
| %done = icmp sgt i16 %ctr.dec, -1 |
| br i1 %done, label %while.body, label %exit |
| |
| exit: |
| ret void |
| } |