[VectorCombine] Use TCK_CodeSize for size-optimized functions (#202207)
VectorCombine currently uses `TCK_RecipThroughput` for all functions,
including functions optimized for size.
Select `TCK_CodeSize` when `Function::hasOptSize()` is true, covering
both `-Os` (`optsize`) and `-Oz` (`minsize`), while retaining
`TCK_RecipThroughput` for the default optimization mode.
The X86 regression test demonstrates a sign-bit reduction where the
throughput cost model folds an `or` reduction into a `umax` reduction.
The code-size model preserves the smaller form for `optsize` and
`minsize` functions, while the default function retains the existing
throughput-oriented transformation.
Fixes #153375.
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 99e45bd..a8084d5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -6392,8 +6392,9 @@
DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
AAResults &AA = FAM.getResult<AAManager>(F);
const DataLayout *DL = &F.getDataLayout();
- VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
- TryEarlyFoldsOnly);
+ TTI::TargetCostKind CostKind =
+ F.hasOptSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
+ VectorCombine Combiner(F, TTI, DT, AA, AC, DL, CostKind, TryEarlyFoldsOnly);
if (!Combiner.run())
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/llvm/test/Transforms/VectorCombine/X86/fold-signbit-reduction-cmp-codesize.ll b/llvm/test/Transforms/VectorCombine/X86/fold-signbit-reduction-cmp-codesize.ll
new file mode 100644
index 0000000..1d3bb36
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/fold-signbit-reduction-cmp-codesize.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE42
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+
+; The throughput cost model folds this sign-bit reduction to a umax reduction.
+; The code-size cost model used for optsize and minsize functions preserves the
+; v4i32 or reduction. For v8i16, SSE4.1 makes the umax reduction cheaper, so
+; x86-64-v2 and later fold while baseline x86-64 preserves the or reduction.
+
+define i1 @throughput(<4 x i32> %x) {
+; CHECK-LABEL: define i1 @throughput(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @optsize(<4 x i32> %x) #0 {
+; CHECK-LABEL: define i1 @optsize(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @minsize(<4 x i32> %x) #1 {
+; CHECK-LABEL: define i1 @minsize(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X]])
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <4 x i32> %x, splat (i32 31)
+ %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shr)
+ %cmp = icmp eq i32 %red, 0
+ ret i1 %cmp
+}
+
+define i1 @minsize_v8i16(<8 x i16> %x) #1 {
+; SSE2-LABEL: define i1 @minsize_v8i16(
+; SSE2-SAME: <8 x i16> [[X:%.*]]) #[[ATTR2]] {
+; SSE2-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[X]])
+; SSE2-NEXT: [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; SSE2-NEXT: ret i1 [[CMP]]
+;
+; SSE42-LABEL: define i1 @minsize_v8i16(
+; SSE42-SAME: <8 x i16> [[X:%.*]]) #[[ATTR2]] {
+; SSE42-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[X]])
+; SSE42-NEXT: [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; SSE42-NEXT: ret i1 [[CMP]]
+;
+; AVX2-LABEL: define i1 @minsize_v8i16(
+; AVX2-SAME: <8 x i16> [[X:%.*]]) #[[ATTR2]] {
+; AVX2-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[X]])
+; AVX2-NEXT: [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; AVX2-NEXT: ret i1 [[CMP]]
+;
+; AVX512-LABEL: define i1 @minsize_v8i16(
+; AVX512-SAME: <8 x i16> [[X:%.*]]) #[[ATTR2]] {
+; AVX512-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> [[X]])
+; AVX512-NEXT: [[CMP:%.*]] = icmp sgt i16 [[TMP1]], -1
+; AVX512-NEXT: ret i1 [[CMP]]
+;
+ %shr = lshr <8 x i16> %x, splat (i16 15)
+ %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %shr)
+ %cmp = icmp eq i16 %red, 0
+ ret i1 %cmp
+}
+
+attributes #0 = { optsize }
+attributes #1 = { minsize optsize }