llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll - llvm-project.git - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes='print<cost-model>' -disable-output -mtriple=s390x-unknown-linux \
 ; RUN:   -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15

 define void @fadd_reductions() {
 ; Z15-LABEL: 'fadd_reductions'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
   %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
   %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
   %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
   %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
   ret void
 }

 define void @fast_fadd_reductions(ptr %src, ptr %dst) {
 ; Z15-LABEL: 'fast_fadd_reductions'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
   %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
   %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
   %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
   %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
   ret void
 }

 define void @fmul_reductions() {
 ; Z15-LABEL: 'fmul_reductions'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
   %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
   %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
   %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
   %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
   ret void
 }

 define void @fast_fmul_reductions() {
 ; Z15-LABEL: 'fast_fmul_reductions'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef)
   %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef)
   %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef)
   %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef)
   %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef)

   ret void
 }

 define void @fmin_reductions() {
 ; Z15-LABEL: 'fmin_reductions'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef)
   %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef)
   %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef)
   %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef)
   %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef)
   ret void
 }

 define void @fmax_reductions() {
 ; Z15-LABEL: 'fmax_reductions'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef)
   %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef)
   %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef)
   %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef)
   %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef)
   ret void
 }

 define void @reduceumin() {
 ; Z15-LABEL: 'reduceumin'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)
 ;
   %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef)
   %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef)
   %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
   %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)

   %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef)
   %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef)

   ret void
 }

 define void @reduceumax() {
 ; Z15-LABEL: 'reduceumax'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)
 ;
   %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef)
   %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef)
   %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
   %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)

   %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef)
   %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef)

   ret void
 }

 define void @reducesmin() {
 ; Z15-LABEL: 'reducesmin'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)
 ;
   %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef)
   %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef)
   %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
   %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)

   %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef)
   %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef)

   ret void
 }

 define void @reducesmax() {
 ; Z15-LABEL: 'reducesmax'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
 ; Z15-NEXT  Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)
 ;
   %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
   %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
   %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
   %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)

   %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
   %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef)

   ret void
 }

 define void @reduceadd() {
 ; Z15-LABEL: 'reduceadd'
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
 ;
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
 ; Z15-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)

   ; REDUCEADD64
   %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
   %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
   %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
   %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
   ; REDUCEADD32
   %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
   %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
   %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
   %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
   ; REDUCEADD16
   %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
   %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
   %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
   %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
   ; REDUCEADD8
   %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
   %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
   %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
   %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
   ; EXTREME VALUES
   %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
   %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef)

   ret void
 }

 define void @reducemul() {
 ; CHECK-LABEL: 'reducemul'
 ; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 5 for instruction: %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 7 for instruction: %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 2 for instruction: %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 4 for instruction: %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 6 for instruction: %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 8 for instruction: %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
 ;
 ; CHECK:  Cost Model: Found an estimated cost of 15 for instruction: %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 28 for instruction: %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)

   ; REDUCEADD64
   %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef)
   %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef)
   %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef)
   %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef)
   ; REDUCEADD32
   %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef)
   %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef)
   %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef)
   %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef)
   ; REDUCEADD16
   %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef)
   %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef)
   %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef)
   %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef)
   ; REDUCEADD8
   %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
   %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
   %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
   %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
   ; EXTREME VALUES
   %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
   %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef)

   ret void
 }

 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
 declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>)

 declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
 declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>)

 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
 declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
 declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
 declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
 declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>)

 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
 declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
 declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
 declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
 declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>)

 declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
 declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>)
 declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>)
 declare i128 @llvm.vector.reduce.umin.v4i128(<4 x i128>)

 declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>)
 declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>)
 declare i128 @llvm.vector.reduce.umax.v4i128(<4 x i128>)

 declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
 declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>)
 declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>)
 declare i128 @llvm.vector.reduce.smin.v4i128(<4 x i128>)

 declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
 declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
 declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
 declare i128 @llvm.vector.reduce.smax.v4i128(<4 x i128>)

 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
 declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>)
 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
 declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
 declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)

 declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
 declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>)

 declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>)
 declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
 declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>)
 declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
 declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>)
 declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>)
 declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>)
 declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>)
 declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>)
 declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>)
 declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>)
 declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>)
 declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>)
 declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>)

 declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>)
 declare i256 @llvm.vector.reduce.mul.v4i256(<4 x i256>)