llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck --check-prefixes=CHECK,POWEROF2 %s
 ; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s | FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s

 define i32 @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[IF_END_I87:%.*]]
 ; CHECK:       if.end.i87:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> zeroinitializer, i64 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    switch i32 0, label [[SW_BB509_I:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[SW_BB509_I]]
 ; CHECK-NEXT:      i32 0, label [[IF_THEN458_I:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       if.then458.i:
 ; CHECK-NEXT:    br label [[SW_BB509_I]]
 ; CHECK:       sw.bb509.i:
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i32> [ [[TMP0]], [[IF_THEN458_I]] ], [ [[TMP3]], [[IF_END_I87]] ], [ [[TMP3]], [[IF_END_I87]] ]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   %getelementptr0 = getelementptr i8, ptr null, i64 64036
   %getelementptr1 = getelementptr i8, ptr null, i64 64064
   br label %if.end.i87

 if.end.i87:                                       ; preds = %entry
   %0 = load <2 x i32>, ptr %getelementptr0, align 4
   %1 = load <2 x i32>, ptr %getelementptr1, align 8
   switch i32 0, label %sw.bb509.i [
   i32 1, label %sw.bb509.i
   i32 0, label %if.then458.i
   ]

 if.then458.i:                                     ; preds = %if.end.i87
   br label %sw.bb509.i

 sw.bb509.i:                                       ; preds = %if.then458.i, %if.end.i87, %if.end.i87
   %4 = phi <2 x i32> [ %0, %if.then458.i ], [ %0, %if.end.i87 ], [ %0, %if.end.i87 ]
   %5 = phi <2 x i32> [ %1, %if.then458.i ], [ zeroinitializer, %if.end.i87 ], [ zeroinitializer, %if.end.i87 ]
   ret i32 0
 }

 define void @test2() {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr null, i64 132
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr null, i64 200
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr null, i64 300
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP6]], <8 x float> [[TMP3]], i64 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v16f32(<32 x float> [[TMP7]], <16 x float> [[TMP5]], i64 16)
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <32 x float> [[TMP8]] to <32 x double>
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> poison, <8 x double> zeroinitializer, i64 0)
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP10]], <8 x double> zeroinitializer, i64 8)
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP11]], <8 x double> zeroinitializer, i64 16)
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP12]], <8 x double> zeroinitializer, i64 24)
 ; CHECK-NEXT:    [[TMP14:%.*]] = fadd <32 x double> [[TMP13]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fptrunc <32 x double> [[TMP14]] to <32 x float>
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> zeroinitializer, i64 0)
 ; CHECK-NEXT:    [[TMP17:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP16]], <8 x float> zeroinitializer, i64 8)
 ; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP17]], <8 x float> zeroinitializer, i64 16)
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP18]], <8 x float> zeroinitializer, i64 24)
 ; CHECK-NEXT:    [[TMP20:%.*]] = fcmp ogt <32 x float> [[TMP19]], [[TMP15]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
   %0 = getelementptr i8, ptr null, i64 132
   %1 = getelementptr i8, ptr null, i64 164
   %2 = getelementptr i8, ptr null, i64 200
   %3 = getelementptr i8, ptr null, i64 300
   %4 = load <8 x float>, ptr %0, align 4
   %5 = load <8 x float>, ptr %1, align 4
   %6 = load <8 x float>, ptr %2, align 4
   %7 = load <8 x float>, ptr %3, align 4
   %8 = fpext <8 x float> %4 to <8 x double>
   %9 = fpext <8 x float> %5 to <8 x double>
   %10 = fpext <8 x float> %6 to <8 x double>
   %11 = fpext <8 x float> %7 to <8 x double>
   %12 = fadd <8 x double> zeroinitializer, %8
   %13 = fadd <8 x double> zeroinitializer, %9
   %14 = fadd <8 x double> zeroinitializer, %10
   %15 = fadd <8 x double> zeroinitializer, %11
   %16 = fptrunc <8 x double> %12 to <8 x float>
   %17 = fptrunc <8 x double> %13 to <8 x float>
   %18 = fptrunc <8 x double> %14 to <8 x float>
   %19 = fptrunc <8 x double> %15 to <8 x float>
   %20 = fcmp ogt <8 x float> zeroinitializer, %16
   %21 = fcmp ogt <8 x float> zeroinitializer, %17
   %22 = fcmp ogt <8 x float> zeroinitializer, %18
   %23 = fcmp ogt <8 x float> zeroinitializer, %19
   ret void
 }

 define void @test3(float %0) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY_LR_PH:%.*]]
 ; CHECK:       for.body.lr.ph:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP1]], <2 x float> zeroinitializer, i64 2)
 ; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x float> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> poison, <2 x i1> splat (i1 true), i64 0)
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> [[TMP6]], <2 x i1> [[TMP5]], i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP4]], i64 0)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> [[TMP2]]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ;
 entry:
   br label %for.body.lr.ph

 for.body.lr.ph:
   br i1 false, label %for.cond.cleanup, label %for.body

 for.cond.cleanup:                                 ; preds = %for.body, %for.body.lr.ph
   %1 = phi <2 x float> [ zeroinitializer, %for.body.lr.ph ], [ %5, %for.body ]
   %2 = phi <2 x float> [ zeroinitializer, %for.body.lr.ph ], [ %6, %for.body ]
   ret void

 for.body:
   %3 = load <2 x float>, ptr null, align 4
   %4 = fcmp olt <2 x float> zeroinitializer, %3
   %5 = select <2 x i1> <i1 true, i1 true>, <2 x float> %3, <2 x float> zeroinitializer
   %6 = select <2 x i1> %4, <2 x float> %3, <2 x float> zeroinitializer
   br label %for.cond.cleanup
 }

 define ptr @test4() {
 ; POWEROF2-LABEL: @test4(
 ; POWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; POWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; POWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
 ; POWEROF2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
 ; POWEROF2-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
 ; POWEROF2-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
 ; POWEROF2-NEXT:    br label [[TMP8:%.*]]
 ; POWEROF2:       7:
 ; POWEROF2-NEXT:    br label [[TMP8]]
 ; POWEROF2:       8:
 ; POWEROF2-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
 ; POWEROF2-NEXT:    [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
 ; POWEROF2-NEXT:    br label [[TMP11:%.*]]
 ; POWEROF2:       11:
 ; POWEROF2-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
 ; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
 ; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
 ; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
 ; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
 ; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
 ; POWEROF2-NEXT:    [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
 ; POWEROF2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
 ; POWEROF2-NEXT:    [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
 ; POWEROF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
 ; POWEROF2-NEXT:    [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
 ; POWEROF2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
 ; POWEROF2-NEXT:    [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
 ; POWEROF2-NEXT:    [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
 ; POWEROF2-NEXT:    [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
 ; POWEROF2-NEXT:    ret ptr null
 ;
 ; NONPOWEROF2-LABEL: @test4(
 ; NONPOWEROF2-NEXT:    [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
 ; NONPOWEROF2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; NONPOWEROF2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
 ; NONPOWEROF2-NEXT:    [[TMP4:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> poison, <3 x float> [[TMP2]], i64 0)
 ; NONPOWEROF2-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> [[TMP4]], <3 x float> [[TMP3]], i64 3)
 ; NONPOWEROF2-NEXT:    br label [[TMP7:%.*]]
 ; NONPOWEROF2:       6:
 ; NONPOWEROF2-NEXT:    br label [[TMP7]]
 ; NONPOWEROF2:       7:
 ; NONPOWEROF2-NEXT:    [[TMP8:%.*]] = phi <6 x float> [ poison, [[TMP6:%.*]] ], [ [[TMP5]], [[TMP0:%.*]] ]
 ; NONPOWEROF2-NEXT:    br label [[TMP9:%.*]]
 ; NONPOWEROF2:       9:
 ; NONPOWEROF2-NEXT:    [[TMP10:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 0)
 ; NONPOWEROF2-NEXT:    [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
 ; NONPOWEROF2-NEXT:    [[TMP12:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 3)
 ; NONPOWEROF2-NEXT:    [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
 ; NONPOWEROF2-NEXT:    [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
 ; NONPOWEROF2-NEXT:    [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
 ; NONPOWEROF2-NEXT:    [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
 ; NONPOWEROF2-NEXT:    [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
 ; NONPOWEROF2-NEXT:    ret ptr null
 ;
   %1 = fadd <8 x float> zeroinitializer, zeroinitializer
   %2 = extractelement <8 x float> %1, i64 0
   %3 = extractelement <8 x float> %1, i64 1
   %4 = extractelement <8 x float> %1, i64 2
   %5 = extractelement <8 x float> %1, i64 4
   %6 = extractelement <8 x float> %1, i64 5
   %7 = extractelement <8 x float> %1, i64 6
   br label %9

 8:
   br label %9

 9:
   %10 = phi float [ 0.000000e+00, %8 ], [ %7, %0 ]
   %11 = phi float [ 0.000000e+00, %8 ], [ %6, %0 ]
   %12 = phi float [ 0.000000e+00, %8 ], [ %5, %0 ]
   %13 = phi float [ 0.000000e+00, %8 ], [ %4, %0 ]
   %14 = phi float [ 0.000000e+00, %8 ], [ %3, %0 ]
   %15 = phi float [ 0.000000e+00, %8 ], [ %2, %0 ]
   br label %16

 16:
   %17 = fmul float %14, 0.000000e+00
   %18 = fmul float 0.000000e+00, %11
   %19 = fmul float 0.000000e+00, %15
   %20 = fmul float %12, 0.000000e+00
   %21 = fadd reassoc nsz float %17, %19
   %22 = fadd reassoc nsz float %18, %20
   %23 = fmul float %13, 0.000000e+00
   %24 = fmul float %10, 0.000000e+00
   %25 = fadd reassoc nsz float %21, %23
   %26 = fadd reassoc nsz float %22, %24
   %27 = tail call float @llvm.sqrt.f32(float %25)
   %28 = tail call float @llvm.sqrt.f32(float %26)
   ret ptr null
 }

 define i32 @test5() {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> zeroinitializer, i64 0)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> [[TMP0]], <2 x double> zeroinitializer, i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 0)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP3]], <2 x double> zeroinitializer, i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP4]], <2 x double> zeroinitializer, i64 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP5]], <2 x double> zeroinitializer, i64 6)
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 2)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP7]], <2 x double> zeroinitializer, i64 6)
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> poison, <4 x double> [[TMP2]], i64 0)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 2, i32 3, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <8 x double> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    br label [[FOR_END47:%.*]]
 ; CHECK:       for.end47:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi <8 x double> [ [[TMP11]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   %div0 = fdiv <2 x double> zeroinitializer, zeroinitializer
   %div1 = fdiv <2 x double> zeroinitializer, zeroinitializer
   %add0 = fadd <2 x double> zeroinitializer, %div0
   %add1 = fadd <2 x double> zeroinitializer, zeroinitializer
   %add2 = fadd <2 x double> %div1, zeroinitializer
   %add3 = fadd <2 x double> zeroinitializer, zeroinitializer
   br label %for.end47

 for.end47:                                        ; preds = %entry
   %add0.lcssa = phi <2 x double> [ %add0, %entry ]
   %add1.lcssa = phi <2 x double> [ %add1, %entry ]
   %add2.lcssa = phi <2 x double> [ %add2, %entry ]
   %add3.lcssa = phi <2 x double> [ %add3, %entry ]
   ret i32 0
 }
	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s \| FileCheck --check-prefixes=CHECK,POWEROF2 %s
	; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 -slp-vectorize-non-power-of-2 %s \| FileCheck --check-prefixes=CHECK,NONPOWEROF2 %s

	define i32 @test() {
	; CHECK-LABEL: @test(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[IF_END_I87:%.*]]
	; CHECK: if.end.i87:
	; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
	; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> zeroinitializer, i64 2)
	; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
	; CHECK-NEXT: switch i32 0, label [[SW_BB509_I:%.*]] [
	; CHECK-NEXT: i32 1, label [[SW_BB509_I]]
	; CHECK-NEXT: i32 0, label [[IF_THEN458_I:%.*]]
	; CHECK-NEXT: ]
	; CHECK: if.then458.i:
	; CHECK-NEXT: br label [[SW_BB509_I]]
	; CHECK: sw.bb509.i:
	; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i32> [ [[TMP0]], [[IF_THEN458_I]] ], [ [[TMP3]], [[IF_END_I87]] ], [ [[TMP3]], [[IF_END_I87]] ]
	; CHECK-NEXT: ret i32 0
	;
	entry:
	%getelementptr0 = getelementptr i8, ptr null, i64 64036
	%getelementptr1 = getelementptr i8, ptr null, i64 64064
	br label %if.end.i87

	if.end.i87: ; preds = %entry
	%0 = load <2 x i32>, ptr %getelementptr0, align 4
	%1 = load <2 x i32>, ptr %getelementptr1, align 8
	switch i32 0, label %sw.bb509.i [
	i32 1, label %sw.bb509.i
	i32 0, label %if.then458.i
	]

	if.then458.i: ; preds = %if.end.i87
	br label %sw.bb509.i

	sw.bb509.i: ; preds = %if.then458.i, %if.end.i87, %if.end.i87
	%4 = phi <2 x i32> [ %0, %if.then458.i ], [ %0, %if.end.i87 ], [ %0, %if.end.i87 ]
	%5 = phi <2 x i32> [ %1, %if.then458.i ], [ zeroinitializer, %if.end.i87 ], [ zeroinitializer, %if.end.i87 ]
	ret i32 0
	}

	define void @test2() {
	; CHECK-LABEL: @test2(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr null, i64 132
	; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 200
	; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr null, i64 300
	; CHECK-NEXT: [[TMP3:%.*]] = load <8 x float>, ptr [[TMP1]], align 4
	; CHECK-NEXT: [[TMP4:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
	; CHECK-NEXT: [[TMP5:%.*]] = load <16 x float>, ptr [[TMP0]], align 4
	; CHECK-NEXT: [[TMP6:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> [[TMP4]], i64 0)
	; CHECK-NEXT: [[TMP7:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP6]], <8 x float> [[TMP3]], i64 8)
	; CHECK-NEXT: [[TMP8:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v16f32(<32 x float> [[TMP7]], <16 x float> [[TMP5]], i64 16)
	; CHECK-NEXT: [[TMP9:%.*]] = fpext <32 x float> [[TMP8]] to <32 x double>
	; CHECK-NEXT: [[TMP10:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> poison, <8 x double> zeroinitializer, i64 0)
	; CHECK-NEXT: [[TMP11:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP10]], <8 x double> zeroinitializer, i64 8)
	; CHECK-NEXT: [[TMP12:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP11]], <8 x double> zeroinitializer, i64 16)
	; CHECK-NEXT: [[TMP13:%.*]] = call <32 x double> @llvm.vector.insert.v32f64.v8f64(<32 x double> [[TMP12]], <8 x double> zeroinitializer, i64 24)
	; CHECK-NEXT: [[TMP14:%.*]] = fadd <32 x double> [[TMP13]], [[TMP9]]
	; CHECK-NEXT: [[TMP15:%.*]] = fptrunc <32 x double> [[TMP14]] to <32 x float>
	; CHECK-NEXT: [[TMP16:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> poison, <8 x float> zeroinitializer, i64 0)
	; CHECK-NEXT: [[TMP17:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP16]], <8 x float> zeroinitializer, i64 8)
	; CHECK-NEXT: [[TMP18:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP17]], <8 x float> zeroinitializer, i64 16)
	; CHECK-NEXT: [[TMP19:%.*]] = call <32 x float> @llvm.vector.insert.v32f32.v8f32(<32 x float> [[TMP18]], <8 x float> zeroinitializer, i64 24)
	; CHECK-NEXT: [[TMP20:%.*]] = fcmp ogt <32 x float> [[TMP19]], [[TMP15]]
	; CHECK-NEXT: ret void
	;
	entry:
	%0 = getelementptr i8, ptr null, i64 132
	%1 = getelementptr i8, ptr null, i64 164
	%2 = getelementptr i8, ptr null, i64 200
	%3 = getelementptr i8, ptr null, i64 300
	%4 = load <8 x float>, ptr %0, align 4
	%5 = load <8 x float>, ptr %1, align 4
	%6 = load <8 x float>, ptr %2, align 4
	%7 = load <8 x float>, ptr %3, align 4
	%8 = fpext <8 x float> %4 to <8 x double>
	%9 = fpext <8 x float> %5 to <8 x double>
	%10 = fpext <8 x float> %6 to <8 x double>
	%11 = fpext <8 x float> %7 to <8 x double>
	%12 = fadd <8 x double> zeroinitializer, %8
	%13 = fadd <8 x double> zeroinitializer, %9
	%14 = fadd <8 x double> zeroinitializer, %10
	%15 = fadd <8 x double> zeroinitializer, %11
	%16 = fptrunc <8 x double> %12 to <8 x float>
	%17 = fptrunc <8 x double> %13 to <8 x float>
	%18 = fptrunc <8 x double> %14 to <8 x float>
	%19 = fptrunc <8 x double> %15 to <8 x float>
	%20 = fcmp ogt <8 x float> zeroinitializer, %16
	%21 = fcmp ogt <8 x float> zeroinitializer, %17
	%22 = fcmp ogt <8 x float> zeroinitializer, %18
	%23 = fcmp ogt <8 x float> zeroinitializer, %19
	ret void
	}

	define void @test3(float %0) {
	; CHECK-LABEL: @test3(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: br label [[FOR_BODY_LR_PH:%.*]]
	; CHECK: for.body.lr.ph:
	; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
	; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP1]], <2 x float> zeroinitializer, i64 2)
	; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.]], label [[FOR_BODY:%.]]
	; CHECK: for.cond.cleanup:
	; CHECK-NEXT: [[TMP3:%.]] = phi <4 x float> [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[TMP10:%.]], [[FOR_BODY]] ]
	; CHECK-NEXT: ret void
	; CHECK: for.body:
	; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr null, align 4
	; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x float> zeroinitializer, [[TMP4]]
	; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> poison, <2 x i1> splat (i1 true), i64 0)
	; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> [[TMP6]], <2 x i1> [[TMP5]], i64 2)
	; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP4]], i64 0)
	; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	; CHECK-NEXT: [[TMP10]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> [[TMP2]]
	; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
	;
	entry:
	br label %for.body.lr.ph

	for.body.lr.ph:
	br i1 false, label %for.cond.cleanup, label %for.body

	for.cond.cleanup: ; preds = %for.body, %for.body.lr.ph
	%1 = phi <2 x float> [ zeroinitializer, %for.body.lr.ph ], [ %5, %for.body ]
	%2 = phi <2 x float> [ zeroinitializer, %for.body.lr.ph ], [ %6, %for.body ]
	ret void

	for.body:
	%3 = load <2 x float>, ptr null, align 4
	%4 = fcmp olt <2 x float> zeroinitializer, %3
	%5 = select <2 x i1> <i1 true, i1 true>, <2 x float> %3, <2 x float> zeroinitializer
	%6 = select <2 x i1> %4, <2 x float> %3, <2 x float> zeroinitializer
	br label %for.cond.cleanup
	}

	define ptr @test4() {
	; POWEROF2-LABEL: @test4(
	; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
	; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
	; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
	; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
	; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
	; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
	; POWEROF2-NEXT: br label [[TMP8:%.*]]
	; POWEROF2: 7:
	; POWEROF2-NEXT: br label [[TMP8]]
	; POWEROF2: 8:
	; POWEROF2-NEXT: [[TMP9:%.]] = phi <2 x float> [ poison, [[TMP7:%.]] ], [ [[TMP4]], [[TMP0:%.*]] ]
	; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
	; POWEROF2-NEXT: br label [[TMP11:%.*]]
	; POWEROF2: 11:
	; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
	; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
	; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
	; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
	; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
	; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
	; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
	; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
	; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
	; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
	; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
	; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
	; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
	; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
	; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
	; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
	; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
	; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
	; POWEROF2-NEXT: ret ptr null
	;
	; NONPOWEROF2-LABEL: @test4(
	; NONPOWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
	; NONPOWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
	; NONPOWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <3 x i32> <i32 4, i32 5, i32 6>
	; NONPOWEROF2-NEXT: [[TMP4:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> poison, <3 x float> [[TMP2]], i64 0)
	; NONPOWEROF2-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.vector.insert.v6f32.v3f32(<6 x float> [[TMP4]], <3 x float> [[TMP3]], i64 3)
	; NONPOWEROF2-NEXT: br label [[TMP7:%.*]]
	; NONPOWEROF2: 6:
	; NONPOWEROF2-NEXT: br label [[TMP7]]
	; NONPOWEROF2: 7:
	; NONPOWEROF2-NEXT: [[TMP8:%.]] = phi <6 x float> [ poison, [[TMP6:%.]] ], [ [[TMP5]], [[TMP0:%.*]] ]
	; NONPOWEROF2-NEXT: br label [[TMP9:%.*]]
	; NONPOWEROF2: 9:
	; NONPOWEROF2-NEXT: [[TMP10:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 0)
	; NONPOWEROF2-NEXT: [[TMP11:%.*]] = fmul <3 x float> zeroinitializer, [[TMP10]]
	; NONPOWEROF2-NEXT: [[TMP12:%.*]] = call <3 x float> @llvm.vector.extract.v3f32.v6f32(<6 x float> [[TMP8]], i64 3)
	; NONPOWEROF2-NEXT: [[TMP13:%.*]] = fmul <3 x float> zeroinitializer, [[TMP12]]
	; NONPOWEROF2-NEXT: [[TMP14:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP11]])
	; NONPOWEROF2-NEXT: [[TMP15:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP13]])
	; NONPOWEROF2-NEXT: [[TMP16:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP14]])
	; NONPOWEROF2-NEXT: [[TMP17:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP15]])
	; NONPOWEROF2-NEXT: ret ptr null
	;
	%1 = fadd <8 x float> zeroinitializer, zeroinitializer
	%2 = extractelement <8 x float> %1, i64 0
	%3 = extractelement <8 x float> %1, i64 1
	%4 = extractelement <8 x float> %1, i64 2
	%5 = extractelement <8 x float> %1, i64 4
	%6 = extractelement <8 x float> %1, i64 5
	%7 = extractelement <8 x float> %1, i64 6
	br label %9

	8:
	br label %9

	9:
	%10 = phi float [ 0.000000e+00, %8 ], [ %7, %0 ]
	%11 = phi float [ 0.000000e+00, %8 ], [ %6, %0 ]
	%12 = phi float [ 0.000000e+00, %8 ], [ %5, %0 ]
	%13 = phi float [ 0.000000e+00, %8 ], [ %4, %0 ]
	%14 = phi float [ 0.000000e+00, %8 ], [ %3, %0 ]
	%15 = phi float [ 0.000000e+00, %8 ], [ %2, %0 ]
	br label %16

	16:
	%17 = fmul float %14, 0.000000e+00
	%18 = fmul float 0.000000e+00, %11
	%19 = fmul float 0.000000e+00, %15
	%20 = fmul float %12, 0.000000e+00
	%21 = fadd reassoc nsz float %17, %19
	%22 = fadd reassoc nsz float %18, %20
	%23 = fmul float %13, 0.000000e+00
	%24 = fmul float %10, 0.000000e+00
	%25 = fadd reassoc nsz float %21, %23
	%26 = fadd reassoc nsz float %22, %24
	%27 = tail call float @llvm.sqrt.f32(float %25)
	%28 = tail call float @llvm.sqrt.f32(float %26)
	ret ptr null
	}

	define i32 @test5() {
	; CHECK-LABEL: @test5(
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[TMP0:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> zeroinitializer, i64 0)
	; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> [[TMP0]], <2 x double> zeroinitializer, i64 2)
	; CHECK-NEXT: [[TMP2:%.*]] = fdiv <4 x double> [[TMP1]], [[TMP1]]
	; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 0)
	; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP3]], <2 x double> zeroinitializer, i64 2)
	; CHECK-NEXT: [[TMP5:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP4]], <2 x double> zeroinitializer, i64 4)
	; CHECK-NEXT: [[TMP6:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP5]], <2 x double> zeroinitializer, i64 6)
	; CHECK-NEXT: [[TMP7:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 2)
	; CHECK-NEXT: [[TMP8:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP7]], <2 x double> zeroinitializer, i64 6)
	; CHECK-NEXT: [[TMP9:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> poison, <4 x double> [[TMP2]], i64 0)
	; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 2, i32 3, i32 14, i32 15>
	; CHECK-NEXT: [[TMP11:%.*]] = fadd <8 x double> [[TMP6]], [[TMP10]]
	; CHECK-NEXT: br label [[FOR_END47:%.*]]
	; CHECK: for.end47:
	; CHECK-NEXT: [[TMP12:%.]] = phi <8 x double> [ [[TMP11]], [[ENTRY:%.]] ]
	; CHECK-NEXT: ret i32 0
	;
	entry:
	%div0 = fdiv <2 x double> zeroinitializer, zeroinitializer
	%div1 = fdiv <2 x double> zeroinitializer, zeroinitializer
	%add0 = fadd <2 x double> zeroinitializer, %div0
	%add1 = fadd <2 x double> zeroinitializer, zeroinitializer
	%add2 = fadd <2 x double> %div1, zeroinitializer
	%add3 = fadd <2 x double> zeroinitializer, zeroinitializer
	br label %for.end47

	for.end47: ; preds = %entry
	%add0.lcssa = phi <2 x double> [ %add0, %entry ]
	%add1.lcssa = phi <2 x double> [ %add1, %entry ]
	%add2.lcssa = phi <2 x double> [ %add2, %entry ]
	%add3.lcssa = phi <2 x double> [ %add3, %entry ]
	ret i32 0
	}