blob: 93f73e5cd30ff5f201ec022408563077c4669aad [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
; RUN: | FileCheck --check-prefix=LA32 %s
; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
; RUN: | FileCheck --check-prefix=LA64 %s
%struct.S = type { i64, i64, i8 }
%struct.F = type { float, double, float }
%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
; LA32-LABEL: sink_fold_i64:
; LA32: # %bb.0: # %entry
; LA32-NEXT: addi.w $sp, $sp, -48
; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill
; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill
; LA32-NEXT: move $s0, $a3
; LA32-NEXT: move $s1, $a2
; LA32-NEXT: slli.w $a1, $a0, 4
; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
; LA32-NEXT: sltui $a1, $a3, 1
; LA32-NEXT: slti $a2, $a3, 0
; LA32-NEXT: masknez $a2, $a2, $a1
; LA32-NEXT: sltui $a3, $s1, 1
; LA32-NEXT: maskeqz $a1, $a3, $a1
; LA32-NEXT: or $a1, $a1, $a2
; LA32-NEXT: add.w $s2, $a4, $a0
; LA32-NEXT: bnez $a1, .LBB0_3
; LA32-NEXT: # %bb.1: # %for.body.preheader
; LA32-NEXT: move $fp, $a4
; LA32-NEXT: move $s4, $zero
; LA32-NEXT: move $s5, $zero
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s6, $zero
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB0_2: # %for.body
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: move $a0, $fp
; LA32-NEXT: bl f
; LA32-NEXT: ld.w $a0, $s2, 12
; LA32-NEXT: ld.w $a1, $s2, 8
; LA32-NEXT: add.w $a0, $a0, $s6
; LA32-NEXT: add.w $s3, $a1, $s3
; LA32-NEXT: sltu $a1, $s3, $a1
; LA32-NEXT: addi.w $s4, $s4, 1
; LA32-NEXT: sltui $a2, $s4, 1
; LA32-NEXT: add.w $s5, $s5, $a2
; LA32-NEXT: xor $a2, $s4, $s1
; LA32-NEXT: xor $a3, $s5, $s0
; LA32-NEXT: or $a2, $a2, $a3
; LA32-NEXT: add.w $s6, $a0, $a1
; LA32-NEXT: bnez $a2, .LBB0_2
; LA32-NEXT: b .LBB0_4
; LA32-NEXT: .LBB0_3:
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s6, $zero
; LA32-NEXT: .LBB0_4: # %for.cond.cleanup
; LA32-NEXT: st.w $s3, $s2, 8
; LA32-NEXT: st.w $s6, $s2, 12
; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 48
; LA32-NEXT: ret
;
; LA64-LABEL: sink_fold_i64:
; LA64: # %bb.0: # %entry
; LA64-NEXT: addi.d $sp, $sp, -48
; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill
; LA64-NEXT: move $s0, $a1
; LA64-NEXT: slli.d $a1, $a0, 4
; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
; LA64-NEXT: add.d $s1, $a2, $a0
; LA64-NEXT: blez $s0, .LBB0_3
; LA64-NEXT: # %bb.1: # %for.body.preheader
; LA64-NEXT: move $fp, $a2
; LA64-NEXT: move $s2, $zero
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB0_2: # %for.body
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: move $a0, $fp
; LA64-NEXT: pcaddu18i $ra, %call36(f)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: ld.d $a0, $s1, 8
; LA64-NEXT: addi.d $s0, $s0, -1
; LA64-NEXT: add.d $s2, $a0, $s2
; LA64-NEXT: bnez $s0, .LBB0_2
; LA64-NEXT: b .LBB0_4
; LA64-NEXT: .LBB0_3:
; LA64-NEXT: move $s2, $zero
; LA64-NEXT: .LBB0_4: # %for.cond.cleanup
; LA64-NEXT: st.d $s2, $s1, 8
; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 48
; LA64-NEXT: ret
entry:
%y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
%cmp4 = icmp sgt i64 %n, 0
br i1 %cmp4, label %for.body, label %for.cond.cleanup
for.body: ; preds = %entry, %for.body
%i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
call void @f(ptr %a)
%0 = load i64, ptr %y
%add = add nsw i64 %0, %s.05
%inc = add nuw nsw i64 %i.06, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
store i64 %s.0.lcssa, ptr %y
ret void
}
define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
; LA32-LABEL: sink_fold_f32:
; LA32: # %bb.0: # %entry
; LA32-NEXT: addi.w $sp, $sp, -48
; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
; LA32-NEXT: move $s0, $a3
; LA32-NEXT: move $s1, $a2
; LA32-NEXT: slli.w $a1, $a0, 4
; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
; LA32-NEXT: sltui $a1, $a3, 1
; LA32-NEXT: slti $a2, $a3, 0
; LA32-NEXT: masknez $a2, $a2, $a1
; LA32-NEXT: sltui $a3, $s1, 1
; LA32-NEXT: maskeqz $a1, $a3, $a1
; LA32-NEXT: or $a1, $a1, $a2
; LA32-NEXT: add.w $s2, $a4, $a0
; LA32-NEXT: bnez $a1, .LBB1_3
; LA32-NEXT: # %bb.1: # %for.body.preheader
; LA32-NEXT: move $fp, $a4
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s4, $zero
; LA32-NEXT: movgr2fr.w $fs0, $zero
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB1_2: # %for.body
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: move $a0, $fp
; LA32-NEXT: bl f
; LA32-NEXT: fld.s $fa0, $s2, 16
; LA32-NEXT: addi.w $s3, $s3, 1
; LA32-NEXT: sltui $a0, $s3, 1
; LA32-NEXT: add.w $s4, $s4, $a0
; LA32-NEXT: xor $a0, $s3, $s1
; LA32-NEXT: xor $a1, $s4, $s0
; LA32-NEXT: or $a0, $a0, $a1
; LA32-NEXT: fadd.s $fs0, $fa0, $fs0
; LA32-NEXT: bnez $a0, .LBB1_2
; LA32-NEXT: b .LBB1_4
; LA32-NEXT: .LBB1_3:
; LA32-NEXT: movgr2fr.w $fs0, $zero
; LA32-NEXT: .LBB1_4: # %for.cond.cleanup
; LA32-NEXT: fst.s $fs0, $s2, 16
; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 48
; LA32-NEXT: ret
;
; LA64-LABEL: sink_fold_f32:
; LA64: # %bb.0: # %entry
; LA64-NEXT: addi.d $sp, $sp, -48
; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
; LA64-NEXT: move $s0, $a1
; LA64-NEXT: slli.d $a1, $a0, 4
; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
; LA64-NEXT: add.d $s1, $a2, $a0
; LA64-NEXT: blez $s0, .LBB1_3
; LA64-NEXT: # %bb.1: # %for.body.preheader
; LA64-NEXT: move $fp, $a2
; LA64-NEXT: movgr2fr.w $fs0, $zero
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB1_2: # %for.body
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: move $a0, $fp
; LA64-NEXT: pcaddu18i $ra, %call36(f)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: fld.s $fa0, $s1, 16
; LA64-NEXT: addi.d $s0, $s0, -1
; LA64-NEXT: fadd.s $fs0, $fa0, $fs0
; LA64-NEXT: bnez $s0, .LBB1_2
; LA64-NEXT: b .LBB1_4
; LA64-NEXT: .LBB1_3:
; LA64-NEXT: movgr2fr.w $fs0, $zero
; LA64-NEXT: .LBB1_4: # %for.cond.cleanup
; LA64-NEXT: fst.s $fs0, $s1, 16
; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 48
; LA64-NEXT: ret
entry:
%y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
%cmp4 = icmp sgt i64 %n, 0
br i1 %cmp4, label %for.body, label %for.cond.cleanup
for.body: ; preds = %entry, %for.body
%i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
call void @f(ptr %a)
%0 = load float, ptr %y
%add = fadd float %0, %s.05
%inc = add nuw nsw i64 %i.06, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
store float %s.0.lcssa, ptr %y
ret void
}
define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
; LA32-LABEL: sink_fold_v4i32:
; LA32: # %bb.0: # %entry
; LA32-NEXT: addi.w $sp, $sp, -48
; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
; LA32-NEXT: move $s0, $a3
; LA32-NEXT: move $s1, $a2
; LA32-NEXT: slli.w $a0, $a0, 6
; LA32-NEXT: sltui $a1, $a3, 1
; LA32-NEXT: slti $a2, $a3, 0
; LA32-NEXT: masknez $a2, $a2, $a1
; LA32-NEXT: sltui $a3, $s1, 1
; LA32-NEXT: maskeqz $a1, $a3, $a1
; LA32-NEXT: or $a1, $a1, $a2
; LA32-NEXT: add.w $s2, $a4, $a0
; LA32-NEXT: bnez $a1, .LBB2_3
; LA32-NEXT: # %bb.1: # %for.body.preheader
; LA32-NEXT: move $fp, $a4
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s4, $zero
; LA32-NEXT: vrepli.b $vr0, 0
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB2_2: # %for.body
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; LA32-NEXT: move $a0, $fp
; LA32-NEXT: bl f
; LA32-NEXT: vld $vr0, $s2, 16
; LA32-NEXT: addi.w $s3, $s3, 1
; LA32-NEXT: sltui $a0, $s3, 1
; LA32-NEXT: add.w $s4, $s4, $a0
; LA32-NEXT: xor $a0, $s3, $s1
; LA32-NEXT: xor $a1, $s4, $s0
; LA32-NEXT: or $a0, $a0, $a1
; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
; LA32-NEXT: vadd.w $vr1, $vr0, $vr1
; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
; LA32-NEXT: bnez $a0, .LBB2_2
; LA32-NEXT: b .LBB2_4
; LA32-NEXT: .LBB2_3:
; LA32-NEXT: vrepli.b $vr0, 0
; LA32-NEXT: .LBB2_4: # %for.cond.cleanup
; LA32-NEXT: vst $vr0, $s2, 16
; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 48
; LA32-NEXT: ret
;
; LA64-LABEL: sink_fold_v4i32:
; LA64: # %bb.0: # %entry
; LA64-NEXT: addi.d $sp, $sp, -48
; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
; LA64-NEXT: slli.d $a0, $a0, 6
; LA64-NEXT: add.d $s1, $a2, $a0
; LA64-NEXT: blez $a1, .LBB2_3
; LA64-NEXT: # %bb.1: # %for.body.preheader
; LA64-NEXT: move $fp, $a2
; LA64-NEXT: move $s0, $a1
; LA64-NEXT: vrepli.b $vr0, 0
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB2_2: # %for.body
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; LA64-NEXT: move $a0, $fp
; LA64-NEXT: pcaddu18i $ra, %call36(f)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: vld $vr0, $s1, 16
; LA64-NEXT: addi.d $s0, $s0, -1
; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
; LA64-NEXT: vadd.w $vr1, $vr0, $vr1
; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
; LA64-NEXT: bnez $s0, .LBB2_2
; LA64-NEXT: b .LBB2_4
; LA64-NEXT: .LBB2_3:
; LA64-NEXT: vrepli.b $vr0, 0
; LA64-NEXT: .LBB2_4: # %for.cond.cleanup
; LA64-NEXT: vst $vr0, $s1, 16
; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 48
; LA64-NEXT: ret
entry:
%y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.body: ; preds = %entry, %for.body
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
call void @f(ptr %a)
%v = load <4 x i32>, ptr %y
%addv = add <4 x i32> %v, %sum.0
%inc = add nuw nsw i64 %i.0, 1
%exitcond = icmp eq i64 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
store <4 x i32> %sum.lcssa, ptr %y
ret void
}
define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
; LA32-LABEL: sink_fold_v16i16:
; LA32: # %bb.0: # %entry
; LA32-NEXT: addi.w $sp, $sp, -80
; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
; LA32-NEXT: move $s0, $a3
; LA32-NEXT: move $s1, $a2
; LA32-NEXT: slli.w $a0, $a0, 6
; LA32-NEXT: sltui $a1, $a3, 1
; LA32-NEXT: slti $a2, $a3, 0
; LA32-NEXT: masknez $a2, $a2, $a1
; LA32-NEXT: sltui $a3, $s1, 1
; LA32-NEXT: maskeqz $a1, $a3, $a1
; LA32-NEXT: or $a1, $a1, $a2
; LA32-NEXT: add.w $s2, $a4, $a0
; LA32-NEXT: bnez $a1, .LBB3_3
; LA32-NEXT: # %bb.1: # %for.body.preheader
; LA32-NEXT: move $fp, $a4
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s4, $zero
; LA32-NEXT: xvrepli.b $xr0, 0
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB3_2: # %for.body
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; LA32-NEXT: move $a0, $fp
; LA32-NEXT: bl f
; LA32-NEXT: xvld $xr0, $s2, 32
; LA32-NEXT: addi.w $s3, $s3, 1
; LA32-NEXT: sltui $a0, $s3, 1
; LA32-NEXT: add.w $s4, $s4, $a0
; LA32-NEXT: xor $a0, $s3, $s1
; LA32-NEXT: xor $a1, $s4, $s0
; LA32-NEXT: or $a0, $a0, $a1
; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1
; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; LA32-NEXT: bnez $a0, .LBB3_2
; LA32-NEXT: b .LBB3_4
; LA32-NEXT: .LBB3_3:
; LA32-NEXT: xvrepli.b $xr0, 0
; LA32-NEXT: .LBB3_4: # %for.cond.cleanup
; LA32-NEXT: xvst $xr0, $s2, 32
; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 80
; LA32-NEXT: ret
;
; LA64-LABEL: sink_fold_v16i16:
; LA64: # %bb.0: # %entry
; LA64-NEXT: addi.d $sp, $sp, -80
; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
; LA64-NEXT: slli.d $a0, $a0, 6
; LA64-NEXT: add.d $s1, $a2, $a0
; LA64-NEXT: blez $a1, .LBB3_3
; LA64-NEXT: # %bb.1: # %for.body.preheader
; LA64-NEXT: move $fp, $a2
; LA64-NEXT: move $s0, $a1
; LA64-NEXT: xvrepli.b $xr0, 0
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB3_2: # %for.body
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; LA64-NEXT: move $a0, $fp
; LA64-NEXT: pcaddu18i $ra, %call36(f)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: xvld $xr0, $s1, 32
; LA64-NEXT: addi.d $s0, $s0, -1
; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1
; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; LA64-NEXT: bnez $s0, .LBB3_2
; LA64-NEXT: b .LBB3_4
; LA64-NEXT: .LBB3_3:
; LA64-NEXT: xvrepli.b $xr0, 0
; LA64-NEXT: .LBB3_4: # %for.cond.cleanup
; LA64-NEXT: xvst $xr0, $s1, 32
; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 80
; LA64-NEXT: ret
entry:
%y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.body: ; preds = %entry, %for.body
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
call void @f(ptr %a)
%v = load <16 x i16>, ptr %y
%addv = add <16 x i16> %v, %sum.0
%inc = add nuw nsw i64 %i.0, 1
%exitcond = icmp eq i64 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
store <16 x i16> %sum.lcssa, ptr %y
ret void
}
define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
; LA32-LABEL: sink_fold_extracti8:
; LA32: # %bb.0: # %entry
; LA32-NEXT: addi.w $sp, $sp, -48
; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
; LA32-NEXT: move $s0, $a3
; LA32-NEXT: move $s1, $a2
; LA32-NEXT: slli.w $a1, $a0, 4
; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
; LA32-NEXT: sltui $a1, $a3, 1
; LA32-NEXT: slti $a2, $a3, 0
; LA32-NEXT: masknez $a2, $a2, $a1
; LA32-NEXT: sltui $a3, $s1, 1
; LA32-NEXT: maskeqz $a1, $a3, $a1
; LA32-NEXT: or $a1, $a1, $a2
; LA32-NEXT: add.w $s2, $a4, $a0
; LA32-NEXT: bnez $a1, .LBB4_3
; LA32-NEXT: # %bb.1: # %for.body.preheader
; LA32-NEXT: move $fp, $a4
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s4, $zero
; LA32-NEXT: vrepli.b $vr0, 0
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB4_2: # %for.body
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; LA32-NEXT: move $a0, $fp
; LA32-NEXT: bl f
; LA32-NEXT: vldrepl.b $vr0, $s2, 16
; LA32-NEXT: addi.w $s3, $s3, 1
; LA32-NEXT: sltui $a0, $s3, 1
; LA32-NEXT: add.w $s4, $s4, $a0
; LA32-NEXT: xor $a0, $s3, $s1
; LA32-NEXT: xor $a1, $s4, $s0
; LA32-NEXT: or $a0, $a0, $a1
; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
; LA32-NEXT: vadd.b $vr1, $vr0, $vr1
; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
; LA32-NEXT: bnez $a0, .LBB4_2
; LA32-NEXT: b .LBB4_4
; LA32-NEXT: .LBB4_3:
; LA32-NEXT: vrepli.b $vr0, 0
; LA32-NEXT: .LBB4_4: # %for.cond.cleanup
; LA32-NEXT: vstelm.b $vr0, $s2, 16, 1
; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 48
; LA32-NEXT: ret
;
; LA64-LABEL: sink_fold_extracti8:
; LA64: # %bb.0: # %entry
; LA64-NEXT: addi.d $sp, $sp, -48
; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
; LA64-NEXT: move $s0, $a1
; LA64-NEXT: slli.d $a1, $a0, 4
; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
; LA64-NEXT: add.d $s1, $a2, $a0
; LA64-NEXT: blez $s0, .LBB4_3
; LA64-NEXT: # %bb.1: # %for.body.preheader
; LA64-NEXT: move $fp, $a2
; LA64-NEXT: vrepli.b $vr0, 0
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB4_2: # %for.body
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
; LA64-NEXT: move $a0, $fp
; LA64-NEXT: pcaddu18i $ra, %call36(f)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: vldrepl.b $vr0, $s1, 16
; LA64-NEXT: addi.d $s0, $s0, -1
; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
; LA64-NEXT: vadd.b $vr1, $vr0, $vr1
; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
; LA64-NEXT: bnez $s0, .LBB4_2
; LA64-NEXT: b .LBB4_4
; LA64-NEXT: .LBB4_3:
; LA64-NEXT: vrepli.b $vr0, 0
; LA64-NEXT: .LBB4_4: # %for.cond.cleanup
; LA64-NEXT: vstelm.b $vr0, $s1, 16, 1
; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 48
; LA64-NEXT: ret
entry:
%y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.body: ; preds = %entry, %for.body
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
call void @f(ptr %a)
%e = load i8, ptr %y
%ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
%v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
%addv = add <16 x i8> %v, %sum.0
%inc = add nuw nsw i64 %i.0, 1
%exitcond = icmp eq i64 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
%res = extractelement <16 x i8> %sum.lcssa, i32 1
store i8 %res, ptr %y
ret void
}
define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
; LA32-LABEL: sink_fold_extractf64:
; LA32: # %bb.0: # %entry
; LA32-NEXT: addi.w $sp, $sp, -80
; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
; LA32-NEXT: move $s0, $a3
; LA32-NEXT: move $s1, $a2
; LA32-NEXT: slli.w $a1, $a0, 4
; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
; LA32-NEXT: sltui $a1, $a3, 1
; LA32-NEXT: slti $a2, $a3, 0
; LA32-NEXT: masknez $a2, $a2, $a1
; LA32-NEXT: sltui $a3, $s1, 1
; LA32-NEXT: maskeqz $a1, $a3, $a1
; LA32-NEXT: or $a1, $a1, $a2
; LA32-NEXT: add.w $s2, $a4, $a0
; LA32-NEXT: bnez $a1, .LBB5_3
; LA32-NEXT: # %bb.1: # %for.body.preheader
; LA32-NEXT: move $fp, $a4
; LA32-NEXT: move $s3, $zero
; LA32-NEXT: move $s4, $zero
; LA32-NEXT: xvrepli.b $xr0, 0
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB5_2: # %for.body
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; LA32-NEXT: move $a0, $fp
; LA32-NEXT: bl f
; LA32-NEXT: xvldrepl.d $xr0, $s2, 8
; LA32-NEXT: addi.w $s3, $s3, 1
; LA32-NEXT: sltui $a0, $s3, 1
; LA32-NEXT: add.w $s4, $s4, $a0
; LA32-NEXT: xor $a0, $s3, $s1
; LA32-NEXT: xor $a1, $s4, $s0
; LA32-NEXT: or $a0, $a0, $a1
; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1
; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; LA32-NEXT: bnez $a0, .LBB5_2
; LA32-NEXT: b .LBB5_4
; LA32-NEXT: .LBB5_3:
; LA32-NEXT: xvrepli.b $xr0, 0
; LA32-NEXT: .LBB5_4: # %for.cond.cleanup
; LA32-NEXT: xvstelm.d $xr0, $s2, 8, 1
; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
; LA32-NEXT: addi.w $sp, $sp, 80
; LA32-NEXT: ret
;
; LA64-LABEL: sink_fold_extractf64:
; LA64: # %bb.0: # %entry
; LA64-NEXT: addi.d $sp, $sp, -80
; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
; LA64-NEXT: move $s0, $a1
; LA64-NEXT: slli.d $a1, $a0, 4
; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
; LA64-NEXT: add.d $s1, $a2, $a0
; LA64-NEXT: blez $s0, .LBB5_3
; LA64-NEXT: # %bb.1: # %for.body.preheader
; LA64-NEXT: move $fp, $a2
; LA64-NEXT: xvrepli.b $xr0, 0
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB5_2: # %for.body
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
; LA64-NEXT: move $a0, $fp
; LA64-NEXT: pcaddu18i $ra, %call36(f)
; LA64-NEXT: jirl $ra, $ra, 0
; LA64-NEXT: xvldrepl.d $xr0, $s1, 8
; LA64-NEXT: addi.d $s0, $s0, -1
; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1
; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
; LA64-NEXT: bnez $s0, .LBB5_2
; LA64-NEXT: b .LBB5_4
; LA64-NEXT: .LBB5_3:
; LA64-NEXT: xvrepli.b $xr0, 0
; LA64-NEXT: .LBB5_4: # %for.cond.cleanup
; LA64-NEXT: xvstelm.d $xr0, $s1, 8, 1
; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
; LA64-NEXT: addi.d $sp, $sp, 80
; LA64-NEXT: ret
entry:
%y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
%cmp = icmp sgt i64 %n, 0
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.body: ; preds = %entry, %for.body
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
%sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
call void @f(ptr %a)
%e = load double, ptr %y
%ins0 = insertelement <4 x double> poison, double %e, i32 0
%v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
%addv = fadd <4 x double> %v, %sum.0
%inc = add nuw nsw i64 %i.0, 1
%exitcond = icmp eq i64 %inc, %n
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
%sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
%res = extractelement <4 x double> %sum.lcssa, i32 1
store double %res, ptr %y
ret void
}
declare void @f(ptr)