| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \ |
| ; RUN: | FileCheck --check-prefix=LA32 %s |
| ; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \ |
| ; RUN: | FileCheck --check-prefix=LA64 %s |
| |
| %struct.S = type { i64, i64, i8 } |
| %struct.F = type { float, double, float } |
| %struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> } |
| |
| define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind { |
| ; LA32-LABEL: sink_fold_i64: |
| ; LA32: # %bb.0: # %entry |
| ; LA32-NEXT: addi.w $sp, $sp, -48 |
| ; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill |
| ; LA32-NEXT: move $s0, $a3 |
| ; LA32-NEXT: move $s1, $a2 |
| ; LA32-NEXT: slli.w $a1, $a0, 4 |
| ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 |
| ; LA32-NEXT: sltui $a1, $a3, 1 |
| ; LA32-NEXT: slti $a2, $a3, 0 |
| ; LA32-NEXT: masknez $a2, $a2, $a1 |
| ; LA32-NEXT: sltui $a3, $s1, 1 |
| ; LA32-NEXT: maskeqz $a1, $a3, $a1 |
| ; LA32-NEXT: or $a1, $a1, $a2 |
| ; LA32-NEXT: add.w $s2, $a4, $a0 |
| ; LA32-NEXT: bnez $a1, .LBB0_3 |
| ; LA32-NEXT: # %bb.1: # %for.body.preheader |
| ; LA32-NEXT: move $fp, $a4 |
| ; LA32-NEXT: move $s4, $zero |
| ; LA32-NEXT: move $s5, $zero |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s6, $zero |
| ; LA32-NEXT: .p2align 4, , 16 |
| ; LA32-NEXT: .LBB0_2: # %for.body |
| ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA32-NEXT: move $a0, $fp |
| ; LA32-NEXT: bl f |
| ; LA32-NEXT: ld.w $a0, $s2, 12 |
| ; LA32-NEXT: ld.w $a1, $s2, 8 |
| ; LA32-NEXT: add.w $a0, $a0, $s6 |
| ; LA32-NEXT: add.w $s3, $a1, $s3 |
| ; LA32-NEXT: sltu $a1, $s3, $a1 |
| ; LA32-NEXT: addi.w $s4, $s4, 1 |
| ; LA32-NEXT: sltui $a2, $s4, 1 |
| ; LA32-NEXT: add.w $s5, $s5, $a2 |
| ; LA32-NEXT: xor $a2, $s4, $s1 |
| ; LA32-NEXT: xor $a3, $s5, $s0 |
| ; LA32-NEXT: or $a2, $a2, $a3 |
| ; LA32-NEXT: add.w $s6, $a0, $a1 |
| ; LA32-NEXT: bnez $a2, .LBB0_2 |
| ; LA32-NEXT: b .LBB0_4 |
| ; LA32-NEXT: .LBB0_3: |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s6, $zero |
| ; LA32-NEXT: .LBB0_4: # %for.cond.cleanup |
| ; LA32-NEXT: st.w $s3, $s2, 8 |
| ; LA32-NEXT: st.w $s6, $s2, 12 |
| ; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload |
| ; LA32-NEXT: addi.w $sp, $sp, 48 |
| ; LA32-NEXT: ret |
| ; |
| ; LA64-LABEL: sink_fold_i64: |
| ; LA64: # %bb.0: # %entry |
| ; LA64-NEXT: addi.d $sp, $sp, -48 |
| ; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill |
| ; LA64-NEXT: move $s0, $a1 |
| ; LA64-NEXT: slli.d $a1, $a0, 4 |
| ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 |
| ; LA64-NEXT: add.d $s1, $a2, $a0 |
| ; LA64-NEXT: blez $s0, .LBB0_3 |
| ; LA64-NEXT: # %bb.1: # %for.body.preheader |
| ; LA64-NEXT: move $fp, $a2 |
| ; LA64-NEXT: move $s2, $zero |
| ; LA64-NEXT: .p2align 4, , 16 |
| ; LA64-NEXT: .LBB0_2: # %for.body |
| ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA64-NEXT: move $a0, $fp |
| ; LA64-NEXT: pcaddu18i $ra, %call36(f) |
| ; LA64-NEXT: jirl $ra, $ra, 0 |
| ; LA64-NEXT: ld.d $a0, $s1, 8 |
| ; LA64-NEXT: addi.d $s0, $s0, -1 |
| ; LA64-NEXT: add.d $s2, $a0, $s2 |
| ; LA64-NEXT: bnez $s0, .LBB0_2 |
| ; LA64-NEXT: b .LBB0_4 |
| ; LA64-NEXT: .LBB0_3: |
| ; LA64-NEXT: move $s2, $zero |
| ; LA64-NEXT: .LBB0_4: # %for.cond.cleanup |
| ; LA64-NEXT: st.d $s2, $s1, 8 |
| ; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload |
| ; LA64-NEXT: addi.d $sp, $sp, 48 |
| ; LA64-NEXT: ret |
| entry: |
| %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1 |
| %cmp4 = icmp sgt i64 %n, 0 |
| br i1 %cmp4, label %for.body, label %for.cond.cleanup |
| |
| for.body: ; preds = %entry, %for.body |
| %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ] |
| call void @f(ptr %a) |
| %0 = load i64, ptr %y |
| %add = add nsw i64 %0, %s.05 |
| %inc = add nuw nsw i64 %i.06, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ] |
| store i64 %s.0.lcssa, ptr %y |
| ret void |
| } |
| |
| define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind { |
| ; LA32-LABEL: sink_fold_f32: |
| ; LA32: # %bb.0: # %entry |
| ; LA32-NEXT: addi.w $sp, $sp, -48 |
| ; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill |
| ; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill |
| ; LA32-NEXT: move $s0, $a3 |
| ; LA32-NEXT: move $s1, $a2 |
| ; LA32-NEXT: slli.w $a1, $a0, 4 |
| ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 |
| ; LA32-NEXT: sltui $a1, $a3, 1 |
| ; LA32-NEXT: slti $a2, $a3, 0 |
| ; LA32-NEXT: masknez $a2, $a2, $a1 |
| ; LA32-NEXT: sltui $a3, $s1, 1 |
| ; LA32-NEXT: maskeqz $a1, $a3, $a1 |
| ; LA32-NEXT: or $a1, $a1, $a2 |
| ; LA32-NEXT: add.w $s2, $a4, $a0 |
| ; LA32-NEXT: bnez $a1, .LBB1_3 |
| ; LA32-NEXT: # %bb.1: # %for.body.preheader |
| ; LA32-NEXT: move $fp, $a4 |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s4, $zero |
| ; LA32-NEXT: movgr2fr.w $fs0, $zero |
| ; LA32-NEXT: .p2align 4, , 16 |
| ; LA32-NEXT: .LBB1_2: # %for.body |
| ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA32-NEXT: move $a0, $fp |
| ; LA32-NEXT: bl f |
| ; LA32-NEXT: fld.s $fa0, $s2, 16 |
| ; LA32-NEXT: addi.w $s3, $s3, 1 |
| ; LA32-NEXT: sltui $a0, $s3, 1 |
| ; LA32-NEXT: add.w $s4, $s4, $a0 |
| ; LA32-NEXT: xor $a0, $s3, $s1 |
| ; LA32-NEXT: xor $a1, $s4, $s0 |
| ; LA32-NEXT: or $a0, $a0, $a1 |
| ; LA32-NEXT: fadd.s $fs0, $fa0, $fs0 |
| ; LA32-NEXT: bnez $a0, .LBB1_2 |
| ; LA32-NEXT: b .LBB1_4 |
| ; LA32-NEXT: .LBB1_3: |
| ; LA32-NEXT: movgr2fr.w $fs0, $zero |
| ; LA32-NEXT: .LBB1_4: # %for.cond.cleanup |
| ; LA32-NEXT: fst.s $fs0, $s2, 16 |
| ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload |
| ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload |
| ; LA32-NEXT: addi.w $sp, $sp, 48 |
| ; LA32-NEXT: ret |
| ; |
| ; LA64-LABEL: sink_fold_f32: |
| ; LA64: # %bb.0: # %entry |
| ; LA64-NEXT: addi.d $sp, $sp, -48 |
| ; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill |
| ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill |
| ; LA64-NEXT: move $s0, $a1 |
| ; LA64-NEXT: slli.d $a1, $a0, 4 |
| ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 |
| ; LA64-NEXT: add.d $s1, $a2, $a0 |
| ; LA64-NEXT: blez $s0, .LBB1_3 |
| ; LA64-NEXT: # %bb.1: # %for.body.preheader |
| ; LA64-NEXT: move $fp, $a2 |
| ; LA64-NEXT: movgr2fr.w $fs0, $zero |
| ; LA64-NEXT: .p2align 4, , 16 |
| ; LA64-NEXT: .LBB1_2: # %for.body |
| ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA64-NEXT: move $a0, $fp |
| ; LA64-NEXT: pcaddu18i $ra, %call36(f) |
| ; LA64-NEXT: jirl $ra, $ra, 0 |
| ; LA64-NEXT: fld.s $fa0, $s1, 16 |
| ; LA64-NEXT: addi.d $s0, $s0, -1 |
| ; LA64-NEXT: fadd.s $fs0, $fa0, $fs0 |
| ; LA64-NEXT: bnez $s0, .LBB1_2 |
| ; LA64-NEXT: b .LBB1_4 |
| ; LA64-NEXT: .LBB1_3: |
| ; LA64-NEXT: movgr2fr.w $fs0, $zero |
| ; LA64-NEXT: .LBB1_4: # %for.cond.cleanup |
| ; LA64-NEXT: fst.s $fs0, $s1, 16 |
| ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload |
| ; LA64-NEXT: addi.d $sp, $sp, 48 |
| ; LA64-NEXT: ret |
| entry: |
| %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2 |
| %cmp4 = icmp sgt i64 %n, 0 |
| br i1 %cmp4, label %for.body, label %for.cond.cleanup |
| |
| for.body: ; preds = %entry, %for.body |
| %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ] |
| call void @f(ptr %a) |
| %0 = load float, ptr %y |
| %add = fadd float %0, %s.05 |
| %inc = add nuw nsw i64 %i.06, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ] |
| store float %s.0.lcssa, ptr %y |
| ret void |
| } |
| |
| define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind { |
| ; LA32-LABEL: sink_fold_v4i32: |
| ; LA32: # %bb.0: # %entry |
| ; LA32-NEXT: addi.w $sp, $sp, -48 |
| ; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill |
| ; LA32-NEXT: move $s0, $a3 |
| ; LA32-NEXT: move $s1, $a2 |
| ; LA32-NEXT: slli.w $a0, $a0, 6 |
| ; LA32-NEXT: sltui $a1, $a3, 1 |
| ; LA32-NEXT: slti $a2, $a3, 0 |
| ; LA32-NEXT: masknez $a2, $a2, $a1 |
| ; LA32-NEXT: sltui $a3, $s1, 1 |
| ; LA32-NEXT: maskeqz $a1, $a3, $a1 |
| ; LA32-NEXT: or $a1, $a1, $a2 |
| ; LA32-NEXT: add.w $s2, $a4, $a0 |
| ; LA32-NEXT: bnez $a1, .LBB2_3 |
| ; LA32-NEXT: # %bb.1: # %for.body.preheader |
| ; LA32-NEXT: move $fp, $a4 |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s4, $zero |
| ; LA32-NEXT: vrepli.b $vr0, 0 |
| ; LA32-NEXT: .p2align 4, , 16 |
| ; LA32-NEXT: .LBB2_2: # %for.body |
| ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill |
| ; LA32-NEXT: move $a0, $fp |
| ; LA32-NEXT: bl f |
| ; LA32-NEXT: vld $vr0, $s2, 16 |
| ; LA32-NEXT: addi.w $s3, $s3, 1 |
| ; LA32-NEXT: sltui $a0, $s3, 1 |
| ; LA32-NEXT: add.w $s4, $s4, $a0 |
| ; LA32-NEXT: xor $a0, $s3, $s1 |
| ; LA32-NEXT: xor $a1, $s4, $s0 |
| ; LA32-NEXT: or $a0, $a0, $a1 |
| ; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload |
| ; LA32-NEXT: vadd.w $vr1, $vr0, $vr1 |
| ; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill |
| ; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload |
| ; LA32-NEXT: bnez $a0, .LBB2_2 |
| ; LA32-NEXT: b .LBB2_4 |
| ; LA32-NEXT: .LBB2_3: |
| ; LA32-NEXT: vrepli.b $vr0, 0 |
| ; LA32-NEXT: .LBB2_4: # %for.cond.cleanup |
| ; LA32-NEXT: vst $vr0, $s2, 16 |
| ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload |
| ; LA32-NEXT: addi.w $sp, $sp, 48 |
| ; LA32-NEXT: ret |
| ; |
| ; LA64-LABEL: sink_fold_v4i32: |
| ; LA64: # %bb.0: # %entry |
| ; LA64-NEXT: addi.d $sp, $sp, -48 |
| ; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill |
| ; LA64-NEXT: slli.d $a0, $a0, 6 |
| ; LA64-NEXT: add.d $s1, $a2, $a0 |
| ; LA64-NEXT: blez $a1, .LBB2_3 |
| ; LA64-NEXT: # %bb.1: # %for.body.preheader |
| ; LA64-NEXT: move $fp, $a2 |
| ; LA64-NEXT: move $s0, $a1 |
| ; LA64-NEXT: vrepli.b $vr0, 0 |
| ; LA64-NEXT: .p2align 4, , 16 |
| ; LA64-NEXT: .LBB2_2: # %for.body |
| ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill |
| ; LA64-NEXT: move $a0, $fp |
| ; LA64-NEXT: pcaddu18i $ra, %call36(f) |
| ; LA64-NEXT: jirl $ra, $ra, 0 |
| ; LA64-NEXT: vld $vr0, $s1, 16 |
| ; LA64-NEXT: addi.d $s0, $s0, -1 |
| ; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload |
| ; LA64-NEXT: vadd.w $vr1, $vr0, $vr1 |
| ; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill |
| ; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload |
| ; LA64-NEXT: bnez $s0, .LBB2_2 |
| ; LA64-NEXT: b .LBB2_4 |
| ; LA64-NEXT: .LBB2_3: |
| ; LA64-NEXT: vrepli.b $vr0, 0 |
| ; LA64-NEXT: .LBB2_4: # %for.cond.cleanup |
| ; LA64-NEXT: vst $vr0, $s1, 16 |
| ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload |
| ; LA64-NEXT: addi.d $sp, $sp, 48 |
| ; LA64-NEXT: ret |
| entry: |
| %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1 |
| %cmp = icmp sgt i64 %n, 0 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.body: ; preds = %entry, %for.body |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| call void @f(ptr %a) |
| %v = load <4 x i32>, ptr %y |
| %addv = add <4 x i32> %v, %sum.0 |
| %inc = add nuw nsw i64 %i.0, 1 |
| %exitcond = icmp eq i64 %inc, %n |
| br i1 %exitcond, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| store <4 x i32> %sum.lcssa, ptr %y |
| ret void |
| } |
| |
| define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind { |
| ; LA32-LABEL: sink_fold_v16i16: |
| ; LA32: # %bb.0: # %entry |
| ; LA32-NEXT: addi.w $sp, $sp, -80 |
| ; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill |
| ; LA32-NEXT: move $s0, $a3 |
| ; LA32-NEXT: move $s1, $a2 |
| ; LA32-NEXT: slli.w $a0, $a0, 6 |
| ; LA32-NEXT: sltui $a1, $a3, 1 |
| ; LA32-NEXT: slti $a2, $a3, 0 |
| ; LA32-NEXT: masknez $a2, $a2, $a1 |
| ; LA32-NEXT: sltui $a3, $s1, 1 |
| ; LA32-NEXT: maskeqz $a1, $a3, $a1 |
| ; LA32-NEXT: or $a1, $a1, $a2 |
| ; LA32-NEXT: add.w $s2, $a4, $a0 |
| ; LA32-NEXT: bnez $a1, .LBB3_3 |
| ; LA32-NEXT: # %bb.1: # %for.body.preheader |
| ; LA32-NEXT: move $fp, $a4 |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s4, $zero |
| ; LA32-NEXT: xvrepli.b $xr0, 0 |
| ; LA32-NEXT: .p2align 4, , 16 |
| ; LA32-NEXT: .LBB3_2: # %for.body |
| ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill |
| ; LA32-NEXT: move $a0, $fp |
| ; LA32-NEXT: bl f |
| ; LA32-NEXT: xvld $xr0, $s2, 32 |
| ; LA32-NEXT: addi.w $s3, $s3, 1 |
| ; LA32-NEXT: sltui $a0, $s3, 1 |
| ; LA32-NEXT: add.w $s4, $s4, $a0 |
| ; LA32-NEXT: xor $a0, $s3, $s1 |
| ; LA32-NEXT: xor $a1, $s4, $s0 |
| ; LA32-NEXT: or $a0, $a0, $a1 |
| ; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload |
| ; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1 |
| ; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill |
| ; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload |
| ; LA32-NEXT: bnez $a0, .LBB3_2 |
| ; LA32-NEXT: b .LBB3_4 |
| ; LA32-NEXT: .LBB3_3: |
| ; LA32-NEXT: xvrepli.b $xr0, 0 |
| ; LA32-NEXT: .LBB3_4: # %for.cond.cleanup |
| ; LA32-NEXT: xvst $xr0, $s2, 32 |
| ; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload |
| ; LA32-NEXT: addi.w $sp, $sp, 80 |
| ; LA32-NEXT: ret |
| ; |
| ; LA64-LABEL: sink_fold_v16i16: |
| ; LA64: # %bb.0: # %entry |
| ; LA64-NEXT: addi.d $sp, $sp, -80 |
| ; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill |
| ; LA64-NEXT: slli.d $a0, $a0, 6 |
| ; LA64-NEXT: add.d $s1, $a2, $a0 |
| ; LA64-NEXT: blez $a1, .LBB3_3 |
| ; LA64-NEXT: # %bb.1: # %for.body.preheader |
| ; LA64-NEXT: move $fp, $a2 |
| ; LA64-NEXT: move $s0, $a1 |
| ; LA64-NEXT: xvrepli.b $xr0, 0 |
| ; LA64-NEXT: .p2align 4, , 16 |
| ; LA64-NEXT: .LBB3_2: # %for.body |
| ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill |
| ; LA64-NEXT: move $a0, $fp |
| ; LA64-NEXT: pcaddu18i $ra, %call36(f) |
| ; LA64-NEXT: jirl $ra, $ra, 0 |
| ; LA64-NEXT: xvld $xr0, $s1, 32 |
| ; LA64-NEXT: addi.d $s0, $s0, -1 |
| ; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload |
| ; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1 |
| ; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill |
| ; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload |
| ; LA64-NEXT: bnez $s0, .LBB3_2 |
| ; LA64-NEXT: b .LBB3_4 |
| ; LA64-NEXT: .LBB3_3: |
| ; LA64-NEXT: xvrepli.b $xr0, 0 |
| ; LA64-NEXT: .LBB3_4: # %for.cond.cleanup |
| ; LA64-NEXT: xvst $xr0, $s1, 32 |
| ; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload |
| ; LA64-NEXT: addi.d $sp, $sp, 80 |
| ; LA64-NEXT: ret |
| entry: |
| %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2 |
| %cmp = icmp sgt i64 %n, 0 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.body: ; preds = %entry, %for.body |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| call void @f(ptr %a) |
| %v = load <16 x i16>, ptr %y |
| %addv = add <16 x i16> %v, %sum.0 |
| %inc = add nuw nsw i64 %i.0, 1 |
| %exitcond = icmp eq i64 %inc, %n |
| br i1 %exitcond, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| store <16 x i16> %sum.lcssa, ptr %y |
| ret void |
| } |
| |
| define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind { |
| ; LA32-LABEL: sink_fold_extracti8: |
| ; LA32: # %bb.0: # %entry |
| ; LA32-NEXT: addi.w $sp, $sp, -48 |
| ; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill |
| ; LA32-NEXT: move $s0, $a3 |
| ; LA32-NEXT: move $s1, $a2 |
| ; LA32-NEXT: slli.w $a1, $a0, 4 |
| ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 |
| ; LA32-NEXT: sltui $a1, $a3, 1 |
| ; LA32-NEXT: slti $a2, $a3, 0 |
| ; LA32-NEXT: masknez $a2, $a2, $a1 |
| ; LA32-NEXT: sltui $a3, $s1, 1 |
| ; LA32-NEXT: maskeqz $a1, $a3, $a1 |
| ; LA32-NEXT: or $a1, $a1, $a2 |
| ; LA32-NEXT: add.w $s2, $a4, $a0 |
| ; LA32-NEXT: bnez $a1, .LBB4_3 |
| ; LA32-NEXT: # %bb.1: # %for.body.preheader |
| ; LA32-NEXT: move $fp, $a4 |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s4, $zero |
| ; LA32-NEXT: vrepli.b $vr0, 0 |
| ; LA32-NEXT: .p2align 4, , 16 |
| ; LA32-NEXT: .LBB4_2: # %for.body |
| ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill |
| ; LA32-NEXT: move $a0, $fp |
| ; LA32-NEXT: bl f |
| ; LA32-NEXT: vldrepl.b $vr0, $s2, 16 |
| ; LA32-NEXT: addi.w $s3, $s3, 1 |
| ; LA32-NEXT: sltui $a0, $s3, 1 |
| ; LA32-NEXT: add.w $s4, $s4, $a0 |
| ; LA32-NEXT: xor $a0, $s3, $s1 |
| ; LA32-NEXT: xor $a1, $s4, $s0 |
| ; LA32-NEXT: or $a0, $a0, $a1 |
| ; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload |
| ; LA32-NEXT: vadd.b $vr1, $vr0, $vr1 |
| ; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill |
| ; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload |
| ; LA32-NEXT: bnez $a0, .LBB4_2 |
| ; LA32-NEXT: b .LBB4_4 |
| ; LA32-NEXT: .LBB4_3: |
| ; LA32-NEXT: vrepli.b $vr0, 0 |
| ; LA32-NEXT: .LBB4_4: # %for.cond.cleanup |
| ; LA32-NEXT: vstelm.b $vr0, $s2, 16, 1 |
| ; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload |
| ; LA32-NEXT: addi.w $sp, $sp, 48 |
| ; LA32-NEXT: ret |
| ; |
| ; LA64-LABEL: sink_fold_extracti8: |
| ; LA64: # %bb.0: # %entry |
| ; LA64-NEXT: addi.d $sp, $sp, -48 |
| ; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill |
| ; LA64-NEXT: move $s0, $a1 |
| ; LA64-NEXT: slli.d $a1, $a0, 4 |
| ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 |
| ; LA64-NEXT: add.d $s1, $a2, $a0 |
| ; LA64-NEXT: blez $s0, .LBB4_3 |
| ; LA64-NEXT: # %bb.1: # %for.body.preheader |
| ; LA64-NEXT: move $fp, $a2 |
| ; LA64-NEXT: vrepli.b $vr0, 0 |
| ; LA64-NEXT: .p2align 4, , 16 |
| ; LA64-NEXT: .LBB4_2: # %for.body |
| ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill |
| ; LA64-NEXT: move $a0, $fp |
| ; LA64-NEXT: pcaddu18i $ra, %call36(f) |
| ; LA64-NEXT: jirl $ra, $ra, 0 |
| ; LA64-NEXT: vldrepl.b $vr0, $s1, 16 |
| ; LA64-NEXT: addi.d $s0, $s0, -1 |
| ; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload |
| ; LA64-NEXT: vadd.b $vr1, $vr0, $vr1 |
| ; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill |
| ; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload |
| ; LA64-NEXT: bnez $s0, .LBB4_2 |
| ; LA64-NEXT: b .LBB4_4 |
| ; LA64-NEXT: .LBB4_3: |
| ; LA64-NEXT: vrepli.b $vr0, 0 |
| ; LA64-NEXT: .LBB4_4: # %for.cond.cleanup |
| ; LA64-NEXT: vstelm.b $vr0, $s1, 16, 1 |
| ; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload |
| ; LA64-NEXT: addi.d $sp, $sp, 48 |
| ; LA64-NEXT: ret |
| entry: |
| %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2 |
| %cmp = icmp sgt i64 %n, 0 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.body: ; preds = %entry, %for.body |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| call void @f(ptr %a) |
| %e = load i8, ptr %y |
| %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0 |
| %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer |
| %addv = add <16 x i8> %v, %sum.0 |
| %inc = add nuw nsw i64 %i.0, 1 |
| %exitcond = icmp eq i64 %inc, %n |
| br i1 %exitcond, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| %res = extractelement <16 x i8> %sum.lcssa, i32 1 |
| store i8 %res, ptr %y |
| ret void |
| } |
| |
| define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind { |
| ; LA32-LABEL: sink_fold_extractf64: |
| ; LA32: # %bb.0: # %entry |
| ; LA32-NEXT: addi.w $sp, $sp, -80 |
| ; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill |
| ; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill |
| ; LA32-NEXT: move $s0, $a3 |
| ; LA32-NEXT: move $s1, $a2 |
| ; LA32-NEXT: slli.w $a1, $a0, 4 |
| ; LA32-NEXT: alsl.w $a0, $a0, $a1, 3 |
| ; LA32-NEXT: sltui $a1, $a3, 1 |
| ; LA32-NEXT: slti $a2, $a3, 0 |
| ; LA32-NEXT: masknez $a2, $a2, $a1 |
| ; LA32-NEXT: sltui $a3, $s1, 1 |
| ; LA32-NEXT: maskeqz $a1, $a3, $a1 |
| ; LA32-NEXT: or $a1, $a1, $a2 |
| ; LA32-NEXT: add.w $s2, $a4, $a0 |
| ; LA32-NEXT: bnez $a1, .LBB5_3 |
| ; LA32-NEXT: # %bb.1: # %for.body.preheader |
| ; LA32-NEXT: move $fp, $a4 |
| ; LA32-NEXT: move $s3, $zero |
| ; LA32-NEXT: move $s4, $zero |
| ; LA32-NEXT: xvrepli.b $xr0, 0 |
| ; LA32-NEXT: .p2align 4, , 16 |
| ; LA32-NEXT: .LBB5_2: # %for.body |
| ; LA32-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill |
| ; LA32-NEXT: move $a0, $fp |
| ; LA32-NEXT: bl f |
| ; LA32-NEXT: xvldrepl.d $xr0, $s2, 8 |
| ; LA32-NEXT: addi.w $s3, $s3, 1 |
| ; LA32-NEXT: sltui $a0, $s3, 1 |
| ; LA32-NEXT: add.w $s4, $s4, $a0 |
| ; LA32-NEXT: xor $a0, $s3, $s1 |
| ; LA32-NEXT: xor $a1, $s4, $s0 |
| ; LA32-NEXT: or $a0, $a0, $a1 |
| ; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload |
| ; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1 |
| ; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill |
| ; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload |
| ; LA32-NEXT: bnez $a0, .LBB5_2 |
| ; LA32-NEXT: b .LBB5_4 |
| ; LA32-NEXT: .LBB5_3: |
| ; LA32-NEXT: xvrepli.b $xr0, 0 |
| ; LA32-NEXT: .LBB5_4: # %for.cond.cleanup |
| ; LA32-NEXT: xvstelm.d $xr0, $s2, 8, 1 |
| ; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload |
| ; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload |
| ; LA32-NEXT: addi.w $sp, $sp, 80 |
| ; LA32-NEXT: ret |
| ; |
| ; LA64-LABEL: sink_fold_extractf64: |
| ; LA64: # %bb.0: # %entry |
| ; LA64-NEXT: addi.d $sp, $sp, -80 |
| ; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill |
| ; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill |
| ; LA64-NEXT: move $s0, $a1 |
| ; LA64-NEXT: slli.d $a1, $a0, 4 |
| ; LA64-NEXT: alsl.d $a0, $a0, $a1, 3 |
| ; LA64-NEXT: add.d $s1, $a2, $a0 |
| ; LA64-NEXT: blez $s0, .LBB5_3 |
| ; LA64-NEXT: # %bb.1: # %for.body.preheader |
| ; LA64-NEXT: move $fp, $a2 |
| ; LA64-NEXT: xvrepli.b $xr0, 0 |
| ; LA64-NEXT: .p2align 4, , 16 |
| ; LA64-NEXT: .LBB5_2: # %for.body |
| ; LA64-NEXT: # =>This Inner Loop Header: Depth=1 |
| ; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill |
| ; LA64-NEXT: move $a0, $fp |
| ; LA64-NEXT: pcaddu18i $ra, %call36(f) |
| ; LA64-NEXT: jirl $ra, $ra, 0 |
| ; LA64-NEXT: xvldrepl.d $xr0, $s1, 8 |
| ; LA64-NEXT: addi.d $s0, $s0, -1 |
| ; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload |
| ; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1 |
| ; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill |
| ; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload |
| ; LA64-NEXT: bnez $s0, .LBB5_2 |
| ; LA64-NEXT: b .LBB5_4 |
| ; LA64-NEXT: .LBB5_3: |
| ; LA64-NEXT: xvrepli.b $xr0, 0 |
| ; LA64-NEXT: .LBB5_4: # %for.cond.cleanup |
| ; LA64-NEXT: xvstelm.d $xr0, $s1, 8, 1 |
| ; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload |
| ; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload |
| ; LA64-NEXT: addi.d $sp, $sp, 80 |
| ; LA64-NEXT: ret |
| entry: |
| %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1 |
| %cmp = icmp sgt i64 %n, 0 |
| br i1 %cmp, label %for.body, label %for.cond.cleanup |
| |
| for.body: ; preds = %entry, %for.body |
| %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ] |
| %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| call void @f(ptr %a) |
| %e = load double, ptr %y |
| %ins0 = insertelement <4 x double> poison, double %e, i32 0 |
| %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer |
| %addv = fadd <4 x double> %v, %sum.0 |
| %inc = add nuw nsw i64 %i.0, 1 |
| %exitcond = icmp eq i64 %inc, %n |
| br i1 %exitcond, label %for.cond.cleanup, label %for.body |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ] |
| %res = extractelement <4 x double> %sum.lcssa, i32 1 |
| store double %res, ptr %y |
| ret void |
| } |
| |
| declare void @f(ptr) |