| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \ |
| ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s |
| |
| ; addresses: |
| ; 1: base1 + offset |
| ; 2: + offset |
| ; 3: + offset |
| ; 4: + offset |
| ; |
| ; chains: |
| ; 1: base: base1 + offset, offsets: (0, offset) |
| ; 2: base: base1 + 3*offset, offsets: (0, offset) |
| ; |
| ; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) { |
| ; long long o1 = base1 + offset; |
| ; long long o2 = base1 + 2 * offset; |
| ; long long o3 = base1 + 3 * offset; |
| ; long long o4 = base1 + 4 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; char *p4 = p + o4; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; unsigned long x4 = *(unsigned long *)(p4 + i); |
| ; sum += x1 * x2 * x3 * x4; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @two_chain_same_offset_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) { |
| ; CHECK-LABEL: two_chain_same_offset_succ: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: blt cr0, .LBB0_4 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: sldi r7, r4, 1 |
| ; CHECK-NEXT: mtctr r6 |
| ; CHECK-NEXT: add r8, r4, r7 |
| ; CHECK-NEXT: add r7, r5, r4 |
| ; CHECK-NEXT: add r5, r5, r8 |
| ; CHECK-NEXT: add r7, r3, r7 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB0_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ld r6, 0(r7) |
| ; CHECK-NEXT: ldx r8, r7, r4 |
| ; CHECK-NEXT: ld r9, 0(r5) |
| ; CHECK-NEXT: ldx r10, r5, r4 |
| ; CHECK-NEXT: addi r7, r7, 1 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: mulld r6, r8, r6 |
| ; CHECK-NEXT: mulld r6, r6, r9 |
| ; CHECK-NEXT: maddld r3, r6, r10, r3 |
| ; CHECK-NEXT: bdnz .LBB0_2 |
| ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup |
| ; CHECK-NEXT: blr |
| ; CHECK-NEXT: .LBB0_4: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = shl nsw i64 %offset, 1 |
| %mul2 = mul nsw i64 %offset, 3 |
| %mul4 = shl nsw i64 %offset, 2 |
| %cmp46 = icmp sgt i64 %n, 0 |
| br i1 %cmp46, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ] |
| %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add = add i64 %i.047, %base1 |
| %add.ptr9.idx = add i64 %add, %offset |
| %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx |
| %0 = bitcast i8* %add.ptr9 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add.ptr10.idx = add i64 %add, %mul |
| %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx |
| %2 = bitcast i8* %add.ptr10 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add.ptr11.idx = add i64 %add, %mul2 |
| %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx |
| %4 = bitcast i8* %add.ptr11 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %add.ptr12.idx = add i64 %add, %mul4 |
| %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx |
| %6 = bitcast i8* %add.ptr12 to i64* |
| %7 = load i64, i64* %6, align 8 |
| %mul13 = mul i64 %3, %1 |
| %mul14 = mul i64 %mul13, %5 |
| %mul15 = mul i64 %mul14, %7 |
| %add16 = add i64 %mul15, %sum.048 |
| %inc = add nuw nsw i64 %i.047, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| ; addresses: |
| ; 1: base1 + offset |
| ; 2: + offset |
| ; 3: + offset |
| ; 4: + offset |
| ; 5: + offset |
| ; |
| ; It can not be commoned to chains because we need a chain for a single address. |
| ; It is not profitable to common chains if not all addresses are in chains. |
| ; |
| ; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) { |
| ; long long o1 = base1 + offset; |
| ; long long o2 = base1 + 2 * offset; |
| ; long long o3 = base1 + 3 * offset; |
| ; long long o4 = base1 + 4 * offset; |
| ; long long o5 = base1 + 5 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; char *p4 = p + o4; |
| ; char *p5 = p + o5; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; unsigned long x4 = *(unsigned long *)(p4 + i); |
| ; unsigned long x5 = *(unsigned long *)(p5 + i); |
| ; sum += x1 * x2 * x3 * x4 * x5; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @not_perfect_chain_all_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { |
| ; CHECK-LABEL: not_perfect_chain_all_same_offset_fail: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: blt cr0, .LBB1_4 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: sldi r7, r4, 1 |
| ; CHECK-NEXT: sldi r9, r4, 2 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: add r8, r4, r7 |
| ; CHECK-NEXT: mtctr r6 |
| ; CHECK-NEXT: add r10, r4, r9 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB1_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ldx r6, r5, r4 |
| ; CHECK-NEXT: ldx r11, r5, r7 |
| ; CHECK-NEXT: ldx r12, r5, r8 |
| ; CHECK-NEXT: ldx r0, r5, r9 |
| ; CHECK-NEXT: mulld r6, r11, r6 |
| ; CHECK-NEXT: ldx r30, r5, r10 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: mulld r6, r6, r12 |
| ; CHECK-NEXT: mulld r6, r6, r0 |
| ; CHECK-NEXT: maddld r3, r6, r30, r3 |
| ; CHECK-NEXT: bdnz .LBB1_2 |
| ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup |
| ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: blr |
| ; CHECK-NEXT: .LBB1_4: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = shl nsw i64 %offset, 1 |
| %mul2 = mul nsw i64 %offset, 3 |
| %mul4 = shl nsw i64 %offset, 2 |
| %mul6 = mul nsw i64 %offset, 5 |
| %cmp58 = icmp sgt i64 %n, 0 |
| br i1 %cmp58, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ] |
| %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add = add i64 %i.059, %base1 |
| %add.ptr12.idx = add i64 %add, %offset |
| %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx |
| %0 = bitcast i8* %add.ptr12 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add.ptr13.idx = add i64 %add, %mul |
| %add.ptr13 = getelementptr inbounds i8, i8* %p, i64 %add.ptr13.idx |
| %2 = bitcast i8* %add.ptr13 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add.ptr14.idx = add i64 %add, %mul2 |
| %add.ptr14 = getelementptr inbounds i8, i8* %p, i64 %add.ptr14.idx |
| %4 = bitcast i8* %add.ptr14 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %add.ptr15.idx = add i64 %add, %mul4 |
| %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx |
| %6 = bitcast i8* %add.ptr15 to i64* |
| %7 = load i64, i64* %6, align 8 |
| %add.ptr16.idx = add i64 %add, %mul6 |
| %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx |
| %8 = bitcast i8* %add.ptr16 to i64* |
| %9 = load i64, i64* %8, align 8 |
| %mul17 = mul i64 %3, %1 |
| %mul18 = mul i64 %mul17, %5 |
| %mul19 = mul i64 %mul18, %7 |
| %mul20 = mul i64 %mul19, %9 |
| %add21 = add i64 %mul20, %sum.060 |
| %inc = add nuw nsw i64 %i.059, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| ; addresses: |
| ; 1: base1 |
| ; 2: + 2*offset |
| ; 3: + offset |
| ; |
| ; We need at least 4 addresses to common 2 chains to reuse at least 1 offset. |
| ; |
| ; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) { |
| ; long long o1 = base1; |
| ; long long o2 = base1 + 2 * offset; |
| ; long long o3 = base1 + 3 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; sum += x1 * x2 * x3; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @no_enough_elements_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { |
| ; CHECK-LABEL: no_enough_elements_fail: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: blt cr0, .LBB2_4 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: sldi r7, r4, 1 |
| ; CHECK-NEXT: mtctr r6 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: add r4, r4, r7 |
| ; CHECK-NEXT: .p2align 5 |
| ; CHECK-NEXT: .LBB2_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ld r6, 0(r5) |
| ; CHECK-NEXT: ldx r8, r5, r7 |
| ; CHECK-NEXT: ldx r9, r5, r4 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: mulld r6, r8, r6 |
| ; CHECK-NEXT: maddld r3, r6, r9, r3 |
| ; CHECK-NEXT: bdnz .LBB2_2 |
| ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup |
| ; CHECK-NEXT: blr |
| ; CHECK-NEXT: .LBB2_4: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = shl nsw i64 %offset, 1 |
| %mul1 = mul nsw i64 %offset, 3 |
| %cmp32 = icmp sgt i64 %n, 0 |
| br i1 %cmp32, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ] |
| %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add.ptr5.idx = add i64 %i.033, %base1 |
| %add.ptr5 = getelementptr inbounds i8, i8* %p, i64 %add.ptr5.idx |
| %0 = bitcast i8* %add.ptr5 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add.ptr6.idx = add i64 %add.ptr5.idx, %mul |
| %add.ptr6 = getelementptr inbounds i8, i8* %p, i64 %add.ptr6.idx |
| %2 = bitcast i8* %add.ptr6 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1 |
| %add.ptr7 = getelementptr inbounds i8, i8* %p, i64 %add.ptr7.idx |
| %4 = bitcast i8* %add.ptr7 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %mul8 = mul i64 %3, %1 |
| %mul9 = mul i64 %mul8, %5 |
| %add10 = add i64 %mul9, %sum.034 |
| %inc = add nuw nsw i64 %i.033, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| ; addresses: |
| ; 1: base1 |
| ; 2: + 2*offset |
| ; 3: + 2*offset |
| ; 4: + 3*offset |
| ; |
| ; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains, |
| ; so we can not common any chains. |
| ; |
| ; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) { |
| ; long long o1 = base1; |
| ; long long o2 = base1 + 2 * offset; |
| ; long long o3 = base1 + 4 * offset; |
| ; long long o4 = base1 + 7 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; char *p4 = p + o4; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; unsigned long x4 = *(unsigned long *)(p4 + i); |
| ; sum += x1 * x2 * x3 * x4; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @no_reuseable_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { |
| ; CHECK-LABEL: no_reuseable_offset_fail: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: blt cr0, .LBB3_4 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: sldi r9, r4, 3 |
| ; CHECK-NEXT: mtctr r6 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: sldi r7, r4, 1 |
| ; CHECK-NEXT: sldi r8, r4, 2 |
| ; CHECK-NEXT: sub r4, r9, r4 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB3_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ld r6, 0(r5) |
| ; CHECK-NEXT: ldx r9, r5, r7 |
| ; CHECK-NEXT: ldx r10, r5, r8 |
| ; CHECK-NEXT: ldx r11, r5, r4 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: mulld r6, r9, r6 |
| ; CHECK-NEXT: mulld r6, r6, r10 |
| ; CHECK-NEXT: maddld r3, r6, r11, r3 |
| ; CHECK-NEXT: bdnz .LBB3_2 |
| ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup |
| ; CHECK-NEXT: blr |
| ; CHECK-NEXT: .LBB3_4: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = shl nsw i64 %offset, 1 |
| %mul1 = shl nsw i64 %offset, 2 |
| %mul3 = mul nsw i64 %offset, 7 |
| %cmp44 = icmp sgt i64 %n, 0 |
| br i1 %cmp44, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ] |
| %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add.ptr8.idx = add i64 %i.045, %base1 |
| %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx |
| %0 = bitcast i8* %add.ptr8 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add.ptr9.idx = add i64 %add.ptr8.idx, %mul |
| %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx |
| %2 = bitcast i8* %add.ptr9 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1 |
| %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx |
| %4 = bitcast i8* %add.ptr10 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3 |
| %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx |
| %6 = bitcast i8* %add.ptr11 to i64* |
| %7 = load i64, i64* %6, align 8 |
| %mul12 = mul i64 %3, %1 |
| %mul13 = mul i64 %mul12, %5 |
| %mul14 = mul i64 %mul13, %7 |
| %add15 = add i64 %mul14, %sum.046 |
| %inc = add nuw nsw i64 %i.045, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| ; addresses: |
| ; 1: base1 + offset |
| ; 2: + offset |
| ; 3: + 3*offset |
| ; 4: + 2*offset |
| ; 5: + 1*offset |
| ; 6: + 2*offset |
| ; |
| ; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5. |
| ; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6 |
| ; and address 5(2*offset), so we can not common chains for these addresses. |
| ; |
| ; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) { |
| ; long long o1 = base1 + offset; |
| ; long long o2 = base1 + 2 * offset; |
| ; long long o3 = base1 + 5 * offset; |
| ; long long o4 = base1 + 7 * offset; |
| ; long long o5 = base1 + 8 * offset; |
| ; long long o6 = base1 + 10 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; char *p4 = p + o4; |
| ; char *p5 = p + o5; |
| ; char *p6 = p + o6; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; unsigned long x4 = *(unsigned long *)(p4 + i); |
| ; unsigned long x5 = *(unsigned long *)(p5 + i); |
| ; unsigned long x6 = *(unsigned long *)(p6 + i); |
| ; sum += x1 * x2 * x3 * x4 * x5 * x6; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @not_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) { |
| ; CHECK-LABEL: not_same_offset_fail: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: blt cr0, .LBB4_3 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: mulli r11, r4, 10 |
| ; CHECK-NEXT: sldi r8, r4, 2 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: add r8, r4, r8 |
| ; CHECK-NEXT: sldi r9, r4, 3 |
| ; CHECK-NEXT: mtctr r6 |
| ; CHECK-NEXT: sldi r7, r4, 1 |
| ; CHECK-NEXT: sub r10, r9, r4 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB4_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ldx r6, r5, r4 |
| ; CHECK-NEXT: ldx r12, r5, r7 |
| ; CHECK-NEXT: ldx r0, r5, r8 |
| ; CHECK-NEXT: ldx r30, r5, r10 |
| ; CHECK-NEXT: mulld r6, r12, r6 |
| ; CHECK-NEXT: ldx r29, r5, r9 |
| ; CHECK-NEXT: ldx r28, r5, r11 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: mulld r6, r6, r0 |
| ; CHECK-NEXT: mulld r6, r6, r30 |
| ; CHECK-NEXT: mulld r6, r6, r29 |
| ; CHECK-NEXT: maddld r3, r6, r28, r3 |
| ; CHECK-NEXT: bdnz .LBB4_2 |
| ; CHECK-NEXT: b .LBB4_4 |
| ; CHECK-NEXT: .LBB4_3: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: .LBB4_4: # %for.cond.cleanup |
| ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = shl nsw i64 %offset, 1 |
| %mul2 = mul nsw i64 %offset, 5 |
| %mul4 = mul nsw i64 %offset, 7 |
| %mul6 = shl nsw i64 %offset, 3 |
| %mul8 = mul nsw i64 %offset, 10 |
| %cmp70 = icmp sgt i64 %n, 0 |
| br i1 %cmp70, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ] |
| %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add = add i64 %i.071, %base1 |
| %add.ptr15.idx = add i64 %add, %offset |
| %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx |
| %0 = bitcast i8* %add.ptr15 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add.ptr16.idx = add i64 %add, %mul |
| %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx |
| %2 = bitcast i8* %add.ptr16 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add.ptr17.idx = add i64 %add, %mul2 |
| %add.ptr17 = getelementptr inbounds i8, i8* %p, i64 %add.ptr17.idx |
| %4 = bitcast i8* %add.ptr17 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %add.ptr18.idx = add i64 %add, %mul4 |
| %add.ptr18 = getelementptr inbounds i8, i8* %p, i64 %add.ptr18.idx |
| %6 = bitcast i8* %add.ptr18 to i64* |
| %7 = load i64, i64* %6, align 8 |
| %add.ptr19.idx = add i64 %add, %mul6 |
| %add.ptr19 = getelementptr inbounds i8, i8* %p, i64 %add.ptr19.idx |
| %8 = bitcast i8* %add.ptr19 to i64* |
| %9 = load i64, i64* %8, align 8 |
| %add.ptr20.idx = add i64 %add, %mul8 |
| %add.ptr20 = getelementptr inbounds i8, i8* %p, i64 %add.ptr20.idx |
| %10 = bitcast i8* %add.ptr20 to i64* |
| %11 = load i64, i64* %10, align 8 |
| %mul21 = mul i64 %3, %1 |
| %mul22 = mul i64 %mul21, %5 |
| %mul23 = mul i64 %mul22, %7 |
| %mul24 = mul i64 %mul23, %9 |
| %mul25 = mul i64 %mul24, %11 |
| %add26 = add i64 %mul25, %sum.072 |
| %inc = add nuw nsw i64 %i.071, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| ; addresses: |
| ; 1: base1 + offset |
| ; 2: + offset |
| ; 3: + 3*offset |
| ; 4: + 2*offset |
| ; |
| ; chains: |
| ; 1: base1 + offset, offsets: (0, 2*offset) |
| ; 2: base1 + 4*offset, offsets: (0, 2*offset) |
| ; |
| ; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) { |
| ; long long o1 = base1 + offset; |
| ; long long o2 = base1 + 3 * offset; |
| ; long long o3 = base1 + 4 * offset; |
| ; long long o4 = base1 + 6 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; char *p4 = p + o4; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; unsigned long x4 = *(unsigned long *)(p4 + i); |
| ; sum += x1 * x2 * x3 * x4; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @two_chain_different_offsets_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) { |
| ; CHECK-LABEL: two_chain_different_offsets_succ: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: blt cr0, .LBB5_4 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: sldi r8, r4, 2 |
| ; CHECK-NEXT: add r7, r5, r4 |
| ; CHECK-NEXT: mtctr r6 |
| ; CHECK-NEXT: add r5, r5, r8 |
| ; CHECK-NEXT: add r7, r3, r7 |
| ; CHECK-NEXT: sldi r4, r4, 1 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB5_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ld r6, 0(r7) |
| ; CHECK-NEXT: ldx r8, r7, r4 |
| ; CHECK-NEXT: ld r9, 0(r5) |
| ; CHECK-NEXT: ldx r10, r5, r4 |
| ; CHECK-NEXT: addi r7, r7, 1 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: mulld r6, r8, r6 |
| ; CHECK-NEXT: mulld r6, r6, r9 |
| ; CHECK-NEXT: maddld r3, r6, r10, r3 |
| ; CHECK-NEXT: bdnz .LBB5_2 |
| ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup |
| ; CHECK-NEXT: blr |
| ; CHECK-NEXT: .LBB5_4: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = mul nsw i64 %offset, 3 |
| %mul2 = shl nsw i64 %offset, 2 |
| %mul4 = mul nsw i64 %offset, 6 |
| %cmp46 = icmp sgt i64 %n, 0 |
| br i1 %cmp46, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ] |
| %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add = add i64 %i.047, %base1 |
| %add.ptr9.idx = add i64 %add, %offset |
| %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx |
| %0 = bitcast i8* %add.ptr9 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add.ptr10.idx = add i64 %add, %mul |
| %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx |
| %2 = bitcast i8* %add.ptr10 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add.ptr11.idx = add i64 %add, %mul2 |
| %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx |
| %4 = bitcast i8* %add.ptr11 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %add.ptr12.idx = add i64 %add, %mul4 |
| %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx |
| %6 = bitcast i8* %add.ptr12 to i64* |
| %7 = load i64, i64* %6, align 8 |
| %mul13 = mul i64 %3, %1 |
| %mul14 = mul i64 %mul13, %5 |
| %mul15 = mul i64 %mul14, %7 |
| %add16 = add i64 %mul15, %sum.048 |
| %inc = add nuw nsw i64 %i.047, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| ; addresses: |
| ; 1: base1 + offset |
| ; 2: + 2*offset |
| ; 3: + base2 - base1 - 2*offset |
| ; 4: + 2*offset |
| ; |
| ; chains: |
| ; 1: base1 + offset, offsets: (0, 2*offset) |
| ; 2: base2 + offset, offsets: (0, 2*offset) |
| ; |
| ; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) { |
| ; long long o1 = base1 + offset; |
| ; long long o2 = base1 + 3 * offset; |
| ; long long o3 = base2 + offset; |
| ; long long o4 = base2 + 3 * offset; |
| ; char *p1 = p + o1; |
| ; char *p2 = p + o2; |
| ; char *p3 = p + o3; |
| ; char *p4 = p + o4; |
| ; long long sum = 0; |
| ; for (long long i = 0; i < n; ++i) { |
| ; unsigned long x1 = *(unsigned long *)(p1 + i); |
| ; unsigned long x2 = *(unsigned long *)(p2 + i); |
| ; unsigned long x3 = *(unsigned long *)(p3 + i); |
| ; unsigned long x4 = *(unsigned long *)(p4 + i); |
| ; sum += x1 * x2 * x3 * x4; |
| ; } |
| ; return sum; |
| ; } |
| ; |
| define i64 @two_chain_two_bases_succ(i8* %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) { |
| ; CHECK-LABEL: two_chain_two_bases_succ: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r7, 1 |
| ; CHECK-NEXT: blt cr0, .LBB6_4 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: add r6, r6, r4 |
| ; CHECK-NEXT: add r5, r5, r4 |
| ; CHECK-NEXT: mtctr r7 |
| ; CHECK-NEXT: sldi r4, r4, 1 |
| ; CHECK-NEXT: add r5, r3, r5 |
| ; CHECK-NEXT: add r6, r3, r6 |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB6_2: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: ld r7, 0(r5) |
| ; CHECK-NEXT: ldx r8, r5, r4 |
| ; CHECK-NEXT: ld r9, 0(r6) |
| ; CHECK-NEXT: ldx r10, r6, r4 |
| ; CHECK-NEXT: addi r5, r5, 1 |
| ; CHECK-NEXT: addi r6, r6, 1 |
| ; CHECK-NEXT: mulld r7, r8, r7 |
| ; CHECK-NEXT: mulld r7, r7, r9 |
| ; CHECK-NEXT: maddld r3, r7, r10, r3 |
| ; CHECK-NEXT: bdnz .LBB6_2 |
| ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup |
| ; CHECK-NEXT: blr |
| ; CHECK-NEXT: .LBB6_4: |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: blr |
| entry: |
| %mul = mul nsw i64 %offset, 3 |
| %cmp44 = icmp sgt i64 %n, 0 |
| br i1 %cmp44, label %for.body, label %for.cond.cleanup |
| |
| for.cond.cleanup: ; preds = %for.body, %entry |
| %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ] |
| ret i64 %sum.0.lcssa |
| |
| for.body: ; preds = %entry, %for.body |
| %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ] |
| %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ] |
| %add = add i64 %i.045, %base1 |
| %add.ptr8.idx = add i64 %add, %offset |
| %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx |
| %0 = bitcast i8* %add.ptr8 to i64* |
| %1 = load i64, i64* %0, align 8 |
| %add1 = add i64 %i.045, %mul |
| %add.ptr9.idx = add i64 %add1, %base1 |
| %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx |
| %2 = bitcast i8* %add.ptr9 to i64* |
| %3 = load i64, i64* %2, align 8 |
| %add2 = add i64 %i.045, %base2 |
| %add.ptr10.idx = add i64 %add2, %offset |
| %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx |
| %4 = bitcast i8* %add.ptr10 to i64* |
| %5 = load i64, i64* %4, align 8 |
| %add.ptr11.idx = add i64 %add2, %mul |
| %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx |
| %6 = bitcast i8* %add.ptr11 to i64* |
| %7 = load i64, i64* %6, align 8 |
| %mul12 = mul i64 %3, %1 |
| %mul13 = mul i64 %mul12, %5 |
| %mul14 = mul i64 %mul13, %7 |
| %add15 = add i64 %mul14, %sum.046 |
| %inc = add nuw nsw i64 %i.045, 1 |
| %exitcond.not = icmp eq i64 %inc, %n |
| br i1 %exitcond.not, label %for.cond.cleanup, label %for.body |
| } |
| ; |
| ; Check chain commoning can reduce register pressure to save register spill/reload. |
| ; |
| ; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) { |
| ; inc = inc4; |
| ; #pragma unroll 4 |
| ; for (long long i = 0; i < 4 * m; i++) { |
| ; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1]; |
| ; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2]; |
| ; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3]; |
| ; inc = inc + inc4; |
| ; } |
| ; return 0; |
| ; } |
| ; |
| define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) { |
| ; CHECK-LABEL: spill_reduce_succ: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r9, -160(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: blt cr0, .LBB7_7 |
| ; CHECK-NEXT: # %bb.1: # %for.body.preheader |
| ; CHECK-NEXT: sldi r6, r6, 2 |
| ; CHECK-NEXT: li r7, 1 |
| ; CHECK-NEXT: mr r12, r10 |
| ; CHECK-NEXT: cmpdi r6, 1 |
| ; CHECK-NEXT: iselgt r7, r6, r7 |
| ; CHECK-NEXT: addi r8, r7, -1 |
| ; CHECK-NEXT: clrldi r6, r7, 63 |
| ; CHECK-NEXT: cmpldi r8, 3 |
| ; CHECK-NEXT: blt cr0, .LBB7_4 |
| ; CHECK-NEXT: # %bb.2: # %for.body.preheader.new |
| ; CHECK-NEXT: rldicl r7, r7, 62, 2 |
| ; CHECK-NEXT: sldi r10, r12, 2 |
| ; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: rldicl r7, r7, 2, 1 |
| ; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill |
| ; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: add r8, r7, r10 |
| ; CHECK-NEXT: mr r22, r7 |
| ; CHECK-NEXT: mr r7, r4 |
| ; CHECK-NEXT: mr r4, r3 |
| ; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: sldi r8, r8, 3 |
| ; CHECK-NEXT: add r9, r5, r8 |
| ; CHECK-NEXT: add r8, r3, r10 |
| ; CHECK-NEXT: add r10, r2, r10 |
| ; CHECK-NEXT: sldi r10, r10, 3 |
| ; CHECK-NEXT: sldi r8, r8, 3 |
| ; CHECK-NEXT: add r30, r5, r10 |
| ; CHECK-NEXT: add r29, r7, r10 |
| ; CHECK-NEXT: add r28, r4, r10 |
| ; CHECK-NEXT: sldi r10, r12, 1 |
| ; CHECK-NEXT: add r8, r5, r8 |
| ; CHECK-NEXT: add r11, r12, r10 |
| ; CHECK-NEXT: add r0, r22, r11 |
| ; CHECK-NEXT: sldi r0, r0, 3 |
| ; CHECK-NEXT: add r27, r5, r0 |
| ; CHECK-NEXT: add r0, r3, r11 |
| ; CHECK-NEXT: add r11, r2, r11 |
| ; CHECK-NEXT: sldi r11, r11, 3 |
| ; CHECK-NEXT: sldi r0, r0, 3 |
| ; CHECK-NEXT: add r25, r5, r11 |
| ; CHECK-NEXT: add r24, r7, r11 |
| ; CHECK-NEXT: add r23, r4, r11 |
| ; CHECK-NEXT: add r11, r22, r10 |
| ; CHECK-NEXT: add r26, r5, r0 |
| ; CHECK-NEXT: mr r0, r22 |
| ; CHECK-NEXT: sldi r11, r11, 3 |
| ; CHECK-NEXT: add r22, r5, r11 |
| ; CHECK-NEXT: add r11, r3, r10 |
| ; CHECK-NEXT: add r10, r2, r10 |
| ; CHECK-NEXT: sldi r10, r10, 3 |
| ; CHECK-NEXT: sldi r11, r11, 3 |
| ; CHECK-NEXT: add r20, r5, r10 |
| ; CHECK-NEXT: add r19, r7, r10 |
| ; CHECK-NEXT: add r18, r4, r10 |
| ; CHECK-NEXT: add r10, r12, r0 |
| ; CHECK-NEXT: add r21, r5, r11 |
| ; CHECK-NEXT: sldi r11, r2, 3 |
| ; CHECK-NEXT: sldi r10, r10, 3 |
| ; CHECK-NEXT: add r17, r5, r10 |
| ; CHECK-NEXT: add r10, r12, r3 |
| ; CHECK-NEXT: sldi r10, r10, 3 |
| ; CHECK-NEXT: add r16, r5, r10 |
| ; CHECK-NEXT: add r10, r12, r2 |
| ; CHECK-NEXT: sldi r10, r10, 3 |
| ; CHECK-NEXT: add r15, r5, r10 |
| ; CHECK-NEXT: add r14, r7, r10 |
| ; CHECK-NEXT: add r31, r4, r10 |
| ; CHECK-NEXT: sldi r10, r3, 3 |
| ; CHECK-NEXT: mr r3, r4 |
| ; CHECK-NEXT: mr r4, r7 |
| ; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: sub r0, r10, r11 |
| ; CHECK-NEXT: sldi r10, r7, 3 |
| ; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: sub r2, r10, r11 |
| ; CHECK-NEXT: li r11, 0 |
| ; CHECK-NEXT: mr r10, r12 |
| ; CHECK-NEXT: addi r7, r7, -4 |
| ; CHECK-NEXT: rldicl r7, r7, 62, 2 |
| ; CHECK-NEXT: addi r7, r7, 1 |
| ; CHECK-NEXT: mtctr r7 |
| ; CHECK-NEXT: sldi r7, r12, 5 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB7_3: # %for.body |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: lfd f0, 0(r31) |
| ; CHECK-NEXT: lfd f1, 0(r14) |
| ; CHECK-NEXT: add r10, r10, r12 |
| ; CHECK-NEXT: add r10, r10, r12 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfd f1, 0(r15) |
| ; CHECK-NEXT: add r10, r10, r12 |
| ; CHECK-NEXT: add r10, r10, r12 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfd f0, 0(r15) |
| ; CHECK-NEXT: add r15, r15, r7 |
| ; CHECK-NEXT: lfdx f0, r31, r0 |
| ; CHECK-NEXT: lfdx f1, r14, r0 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r16, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r16, r11 |
| ; CHECK-NEXT: lfdx f0, r31, r2 |
| ; CHECK-NEXT: lfdx f1, r14, r2 |
| ; CHECK-NEXT: add r31, r31, r7 |
| ; CHECK-NEXT: add r14, r14, r7 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r17, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r17, r11 |
| ; CHECK-NEXT: lfd f0, 0(r18) |
| ; CHECK-NEXT: lfd f1, 0(r19) |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r20, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r20, r11 |
| ; CHECK-NEXT: lfdx f0, r18, r0 |
| ; CHECK-NEXT: lfdx f1, r19, r0 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r21, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r21, r11 |
| ; CHECK-NEXT: lfdx f0, r18, r2 |
| ; CHECK-NEXT: lfdx f1, r19, r2 |
| ; CHECK-NEXT: add r18, r18, r7 |
| ; CHECK-NEXT: add r19, r19, r7 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r22, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r22, r11 |
| ; CHECK-NEXT: lfd f0, 0(r23) |
| ; CHECK-NEXT: lfd f1, 0(r24) |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r25, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r25, r11 |
| ; CHECK-NEXT: lfdx f0, r23, r0 |
| ; CHECK-NEXT: lfdx f1, r24, r0 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r26, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r26, r11 |
| ; CHECK-NEXT: lfdx f0, r23, r2 |
| ; CHECK-NEXT: lfdx f1, r24, r2 |
| ; CHECK-NEXT: add r23, r23, r7 |
| ; CHECK-NEXT: add r24, r24, r7 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r27, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r27, r11 |
| ; CHECK-NEXT: lfd f0, 0(r28) |
| ; CHECK-NEXT: lfd f1, 0(r29) |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r30, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r30, r11 |
| ; CHECK-NEXT: lfdx f0, r28, r0 |
| ; CHECK-NEXT: lfdx f1, r29, r0 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r8, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r8, r11 |
| ; CHECK-NEXT: lfdx f0, r28, r2 |
| ; CHECK-NEXT: lfdx f1, r29, r2 |
| ; CHECK-NEXT: add r28, r28, r7 |
| ; CHECK-NEXT: add r29, r29, r7 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r9, r11 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r9, r11 |
| ; CHECK-NEXT: add r11, r11, r7 |
| ; CHECK-NEXT: bdnz .LBB7_3 |
| ; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa |
| ; CHECK-NEXT: cmpldi r6, 0 |
| ; CHECK-NEXT: beq cr0, .LBB7_7 |
| ; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader |
| ; CHECK-NEXT: sldi r8, r12, 3 |
| ; CHECK-NEXT: ld r12, -176(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: add r12, r10, r12 |
| ; CHECK-NEXT: add r7, r10, r7 |
| ; CHECK-NEXT: sldi r0, r12, 3 |
| ; CHECK-NEXT: sldi r11, r7, 3 |
| ; CHECK-NEXT: add r12, r5, r0 |
| ; CHECK-NEXT: add r30, r4, r0 |
| ; CHECK-NEXT: add r29, r3, r0 |
| ; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: add r7, r5, r11 |
| ; CHECK-NEXT: add r9, r4, r11 |
| ; CHECK-NEXT: add r11, r3, r11 |
| ; CHECK-NEXT: add r10, r10, r0 |
| ; CHECK-NEXT: sldi r10, r10, 3 |
| ; CHECK-NEXT: add r5, r5, r10 |
| ; CHECK-NEXT: add r4, r4, r10 |
| ; CHECK-NEXT: add r3, r3, r10 |
| ; CHECK-NEXT: li r10, 0 |
| ; CHECK-NEXT: .p2align 4 |
| ; CHECK-NEXT: .LBB7_6: # %for.body.epil |
| ; CHECK-NEXT: # |
| ; CHECK-NEXT: lfdx f0, r3, r10 |
| ; CHECK-NEXT: lfdx f1, r4, r10 |
| ; CHECK-NEXT: addi r6, r6, -1 |
| ; CHECK-NEXT: cmpldi r6, 0 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfd f1, 0(r5) |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfd f0, 0(r5) |
| ; CHECK-NEXT: add r5, r5, r8 |
| ; CHECK-NEXT: lfdx f0, r29, r10 |
| ; CHECK-NEXT: lfdx f1, r30, r10 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r12, r10 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r12, r10 |
| ; CHECK-NEXT: lfdx f0, r11, r10 |
| ; CHECK-NEXT: lfdx f1, r9, r10 |
| ; CHECK-NEXT: xsmuldp f0, f0, f1 |
| ; CHECK-NEXT: lfdx f1, r7, r10 |
| ; CHECK-NEXT: xsadddp f0, f1, f0 |
| ; CHECK-NEXT: stfdx f0, r7, r10 |
| ; CHECK-NEXT: add r10, r10, r8 |
| ; CHECK-NEXT: bne cr0, .LBB7_6 |
| ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup |
| ; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: li r3, 0 |
| ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload |
| ; CHECK-NEXT: blr |
| entry: |
| %cmp49 = icmp sgt i64 %m, 0 |
| br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %0 = shl i64 %m, 2 |
| %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1) |
| %1 = add nsw i64 %smax52, -1 |
| %xtraiter = and i64 %smax52, 1 |
| %2 = icmp ult i64 %1, 3 |
| br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new |
| |
| for.body.preheader.new: ; preds = %for.body.preheader |
| %unroll_iter = and i64 %smax52, 9223372036854775804 |
| br label %for.body |
| |
| for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader |
| %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ] |
| %lcmp.mod.not = icmp eq i64 %xtraiter, 0 |
| br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil |
| |
| for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil |
| %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ] |
| %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] |
| %add.epil = add nsw i64 %inc.addr.050.epil, %inc1 |
| %arrayidx.epil = getelementptr inbounds double, double* %input1, i64 %add.epil |
| %3 = load double, double* %arrayidx.epil, align 8 |
| %arrayidx2.epil = getelementptr inbounds double, double* %input2, i64 %add.epil |
| %4 = load double, double* %arrayidx2.epil, align 8 |
| %mul3.epil = fmul double %3, %4 |
| %arrayidx5.epil = getelementptr inbounds double, double* %output, i64 %add.epil |
| %5 = load double, double* %arrayidx5.epil, align 8 |
| %add6.epil = fadd double %5, %mul3.epil |
| store double %add6.epil, double* %arrayidx5.epil, align 8 |
| %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2 |
| %arrayidx8.epil = getelementptr inbounds double, double* %input1, i64 %add7.epil |
| %6 = load double, double* %arrayidx8.epil, align 8 |
| %arrayidx10.epil = getelementptr inbounds double, double* %input2, i64 %add7.epil |
| %7 = load double, double* %arrayidx10.epil, align 8 |
| %mul11.epil = fmul double %6, %7 |
| %arrayidx13.epil = getelementptr inbounds double, double* %output, i64 %add7.epil |
| %8 = load double, double* %arrayidx13.epil, align 8 |
| %add14.epil = fadd double %8, %mul11.epil |
| store double %add14.epil, double* %arrayidx13.epil, align 8 |
| %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3 |
| %arrayidx16.epil = getelementptr inbounds double, double* %input1, i64 %add15.epil |
| %9 = load double, double* %arrayidx16.epil, align 8 |
| %arrayidx18.epil = getelementptr inbounds double, double* %input2, i64 %add15.epil |
| %10 = load double, double* %arrayidx18.epil, align 8 |
| %mul19.epil = fmul double %9, %10 |
| %arrayidx21.epil = getelementptr inbounds double, double* %output, i64 %add15.epil |
| %11 = load double, double* %arrayidx21.epil, align 8 |
| %add22.epil = fadd double %11, %mul19.epil |
| store double %add22.epil, double* %arrayidx21.epil, align 8 |
| %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4 |
| %epil.iter.sub = add nsw i64 %epil.iter, -1 |
| %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0 |
| br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil |
| |
| for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry |
| ret i32 0 |
| |
| for.body: ; preds = %for.body, %for.body.preheader.new |
| %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ] |
| %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] |
| %add = add nsw i64 %inc.addr.050, %inc1 |
| %arrayidx = getelementptr inbounds double, double* %input1, i64 %add |
| %12 = load double, double* %arrayidx, align 8 |
| %arrayidx2 = getelementptr inbounds double, double* %input2, i64 %add |
| %13 = load double, double* %arrayidx2, align 8 |
| %mul3 = fmul double %12, %13 |
| %arrayidx5 = getelementptr inbounds double, double* %output, i64 %add |
| %14 = load double, double* %arrayidx5, align 8 |
| %add6 = fadd double %14, %mul3 |
| store double %add6, double* %arrayidx5, align 8 |
| %add7 = add nsw i64 %inc.addr.050, %inc2 |
| %arrayidx8 = getelementptr inbounds double, double* %input1, i64 %add7 |
| %15 = load double, double* %arrayidx8, align 8 |
| %arrayidx10 = getelementptr inbounds double, double* %input2, i64 %add7 |
| %16 = load double, double* %arrayidx10, align 8 |
| %mul11 = fmul double %15, %16 |
| %arrayidx13 = getelementptr inbounds double, double* %output, i64 %add7 |
| %17 = load double, double* %arrayidx13, align 8 |
| %add14 = fadd double %17, %mul11 |
| store double %add14, double* %arrayidx13, align 8 |
| %add15 = add nsw i64 %inc.addr.050, %inc3 |
| %arrayidx16 = getelementptr inbounds double, double* %input1, i64 %add15 |
| %18 = load double, double* %arrayidx16, align 8 |
| %arrayidx18 = getelementptr inbounds double, double* %input2, i64 %add15 |
| %19 = load double, double* %arrayidx18, align 8 |
| %mul19 = fmul double %18, %19 |
| %arrayidx21 = getelementptr inbounds double, double* %output, i64 %add15 |
| %20 = load double, double* %arrayidx21, align 8 |
| %add22 = fadd double %20, %mul19 |
| store double %add22, double* %arrayidx21, align 8 |
| %add23 = add nsw i64 %inc.addr.050, %inc4 |
| %add.1 = add nsw i64 %add23, %inc1 |
| %arrayidx.1 = getelementptr inbounds double, double* %input1, i64 %add.1 |
| %21 = load double, double* %arrayidx.1, align 8 |
| %arrayidx2.1 = getelementptr inbounds double, double* %input2, i64 %add.1 |
| %22 = load double, double* %arrayidx2.1, align 8 |
| %mul3.1 = fmul double %21, %22 |
| %arrayidx5.1 = getelementptr inbounds double, double* %output, i64 %add.1 |
| %23 = load double, double* %arrayidx5.1, align 8 |
| %add6.1 = fadd double %23, %mul3.1 |
| store double %add6.1, double* %arrayidx5.1, align 8 |
| %add7.1 = add nsw i64 %add23, %inc2 |
| %arrayidx8.1 = getelementptr inbounds double, double* %input1, i64 %add7.1 |
| %24 = load double, double* %arrayidx8.1, align 8 |
| %arrayidx10.1 = getelementptr inbounds double, double* %input2, i64 %add7.1 |
| %25 = load double, double* %arrayidx10.1, align 8 |
| %mul11.1 = fmul double %24, %25 |
| %arrayidx13.1 = getelementptr inbounds double, double* %output, i64 %add7.1 |
| %26 = load double, double* %arrayidx13.1, align 8 |
| %add14.1 = fadd double %26, %mul11.1 |
| store double %add14.1, double* %arrayidx13.1, align 8 |
| %add15.1 = add nsw i64 %add23, %inc3 |
| %arrayidx16.1 = getelementptr inbounds double, double* %input1, i64 %add15.1 |
| %27 = load double, double* %arrayidx16.1, align 8 |
| %arrayidx18.1 = getelementptr inbounds double, double* %input2, i64 %add15.1 |
| %28 = load double, double* %arrayidx18.1, align 8 |
| %mul19.1 = fmul double %27, %28 |
| %arrayidx21.1 = getelementptr inbounds double, double* %output, i64 %add15.1 |
| %29 = load double, double* %arrayidx21.1, align 8 |
| %add22.1 = fadd double %29, %mul19.1 |
| store double %add22.1, double* %arrayidx21.1, align 8 |
| %add23.1 = add nsw i64 %add23, %inc4 |
| %add.2 = add nsw i64 %add23.1, %inc1 |
| %arrayidx.2 = getelementptr inbounds double, double* %input1, i64 %add.2 |
| %30 = load double, double* %arrayidx.2, align 8 |
| %arrayidx2.2 = getelementptr inbounds double, double* %input2, i64 %add.2 |
| %31 = load double, double* %arrayidx2.2, align 8 |
| %mul3.2 = fmul double %30, %31 |
| %arrayidx5.2 = getelementptr inbounds double, double* %output, i64 %add.2 |
| %32 = load double, double* %arrayidx5.2, align 8 |
| %add6.2 = fadd double %32, %mul3.2 |
| store double %add6.2, double* %arrayidx5.2, align 8 |
| %add7.2 = add nsw i64 %add23.1, %inc2 |
| %arrayidx8.2 = getelementptr inbounds double, double* %input1, i64 %add7.2 |
| %33 = load double, double* %arrayidx8.2, align 8 |
| %arrayidx10.2 = getelementptr inbounds double, double* %input2, i64 %add7.2 |
| %34 = load double, double* %arrayidx10.2, align 8 |
| %mul11.2 = fmul double %33, %34 |
| %arrayidx13.2 = getelementptr inbounds double, double* %output, i64 %add7.2 |
| %35 = load double, double* %arrayidx13.2, align 8 |
| %add14.2 = fadd double %35, %mul11.2 |
| store double %add14.2, double* %arrayidx13.2, align 8 |
| %add15.2 = add nsw i64 %add23.1, %inc3 |
| %arrayidx16.2 = getelementptr inbounds double, double* %input1, i64 %add15.2 |
| %36 = load double, double* %arrayidx16.2, align 8 |
| %arrayidx18.2 = getelementptr inbounds double, double* %input2, i64 %add15.2 |
| %37 = load double, double* %arrayidx18.2, align 8 |
| %mul19.2 = fmul double %36, %37 |
| %arrayidx21.2 = getelementptr inbounds double, double* %output, i64 %add15.2 |
| %38 = load double, double* %arrayidx21.2, align 8 |
| %add22.2 = fadd double %38, %mul19.2 |
| store double %add22.2, double* %arrayidx21.2, align 8 |
| %add23.2 = add nsw i64 %add23.1, %inc4 |
| %add.3 = add nsw i64 %add23.2, %inc1 |
| %arrayidx.3 = getelementptr inbounds double, double* %input1, i64 %add.3 |
| %39 = load double, double* %arrayidx.3, align 8 |
| %arrayidx2.3 = getelementptr inbounds double, double* %input2, i64 %add.3 |
| %40 = load double, double* %arrayidx2.3, align 8 |
| %mul3.3 = fmul double %39, %40 |
| %arrayidx5.3 = getelementptr inbounds double, double* %output, i64 %add.3 |
| %41 = load double, double* %arrayidx5.3, align 8 |
| %add6.3 = fadd double %41, %mul3.3 |
| store double %add6.3, double* %arrayidx5.3, align 8 |
| %add7.3 = add nsw i64 %add23.2, %inc2 |
| %arrayidx8.3 = getelementptr inbounds double, double* %input1, i64 %add7.3 |
| %42 = load double, double* %arrayidx8.3, align 8 |
| %arrayidx10.3 = getelementptr inbounds double, double* %input2, i64 %add7.3 |
| %43 = load double, double* %arrayidx10.3, align 8 |
| %mul11.3 = fmul double %42, %43 |
| %arrayidx13.3 = getelementptr inbounds double, double* %output, i64 %add7.3 |
| %44 = load double, double* %arrayidx13.3, align 8 |
| %add14.3 = fadd double %44, %mul11.3 |
| store double %add14.3, double* %arrayidx13.3, align 8 |
| %add15.3 = add nsw i64 %add23.2, %inc3 |
| %arrayidx16.3 = getelementptr inbounds double, double* %input1, i64 %add15.3 |
| %45 = load double, double* %arrayidx16.3, align 8 |
| %arrayidx18.3 = getelementptr inbounds double, double* %input2, i64 %add15.3 |
| %46 = load double, double* %arrayidx18.3, align 8 |
| %mul19.3 = fmul double %45, %46 |
| %arrayidx21.3 = getelementptr inbounds double, double* %output, i64 %add15.3 |
| %47 = load double, double* %arrayidx21.3, align 8 |
| %add22.3 = fadd double %47, %mul19.3 |
| store double %add22.3, double* %arrayidx21.3, align 8 |
| %add23.3 = add nsw i64 %add23.2, %inc4 |
| %niter.nsub.3 = add i64 %niter, -4 |
| %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0 |
| br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body |
| } |
| |
| declare i64 @llvm.smax.i64(i64, i64) |
| |