blob: e55c8060b73ef7830e2987bf2cba5c20be73e4b5 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=aarch64-linux-gnu -o - -O3 %s | FileCheck %s
; During times of high register pressure, Greedy register allocation
; may emit large mov spill chains on AArch64. The Spill Copy Elimination
; pass can simplify these chains, and improve runtime performance. For situations
; where this is likely, we need to ensure it is simplifying the register allocation
define void @_test(ptr readonly nocapture %0, ptr readonly nocapture %1, ptr readonly nocapture %2, ptr readonly nocapture %3, ptr readonly nocapture %4, ptr readonly nocapture %5, ptr readonly nocapture %6, ptr readonly nocapture %7, ptr readonly nocapture %8, ptr readonly nocapture %9, ptr readonly nocapture %10, ptr readonly nocapture %11, ptr readonly nocapture %12, ptr readonly nocapture %13, ptr readonly nocapture %14, ptr readonly nocapture %15) #0 {
; CHECK-LABEL: _test:
; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-96]! // 8-byte Folded Spill
; CHECK-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
; CHECK-NEXT: sub sp, sp, #3648
; CHECK-NEXT: ldr x9, [sp, #7896]
; CHECK-NEXT: ldr x10, [sp, #7888]
; CHECK-NEXT: add x17, sp, #1, lsl #12 // =4096
; CHECK-NEXT: ldr x11, [sp, #7880]
; CHECK-NEXT: ldr x12, [sp, #7872]
; CHECK-NEXT: add x18, sp, #1, lsl #12 // =4096
; CHECK-NEXT: ldr x13, [sp, #7864]
; CHECK-NEXT: ldr x14, [sp, #7856]
; CHECK-NEXT: add x19, sp, #1, lsl #12 // =4096
; CHECK-NEXT: ldr x15, [sp, #7848]
; CHECK-NEXT: ldr x16, [sp, #7840]
; CHECK-NEXT: add x20, sp, #1, lsl #12 // =4096
; CHECK-NEXT: add x21, sp, #1, lsl #12 // =4096
; CHECK-NEXT: add x22, sp, #1, lsl #12 // =4096
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add x17, x17, #3148
; CHECK-NEXT: add x18, x18, #2148
; CHECK-NEXT: add x19, x19, #1648
; CHECK-NEXT: add x20, x20, #2648
; CHECK-NEXT: add x21, x21, #1148
; CHECK-NEXT: add x22, x22, #148
; CHECK-NEXT: add x23, sp, #3744
; CHECK-NEXT: add x24, sp, #3244
; CHECK-NEXT: add x25, sp, #2244
; CHECK-NEXT: add x26, sp, #1744
; CHECK-NEXT: add x27, sp, #244
; CHECK-NEXT: mov w28, #1 // =0x1
; CHECK-NEXT: .LBB0_1: // %.preheader167
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB0_2 Depth 2
; CHECK-NEXT: // Child Loop BB0_3 Depth 3
; CHECK-NEXT: stp x28, x27, [sp, #8] // 16-byte Folded Spill
; CHECK-NEXT: mov x30, x0
; CHECK-NEXT: mov x28, x18
; CHECK-NEXT: stp x0, x1, [sp, #168] // 16-byte Folded Spill
; CHECK-NEXT: mov x0, x9
; CHECK-NEXT: stp x18, x17, [sp, #88] // 16-byte Folded Spill
; CHECK-NEXT: mov x18, x17
; CHECK-NEXT: mov w17, #1 // =0x1
; CHECK-NEXT: stp x2, x3, [sp, #184] // 16-byte Folded Spill
; CHECK-NEXT: stp x4, x5, [sp, #200] // 16-byte Folded Spill
; CHECK-NEXT: stp x6, x7, [sp, #216] // 16-byte Folded Spill
; CHECK-NEXT: stp x16, x15, [sp, #104] // 16-byte Folded Spill
; CHECK-NEXT: stp x14, x13, [sp, #120] // 16-byte Folded Spill
; CHECK-NEXT: stp x12, x11, [sp, #136] // 16-byte Folded Spill
; CHECK-NEXT: stp x10, x9, [sp, #152] // 16-byte Folded Spill
; CHECK-NEXT: stp x26, x25, [sp, #24] // 16-byte Folded Spill
; CHECK-NEXT: stp x24, x23, [sp, #40] // 16-byte Folded Spill
; CHECK-NEXT: stp x22, x21, [sp, #56] // 16-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #72] // 16-byte Folded Spill
; CHECK-NEXT: str x17, [sp, #232] // 8-byte Spill
; CHECK-NEXT: .LBB0_2: // %.preheader166
; CHECK-NEXT: // Parent Loop BB0_1 Depth=1
; CHECK-NEXT: // => This Loop Header: Depth=2
; CHECK-NEXT: // Child Loop BB0_3 Depth 3
; CHECK-NEXT: mov x9, #-500 // =0xfffffffffffffe0c
; CHECK-NEXT: mov x17, x28
; CHECK-NEXT: str x20, [sp, #7752] // 8-byte Spill
; CHECK-NEXT: .LBB0_3: // Parent Loop BB0_1 Depth=1
; CHECK-NEXT: // Parent Loop BB0_2 Depth=2
; CHECK-NEXT: // => This Inner Loop Header: Depth=3
; CHECK-NEXT: add x28, x26, x9
; CHECK-NEXT: ldr s1, [x5, x9]
; CHECK-NEXT: ldr s3, [x14, x9]
; CHECK-NEXT: ldr s2, [x28, #500]
; CHECK-NEXT: ldr s0, [x8]
; CHECK-NEXT: add x28, x27, x9
; CHECK-NEXT: ldr s6, [x30, x9]
; CHECK-NEXT: ldr s7, [x28, #500]
; CHECK-NEXT: add x28, x25, x9
; CHECK-NEXT: fmul s4, s1, s2
; CHECK-NEXT: fmul s5, s3, s2
; CHECK-NEXT: ldr s16, [x1, x9]
; CHECK-NEXT: fmul s2, s0, s2
; CHECK-NEXT: ldr s17, [x3, x9]
; CHECK-NEXT: ldr s18, [x28, #500]
; CHECK-NEXT: add x28, x24, x9
; CHECK-NEXT: ldr s19, [x6, x9]
; CHECK-NEXT: mov x20, x21
; CHECK-NEXT: ldr s21, [x28, #500]
; CHECK-NEXT: add x28, x23, x9
; CHECK-NEXT: fmadd s4, s6, s7, s4
; CHECK-NEXT: ldr s6, [x4, x9]
; CHECK-NEXT: fmadd s5, s16, s7, s5
; CHECK-NEXT: fmadd s1, s1, s7, s2
; CHECK-NEXT: fmadd s20, s6, s7, s2
; CHECK-NEXT: fmadd s2, s17, s7, s2
; CHECK-NEXT: ldr s7, [x15, x9]
; CHECK-NEXT: fmadd s4, s16, s18, s4
; CHECK-NEXT: fmadd s5, s19, s18, s5
; CHECK-NEXT: ldr s16, [x16, x9]
; CHECK-NEXT: fmul s19, s0, s21
; CHECK-NEXT: fmadd s1, s3, s18, s1
; CHECK-NEXT: fmadd s3, s7, s18, s20
; CHECK-NEXT: fmadd s2, s16, s18, s2
; CHECK-NEXT: fmadd s4, s6, s21, s4
; CHECK-NEXT: fmadd s5, s7, s21, s5
; CHECK-NEXT: ldr s6, [x0, x9]
; CHECK-NEXT: fadd s1, s1, s19
; CHECK-NEXT: ldr s7, [x28, #500]
; CHECK-NEXT: add x28, x22, x9
; CHECK-NEXT: fadd s3, s3, s19
; CHECK-NEXT: fmadd s2, s6, s21, s2
; CHECK-NEXT: ldr s18, [x28, #500]
; CHECK-NEXT: add x28, x21, x9
; CHECK-NEXT: mov x21, x8
; CHECK-NEXT: ldr x8, [sp, #7752] // 8-byte Reload
; CHECK-NEXT: fmadd s4, s17, s7, s4
; CHECK-NEXT: fmadd s5, s16, s7, s5
; CHECK-NEXT: ldr s16, [x10, x9]
; CHECK-NEXT: fmadd s0, s0, s7, s1
; CHECK-NEXT: ldr s17, [x2, x9]
; CHECK-NEXT: ldr s1, [x7, x9]
; CHECK-NEXT: fmadd s3, s6, s7, s3
; CHECK-NEXT: ldr s6, [x11, x9]
; CHECK-NEXT: fmadd s2, s16, s7, s2
; CHECK-NEXT: ldr s7, [x12, x9]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: fmadd s4, s17, s18, s4
; CHECK-NEXT: fmadd s1, s1, s18, s5
; CHECK-NEXT: ldr s5, [x13, x9]
; CHECK-NEXT: fmadd s0, s6, s18, s0
; CHECK-NEXT: fmadd s3, s7, s18, s3
; CHECK-NEXT: fmadd s2, s5, s18, s2
; CHECK-NEXT: str s4, [x28, #500]
; CHECK-NEXT: add x28, x19, x9
; CHECK-NEXT: str s1, [x8, #500]
; CHECK-NEXT: add x8, x17, x9
; CHECK-NEXT: str s0, [x28, #500]
; CHECK-NEXT: add x28, x18, x9
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: str s3, [x8, #500]
; CHECK-NEXT: mov x8, x21
; CHECK-NEXT: mov x21, x20
; CHECK-NEXT: cmn x9, #480
; CHECK-NEXT: str s2, [x28, #500]
; CHECK-NEXT: b.ne .LBB0_3
; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_2 Depth=2
; CHECK-NEXT: ldr x9, [sp, #232] // 8-byte Reload
; CHECK-NEXT: ldr x20, [sp, #7752] // 8-byte Reload
; CHECK-NEXT: add x18, x18, #20
; CHECK-NEXT: add x28, x17, #20
; CHECK-NEXT: add x19, x19, #20
; CHECK-NEXT: add x21, x21, #20
; CHECK-NEXT: add x9, x9, #1
; CHECK-NEXT: add x20, x20, #20
; CHECK-NEXT: add x22, x22, #20
; CHECK-NEXT: add x23, x23, #20
; CHECK-NEXT: add x24, x24, #20
; CHECK-NEXT: add x25, x25, #20
; CHECK-NEXT: add x26, x26, #20
; CHECK-NEXT: add x27, x27, #20
; CHECK-NEXT: add x0, x0, #20
; CHECK-NEXT: add x10, x10, #20
; CHECK-NEXT: add x11, x11, #20
; CHECK-NEXT: add x12, x12, #20
; CHECK-NEXT: add x13, x13, #20
; CHECK-NEXT: add x14, x14, #20
; CHECK-NEXT: add x15, x15, #20
; CHECK-NEXT: add x16, x16, #20
; CHECK-NEXT: add x7, x7, #20
; CHECK-NEXT: add x6, x6, #20
; CHECK-NEXT: add x5, x5, #20
; CHECK-NEXT: add x4, x4, #20
; CHECK-NEXT: add x3, x3, #20
; CHECK-NEXT: cmp x9, #6
; CHECK-NEXT: add x2, x2, #20
; CHECK-NEXT: add x1, x1, #20
; CHECK-NEXT: add x30, x30, #20
; CHECK-NEXT: str x9, [sp, #232] // 8-byte Spill
; CHECK-NEXT: b.ne .LBB0_2
; CHECK-NEXT: // %bb.5: // in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: ldp x18, x17, [sp, #88] // 16-byte Folded Reload
; CHECK-NEXT: ldp x28, x27, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: ldp x20, x19, [sp, #72] // 16-byte Folded Reload
; CHECK-NEXT: add x28, x28, #1
; CHECK-NEXT: ldp x22, x21, [sp, #56] // 16-byte Folded Reload
; CHECK-NEXT: add x17, x17, #100
; CHECK-NEXT: add x18, x18, #100
; CHECK-NEXT: add x27, x27, #100
; CHECK-NEXT: ldp x24, x23, [sp, #40] // 16-byte Folded Reload
; CHECK-NEXT: add x19, x19, #100
; CHECK-NEXT: ldp x26, x25, [sp, #24] // 16-byte Folded Reload
; CHECK-NEXT: add x20, x20, #100
; CHECK-NEXT: ldp x10, x9, [sp, #152] // 16-byte Folded Reload
; CHECK-NEXT: add x21, x21, #100
; CHECK-NEXT: ldp x12, x11, [sp, #136] // 16-byte Folded Reload
; CHECK-NEXT: add x22, x22, #100
; CHECK-NEXT: ldp x14, x13, [sp, #120] // 16-byte Folded Reload
; CHECK-NEXT: add x23, x23, #100
; CHECK-NEXT: ldp x16, x15, [sp, #104] // 16-byte Folded Reload
; CHECK-NEXT: add x24, x24, #100
; CHECK-NEXT: ldp x6, x7, [sp, #216] // 16-byte Folded Reload
; CHECK-NEXT: add x25, x25, #100
; CHECK-NEXT: ldp x4, x5, [sp, #200] // 16-byte Folded Reload
; CHECK-NEXT: add x26, x26, #100
; CHECK-NEXT: ldp x2, x3, [sp, #184] // 16-byte Folded Reload
; CHECK-NEXT: add x9, x9, #100
; CHECK-NEXT: ldp x0, x1, [sp, #168] // 16-byte Folded Reload
; CHECK-NEXT: add x10, x10, #100
; CHECK-NEXT: add x11, x11, #100
; CHECK-NEXT: add x12, x12, #100
; CHECK-NEXT: add x13, x13, #100
; CHECK-NEXT: add x14, x14, #100
; CHECK-NEXT: add x15, x15, #100
; CHECK-NEXT: add x16, x16, #100
; CHECK-NEXT: add x7, x7, #100
; CHECK-NEXT: add x6, x6, #100
; CHECK-NEXT: add x5, x5, #100
; CHECK-NEXT: add x4, x4, #100
; CHECK-NEXT: add x3, x3, #100
; CHECK-NEXT: add x2, x2, #100
; CHECK-NEXT: add x1, x1, #100
; CHECK-NEXT: add x0, x0, #100
; CHECK-NEXT: b .LBB0_1
%17 = alloca [5 x [5 x [5 x float]]], align 4
%18 = alloca [5 x [5 x [5 x float]]], align 4
%19 = alloca [5 x [5 x [5 x float]]], align 4
%20 = alloca [5 x [5 x [5 x float]]], align 4
%21 = alloca [5 x [5 x [5 x float]]], align 4
%22 = alloca [5 x [5 x [5 x float]]], align 4
%23 = alloca [5 x [5 x [5 x float]]], align 4
%24 = alloca [5 x [5 x [5 x float]]], align 4
%25 = alloca [5 x [5 x [5 x float]]], align 4
%26 = alloca [5 x [5 x [5 x float]]], align 4
%27 = alloca [5 x [5 x [5 x float]]], align 4
%28 = alloca [5 x [5 x [5 x float]]], align 4
%29 = alloca [5 x [5 x [5 x float]]], align 4
%30 = alloca [5 x [5 x [5 x float]]], align 4
%31 = alloca [5 x [5 x [5 x float]]], align 4
%32 = sext i32 0 to i64
%.idx = mul nsw i64 %32, 4500
%33 = getelementptr i8, ptr null, i64 -4500
%34 = mul nsw i64 %32, 125
%35 = add nsw i64 %34, -125
br label %.preheader167
.preheader167: ; preds = %154, %16
%indvars.iv175 = phi i64 [ 1, %16 ], [ %indvars.iv.next176, %154 ]
%36 = mul nuw nsw i64 %indvars.iv175, 25
%37 = add nsw i64 %36, -31
br label %.preheader166
.preheader166: ; preds = %153, %.preheader167
%indvars.iv172 = phi i64 [ 1, %.preheader167 ], [ %indvars.iv.next173, %153 ]
%38 = mul nuw nsw i64 %indvars.iv172, 5
%39 = add nsw i64 %37, %38
br label %40
40: ; preds = %40, %.preheader166
%indvars.iv = phi i64 [ 1, %.preheader166 ], [ %indvars.iv.next, %40 ]
%41 = phi i64 [ 5, %.preheader166 ], [ %152, %40 ]
%42 = add nsw i64 %39, %indvars.iv
%43 = add nsw i64 %35, %42
%44 = getelementptr float, ptr %0, i64 %43
%45 = load float, ptr %44, align 4
%46 = getelementptr float, ptr %1, i64 %43
%47 = load float, ptr %46, align 4
%48 = getelementptr float, ptr %2, i64 %43
%49 = load float, ptr %48, align 4
%50 = getelementptr float, ptr %3, i64 %43
%51 = load float, ptr %50, align 4
%52 = getelementptr float, ptr %4, i64 %43
%53 = load float, ptr %52, align 4
%54 = getelementptr float, ptr %5, i64 %43
%55 = load float, ptr %54, align 4
%56 = getelementptr float, ptr %6, i64 %43
%57 = load float, ptr %56, align 4
%58 = getelementptr float, ptr %7, i64 %43
%59 = load float, ptr %58, align 4
%60 = getelementptr float, ptr %8, i64 %43
%61 = load float, ptr %60, align 4
%62 = getelementptr float, ptr %9, i64 %43
%63 = load float, ptr %62, align 4
%64 = getelementptr float, ptr %10, i64 %43
%65 = load float, ptr %64, align 4
%66 = getelementptr float, ptr %11, i64 %43
%67 = load float, ptr %66, align 4
%68 = getelementptr float, ptr %12, i64 %43
%69 = load float, ptr %68, align 4
%70 = getelementptr float, ptr %13, i64 %43
%71 = load float, ptr %70, align 4
%72 = getelementptr float, ptr %14, i64 %43
%73 = load float, ptr %72, align 4
%74 = getelementptr float, ptr %15, i64 %43
%75 = load float, ptr %74, align 4
%76 = load float, ptr null, align 4
%77 = load float, ptr null, align 4
%78 = load float, ptr null, align 4
%79 = load float, ptr null, align 4
%80 = getelementptr float, ptr %31, i64 %42
%81 = load float, ptr %80, align 4
%82 = fmul contract float %45, %81
%83 = getelementptr float, ptr %28, i64 %42
%84 = load float, ptr %83, align 4
%85 = fmul contract float %55, %84
%86 = fadd contract float %82, %85
%87 = getelementptr float, ptr %27, i64 %42
%88 = load float, ptr %87, align 4
%89 = fmul contract float %47, %88
%90 = fadd contract float %86, %89
%91 = getelementptr float, ptr %25, i64 %42
%92 = load float, ptr %91, align 4
%93 = fmul contract float %53, %92
%94 = fadd contract float %90, %93
%95 = getelementptr float, ptr %24, i64 %42
%96 = load float, ptr %95, align 4
%97 = fmul contract float %51, %96
%98 = fadd contract float %94, %97
%99 = getelementptr float, ptr %23, i64 %42
%100 = load float, ptr %99, align 4
%101 = fmul contract float %49, %100
%102 = fadd contract float %98, %101
%103 = getelementptr float, ptr %21, i64 %42
store float %102, ptr %103, align 4
%104 = fmul contract float %47, %81
%105 = fmul contract float %65, %84
%106 = fadd contract float %104, %105
%107 = fmul contract float %57, %88
%108 = fadd contract float %106, %107
%109 = fmul contract float %63, %92
%110 = fadd contract float %108, %109
%111 = fmul contract float %61, %96
%112 = fadd contract float %110, %111
%113 = fmul contract float %59, %100
%114 = fadd contract float %112, %113
%115 = getelementptr float, ptr %18, i64 %42
store float %114, ptr %115, align 4
%116 = fmul contract float %55, %81
%117 = fmul contract float %79, %84
%118 = fadd contract float %116, %117
%119 = fmul contract float %65, %88
%120 = fadd contract float %118, %119
%121 = fmul contract float %78, %92
%122 = fadd contract float %120, %121
%123 = fmul contract float %76, %96
%124 = fadd contract float %122, %123
%125 = fmul contract float %71, %100
%126 = fadd contract float %124, %125
%127 = getelementptr float, ptr %20, i64 %42
store float %126, ptr %127, align 4
%128 = fmul contract float %53, %81
%129 = fmul contract float %78, %84
%130 = fadd contract float %128, %129
%131 = fmul contract float %63, %88
%132 = fadd contract float %130, %131
%133 = fmul contract float %77, %92
%134 = fadd contract float %132, %133
%135 = fmul contract float %75, %96
%136 = fadd contract float %134, %135
%137 = fmul contract float %69, %100
%138 = fadd contract float %136, %137
%139 = getelementptr float, ptr %19, i64 %42
store float %138, ptr %139, align 4
%140 = fmul contract float %51, %81
%141 = fmul contract float %76, %84
%142 = fadd contract float %140, %141
%143 = fmul contract float %61, %88
%144 = fadd contract float %142, %143
%145 = fmul contract float %75, %92
%146 = fadd contract float %144, %145
%147 = fmul contract float %73, %96
%148 = fadd contract float %146, %147
%149 = fmul contract float %67, %100
%150 = fadd contract float %148, %149
%151 = getelementptr float, ptr %17, i64 %42
store float %150, ptr %151, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%152 = add nsw i64 %41, -1
%.not145 = icmp eq i64 %152, 0
br i1 %.not145, label %153, label %40
153: ; preds = %40
%indvars.iv.next173 = add nuw nsw i64 %indvars.iv172, 1
%exitcond.not = icmp eq i64 %indvars.iv.next173, 6
br i1 %exitcond.not, label %154, label %.preheader166
154: ; preds = %153
%indvars.iv.next176 = add nuw nsw i64 %indvars.iv175, 1
%exitcond178.not = icmp eq i64 %indvars.iv.next176, 6
br label %.preheader167
}
attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "frame-pointer"="non-leaf" "target-cpu"="generic" "target-features"="+outline-atomics,+v8a,+fp-armv8,+neon" }