blob: 0ed29b48cf2f8eed6e1db5a7779d62d83e61af0c [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s
define i8 @scalarize_v16i8(ptr %p) {
; CHECK-LABEL: scalarize_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #3]
; CHECK-NEXT: ldrb w9, [x0, #2]
; CHECK-NEXT: ldrb w10, [x0, #1]
; CHECK-NEXT: ldrb w11, [x0]
; CHECK-NEXT: ldrb w13, [x0, #5]
; CHECK-NEXT: ldrb w14, [x0, #4]
; CHECK-NEXT: add w8, w9, w8
; CHECK-NEXT: ldrb w12, [x0, #15]
; CHECK-NEXT: ldrb w15, [x0, #11]
; CHECK-NEXT: add w10, w11, w10
; CHECK-NEXT: add w9, w14, w13
; CHECK-NEXT: ldrb w11, [x0, #10]
; CHECK-NEXT: ldrb w13, [x0, #9]
; CHECK-NEXT: add w8, w10, w8
; CHECK-NEXT: ldrb w14, [x0, #8]
; CHECK-NEXT: ldrb w16, [x0, #7]
; CHECK-NEXT: add w11, w11, w15
; CHECK-NEXT: ldrb w17, [x0, #6]
; CHECK-NEXT: ldrb w18, [x0, #14]
; CHECK-NEXT: add w13, w14, w13
; CHECK-NEXT: ldrb w1, [x0, #13]
; CHECK-NEXT: ldrb w0, [x0, #12]
; CHECK-NEXT: add w16, w17, w16
; CHECK-NEXT: add w10, w13, w11
; CHECK-NEXT: add w12, w18, w12
; CHECK-NEXT: add w9, w9, w16
; CHECK-NEXT: add w14, w0, w1
; CHECK-NEXT: add w8, w8, w9
; CHECK-NEXT: add w11, w14, w12
; CHECK-NEXT: add w9, w10, w11
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%wide.load = load <16 x i8>, ptr %p, align 4
%l0 = extractelement <16 x i8> %wide.load, i32 0
%l1 = extractelement <16 x i8> %wide.load, i32 1
%l2 = extractelement <16 x i8> %wide.load, i32 2
%l3 = extractelement <16 x i8> %wide.load, i32 3
%l4 = extractelement <16 x i8> %wide.load, i32 4
%l5 = extractelement <16 x i8> %wide.load, i32 5
%l6 = extractelement <16 x i8> %wide.load, i32 6
%l7 = extractelement <16 x i8> %wide.load, i32 7
%l8 = extractelement <16 x i8> %wide.load, i32 8
%l9 = extractelement <16 x i8> %wide.load, i32 9
%l10 = extractelement <16 x i8> %wide.load, i32 10
%l11 = extractelement <16 x i8> %wide.load, i32 11
%l12 = extractelement <16 x i8> %wide.load, i32 12
%l13 = extractelement <16 x i8> %wide.load, i32 13
%l14 = extractelement <16 x i8> %wide.load, i32 14
%l15 = extractelement <16 x i8> %wide.load, i32 15
%a0 = add i8 %l0, %l1
%a1 = add i8 %l2, %l3
%a2 = add i8 %l4, %l5
%a3 = add i8 %l6, %l7
%a4 = add i8 %l8, %l9
%a5 = add i8 %l10, %l11
%a6 = add i8 %l12, %l13
%a7 = add i8 %l14, %l15
%b0 = add i8 %a0, %a1
%b1 = add i8 %a2, %a3
%b2 = add i8 %a4, %a5
%b3 = add i8 %a6, %a7
%c0 = add i8 %b0, %b1
%c1 = add i8 %b2, %b3
%r = add i8 %c0, %c1
ret i8 %r
}
define i8 @scalarize_v8i8(ptr %p) {
; CHECK-LABEL: scalarize_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrb w8, [x0, #7]
; CHECK-NEXT: ldrb w9, [x0, #6]
; CHECK-NEXT: ldrb w10, [x0, #5]
; CHECK-NEXT: ldrb w11, [x0, #1]
; CHECK-NEXT: ldrb w12, [x0]
; CHECK-NEXT: ldrb w13, [x0, #4]
; CHECK-NEXT: add w8, w9, w8
; CHECK-NEXT: ldrb w14, [x0, #3]
; CHECK-NEXT: ldrb w15, [x0, #2]
; CHECK-NEXT: add w11, w12, w11
; CHECK-NEXT: add w10, w13, w10
; CHECK-NEXT: add w12, w15, w14
; CHECK-NEXT: add w8, w10, w8
; CHECK-NEXT: add w9, w11, w12
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%wide.load = load <8 x i8>, ptr %p, align 4
%l0 = extractelement <8 x i8> %wide.load, i32 0
%l1 = extractelement <8 x i8> %wide.load, i32 1
%l2 = extractelement <8 x i8> %wide.load, i32 2
%l3 = extractelement <8 x i8> %wide.load, i32 3
%l4 = extractelement <8 x i8> %wide.load, i32 4
%l5 = extractelement <8 x i8> %wide.load, i32 5
%l6 = extractelement <8 x i8> %wide.load, i32 6
%l7 = extractelement <8 x i8> %wide.load, i32 7
%a0 = add i8 %l0, %l1
%a1 = add i8 %l2, %l3
%a2 = add i8 %l4, %l5
%a3 = add i8 %l6, %l7
%b0 = add i8 %a0, %a1
%b1 = add i8 %a2, %a3
%r = add i8 %b0, %b1
ret i8 %r
}
define i16 @scalarize_v8i16(ptr %p) {
; CHECK-LABEL: scalarize_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0, #14]
; CHECK-NEXT: ldrh w9, [x0, #12]
; CHECK-NEXT: ldrh w10, [x0, #10]
; CHECK-NEXT: ldrh w11, [x0, #2]
; CHECK-NEXT: ldrh w12, [x0]
; CHECK-NEXT: ldrh w13, [x0, #8]
; CHECK-NEXT: add w8, w9, w8
; CHECK-NEXT: ldrh w14, [x0, #6]
; CHECK-NEXT: ldrh w15, [x0, #4]
; CHECK-NEXT: add w11, w12, w11
; CHECK-NEXT: add w10, w13, w10
; CHECK-NEXT: add w12, w15, w14
; CHECK-NEXT: add w8, w10, w8
; CHECK-NEXT: add w9, w11, w12
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%wide.load = load <8 x i16>, ptr %p, align 4
%l0 = extractelement <8 x i16> %wide.load, i32 0
%l1 = extractelement <8 x i16> %wide.load, i32 1
%l2 = extractelement <8 x i16> %wide.load, i32 2
%l3 = extractelement <8 x i16> %wide.load, i32 3
%l4 = extractelement <8 x i16> %wide.load, i32 4
%l5 = extractelement <8 x i16> %wide.load, i32 5
%l6 = extractelement <8 x i16> %wide.load, i32 6
%l7 = extractelement <8 x i16> %wide.load, i32 7
%a0 = add i16 %l0, %l1
%a1 = add i16 %l2, %l3
%a2 = add i16 %l4, %l5
%a3 = add i16 %l6, %l7
%b0 = add i16 %a0, %a1
%b1 = add i16 %a2, %a3
%r = add i16 %b0, %b1
ret i16 %r
}
define i16 @scalarize_v4i16(ptr %p) {
; CHECK-LABEL: scalarize_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0, #6]
; CHECK-NEXT: ldrh w9, [x0, #4]
; CHECK-NEXT: ldrh w10, [x0, #2]
; CHECK-NEXT: ldrh w11, [x0]
; CHECK-NEXT: add w8, w9, w8
; CHECK-NEXT: add w10, w11, w10
; CHECK-NEXT: add w0, w10, w8
; CHECK-NEXT: ret
%wide.load = load <4 x i16>, ptr %p, align 4
%l0 = extractelement <4 x i16> %wide.load, i32 0
%l1 = extractelement <4 x i16> %wide.load, i32 1
%l2 = extractelement <4 x i16> %wide.load, i32 2
%l3 = extractelement <4 x i16> %wide.load, i32 3
%a0 = add i16 %l0, %l1
%a1 = add i16 %l2, %l3
%r = add i16 %a0, %a1
ret i16 %r
}
define i32 @scalarize_v4i32(ptr %p) {
; CHECK-LABEL: scalarize_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp w9, w8, [x0]
; CHECK-NEXT: ldp w10, w11, [x0, #8]
; CHECK-NEXT: add w8, w9, w8
; CHECK-NEXT: add w9, w10, w11
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
%wide.load = load <4 x i32>, ptr %p, align 4
%l0 = extractelement <4 x i32> %wide.load, i32 0
%l1 = extractelement <4 x i32> %wide.load, i32 1
%l2 = extractelement <4 x i32> %wide.load, i32 2
%l3 = extractelement <4 x i32> %wide.load, i32 3
%a0 = add i32 %l0, %l1
%a1 = add i32 %l2, %l3
%r = add i32 %a0, %a1
ret i32 %r
}
define i64 @scalarize_v4i64(ptr %p) {
; CHECK-LABEL: scalarize_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: ldp x10, x11, [x0, #16]
; CHECK-NEXT: add x8, x8, x9
; CHECK-NEXT: add x9, x10, x11
; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
%wide.load = load <4 x i64>, ptr %p, align 4
%l0 = extractelement <4 x i64> %wide.load, i32 0
%l1 = extractelement <4 x i64> %wide.load, i32 1
%l2 = extractelement <4 x i64> %wide.load, i32 2
%l3 = extractelement <4 x i64> %wide.load, i32 3
%a0 = add i64 %l0, %l1
%a1 = add i64 %l2, %l3
%r = add i64 %a0, %a1
ret i64 %r
}
define i64 @scalarize_v4i32_sext(ptr %p) {
; CHECK-LABEL: scalarize_v4i32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: ldpsw x9, x8, [x0, #8]
; CHECK-NEXT: ldpsw x11, x10, [x0]
; CHECK-NEXT: add x8, x9, x8
; CHECK-NEXT: add x10, x11, x10
; CHECK-NEXT: add x0, x10, x8
; CHECK-NEXT: ret
%wide.load = load <4 x i32>, ptr %p, align 4
%ext = sext <4 x i32> %wide.load to <4 x i64>
%l0 = extractelement <4 x i64> %ext, i32 0
%l1 = extractelement <4 x i64> %ext, i32 1
%l2 = extractelement <4 x i64> %ext, i32 2
%l3 = extractelement <4 x i64> %ext, i32 3
%a0 = add i64 %l0, %l1
%a1 = add i64 %l2, %l3
%r = add i64 %a0, %a1
ret i64 %r
}
define i64 @scalarize_v4i32_zext(ptr %p) {
; CHECK-LABEL: scalarize_v4i32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp w9, w8, [x0, #8]
; CHECK-NEXT: ldp w11, w10, [x0]
; CHECK-NEXT: add x8, x9, x8
; CHECK-NEXT: add x10, x11, x10
; CHECK-NEXT: add x0, x10, x8
; CHECK-NEXT: ret
%wide.load = load <4 x i32>, ptr %p, align 4
%ext = zext <4 x i32> %wide.load to <4 x i64>
%l0 = extractelement <4 x i64> %ext, i32 0
%l1 = extractelement <4 x i64> %ext, i32 1
%l2 = extractelement <4 x i64> %ext, i32 2
%l3 = extractelement <4 x i64> %ext, i32 3
%a0 = add i64 %l0, %l1
%a1 = add i64 %l2, %l3
%r = add i64 %a0, %a1
ret i64 %r
}
define half @scalarize_v4f16(ptr %p) {
; CHECK-LABEL: scalarize_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: mov h1, v0.h[1]
; CHECK-NEXT: mov h2, v0.h[2]
; CHECK-NEXT: mov h3, v0.h[3]
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fcvt s1, h1
; CHECK-NEXT: fcvt s3, h3
; CHECK-NEXT: fcvt s2, h2
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: fcvt h1, s1
; CHECK-NEXT: fcvt s1, h1
; CHECK-NEXT: fcvt s0, h0
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: fcvt h0, s0
; CHECK-NEXT: ret
%wide.load = load <4 x half>, ptr %p, align 4
%l0 = extractelement <4 x half> %wide.load, i32 0
%l1 = extractelement <4 x half> %wide.load, i32 1
%l2 = extractelement <4 x half> %wide.load, i32 2
%l3 = extractelement <4 x half> %wide.load, i32 3
%a0 = fadd half %l0, %l1
%a1 = fadd half %l2, %l3
%r = fadd half %a0, %a1
ret half %r
}
define float @scalarize_v4f32(ptr %p) {
; CHECK-LABEL: scalarize_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov s1, v0.s[2]
; CHECK-NEXT: mov s2, v0.s[3]
; CHECK-NEXT: faddp s0, v0.2s
; CHECK-NEXT: fadd s1, s1, s2
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
%wide.load = load <4 x float>, ptr %p, align 4
%l0 = extractelement <4 x float> %wide.load, i32 0
%l1 = extractelement <4 x float> %wide.load, i32 1
%l2 = extractelement <4 x float> %wide.load, i32 2
%l3 = extractelement <4 x float> %wide.load, i32 3
%a0 = fadd float %l0, %l1
%a1 = fadd float %l2, %l3
%r = fadd float %a0, %a1
ret float %r
}
define double @scalarize_v4f64(ptr %p) {
; CHECK-LABEL: scalarize_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q1, q0, [x0]
; CHECK-NEXT: faddp d1, v1.2d
; CHECK-NEXT: faddp d0, v0.2d
; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ret
%wide.load = load <4 x double>, ptr %p, align 4
%l0 = extractelement <4 x double> %wide.load, i32 0
%l1 = extractelement <4 x double> %wide.load, i32 1
%l2 = extractelement <4 x double> %wide.load, i32 2
%l3 = extractelement <4 x double> %wide.load, i32 3
%a0 = fadd double %l0, %l1
%a1 = fadd double %l2, %l3
%r = fadd double %a0, %a1
ret double %r
}
define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
; CHECK-LABEL: scalarize_into_load:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp x8, x9, [x1]
; CHECK-NEXT: ldp x10, x11, [x1, #16]
; CHECK-NEXT: ldp x12, x13, [x1, #64]
; CHECK-NEXT: ldr s0, [x2, x8, lsl #2]
; CHECK-NEXT: ldr s1, [x2, x9, lsl #2]
; CHECK-NEXT: ldp x8, x9, [x1, #32]
; CHECK-NEXT: ldr s2, [x2, x10, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x11, lsl #2]
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldr s6, [x2, x12, lsl #2]
; CHECK-NEXT: ldp x10, x11, [x1, #48]
; CHECK-NEXT: ldr s7, [x2, x13, lsl #2]
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: ldr s2, [x2, x8, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x9, lsl #2]
; CHECK-NEXT: ldp x14, x15, [x1, #80]
; CHECK-NEXT: fadd s2, s2, s3
; CHECK-NEXT: ldr s4, [x2, x10, lsl #2]
; CHECK-NEXT: ldr s5, [x2, x11, lsl #2]
; CHECK-NEXT: ldp x16, x17, [x1, #96]
; CHECK-NEXT: fadd s3, s4, s5
; CHECK-NEXT: fadd s4, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldp x18, x0, [x1, #112]
; CHECK-NEXT: ldr s16, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s17, [x2, x15, lsl #2]
; CHECK-NEXT: ldr s18, [x2, x16, lsl #2]
; CHECK-NEXT: ldr s19, [x2, x17, lsl #2]
; CHECK-NEXT: ldr s20, [x2, x18, lsl #2]
; CHECK-NEXT: ldr s21, [x2, x0, lsl #2]
; CHECK-NEXT: fadd s5, s16, s17
; CHECK-NEXT: fadd s6, s18, s19
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s7, s20, s21
; CHECK-NEXT: fadd s2, s4, s5
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: fadd s3, s6, s7
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
entry:
%wide.load = load <16 x i64>, ptr %23, align 4
%25 = extractelement <16 x i64> %wide.load, i32 0
%26 = getelementptr inbounds float, ptr %rawA, i64 %25
%27 = extractelement <16 x i64> %wide.load, i32 1
%28 = getelementptr inbounds float, ptr %rawA, i64 %27
%29 = extractelement <16 x i64> %wide.load, i32 2
%30 = getelementptr inbounds float, ptr %rawA, i64 %29
%31 = extractelement <16 x i64> %wide.load, i32 3
%32 = getelementptr inbounds float, ptr %rawA, i64 %31
%33 = extractelement <16 x i64> %wide.load, i32 4
%34 = getelementptr inbounds float, ptr %rawA, i64 %33
%35 = extractelement <16 x i64> %wide.load, i32 5
%36 = getelementptr inbounds float, ptr %rawA, i64 %35
%37 = extractelement <16 x i64> %wide.load, i32 6
%38 = getelementptr inbounds float, ptr %rawA, i64 %37
%39 = extractelement <16 x i64> %wide.load, i32 7
%40 = getelementptr inbounds float, ptr %rawA, i64 %39
%41 = extractelement <16 x i64> %wide.load, i32 8
%42 = getelementptr inbounds float, ptr %rawA, i64 %41
%43 = extractelement <16 x i64> %wide.load, i32 9
%44 = getelementptr inbounds float, ptr %rawA, i64 %43
%45 = extractelement <16 x i64> %wide.load, i32 10
%46 = getelementptr inbounds float, ptr %rawA, i64 %45
%47 = extractelement <16 x i64> %wide.load, i32 11
%48 = getelementptr inbounds float, ptr %rawA, i64 %47
%49 = extractelement <16 x i64> %wide.load, i32 12
%50 = getelementptr inbounds float, ptr %rawA, i64 %49
%51 = extractelement <16 x i64> %wide.load, i32 13
%52 = getelementptr inbounds float, ptr %rawA, i64 %51
%53 = extractelement <16 x i64> %wide.load, i32 14
%54 = getelementptr inbounds float, ptr %rawA, i64 %53
%55 = extractelement <16 x i64> %wide.load, i32 15
%56 = getelementptr inbounds float, ptr %rawA, i64 %55
%59 = load float, ptr %26, align 4
%60 = load float, ptr %28, align 4
%61 = load float, ptr %30, align 4
%62 = load float, ptr %32, align 4
%63 = load float, ptr %34, align 4
%64 = load float, ptr %36, align 4
%65 = load float, ptr %38, align 4
%66 = load float, ptr %40, align 4
%67 = load float, ptr %42, align 4
%68 = load float, ptr %44, align 4
%69 = load float, ptr %46, align 4
%70 = load float, ptr %48, align 4
%71 = load float, ptr %50, align 4
%72 = load float, ptr %52, align 4
%73 = load float, ptr %54, align 4
%74 = load float, ptr %56, align 4
%a1 = fadd float %59, %60
%a2 = fadd float %61, %62
%a3 = fadd float %63, %64
%a4 = fadd float %65, %66
%a5 = fadd float %67, %68
%a6 = fadd float %69, %70
%a7 = fadd float %71, %72
%a8 = fadd float %73, %74
%a9 = fadd float %a1, %a2
%a10 = fadd float %a3, %a4
%a11 = fadd float %a5, %a6
%a12 = fadd float %a7, %a8
%a13 = fadd float %a9, %a10
%a14 = fadd float %a11, %a12
%a15 = fadd float %a13, %a14
ret float %a15
}
define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
; CHECK-LABEL: scalarize_into_load_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldpsw x9, x8, [x1]
; CHECK-NEXT: ldpsw x11, x10, [x1, #8]
; CHECK-NEXT: ldpsw x13, x12, [x1, #24]
; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
; CHECK-NEXT: ldr s1, [x2, x8, lsl #2]
; CHECK-NEXT: ldpsw x9, x8, [x1, #56]
; CHECK-NEXT: ldr s2, [x2, x11, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldpsw x11, x10, [x1, #48]
; CHECK-NEXT: ldpsw x15, x14, [x1, #16]
; CHECK-NEXT: ldpsw x17, x16, [x1, #40]
; CHECK-NEXT: ldpsw x0, x18, [x1, #32]
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: ldr s2, [x2, x15, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
; CHECK-NEXT: ldr s6, [x2, x0, lsl #2]
; CHECK-NEXT: fadd s2, s2, s3
; CHECK-NEXT: ldr s7, [x2, x18, lsl #2]
; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
; CHECK-NEXT: fadd s3, s4, s5
; CHECK-NEXT: ldr s18, [x2, x11, lsl #2]
; CHECK-NEXT: ldr s19, [x2, x10, lsl #2]
; CHECK-NEXT: fadd s4, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldr s20, [x2, x9, lsl #2]
; CHECK-NEXT: ldr s21, [x2, x8, lsl #2]
; CHECK-NEXT: fadd s5, s16, s17
; CHECK-NEXT: fadd s6, s18, s19
; CHECK-NEXT: fadd s7, s20, s21
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s2, s4, s5
; CHECK-NEXT: fadd s3, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
entry:
%wide.load = load <16 x i32>, ptr %23, align 4
%24 = sext <16 x i32> %wide.load to <16 x i64>
%25 = extractelement <16 x i64> %24, i32 0
%26 = getelementptr inbounds float, ptr %rawA, i64 %25
%27 = extractelement <16 x i64> %24, i32 1
%28 = getelementptr inbounds float, ptr %rawA, i64 %27
%29 = extractelement <16 x i64> %24, i32 2
%30 = getelementptr inbounds float, ptr %rawA, i64 %29
%31 = extractelement <16 x i64> %24, i32 3
%32 = getelementptr inbounds float, ptr %rawA, i64 %31
%33 = extractelement <16 x i64> %24, i32 4
%34 = getelementptr inbounds float, ptr %rawA, i64 %33
%35 = extractelement <16 x i64> %24, i32 5
%36 = getelementptr inbounds float, ptr %rawA, i64 %35
%37 = extractelement <16 x i64> %24, i32 6
%38 = getelementptr inbounds float, ptr %rawA, i64 %37
%39 = extractelement <16 x i64> %24, i32 7
%40 = getelementptr inbounds float, ptr %rawA, i64 %39
%41 = extractelement <16 x i64> %24, i32 8
%42 = getelementptr inbounds float, ptr %rawA, i64 %41
%43 = extractelement <16 x i64> %24, i32 9
%44 = getelementptr inbounds float, ptr %rawA, i64 %43
%45 = extractelement <16 x i64> %24, i32 10
%46 = getelementptr inbounds float, ptr %rawA, i64 %45
%47 = extractelement <16 x i64> %24, i32 11
%48 = getelementptr inbounds float, ptr %rawA, i64 %47
%49 = extractelement <16 x i64> %24, i32 12
%50 = getelementptr inbounds float, ptr %rawA, i64 %49
%51 = extractelement <16 x i64> %24, i32 13
%52 = getelementptr inbounds float, ptr %rawA, i64 %51
%53 = extractelement <16 x i64> %24, i32 14
%54 = getelementptr inbounds float, ptr %rawA, i64 %53
%55 = extractelement <16 x i64> %24, i32 15
%56 = getelementptr inbounds float, ptr %rawA, i64 %55
%59 = load float, ptr %26, align 4
%60 = load float, ptr %28, align 4
%61 = load float, ptr %30, align 4
%62 = load float, ptr %32, align 4
%63 = load float, ptr %34, align 4
%64 = load float, ptr %36, align 4
%65 = load float, ptr %38, align 4
%66 = load float, ptr %40, align 4
%67 = load float, ptr %42, align 4
%68 = load float, ptr %44, align 4
%69 = load float, ptr %46, align 4
%70 = load float, ptr %48, align 4
%71 = load float, ptr %50, align 4
%72 = load float, ptr %52, align 4
%73 = load float, ptr %54, align 4
%74 = load float, ptr %56, align 4
%a1 = fadd float %59, %60
%a2 = fadd float %61, %62
%a3 = fadd float %63, %64
%a4 = fadd float %65, %66
%a5 = fadd float %67, %68
%a6 = fadd float %69, %70
%a7 = fadd float %71, %72
%a8 = fadd float %73, %74
%a9 = fadd float %a1, %a2
%a10 = fadd float %a3, %a4
%a11 = fadd float %a5, %a6
%a12 = fadd float %a7, %a8
%a13 = fadd float %a9, %a10
%a14 = fadd float %a11, %a12
%a15 = fadd float %a13, %a14
ret float %a15
}
define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) {
; CHECK-LABEL: scalarize_into_load_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp w9, w8, [x1]
; CHECK-NEXT: ldp w11, w10, [x1, #8]
; CHECK-NEXT: ldp w13, w12, [x1, #24]
; CHECK-NEXT: ldr s0, [x2, x9, lsl #2]
; CHECK-NEXT: ldr s1, [x2, x8, lsl #2]
; CHECK-NEXT: ldp w9, w8, [x1, #56]
; CHECK-NEXT: ldr s2, [x2, x11, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x10, lsl #2]
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldp w11, w10, [x1, #48]
; CHECK-NEXT: ldp w15, w14, [x1, #16]
; CHECK-NEXT: ldp w17, w16, [x1, #40]
; CHECK-NEXT: ldp w0, w18, [x1, #32]
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: ldr s2, [x2, x15, lsl #2]
; CHECK-NEXT: ldr s3, [x2, x14, lsl #2]
; CHECK-NEXT: ldr s4, [x2, x13, lsl #2]
; CHECK-NEXT: ldr s5, [x2, x12, lsl #2]
; CHECK-NEXT: ldr s16, [x2, x17, lsl #2]
; CHECK-NEXT: ldr s6, [x2, x0, lsl #2]
; CHECK-NEXT: fadd s2, s2, s3
; CHECK-NEXT: ldr s7, [x2, x18, lsl #2]
; CHECK-NEXT: ldr s17, [x2, x16, lsl #2]
; CHECK-NEXT: fadd s3, s4, s5
; CHECK-NEXT: ldr s18, [x2, x11, lsl #2]
; CHECK-NEXT: ldr s19, [x2, x10, lsl #2]
; CHECK-NEXT: fadd s4, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ldr s20, [x2, x9, lsl #2]
; CHECK-NEXT: ldr s21, [x2, x8, lsl #2]
; CHECK-NEXT: fadd s5, s16, s17
; CHECK-NEXT: fadd s6, s18, s19
; CHECK-NEXT: fadd s7, s20, s21
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s2, s4, s5
; CHECK-NEXT: fadd s3, s6, s7
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: fadd s1, s2, s3
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
entry:
%wide.load = load <16 x i32>, ptr %23, align 4
%24 = zext <16 x i32> %wide.load to <16 x i64>
%25 = extractelement <16 x i64> %24, i32 0
%26 = getelementptr inbounds float, ptr %rawA, i64 %25
%27 = extractelement <16 x i64> %24, i32 1
%28 = getelementptr inbounds float, ptr %rawA, i64 %27
%29 = extractelement <16 x i64> %24, i32 2
%30 = getelementptr inbounds float, ptr %rawA, i64 %29
%31 = extractelement <16 x i64> %24, i32 3
%32 = getelementptr inbounds float, ptr %rawA, i64 %31
%33 = extractelement <16 x i64> %24, i32 4
%34 = getelementptr inbounds float, ptr %rawA, i64 %33
%35 = extractelement <16 x i64> %24, i32 5
%36 = getelementptr inbounds float, ptr %rawA, i64 %35
%37 = extractelement <16 x i64> %24, i32 6
%38 = getelementptr inbounds float, ptr %rawA, i64 %37
%39 = extractelement <16 x i64> %24, i32 7
%40 = getelementptr inbounds float, ptr %rawA, i64 %39
%41 = extractelement <16 x i64> %24, i32 8
%42 = getelementptr inbounds float, ptr %rawA, i64 %41
%43 = extractelement <16 x i64> %24, i32 9
%44 = getelementptr inbounds float, ptr %rawA, i64 %43
%45 = extractelement <16 x i64> %24, i32 10
%46 = getelementptr inbounds float, ptr %rawA, i64 %45
%47 = extractelement <16 x i64> %24, i32 11
%48 = getelementptr inbounds float, ptr %rawA, i64 %47
%49 = extractelement <16 x i64> %24, i32 12
%50 = getelementptr inbounds float, ptr %rawA, i64 %49
%51 = extractelement <16 x i64> %24, i32 13
%52 = getelementptr inbounds float, ptr %rawA, i64 %51
%53 = extractelement <16 x i64> %24, i32 14
%54 = getelementptr inbounds float, ptr %rawA, i64 %53
%55 = extractelement <16 x i64> %24, i32 15
%56 = getelementptr inbounds float, ptr %rawA, i64 %55
%59 = load float, ptr %26, align 4
%60 = load float, ptr %28, align 4
%61 = load float, ptr %30, align 4
%62 = load float, ptr %32, align 4
%63 = load float, ptr %34, align 4
%64 = load float, ptr %36, align 4
%65 = load float, ptr %38, align 4
%66 = load float, ptr %40, align 4
%67 = load float, ptr %42, align 4
%68 = load float, ptr %44, align 4
%69 = load float, ptr %46, align 4
%70 = load float, ptr %48, align 4
%71 = load float, ptr %50, align 4
%72 = load float, ptr %52, align 4
%73 = load float, ptr %54, align 4
%74 = load float, ptr %56, align 4
%a1 = fadd float %59, %60
%a2 = fadd float %61, %62
%a3 = fadd float %63, %64
%a4 = fadd float %65, %66
%a5 = fadd float %67, %68
%a6 = fadd float %69, %70
%a7 = fadd float %71, %72
%a8 = fadd float %73, %74
%a9 = fadd float %a1, %a2
%a10 = fadd float %a3, %a4
%a11 = fadd float %a5, %a6
%a12 = fadd float %a7, %a8
%a13 = fadd float %a9, %a10
%a14 = fadd float %a11, %a12
%a15 = fadd float %a13, %a14
ret float %a15
}