| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s |
| |
| define i8 @scalarize_v16i8(ptr %p) { |
| ; CHECK-LABEL: scalarize_v16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldrb w8, [x0, #3] |
| ; CHECK-NEXT: ldrb w9, [x0, #2] |
| ; CHECK-NEXT: ldrb w10, [x0, #1] |
| ; CHECK-NEXT: ldrb w11, [x0] |
| ; CHECK-NEXT: ldrb w13, [x0, #5] |
| ; CHECK-NEXT: ldrb w14, [x0, #4] |
| ; CHECK-NEXT: add w8, w9, w8 |
| ; CHECK-NEXT: ldrb w12, [x0, #15] |
| ; CHECK-NEXT: ldrb w15, [x0, #11] |
| ; CHECK-NEXT: add w10, w11, w10 |
| ; CHECK-NEXT: add w9, w14, w13 |
| ; CHECK-NEXT: ldrb w11, [x0, #10] |
| ; CHECK-NEXT: ldrb w13, [x0, #9] |
| ; CHECK-NEXT: add w8, w10, w8 |
| ; CHECK-NEXT: ldrb w14, [x0, #8] |
| ; CHECK-NEXT: ldrb w16, [x0, #7] |
| ; CHECK-NEXT: add w11, w11, w15 |
| ; CHECK-NEXT: ldrb w17, [x0, #6] |
| ; CHECK-NEXT: ldrb w18, [x0, #14] |
| ; CHECK-NEXT: add w13, w14, w13 |
| ; CHECK-NEXT: ldrb w1, [x0, #13] |
| ; CHECK-NEXT: ldrb w0, [x0, #12] |
| ; CHECK-NEXT: add w16, w17, w16 |
| ; CHECK-NEXT: add w10, w13, w11 |
| ; CHECK-NEXT: add w12, w18, w12 |
| ; CHECK-NEXT: add w9, w9, w16 |
| ; CHECK-NEXT: add w14, w0, w1 |
| ; CHECK-NEXT: add w8, w8, w9 |
| ; CHECK-NEXT: add w11, w14, w12 |
| ; CHECK-NEXT: add w9, w10, w11 |
| ; CHECK-NEXT: add w0, w8, w9 |
| ; CHECK-NEXT: ret |
| %wide.load = load <16 x i8>, ptr %p, align 4 |
| %l0 = extractelement <16 x i8> %wide.load, i32 0 |
| %l1 = extractelement <16 x i8> %wide.load, i32 1 |
| %l2 = extractelement <16 x i8> %wide.load, i32 2 |
| %l3 = extractelement <16 x i8> %wide.load, i32 3 |
| %l4 = extractelement <16 x i8> %wide.load, i32 4 |
| %l5 = extractelement <16 x i8> %wide.load, i32 5 |
| %l6 = extractelement <16 x i8> %wide.load, i32 6 |
| %l7 = extractelement <16 x i8> %wide.load, i32 7 |
| %l8 = extractelement <16 x i8> %wide.load, i32 8 |
| %l9 = extractelement <16 x i8> %wide.load, i32 9 |
| %l10 = extractelement <16 x i8> %wide.load, i32 10 |
| %l11 = extractelement <16 x i8> %wide.load, i32 11 |
| %l12 = extractelement <16 x i8> %wide.load, i32 12 |
| %l13 = extractelement <16 x i8> %wide.load, i32 13 |
| %l14 = extractelement <16 x i8> %wide.load, i32 14 |
| %l15 = extractelement <16 x i8> %wide.load, i32 15 |
| %a0 = add i8 %l0, %l1 |
| %a1 = add i8 %l2, %l3 |
| %a2 = add i8 %l4, %l5 |
| %a3 = add i8 %l6, %l7 |
| %a4 = add i8 %l8, %l9 |
| %a5 = add i8 %l10, %l11 |
| %a6 = add i8 %l12, %l13 |
| %a7 = add i8 %l14, %l15 |
| %b0 = add i8 %a0, %a1 |
| %b1 = add i8 %a2, %a3 |
| %b2 = add i8 %a4, %a5 |
| %b3 = add i8 %a6, %a7 |
| %c0 = add i8 %b0, %b1 |
| %c1 = add i8 %b2, %b3 |
| %r = add i8 %c0, %c1 |
| ret i8 %r |
| } |
| |
| define i8 @scalarize_v8i8(ptr %p) { |
| ; CHECK-LABEL: scalarize_v8i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldrb w8, [x0, #7] |
| ; CHECK-NEXT: ldrb w9, [x0, #6] |
| ; CHECK-NEXT: ldrb w10, [x0, #5] |
| ; CHECK-NEXT: ldrb w11, [x0, #1] |
| ; CHECK-NEXT: ldrb w12, [x0] |
| ; CHECK-NEXT: ldrb w13, [x0, #4] |
| ; CHECK-NEXT: add w8, w9, w8 |
| ; CHECK-NEXT: ldrb w14, [x0, #3] |
| ; CHECK-NEXT: ldrb w15, [x0, #2] |
| ; CHECK-NEXT: add w11, w12, w11 |
| ; CHECK-NEXT: add w10, w13, w10 |
| ; CHECK-NEXT: add w12, w15, w14 |
| ; CHECK-NEXT: add w8, w10, w8 |
| ; CHECK-NEXT: add w9, w11, w12 |
| ; CHECK-NEXT: add w0, w9, w8 |
| ; CHECK-NEXT: ret |
| %wide.load = load <8 x i8>, ptr %p, align 4 |
| %l0 = extractelement <8 x i8> %wide.load, i32 0 |
| %l1 = extractelement <8 x i8> %wide.load, i32 1 |
| %l2 = extractelement <8 x i8> %wide.load, i32 2 |
| %l3 = extractelement <8 x i8> %wide.load, i32 3 |
| %l4 = extractelement <8 x i8> %wide.load, i32 4 |
| %l5 = extractelement <8 x i8> %wide.load, i32 5 |
| %l6 = extractelement <8 x i8> %wide.load, i32 6 |
| %l7 = extractelement <8 x i8> %wide.load, i32 7 |
| %a0 = add i8 %l0, %l1 |
| %a1 = add i8 %l2, %l3 |
| %a2 = add i8 %l4, %l5 |
| %a3 = add i8 %l6, %l7 |
| %b0 = add i8 %a0, %a1 |
| %b1 = add i8 %a2, %a3 |
| %r = add i8 %b0, %b1 |
| ret i8 %r |
| } |
| |
| define i16 @scalarize_v8i16(ptr %p) { |
| ; CHECK-LABEL: scalarize_v8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldrh w8, [x0, #14] |
| ; CHECK-NEXT: ldrh w9, [x0, #12] |
| ; CHECK-NEXT: ldrh w10, [x0, #10] |
| ; CHECK-NEXT: ldrh w11, [x0, #2] |
| ; CHECK-NEXT: ldrh w12, [x0] |
| ; CHECK-NEXT: ldrh w13, [x0, #8] |
| ; CHECK-NEXT: add w8, w9, w8 |
| ; CHECK-NEXT: ldrh w14, [x0, #6] |
| ; CHECK-NEXT: ldrh w15, [x0, #4] |
| ; CHECK-NEXT: add w11, w12, w11 |
| ; CHECK-NEXT: add w10, w13, w10 |
| ; CHECK-NEXT: add w12, w15, w14 |
| ; CHECK-NEXT: add w8, w10, w8 |
| ; CHECK-NEXT: add w9, w11, w12 |
| ; CHECK-NEXT: add w0, w9, w8 |
| ; CHECK-NEXT: ret |
| %wide.load = load <8 x i16>, ptr %p, align 4 |
| %l0 = extractelement <8 x i16> %wide.load, i32 0 |
| %l1 = extractelement <8 x i16> %wide.load, i32 1 |
| %l2 = extractelement <8 x i16> %wide.load, i32 2 |
| %l3 = extractelement <8 x i16> %wide.load, i32 3 |
| %l4 = extractelement <8 x i16> %wide.load, i32 4 |
| %l5 = extractelement <8 x i16> %wide.load, i32 5 |
| %l6 = extractelement <8 x i16> %wide.load, i32 6 |
| %l7 = extractelement <8 x i16> %wide.load, i32 7 |
| %a0 = add i16 %l0, %l1 |
| %a1 = add i16 %l2, %l3 |
| %a2 = add i16 %l4, %l5 |
| %a3 = add i16 %l6, %l7 |
| %b0 = add i16 %a0, %a1 |
| %b1 = add i16 %a2, %a3 |
| %r = add i16 %b0, %b1 |
| ret i16 %r |
| } |
| |
| define i16 @scalarize_v4i16(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldrh w8, [x0, #6] |
| ; CHECK-NEXT: ldrh w9, [x0, #4] |
| ; CHECK-NEXT: ldrh w10, [x0, #2] |
| ; CHECK-NEXT: ldrh w11, [x0] |
| ; CHECK-NEXT: add w8, w9, w8 |
| ; CHECK-NEXT: add w10, w11, w10 |
| ; CHECK-NEXT: add w0, w10, w8 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x i16>, ptr %p, align 4 |
| %l0 = extractelement <4 x i16> %wide.load, i32 0 |
| %l1 = extractelement <4 x i16> %wide.load, i32 1 |
| %l2 = extractelement <4 x i16> %wide.load, i32 2 |
| %l3 = extractelement <4 x i16> %wide.load, i32 3 |
| %a0 = add i16 %l0, %l1 |
| %a1 = add i16 %l2, %l3 |
| %r = add i16 %a0, %a1 |
| ret i16 %r |
| } |
| |
| define i32 @scalarize_v4i32(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4i32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldp w9, w8, [x0] |
| ; CHECK-NEXT: ldp w10, w11, [x0, #8] |
| ; CHECK-NEXT: add w8, w9, w8 |
| ; CHECK-NEXT: add w9, w10, w11 |
| ; CHECK-NEXT: add w0, w8, w9 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x i32>, ptr %p, align 4 |
| %l0 = extractelement <4 x i32> %wide.load, i32 0 |
| %l1 = extractelement <4 x i32> %wide.load, i32 1 |
| %l2 = extractelement <4 x i32> %wide.load, i32 2 |
| %l3 = extractelement <4 x i32> %wide.load, i32 3 |
| %a0 = add i32 %l0, %l1 |
| %a1 = add i32 %l2, %l3 |
| %r = add i32 %a0, %a1 |
| ret i32 %r |
| } |
| |
| define i64 @scalarize_v4i64(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4i64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldp x8, x9, [x0] |
| ; CHECK-NEXT: ldp x10, x11, [x0, #16] |
| ; CHECK-NEXT: add x8, x8, x9 |
| ; CHECK-NEXT: add x9, x10, x11 |
| ; CHECK-NEXT: add x0, x8, x9 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x i64>, ptr %p, align 4 |
| %l0 = extractelement <4 x i64> %wide.load, i32 0 |
| %l1 = extractelement <4 x i64> %wide.load, i32 1 |
| %l2 = extractelement <4 x i64> %wide.load, i32 2 |
| %l3 = extractelement <4 x i64> %wide.load, i32 3 |
| %a0 = add i64 %l0, %l1 |
| %a1 = add i64 %l2, %l3 |
| %r = add i64 %a0, %a1 |
| ret i64 %r |
| } |
| |
| define i64 @scalarize_v4i32_sext(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4i32_sext: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldpsw x9, x8, [x0, #8] |
| ; CHECK-NEXT: ldpsw x11, x10, [x0] |
| ; CHECK-NEXT: add x8, x9, x8 |
| ; CHECK-NEXT: add x10, x11, x10 |
| ; CHECK-NEXT: add x0, x10, x8 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x i32>, ptr %p, align 4 |
| %ext = sext <4 x i32> %wide.load to <4 x i64> |
| %l0 = extractelement <4 x i64> %ext, i32 0 |
| %l1 = extractelement <4 x i64> %ext, i32 1 |
| %l2 = extractelement <4 x i64> %ext, i32 2 |
| %l3 = extractelement <4 x i64> %ext, i32 3 |
| %a0 = add i64 %l0, %l1 |
| %a1 = add i64 %l2, %l3 |
| %r = add i64 %a0, %a1 |
| ret i64 %r |
| } |
| |
| define i64 @scalarize_v4i32_zext(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4i32_zext: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldp w9, w8, [x0, #8] |
| ; CHECK-NEXT: ldp w11, w10, [x0] |
| ; CHECK-NEXT: add x8, x9, x8 |
| ; CHECK-NEXT: add x10, x11, x10 |
| ; CHECK-NEXT: add x0, x10, x8 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x i32>, ptr %p, align 4 |
| %ext = zext <4 x i32> %wide.load to <4 x i64> |
| %l0 = extractelement <4 x i64> %ext, i32 0 |
| %l1 = extractelement <4 x i64> %ext, i32 1 |
| %l2 = extractelement <4 x i64> %ext, i32 2 |
| %l3 = extractelement <4 x i64> %ext, i32 3 |
| %a0 = add i64 %l0, %l1 |
| %a1 = add i64 %l2, %l3 |
| %r = add i64 %a0, %a1 |
| ret i64 %r |
| } |
| |
| define half @scalarize_v4f16(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr d0, [x0] |
| ; CHECK-NEXT: mov h1, v0.h[1] |
| ; CHECK-NEXT: mov h2, v0.h[2] |
| ; CHECK-NEXT: mov h3, v0.h[3] |
| ; CHECK-NEXT: fcvt s0, h0 |
| ; CHECK-NEXT: fcvt s1, h1 |
| ; CHECK-NEXT: fcvt s3, h3 |
| ; CHECK-NEXT: fcvt s2, h2 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fcvt h0, s0 |
| ; CHECK-NEXT: fcvt h1, s1 |
| ; CHECK-NEXT: fcvt s1, h1 |
| ; CHECK-NEXT: fcvt s0, h0 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: fcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x half>, ptr %p, align 4 |
| %l0 = extractelement <4 x half> %wide.load, i32 0 |
| %l1 = extractelement <4 x half> %wide.load, i32 1 |
| %l2 = extractelement <4 x half> %wide.load, i32 2 |
| %l3 = extractelement <4 x half> %wide.load, i32 3 |
| %a0 = fadd half %l0, %l1 |
| %a1 = fadd half %l2, %l3 |
| %r = fadd half %a0, %a1 |
| ret half %r |
| } |
| |
| define float @scalarize_v4f32(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldr q0, [x0] |
| ; CHECK-NEXT: mov s1, v0.s[2] |
| ; CHECK-NEXT: mov s2, v0.s[3] |
| ; CHECK-NEXT: faddp s0, v0.2s |
| ; CHECK-NEXT: fadd s1, s1, s2 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x float>, ptr %p, align 4 |
| %l0 = extractelement <4 x float> %wide.load, i32 0 |
| %l1 = extractelement <4 x float> %wide.load, i32 1 |
| %l2 = extractelement <4 x float> %wide.load, i32 2 |
| %l3 = extractelement <4 x float> %wide.load, i32 3 |
| %a0 = fadd float %l0, %l1 |
| %a1 = fadd float %l2, %l3 |
| %r = fadd float %a0, %a1 |
| ret float %r |
| } |
| |
| define double @scalarize_v4f64(ptr %p) { |
| ; CHECK-LABEL: scalarize_v4f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ldp q1, q0, [x0] |
| ; CHECK-NEXT: faddp d1, v1.2d |
| ; CHECK-NEXT: faddp d0, v0.2d |
| ; CHECK-NEXT: fadd d0, d1, d0 |
| ; CHECK-NEXT: ret |
| %wide.load = load <4 x double>, ptr %p, align 4 |
| %l0 = extractelement <4 x double> %wide.load, i32 0 |
| %l1 = extractelement <4 x double> %wide.load, i32 1 |
| %l2 = extractelement <4 x double> %wide.load, i32 2 |
| %l3 = extractelement <4 x double> %wide.load, i32 3 |
| %a0 = fadd double %l0, %l1 |
| %a1 = fadd double %l2, %l3 |
| %r = fadd double %a0, %a1 |
| ret double %r |
| } |
| |
| |
| define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { |
| ; CHECK-LABEL: scalarize_into_load: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ldp x8, x9, [x1] |
| ; CHECK-NEXT: ldp x10, x11, [x1, #16] |
| ; CHECK-NEXT: ldp x12, x13, [x1, #64] |
| ; CHECK-NEXT: ldr s0, [x2, x8, lsl #2] |
| ; CHECK-NEXT: ldr s1, [x2, x9, lsl #2] |
| ; CHECK-NEXT: ldp x8, x9, [x1, #32] |
| ; CHECK-NEXT: ldr s2, [x2, x10, lsl #2] |
| ; CHECK-NEXT: ldr s3, [x2, x11, lsl #2] |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ldr s6, [x2, x12, lsl #2] |
| ; CHECK-NEXT: ldp x10, x11, [x1, #48] |
| ; CHECK-NEXT: ldr s7, [x2, x13, lsl #2] |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] |
| ; CHECK-NEXT: ldr s3, [x2, x9, lsl #2] |
| ; CHECK-NEXT: ldp x14, x15, [x1, #80] |
| ; CHECK-NEXT: fadd s2, s2, s3 |
| ; CHECK-NEXT: ldr s4, [x2, x10, lsl #2] |
| ; CHECK-NEXT: ldr s5, [x2, x11, lsl #2] |
| ; CHECK-NEXT: ldp x16, x17, [x1, #96] |
| ; CHECK-NEXT: fadd s3, s4, s5 |
| ; CHECK-NEXT: fadd s4, s6, s7 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ldp x18, x0, [x1, #112] |
| ; CHECK-NEXT: ldr s16, [x2, x14, lsl #2] |
| ; CHECK-NEXT: ldr s17, [x2, x15, lsl #2] |
| ; CHECK-NEXT: ldr s18, [x2, x16, lsl #2] |
| ; CHECK-NEXT: ldr s19, [x2, x17, lsl #2] |
| ; CHECK-NEXT: ldr s20, [x2, x18, lsl #2] |
| ; CHECK-NEXT: ldr s21, [x2, x0, lsl #2] |
| ; CHECK-NEXT: fadd s5, s16, s17 |
| ; CHECK-NEXT: fadd s6, s18, s19 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fadd s7, s20, s21 |
| ; CHECK-NEXT: fadd s2, s4, s5 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: fadd s3, s6, s7 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ret |
| entry: |
| %wide.load = load <16 x i64>, ptr %23, align 4 |
| %25 = extractelement <16 x i64> %wide.load, i32 0 |
| %26 = getelementptr inbounds float, ptr %rawA, i64 %25 |
| %27 = extractelement <16 x i64> %wide.load, i32 1 |
| %28 = getelementptr inbounds float, ptr %rawA, i64 %27 |
| %29 = extractelement <16 x i64> %wide.load, i32 2 |
| %30 = getelementptr inbounds float, ptr %rawA, i64 %29 |
| %31 = extractelement <16 x i64> %wide.load, i32 3 |
| %32 = getelementptr inbounds float, ptr %rawA, i64 %31 |
| %33 = extractelement <16 x i64> %wide.load, i32 4 |
| %34 = getelementptr inbounds float, ptr %rawA, i64 %33 |
| %35 = extractelement <16 x i64> %wide.load, i32 5 |
| %36 = getelementptr inbounds float, ptr %rawA, i64 %35 |
| %37 = extractelement <16 x i64> %wide.load, i32 6 |
| %38 = getelementptr inbounds float, ptr %rawA, i64 %37 |
| %39 = extractelement <16 x i64> %wide.load, i32 7 |
| %40 = getelementptr inbounds float, ptr %rawA, i64 %39 |
| %41 = extractelement <16 x i64> %wide.load, i32 8 |
| %42 = getelementptr inbounds float, ptr %rawA, i64 %41 |
| %43 = extractelement <16 x i64> %wide.load, i32 9 |
| %44 = getelementptr inbounds float, ptr %rawA, i64 %43 |
| %45 = extractelement <16 x i64> %wide.load, i32 10 |
| %46 = getelementptr inbounds float, ptr %rawA, i64 %45 |
| %47 = extractelement <16 x i64> %wide.load, i32 11 |
| %48 = getelementptr inbounds float, ptr %rawA, i64 %47 |
| %49 = extractelement <16 x i64> %wide.load, i32 12 |
| %50 = getelementptr inbounds float, ptr %rawA, i64 %49 |
| %51 = extractelement <16 x i64> %wide.load, i32 13 |
| %52 = getelementptr inbounds float, ptr %rawA, i64 %51 |
| %53 = extractelement <16 x i64> %wide.load, i32 14 |
| %54 = getelementptr inbounds float, ptr %rawA, i64 %53 |
| %55 = extractelement <16 x i64> %wide.load, i32 15 |
| %56 = getelementptr inbounds float, ptr %rawA, i64 %55 |
| %59 = load float, ptr %26, align 4 |
| %60 = load float, ptr %28, align 4 |
| %61 = load float, ptr %30, align 4 |
| %62 = load float, ptr %32, align 4 |
| %63 = load float, ptr %34, align 4 |
| %64 = load float, ptr %36, align 4 |
| %65 = load float, ptr %38, align 4 |
| %66 = load float, ptr %40, align 4 |
| %67 = load float, ptr %42, align 4 |
| %68 = load float, ptr %44, align 4 |
| %69 = load float, ptr %46, align 4 |
| %70 = load float, ptr %48, align 4 |
| %71 = load float, ptr %50, align 4 |
| %72 = load float, ptr %52, align 4 |
| %73 = load float, ptr %54, align 4 |
| %74 = load float, ptr %56, align 4 |
| %a1 = fadd float %59, %60 |
| %a2 = fadd float %61, %62 |
| %a3 = fadd float %63, %64 |
| %a4 = fadd float %65, %66 |
| %a5 = fadd float %67, %68 |
| %a6 = fadd float %69, %70 |
| %a7 = fadd float %71, %72 |
| %a8 = fadd float %73, %74 |
| %a9 = fadd float %a1, %a2 |
| %a10 = fadd float %a3, %a4 |
| %a11 = fadd float %a5, %a6 |
| %a12 = fadd float %a7, %a8 |
| %a13 = fadd float %a9, %a10 |
| %a14 = fadd float %a11, %a12 |
| %a15 = fadd float %a13, %a14 |
| ret float %a15 |
| } |
| |
| define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { |
| ; CHECK-LABEL: scalarize_into_load_sext: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ldpsw x9, x8, [x1] |
| ; CHECK-NEXT: ldpsw x11, x10, [x1, #8] |
| ; CHECK-NEXT: ldpsw x13, x12, [x1, #24] |
| ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] |
| ; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] |
| ; CHECK-NEXT: ldpsw x9, x8, [x1, #56] |
| ; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] |
| ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ldpsw x11, x10, [x1, #48] |
| ; CHECK-NEXT: ldpsw x15, x14, [x1, #16] |
| ; CHECK-NEXT: ldpsw x17, x16, [x1, #40] |
| ; CHECK-NEXT: ldpsw x0, x18, [x1, #32] |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] |
| ; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] |
| ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] |
| ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] |
| ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] |
| ; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] |
| ; CHECK-NEXT: fadd s2, s2, s3 |
| ; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] |
| ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] |
| ; CHECK-NEXT: fadd s3, s4, s5 |
| ; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] |
| ; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] |
| ; CHECK-NEXT: fadd s4, s6, s7 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] |
| ; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] |
| ; CHECK-NEXT: fadd s5, s16, s17 |
| ; CHECK-NEXT: fadd s6, s18, s19 |
| ; CHECK-NEXT: fadd s7, s20, s21 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fadd s2, s4, s5 |
| ; CHECK-NEXT: fadd s3, s6, s7 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ret |
| entry: |
| %wide.load = load <16 x i32>, ptr %23, align 4 |
| %24 = sext <16 x i32> %wide.load to <16 x i64> |
| %25 = extractelement <16 x i64> %24, i32 0 |
| %26 = getelementptr inbounds float, ptr %rawA, i64 %25 |
| %27 = extractelement <16 x i64> %24, i32 1 |
| %28 = getelementptr inbounds float, ptr %rawA, i64 %27 |
| %29 = extractelement <16 x i64> %24, i32 2 |
| %30 = getelementptr inbounds float, ptr %rawA, i64 %29 |
| %31 = extractelement <16 x i64> %24, i32 3 |
| %32 = getelementptr inbounds float, ptr %rawA, i64 %31 |
| %33 = extractelement <16 x i64> %24, i32 4 |
| %34 = getelementptr inbounds float, ptr %rawA, i64 %33 |
| %35 = extractelement <16 x i64> %24, i32 5 |
| %36 = getelementptr inbounds float, ptr %rawA, i64 %35 |
| %37 = extractelement <16 x i64> %24, i32 6 |
| %38 = getelementptr inbounds float, ptr %rawA, i64 %37 |
| %39 = extractelement <16 x i64> %24, i32 7 |
| %40 = getelementptr inbounds float, ptr %rawA, i64 %39 |
| %41 = extractelement <16 x i64> %24, i32 8 |
| %42 = getelementptr inbounds float, ptr %rawA, i64 %41 |
| %43 = extractelement <16 x i64> %24, i32 9 |
| %44 = getelementptr inbounds float, ptr %rawA, i64 %43 |
| %45 = extractelement <16 x i64> %24, i32 10 |
| %46 = getelementptr inbounds float, ptr %rawA, i64 %45 |
| %47 = extractelement <16 x i64> %24, i32 11 |
| %48 = getelementptr inbounds float, ptr %rawA, i64 %47 |
| %49 = extractelement <16 x i64> %24, i32 12 |
| %50 = getelementptr inbounds float, ptr %rawA, i64 %49 |
| %51 = extractelement <16 x i64> %24, i32 13 |
| %52 = getelementptr inbounds float, ptr %rawA, i64 %51 |
| %53 = extractelement <16 x i64> %24, i32 14 |
| %54 = getelementptr inbounds float, ptr %rawA, i64 %53 |
| %55 = extractelement <16 x i64> %24, i32 15 |
| %56 = getelementptr inbounds float, ptr %rawA, i64 %55 |
| %59 = load float, ptr %26, align 4 |
| %60 = load float, ptr %28, align 4 |
| %61 = load float, ptr %30, align 4 |
| %62 = load float, ptr %32, align 4 |
| %63 = load float, ptr %34, align 4 |
| %64 = load float, ptr %36, align 4 |
| %65 = load float, ptr %38, align 4 |
| %66 = load float, ptr %40, align 4 |
| %67 = load float, ptr %42, align 4 |
| %68 = load float, ptr %44, align 4 |
| %69 = load float, ptr %46, align 4 |
| %70 = load float, ptr %48, align 4 |
| %71 = load float, ptr %50, align 4 |
| %72 = load float, ptr %52, align 4 |
| %73 = load float, ptr %54, align 4 |
| %74 = load float, ptr %56, align 4 |
| %a1 = fadd float %59, %60 |
| %a2 = fadd float %61, %62 |
| %a3 = fadd float %63, %64 |
| %a4 = fadd float %65, %66 |
| %a5 = fadd float %67, %68 |
| %a6 = fadd float %69, %70 |
| %a7 = fadd float %71, %72 |
| %a8 = fadd float %73, %74 |
| %a9 = fadd float %a1, %a2 |
| %a10 = fadd float %a3, %a4 |
| %a11 = fadd float %a5, %a6 |
| %a12 = fadd float %a7, %a8 |
| %a13 = fadd float %a9, %a10 |
| %a14 = fadd float %a11, %a12 |
| %a15 = fadd float %a13, %a14 |
| ret float %a15 |
| } |
| |
| define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { |
| ; CHECK-LABEL: scalarize_into_load_zext: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ldp w9, w8, [x1] |
| ; CHECK-NEXT: ldp w11, w10, [x1, #8] |
| ; CHECK-NEXT: ldp w13, w12, [x1, #24] |
| ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] |
| ; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] |
| ; CHECK-NEXT: ldp w9, w8, [x1, #56] |
| ; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] |
| ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ldp w11, w10, [x1, #48] |
| ; CHECK-NEXT: ldp w15, w14, [x1, #16] |
| ; CHECK-NEXT: ldp w17, w16, [x1, #40] |
| ; CHECK-NEXT: ldp w0, w18, [x1, #32] |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] |
| ; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] |
| ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] |
| ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] |
| ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] |
| ; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] |
| ; CHECK-NEXT: fadd s2, s2, s3 |
| ; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] |
| ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] |
| ; CHECK-NEXT: fadd s3, s4, s5 |
| ; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] |
| ; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] |
| ; CHECK-NEXT: fadd s4, s6, s7 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] |
| ; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] |
| ; CHECK-NEXT: fadd s5, s16, s17 |
| ; CHECK-NEXT: fadd s6, s18, s19 |
| ; CHECK-NEXT: fadd s7, s20, s21 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fadd s2, s4, s5 |
| ; CHECK-NEXT: fadd s3, s6, s7 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: fadd s1, s2, s3 |
| ; CHECK-NEXT: fadd s0, s0, s1 |
| ; CHECK-NEXT: ret |
| entry: |
| %wide.load = load <16 x i32>, ptr %23, align 4 |
| %24 = zext <16 x i32> %wide.load to <16 x i64> |
| %25 = extractelement <16 x i64> %24, i32 0 |
| %26 = getelementptr inbounds float, ptr %rawA, i64 %25 |
| %27 = extractelement <16 x i64> %24, i32 1 |
| %28 = getelementptr inbounds float, ptr %rawA, i64 %27 |
| %29 = extractelement <16 x i64> %24, i32 2 |
| %30 = getelementptr inbounds float, ptr %rawA, i64 %29 |
| %31 = extractelement <16 x i64> %24, i32 3 |
| %32 = getelementptr inbounds float, ptr %rawA, i64 %31 |
| %33 = extractelement <16 x i64> %24, i32 4 |
| %34 = getelementptr inbounds float, ptr %rawA, i64 %33 |
| %35 = extractelement <16 x i64> %24, i32 5 |
| %36 = getelementptr inbounds float, ptr %rawA, i64 %35 |
| %37 = extractelement <16 x i64> %24, i32 6 |
| %38 = getelementptr inbounds float, ptr %rawA, i64 %37 |
| %39 = extractelement <16 x i64> %24, i32 7 |
| %40 = getelementptr inbounds float, ptr %rawA, i64 %39 |
| %41 = extractelement <16 x i64> %24, i32 8 |
| %42 = getelementptr inbounds float, ptr %rawA, i64 %41 |
| %43 = extractelement <16 x i64> %24, i32 9 |
| %44 = getelementptr inbounds float, ptr %rawA, i64 %43 |
| %45 = extractelement <16 x i64> %24, i32 10 |
| %46 = getelementptr inbounds float, ptr %rawA, i64 %45 |
| %47 = extractelement <16 x i64> %24, i32 11 |
| %48 = getelementptr inbounds float, ptr %rawA, i64 %47 |
| %49 = extractelement <16 x i64> %24, i32 12 |
| %50 = getelementptr inbounds float, ptr %rawA, i64 %49 |
| %51 = extractelement <16 x i64> %24, i32 13 |
| %52 = getelementptr inbounds float, ptr %rawA, i64 %51 |
| %53 = extractelement <16 x i64> %24, i32 14 |
| %54 = getelementptr inbounds float, ptr %rawA, i64 %53 |
| %55 = extractelement <16 x i64> %24, i32 15 |
| %56 = getelementptr inbounds float, ptr %rawA, i64 %55 |
| %59 = load float, ptr %26, align 4 |
| %60 = load float, ptr %28, align 4 |
| %61 = load float, ptr %30, align 4 |
| %62 = load float, ptr %32, align 4 |
| %63 = load float, ptr %34, align 4 |
| %64 = load float, ptr %36, align 4 |
| %65 = load float, ptr %38, align 4 |
| %66 = load float, ptr %40, align 4 |
| %67 = load float, ptr %42, align 4 |
| %68 = load float, ptr %44, align 4 |
| %69 = load float, ptr %46, align 4 |
| %70 = load float, ptr %48, align 4 |
| %71 = load float, ptr %50, align 4 |
| %72 = load float, ptr %52, align 4 |
| %73 = load float, ptr %54, align 4 |
| %74 = load float, ptr %56, align 4 |
| %a1 = fadd float %59, %60 |
| %a2 = fadd float %61, %62 |
| %a3 = fadd float %63, %64 |
| %a4 = fadd float %65, %66 |
| %a5 = fadd float %67, %68 |
| %a6 = fadd float %69, %70 |
| %a7 = fadd float %71, %72 |
| %a8 = fadd float %73, %74 |
| %a9 = fadd float %a1, %a2 |
| %a10 = fadd float %a3, %a4 |
| %a11 = fadd float %a5, %a6 |
| %a12 = fadd float %a7, %a8 |
| %a13 = fadd float %a9, %a10 |
| %a14 = fadd float %a11, %a12 |
| %a15 = fadd float %a13, %a14 |
| ret float %a15 |
| } |
| |
| |