blob: d8c3e4ae3ffaf06e7dbb4f9c66b6de4730709ca0 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
; Various reductions generated fro SLP vectorizing unrolled loops. Generated
; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed.
define i32 @addv2i32i32(ptr %x) {
; CHECK-LABEL: addv2i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r0, r1, [r0]
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: bx lr
entry:
%0 = load i32, ptr %x, align 4
%arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1
%1 = load i32, ptr %arrayidx.1, align 4
%add.1 = add nsw i32 %1, %0
ret i32 %add.1
}
define i32 @addv4i32i32(ptr %x) {
; CHECK-LABEL: addv4i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, ptr %x, align 4
%1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
ret i32 %1
}
define i32 @addv8i32i32(ptr %x) {
; CHECK-LABEL: addv8i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vaddv.u32 r0, q1
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i32>, ptr %x, align 4
%1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
ret i32 %1
}
define i32 @addv16i32i32(ptr %x) {
; CHECK-LABEL: addv16i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i32>, ptr %x, align 4
%1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
ret i32 %1
}
define i32 @addv24i32i32(ptr %x) {
; CHECK-LABEL: addv24i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i32>, ptr %x, align 4
%arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8
%1 = load <16 x i32>, ptr %arrayidx.8, align 4
%2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
%op.rdx = add nsw i32 %2, %3
ret i32 %op.rdx
}
define i32 @addv32i32i32(ptr %x) {
; CHECK-LABEL: addv32i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: vaddv.u32 r0, q1
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: vldrw.u32 q0, [r1, #64]
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: vldrw.u32 q0, [r1, #80]
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: vldrw.u32 q0, [r1, #96]
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: vldrw.u32 q0, [r1, #112]
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i32>, ptr %x, align 4
%1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %0)
ret i32 %1
}
define i32 @addv64i32i32(ptr %x) {
; CHECK-LABEL: addv64i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #176]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <64 x i32>, ptr %x, align 4
%1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0)
ret i32 %1
}
define i32 @addv128i32i32(ptr %x) {
; CHECK-LABEL: addv128i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #96]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #112]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #144]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #160]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #176]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #192]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #208]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #224]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #240]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #256]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #272]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #288]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #304]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #320]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #336]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #352]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #368]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #384]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #400]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #416]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #432]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #448]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #464]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #480]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #496]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%wide.load = load <4 x i32>, ptr %x, align 4
%0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
%1 = getelementptr inbounds i32, ptr %x, i32 4
%wide.load.1 = load <4 x i32>, ptr %1, align 4
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1)
%3 = add i32 %2, %0
%4 = getelementptr inbounds i32, ptr %x, i32 8
%wide.load.2 = load <4 x i32>, ptr %4, align 4
%5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2)
%6 = add i32 %5, %3
%7 = getelementptr inbounds i32, ptr %x, i32 12
%wide.load.3 = load <4 x i32>, ptr %7, align 4
%8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3)
%9 = add i32 %8, %6
%10 = getelementptr inbounds i32, ptr %x, i32 16
%wide.load.4 = load <4 x i32>, ptr %10, align 4
%11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4)
%12 = add i32 %11, %9
%13 = getelementptr inbounds i32, ptr %x, i32 20
%wide.load.5 = load <4 x i32>, ptr %13, align 4
%14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5)
%15 = add i32 %14, %12
%16 = getelementptr inbounds i32, ptr %x, i32 24
%wide.load.6 = load <4 x i32>, ptr %16, align 4
%17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6)
%18 = add i32 %17, %15
%19 = getelementptr inbounds i32, ptr %x, i32 28
%wide.load.7 = load <4 x i32>, ptr %19, align 4
%20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7)
%21 = add i32 %20, %18
%22 = getelementptr inbounds i32, ptr %x, i32 32
%wide.load.8 = load <4 x i32>, ptr %22, align 4
%23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8)
%24 = add i32 %23, %21
%25 = getelementptr inbounds i32, ptr %x, i32 36
%wide.load.9 = load <4 x i32>, ptr %25, align 4
%26 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9)
%27 = add i32 %26, %24
%28 = getelementptr inbounds i32, ptr %x, i32 40
%wide.load.10 = load <4 x i32>, ptr %28, align 4
%29 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10)
%30 = add i32 %29, %27
%31 = getelementptr inbounds i32, ptr %x, i32 44
%wide.load.11 = load <4 x i32>, ptr %31, align 4
%32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11)
%33 = add i32 %32, %30
%34 = getelementptr inbounds i32, ptr %x, i32 48
%wide.load.12 = load <4 x i32>, ptr %34, align 4
%35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12)
%36 = add i32 %35, %33
%37 = getelementptr inbounds i32, ptr %x, i32 52
%wide.load.13 = load <4 x i32>, ptr %37, align 4
%38 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13)
%39 = add i32 %38, %36
%40 = getelementptr inbounds i32, ptr %x, i32 56
%wide.load.14 = load <4 x i32>, ptr %40, align 4
%41 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14)
%42 = add i32 %41, %39
%43 = getelementptr inbounds i32, ptr %x, i32 60
%wide.load.15 = load <4 x i32>, ptr %43, align 4
%44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15)
%45 = add i32 %44, %42
%46 = getelementptr inbounds i32, ptr %x, i32 64
%wide.load.16 = load <4 x i32>, ptr %46, align 4
%47 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16)
%48 = add i32 %47, %45
%49 = getelementptr inbounds i32, ptr %x, i32 68
%wide.load.17 = load <4 x i32>, ptr %49, align 4
%50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17)
%51 = add i32 %50, %48
%52 = getelementptr inbounds i32, ptr %x, i32 72
%wide.load.18 = load <4 x i32>, ptr %52, align 4
%53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18)
%54 = add i32 %53, %51
%55 = getelementptr inbounds i32, ptr %x, i32 76
%wide.load.19 = load <4 x i32>, ptr %55, align 4
%56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19)
%57 = add i32 %56, %54
%58 = getelementptr inbounds i32, ptr %x, i32 80
%wide.load.20 = load <4 x i32>, ptr %58, align 4
%59 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20)
%60 = add i32 %59, %57
%61 = getelementptr inbounds i32, ptr %x, i32 84
%wide.load.21 = load <4 x i32>, ptr %61, align 4
%62 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21)
%63 = add i32 %62, %60
%64 = getelementptr inbounds i32, ptr %x, i32 88
%wide.load.22 = load <4 x i32>, ptr %64, align 4
%65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22)
%66 = add i32 %65, %63
%67 = getelementptr inbounds i32, ptr %x, i32 92
%wide.load.23 = load <4 x i32>, ptr %67, align 4
%68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23)
%69 = add i32 %68, %66
%70 = getelementptr inbounds i32, ptr %x, i32 96
%wide.load.24 = load <4 x i32>, ptr %70, align 4
%71 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24)
%72 = add i32 %71, %69
%73 = getelementptr inbounds i32, ptr %x, i32 100
%wide.load.25 = load <4 x i32>, ptr %73, align 4
%74 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25)
%75 = add i32 %74, %72
%76 = getelementptr inbounds i32, ptr %x, i32 104
%wide.load.26 = load <4 x i32>, ptr %76, align 4
%77 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26)
%78 = add i32 %77, %75
%79 = getelementptr inbounds i32, ptr %x, i32 108
%wide.load.27 = load <4 x i32>, ptr %79, align 4
%80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27)
%81 = add i32 %80, %78
%82 = getelementptr inbounds i32, ptr %x, i32 112
%wide.load.28 = load <4 x i32>, ptr %82, align 4
%83 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28)
%84 = add i32 %83, %81
%85 = getelementptr inbounds i32, ptr %x, i32 116
%wide.load.29 = load <4 x i32>, ptr %85, align 4
%86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29)
%87 = add i32 %86, %84
%88 = getelementptr inbounds i32, ptr %x, i32 120
%wide.load.30 = load <4 x i32>, ptr %88, align 4
%89 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30)
%90 = add i32 %89, %87
%91 = getelementptr inbounds i32, ptr %x, i32 124
%wide.load.31 = load <4 x i32>, ptr %91, align 4
%92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31)
%93 = add i32 %92, %90
ret i32 %93
}
define i32 @addv2i32i16(ptr %x) {
; CHECK-LABEL: addv2i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrsh.w r1, [r0]
; CHECK-NEXT: ldrsh.w r0, [r0, #2]
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: bx lr
entry:
%0 = load i16, ptr %x, align 2
%conv = sext i16 %0 to i32
%arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
%1 = load i16, ptr %arrayidx.1, align 2
%conv.1 = sext i16 %1 to i32
%add.1 = add nsw i32 %conv, %conv.1
ret i32 %add.1
}
define i32 @addv4i32i16(ptr %x) {
; CHECK-LABEL: addv4i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i16>, ptr %x, align 2
%1 = sext <4 x i16> %0 to <4 x i32>
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
ret i32 %2
}
define i32 @addv8i32i16(ptr %x) {
; CHECK-LABEL: addv8i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vaddv.s16 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, ptr %x, align 2
%1 = sext <8 x i16> %0 to <8 x i32>
%2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
ret i32 %2
}
define i32 @addv16i32i16(ptr %x) {
; CHECK-LABEL: addv16i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q1, [r0]
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i16>, ptr %x, align 2
%1 = sext <16 x i16> %0 to <16 x i32>
%2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
ret i32 %2
}
define i32 @addv24i32i16(ptr %x) {
; CHECK-LABEL: addv24i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q1, [r0]
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i16>, ptr %x, align 2
%1 = sext <16 x i16> %0 to <16 x i32>
%arrayidx.16 = getelementptr inbounds i16, ptr %x, i32 16
%2 = load <8 x i16>, ptr %arrayidx.16, align 2
%3 = sext <8 x i16> %2 to <8 x i32>
%4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
%5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
%op.rdx = add nsw i32 %4, %5
ret i32 %op.rdx
}
define i32 @addv32i32i16(ptr %x) {
; CHECK-LABEL: addv32i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q1, [r0]
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i16>, ptr %x, align 2
%1 = sext <32 x i16> %0 to <32 x i32>
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
ret i32 %2
}
define i32 @addv64i32i16(ptr %x) {
; CHECK-LABEL: addv64i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q1, [r0]
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
; CHECK-NEXT: ldrsh.w r1, [r0, #120]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: ldrsh.w r3, [r0, #122]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
; CHECK-NEXT: ldrsh.w r12, [r0, #124]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #32]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #40]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #48]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #56]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #64]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #72]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #80]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #88]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #112]
; CHECK-NEXT: ldrsh.w r0, [r0, #126]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: add r1, r2
; CHECK-NEXT: add r1, r3
; CHECK-NEXT: add r1, r12
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i16>, ptr %x, align 2
%1 = sext <32 x i16> %0 to <32 x i32>
%arrayidx.32 = getelementptr inbounds i16, ptr %x, i32 32
%2 = load <16 x i16>, ptr %arrayidx.32, align 2
%3 = sext <16 x i16> %2 to <16 x i32>
%arrayidx.48 = getelementptr inbounds i16, ptr %x, i32 48
%4 = load <8 x i16>, ptr %arrayidx.48, align 2
%5 = sext <8 x i16> %4 to <8 x i32>
%arrayidx.56 = getelementptr inbounds i16, ptr %x, i32 56
%6 = load <4 x i16>, ptr %arrayidx.56, align 2
%7 = sext <4 x i16> %6 to <4 x i32>
%arrayidx.60 = getelementptr inbounds i16, ptr %x, i32 60
%8 = load i16, ptr %arrayidx.60, align 2
%conv.60 = sext i16 %8 to i32
%arrayidx.61 = getelementptr inbounds i16, ptr %x, i32 61
%9 = load i16, ptr %arrayidx.61, align 2
%conv.61 = sext i16 %9 to i32
%arrayidx.62 = getelementptr inbounds i16, ptr %x, i32 62
%10 = load i16, ptr %arrayidx.62, align 2
%conv.62 = sext i16 %10 to i32
%11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
%12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
%op.rdx = add nsw i32 %11, %12
%13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
%op.rdx8 = add nsw i32 %op.rdx, %13
%14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
%op.rdx9 = add nsw i32 %op.rdx8, %14
%15 = add nsw i32 %op.rdx9, %conv.60
%16 = add nsw i32 %15, %conv.61
%17 = add nsw i32 %16, %conv.62
%arrayidx.63 = getelementptr inbounds i16, ptr %x, i32 63
%18 = load i16, ptr %arrayidx.63, align 2
%conv.63 = sext i16 %18 to i32
%add.63 = add nsw i32 %17, %conv.63
ret i32 %add.63
}
define i32 @addv128i32i16(ptr %x) {
; CHECK-LABEL: addv128i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.s16 r2, q1
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
; CHECK-NEXT: vaddva.s16 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%wide.load = load <8 x i16>, ptr %x, align 2
%0 = sext <8 x i16> %wide.load to <8 x i32>
%1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0)
%2 = getelementptr inbounds i16, ptr %x, i32 8
%wide.load.1 = load <8 x i16>, ptr %2, align 2
%3 = sext <8 x i16> %wide.load.1 to <8 x i32>
%4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
%5 = add i32 %4, %1
%6 = getelementptr inbounds i16, ptr %x, i32 16
%wide.load.2 = load <8 x i16>, ptr %6, align 2
%7 = sext <8 x i16> %wide.load.2 to <8 x i32>
%8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
%9 = add i32 %8, %5
%10 = getelementptr inbounds i16, ptr %x, i32 24
%wide.load.3 = load <8 x i16>, ptr %10, align 2
%11 = sext <8 x i16> %wide.load.3 to <8 x i32>
%12 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %11)
%13 = add i32 %12, %9
%14 = getelementptr inbounds i16, ptr %x, i32 32
%wide.load.4 = load <8 x i16>, ptr %14, align 2
%15 = sext <8 x i16> %wide.load.4 to <8 x i32>
%16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
%17 = add i32 %16, %13
%18 = getelementptr inbounds i16, ptr %x, i32 40
%wide.load.5 = load <8 x i16>, ptr %18, align 2
%19 = sext <8 x i16> %wide.load.5 to <8 x i32>
%20 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
%21 = add i32 %20, %17
%22 = getelementptr inbounds i16, ptr %x, i32 48
%wide.load.6 = load <8 x i16>, ptr %22, align 2
%23 = sext <8 x i16> %wide.load.6 to <8 x i32>
%24 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %23)
%25 = add i32 %24, %21
%26 = getelementptr inbounds i16, ptr %x, i32 56
%wide.load.7 = load <8 x i16>, ptr %26, align 2
%27 = sext <8 x i16> %wide.load.7 to <8 x i32>
%28 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %27)
%29 = add i32 %28, %25
%30 = getelementptr inbounds i16, ptr %x, i32 64
%wide.load.8 = load <8 x i16>, ptr %30, align 2
%31 = sext <8 x i16> %wide.load.8 to <8 x i32>
%32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %31)
%33 = add i32 %32, %29
%34 = getelementptr inbounds i16, ptr %x, i32 72
%wide.load.9 = load <8 x i16>, ptr %34, align 2
%35 = sext <8 x i16> %wide.load.9 to <8 x i32>
%36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35)
%37 = add i32 %36, %33
%38 = getelementptr inbounds i16, ptr %x, i32 80
%wide.load.10 = load <8 x i16>, ptr %38, align 2
%39 = sext <8 x i16> %wide.load.10 to <8 x i32>
%40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
%41 = add i32 %40, %37
%42 = getelementptr inbounds i16, ptr %x, i32 88
%wide.load.11 = load <8 x i16>, ptr %42, align 2
%43 = sext <8 x i16> %wide.load.11 to <8 x i32>
%44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
%45 = add i32 %44, %41
%46 = getelementptr inbounds i16, ptr %x, i32 96
%wide.load.12 = load <8 x i16>, ptr %46, align 2
%47 = sext <8 x i16> %wide.load.12 to <8 x i32>
%48 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
%49 = add i32 %48, %45
%50 = getelementptr inbounds i16, ptr %x, i32 104
%wide.load.13 = load <8 x i16>, ptr %50, align 2
%51 = sext <8 x i16> %wide.load.13 to <8 x i32>
%52 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %51)
%53 = add i32 %52, %49
%54 = getelementptr inbounds i16, ptr %x, i32 112
%wide.load.14 = load <8 x i16>, ptr %54, align 2
%55 = sext <8 x i16> %wide.load.14 to <8 x i32>
%56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55)
%57 = add i32 %56, %53
%58 = getelementptr inbounds i16, ptr %x, i32 120
%wide.load.15 = load <8 x i16>, ptr %58, align 2
%59 = sext <8 x i16> %wide.load.15 to <8 x i32>
%60 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %59)
%61 = add i32 %60, %57
ret i32 %61
}
define i32 @addv2i32i8(ptr %x) {
; CHECK-LABEL: addv2i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrb r1, [r0]
; CHECK-NEXT: ldrb r0, [r0, #1]
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: bx lr
entry:
%0 = load i8, ptr %x, align 1
%conv = zext i8 %0 to i32
%arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
%1 = load i8, ptr %arrayidx.1, align 1
%conv.1 = zext i8 %1 to i32
%add.1 = add nuw nsw i32 %conv, %conv.1
ret i32 %add.1
}
define i32 @addv4i32i8(ptr %x) {
; CHECK-LABEL: addv4i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u32 q0, [r0]
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i8>, ptr %x, align 1
%1 = zext <4 x i8> %0 to <4 x i32>
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
ret i32 %2
}
define i32 @addv8i32i8(ptr %x) {
; CHECK-LABEL: addv8i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u16 q0, [r0]
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i8>, ptr %x, align 1
%1 = zext <8 x i8> %0 to <8 x i32>
%2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
ret i32 %2
}
define i32 @addv16i32i8(ptr %x) {
; CHECK-LABEL: addv16i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r0]
; CHECK-NEXT: vaddv.u8 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i8>, ptr %x, align 1
%1 = zext <16 x i8> %0 to <16 x i32>
%2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
ret i32 %2
}
define i32 @addv24i32i8(ptr %x) {
; CHECK-LABEL: addv24i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u8 r0, q1
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i8>, ptr %x, align 1
%1 = zext <16 x i8> %0 to <16 x i32>
%arrayidx.16 = getelementptr inbounds i8, ptr %x, i32 16
%2 = load <8 x i8>, ptr %arrayidx.16, align 1
%3 = zext <8 x i8> %2 to <8 x i32>
%4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
%5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
%op.rdx = add nuw nsw i32 %4, %5
ret i32 %op.rdx
}
define i32 @addv32i32i8(ptr %x) {
; CHECK-LABEL: addv32i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u32 q1, [r0]
; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i8>, ptr %x, align 1
%1 = zext <32 x i8> %0 to <32 x i32>
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
ret i32 %2
}
define i32 @addv64i32i8(ptr %x) {
; CHECK-LABEL: addv64i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u32 q1, [r0]
; CHECK-NEXT: vldrb.u32 q0, [r0, #4]
; CHECK-NEXT: ldrb.w r1, [r0, #60]
; CHECK-NEXT: vaddv.u32 r2, q1
; CHECK-NEXT: ldrb.w r3, [r0, #61]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #8]
; CHECK-NEXT: ldrb.w r12, [r0, #62]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #12]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #16]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #20]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #24]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #28]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrb.u32 q0, [r0, #56]
; CHECK-NEXT: ldrb.w r0, [r0, #63]
; CHECK-NEXT: vaddva.u32 r2, q0
; CHECK-NEXT: add r1, r2
; CHECK-NEXT: add r1, r3
; CHECK-NEXT: add r1, r12
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i8>, ptr %x, align 1
%1 = zext <32 x i8> %0 to <32 x i32>
%arrayidx.32 = getelementptr inbounds i8, ptr %x, i32 32
%2 = load <16 x i8>, ptr %arrayidx.32, align 1
%3 = zext <16 x i8> %2 to <16 x i32>
%arrayidx.48 = getelementptr inbounds i8, ptr %x, i32 48
%4 = load <8 x i8>, ptr %arrayidx.48, align 1
%5 = zext <8 x i8> %4 to <8 x i32>
%arrayidx.56 = getelementptr inbounds i8, ptr %x, i32 56
%6 = load <4 x i8>, ptr %arrayidx.56, align 1
%7 = zext <4 x i8> %6 to <4 x i32>
%arrayidx.60 = getelementptr inbounds i8, ptr %x, i32 60
%8 = load i8, ptr %arrayidx.60, align 1
%conv.60 = zext i8 %8 to i32
%arrayidx.61 = getelementptr inbounds i8, ptr %x, i32 61
%9 = load i8, ptr %arrayidx.61, align 1
%conv.61 = zext i8 %9 to i32
%arrayidx.62 = getelementptr inbounds i8, ptr %x, i32 62
%10 = load i8, ptr %arrayidx.62, align 1
%conv.62 = zext i8 %10 to i32
%11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
%12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
%op.rdx = add nuw nsw i32 %11, %12
%13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
%op.rdx8 = add nuw nsw i32 %op.rdx, %13
%14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7)
%op.rdx9 = add nuw nsw i32 %op.rdx8, %14
%15 = add nuw nsw i32 %op.rdx9, %conv.60
%16 = add nuw nsw i32 %15, %conv.61
%17 = add nuw nsw i32 %16, %conv.62
%arrayidx.63 = getelementptr inbounds i8, ptr %x, i32 63
%18 = load i8, ptr %arrayidx.63, align 1
%conv.63 = zext i8 %18 to i32
%add.63 = add nuw nsw i32 %17, %conv.63
ret i32 %add.63
}
define i32 @addv128i32i8(ptr %x) {
; CHECK-LABEL: addv128i32i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: vaddv.u8 r0, q1
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: vldrb.u8 q0, [r1, #32]
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: vldrb.u8 q0, [r1, #48]
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: vldrb.u8 q0, [r1, #64]
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: vldrb.u8 q0, [r1, #80]
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: vldrb.u8 q0, [r1, #96]
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: vldrb.u8 q0, [r1, #112]
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: bx lr
entry:
%wide.load = load <16 x i8>, ptr %x, align 1
%0 = zext <16 x i8> %wide.load to <16 x i32>
%1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0)
%2 = getelementptr inbounds i8, ptr %x, i32 16
%wide.load.1 = load <16 x i8>, ptr %2, align 1
%3 = zext <16 x i8> %wide.load.1 to <16 x i32>
%4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
%5 = add i32 %4, %1
%6 = getelementptr inbounds i8, ptr %x, i32 32
%wide.load.2 = load <16 x i8>, ptr %6, align 1
%7 = zext <16 x i8> %wide.load.2 to <16 x i32>
%8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
%9 = add i32 %8, %5
%10 = getelementptr inbounds i8, ptr %x, i32 48
%wide.load.3 = load <16 x i8>, ptr %10, align 1
%11 = zext <16 x i8> %wide.load.3 to <16 x i32>
%12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %11)
%13 = add i32 %12, %9
%14 = getelementptr inbounds i8, ptr %x, i32 64
%wide.load.4 = load <16 x i8>, ptr %14, align 1
%15 = zext <16 x i8> %wide.load.4 to <16 x i32>
%16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
%17 = add i32 %16, %13
%18 = getelementptr inbounds i8, ptr %x, i32 80
%wide.load.5 = load <16 x i8>, ptr %18, align 1
%19 = zext <16 x i8> %wide.load.5 to <16 x i32>
%20 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19)
%21 = add i32 %20, %17
%22 = getelementptr inbounds i8, ptr %x, i32 96
%wide.load.6 = load <16 x i8>, ptr %22, align 1
%23 = zext <16 x i8> %wide.load.6 to <16 x i32>
%24 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %23)
%25 = add i32 %24, %21
%26 = getelementptr inbounds i8, ptr %x, i32 112
%wide.load.7 = load <16 x i8>, ptr %26, align 1
%27 = zext <16 x i8> %wide.load.7 to <16 x i32>
%28 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %27)
%29 = add i32 %28, %25
ret i32 %29
}
define signext i16 @addv2i16i16(ptr %x) {
; CHECK-LABEL: addv2i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrh r1, [r0]
; CHECK-NEXT: ldrh r0, [r0, #2]
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load i16, ptr %x, align 2
%arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
%1 = load i16, ptr %arrayidx.1, align 2
%add.1 = add i16 %1, %0
ret i16 %add.1
}
define signext i16 @addv4i16i16(ptr %x) {
; CHECK-LABEL: addv4i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u32 q0, [r0]
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i16>, ptr %x, align 2
%1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %0)
ret i16 %1
}
define signext i16 @addv8i16i16(ptr %x) {
; CHECK-LABEL: addv8i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, ptr %x, align 2
%1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
ret i16 %1
}
define signext i16 @addv16i16i16(ptr %x) {
; CHECK-LABEL: addv16i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u16 r0, q1
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i16>, ptr %x, align 2
%1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0)
ret i16 %1
}
define signext i16 @addv24i16i16(ptr %x) {
; CHECK-LABEL: addv24i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u16 r2, q1
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, ptr %x, align 2
%arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
%1 = load <16 x i16>, ptr %arrayidx.8, align 2
%2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
%3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0)
%op.rdx = add i16 %2, %3
ret i16 %op.rdx
}
define signext i16 @addv32i16i16(ptr %x) {
; CHECK-LABEL: addv32i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u16 r2, q1
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i16>, ptr %x, align 2
%1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %0)
ret i16 %1
}
define signext i16 @addv64i16i16(ptr %x) {
; CHECK-LABEL: addv64i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u16 r2, q1
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <64 x i16>, ptr %x, align 2
%1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %0)
ret i16 %1
}
define signext i16 @addv128i16i16(ptr %x) {
; CHECK-LABEL: addv128i16i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q1, [r0]
; CHECK-NEXT: vldrh.u16 q0, [r0, #16]
; CHECK-NEXT: vaddv.u16 r2, q1
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #32]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #48]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #64]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #80]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #96]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #112]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #128]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #144]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #160]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #176]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #192]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #208]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #224]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: vldrh.u16 q0, [r0, #240]
; CHECK-NEXT: vaddva.u16 r2, q0
; CHECK-NEXT: sxth r0, r2
; CHECK-NEXT: bx lr
entry:
%wide.load = load <8 x i16>, ptr %x, align 2
%0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load)
%1 = getelementptr inbounds i16, ptr %x, i32 8
%wide.load.1 = load <8 x i16>, ptr %1, align 2
%2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1)
%3 = add i16 %2, %0
%4 = getelementptr inbounds i16, ptr %x, i32 16
%wide.load.2 = load <8 x i16>, ptr %4, align 2
%5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2)
%6 = add i16 %5, %3
%7 = getelementptr inbounds i16, ptr %x, i32 24
%wide.load.3 = load <8 x i16>, ptr %7, align 2
%8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3)
%9 = add i16 %8, %6
%10 = getelementptr inbounds i16, ptr %x, i32 32
%wide.load.4 = load <8 x i16>, ptr %10, align 2
%11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4)
%12 = add i16 %11, %9
%13 = getelementptr inbounds i16, ptr %x, i32 40
%wide.load.5 = load <8 x i16>, ptr %13, align 2
%14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5)
%15 = add i16 %14, %12
%16 = getelementptr inbounds i16, ptr %x, i32 48
%wide.load.6 = load <8 x i16>, ptr %16, align 2
%17 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6)
%18 = add i16 %17, %15
%19 = getelementptr inbounds i16, ptr %x, i32 56
%wide.load.7 = load <8 x i16>, ptr %19, align 2
%20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7)
%21 = add i16 %20, %18
%22 = getelementptr inbounds i16, ptr %x, i32 64
%wide.load.8 = load <8 x i16>, ptr %22, align 2
%23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8)
%24 = add i16 %23, %21
%25 = getelementptr inbounds i16, ptr %x, i32 72
%wide.load.9 = load <8 x i16>, ptr %25, align 2
%26 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9)
%27 = add i16 %26, %24
%28 = getelementptr inbounds i16, ptr %x, i32 80
%wide.load.10 = load <8 x i16>, ptr %28, align 2
%29 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10)
%30 = add i16 %29, %27
%31 = getelementptr inbounds i16, ptr %x, i32 88
%wide.load.11 = load <8 x i16>, ptr %31, align 2
%32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11)
%33 = add i16 %32, %30
%34 = getelementptr inbounds i16, ptr %x, i32 96
%wide.load.12 = load <8 x i16>, ptr %34, align 2
%35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12)
%36 = add i16 %35, %33
%37 = getelementptr inbounds i16, ptr %x, i32 104
%wide.load.13 = load <8 x i16>, ptr %37, align 2
%38 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13)
%39 = add i16 %38, %36
%40 = getelementptr inbounds i16, ptr %x, i32 112
%wide.load.14 = load <8 x i16>, ptr %40, align 2
%41 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14)
%42 = add i16 %41, %39
%43 = getelementptr inbounds i16, ptr %x, i32 120
%wide.load.15 = load <8 x i16>, ptr %43, align 2
%44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15)
%45 = add i16 %44, %42
ret i16 %45
}
define zeroext i8 @addv2i8i8(ptr %x) {
; CHECK-LABEL: addv2i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrb r1, [r0]
; CHECK-NEXT: ldrb r0, [r0, #1]
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load i8, ptr %x, align 1
%arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1
%1 = load i8, ptr %arrayidx.1, align 1
%add.1 = add i8 %1, %0
ret i8 %add.1
}
define zeroext i8 @addv4i8i8(ptr %x) {
; CHECK-LABEL: addv4i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u32 q0, [r0]
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i8>, ptr %x, align 1
%1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %0)
ret i8 %1
}
define zeroext i8 @addv8i8i8(ptr %x) {
; CHECK-LABEL: addv8i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u16 q0, [r0]
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i8>, ptr %x, align 1
%1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0)
ret i8 %1
}
define zeroext i8 @addv16i8i8(ptr %x) {
; CHECK-LABEL: addv16i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r0]
; CHECK-NEXT: vaddv.u8 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i8>, ptr %x, align 1
%1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0)
ret i8 %1
}
define zeroext i8 @addv24i8i8(ptr %x) {
; CHECK-LABEL: addv24i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u16 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #8]
; CHECK-NEXT: vaddv.u16 r0, q1
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i8>, ptr %x, align 1
%arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8
%1 = load <16 x i8>, ptr %arrayidx.8, align 1
%2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1)
%3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0)
%op.rdx = add i8 %2, %3
ret i8 %op.rdx
}
define zeroext i8 @addv32i8i8(ptr %x) {
; CHECK-LABEL: addv32i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
; CHECK-NEXT: vaddv.u8 r0, q1
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i8>, ptr %x, align 1
%1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %0)
ret i8 %1
}
define zeroext i8 @addv64i8i8(ptr %x) {
; CHECK-LABEL: addv64i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
; CHECK-NEXT: vaddv.u8 r2, q1
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: uxtb r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <64 x i8>, ptr %x, align 1
%1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %0)
ret i8 %1
}
define zeroext i8 @addv128i8i8(ptr %x) {
; CHECK-LABEL: addv128i8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q1, [r0]
; CHECK-NEXT: vldrb.u8 q0, [r0, #16]
; CHECK-NEXT: vaddv.u8 r2, q1
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #32]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #48]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #64]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #80]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #96]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: vldrb.u8 q0, [r0, #112]
; CHECK-NEXT: vaddva.u8 r2, q0
; CHECK-NEXT: uxtb r0, r2
; CHECK-NEXT: bx lr
entry:
%wide.load = load <16 x i8>, ptr %x, align 1
%0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load)
%1 = getelementptr inbounds i8, ptr %x, i32 16
%wide.load.1 = load <16 x i8>, ptr %1, align 1
%2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1)
%3 = add i8 %2, %0
%4 = getelementptr inbounds i8, ptr %x, i32 32
%wide.load.2 = load <16 x i8>, ptr %4, align 1
%5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2)
%6 = add i8 %5, %3
%7 = getelementptr inbounds i8, ptr %x, i32 48
%wide.load.3 = load <16 x i8>, ptr %7, align 1
%8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3)
%9 = add i8 %8, %6
%10 = getelementptr inbounds i8, ptr %x, i32 64
%wide.load.4 = load <16 x i8>, ptr %10, align 1
%11 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4)
%12 = add i8 %11, %9
%13 = getelementptr inbounds i8, ptr %x, i32 80
%wide.load.5 = load <16 x i8>, ptr %13, align 1
%14 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5)
%15 = add i8 %14, %12
%16 = getelementptr inbounds i8, ptr %x, i32 96
%wide.load.6 = load <16 x i8>, ptr %16, align 1
%17 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6)
%18 = add i8 %17, %15
%19 = getelementptr inbounds i8, ptr %x, i32 112
%wide.load.7 = load <16 x i8>, ptr %19, align 1
%20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7)
%21 = add i8 %20, %18
ret i8 %21
}
define i32 @mlav2i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav2i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r0, r2, [r0]
; CHECK-NEXT: ldrd r1, r3, [r1]
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: mla r0, r3, r2, r0
; CHECK-NEXT: bx lr
entry:
%0 = load i32, ptr %x, align 4
%1 = load i32, ptr %y, align 4
%mul = mul nsw i32 %1, %0
%arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx1.1 = getelementptr inbounds i32, ptr %y, i32 1
%3 = load i32, ptr %arrayidx1.1, align 4
%mul.1 = mul nsw i32 %3, %2
%add.1 = add nsw i32 %mul.1, %mul
ret i32 %add.1
}
define i32 @mlav4i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav4i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i32>, ptr %x, align 4
%1 = load <4 x i32>, ptr %y, align 4
%2 = mul nsw <4 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
ret i32 %3
}
define i32 @mlav8i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav8i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmlav.u32 r2, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i32>, ptr %x, align 4
%1 = load <8 x i32>, ptr %y, align 4
%2 = mul nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @mlav16i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav16i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmlav.u32 r2, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i32>, ptr %x, align 4
%1 = load <16 x i32>, ptr %y, align 4
%2 = mul nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @mlav24i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav24i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i32>, ptr %x, align 4
%1 = load <8 x i32>, ptr %y, align 4
%2 = mul nsw <8 x i32> %1, %0
%arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8
%arrayidx1.8 = getelementptr inbounds i32, ptr %y, i32 8
%3 = load <16 x i32>, ptr %arrayidx.8, align 4
%4 = load <16 x i32>, ptr %arrayidx1.8, align 4
%5 = mul nsw <16 x i32> %4, %3
%6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
%7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%op.rdx = add nsw i32 %6, %7
ret i32 %op.rdx
}
define i32 @mlav32i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav32i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i32>, ptr %x, align 4
%1 = load <32 x i32>, ptr %y, align 4
%2 = mul nsw <32 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
ret i32 %3
}
define i32 @mlav64i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav64i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #128]
; CHECK-NEXT: vldrw.u32 q1, [r1, #128]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #144]
; CHECK-NEXT: vldrw.u32 q1, [r1, #144]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #160]
; CHECK-NEXT: vldrw.u32 q1, [r1, #160]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #176]
; CHECK-NEXT: vldrw.u32 q1, [r1, #176]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #192]
; CHECK-NEXT: vldrw.u32 q1, [r1, #192]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #208]
; CHECK-NEXT: vldrw.u32 q1, [r1, #208]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #224]
; CHECK-NEXT: vldrw.u32 q1, [r1, #224]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #240]
; CHECK-NEXT: vldrw.u32 q1, [r1, #240]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%wide.load = load <4 x i32>, ptr %x, align 4
%wide.load10 = load <4 x i32>, ptr %y, align 4
%0 = mul nsw <4 x i32> %wide.load10, %wide.load
%1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
%2 = getelementptr inbounds i32, ptr %x, i32 4
%wide.load.1 = load <4 x i32>, ptr %2, align 4
%3 = getelementptr inbounds i32, ptr %y, i32 4
%wide.load10.1 = load <4 x i32>, ptr %3, align 4
%4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
%5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
%6 = add i32 %5, %1
%7 = getelementptr inbounds i32, ptr %x, i32 8
%wide.load.2 = load <4 x i32>, ptr %7, align 4
%8 = getelementptr inbounds i32, ptr %y, i32 8
%wide.load10.2 = load <4 x i32>, ptr %8, align 4
%9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
%10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
%11 = add i32 %10, %6
%12 = getelementptr inbounds i32, ptr %x, i32 12
%wide.load.3 = load <4 x i32>, ptr %12, align 4
%13 = getelementptr inbounds i32, ptr %y, i32 12
%wide.load10.3 = load <4 x i32>, ptr %13, align 4
%14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
%15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
%16 = add i32 %15, %11
%17 = getelementptr inbounds i32, ptr %x, i32 16
%wide.load.4 = load <4 x i32>, ptr %17, align 4
%18 = getelementptr inbounds i32, ptr %y, i32 16
%wide.load10.4 = load <4 x i32>, ptr %18, align 4
%19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
%20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
%21 = add i32 %20, %16
%22 = getelementptr inbounds i32, ptr %x, i32 20
%wide.load.5 = load <4 x i32>, ptr %22, align 4
%23 = getelementptr inbounds i32, ptr %y, i32 20
%wide.load10.5 = load <4 x i32>, ptr %23, align 4
%24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
%25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
%26 = add i32 %25, %21
%27 = getelementptr inbounds i32, ptr %x, i32 24
%wide.load.6 = load <4 x i32>, ptr %27, align 4
%28 = getelementptr inbounds i32, ptr %y, i32 24
%wide.load10.6 = load <4 x i32>, ptr %28, align 4
%29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
%30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
%31 = add i32 %30, %26
%32 = getelementptr inbounds i32, ptr %x, i32 28
%wide.load.7 = load <4 x i32>, ptr %32, align 4
%33 = getelementptr inbounds i32, ptr %y, i32 28
%wide.load10.7 = load <4 x i32>, ptr %33, align 4
%34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
%35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
%36 = add i32 %35, %31
%37 = getelementptr inbounds i32, ptr %x, i32 32
%wide.load.8 = load <4 x i32>, ptr %37, align 4
%38 = getelementptr inbounds i32, ptr %y, i32 32
%wide.load10.8 = load <4 x i32>, ptr %38, align 4
%39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
%40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39)
%41 = add i32 %40, %36
%42 = getelementptr inbounds i32, ptr %x, i32 36
%wide.load.9 = load <4 x i32>, ptr %42, align 4
%43 = getelementptr inbounds i32, ptr %y, i32 36
%wide.load10.9 = load <4 x i32>, ptr %43, align 4
%44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
%45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
%46 = add i32 %45, %41
%47 = getelementptr inbounds i32, ptr %x, i32 40
%wide.load.10 = load <4 x i32>, ptr %47, align 4
%48 = getelementptr inbounds i32, ptr %y, i32 40
%wide.load10.10 = load <4 x i32>, ptr %48, align 4
%49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
%50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49)
%51 = add i32 %50, %46
%52 = getelementptr inbounds i32, ptr %x, i32 44
%wide.load.11 = load <4 x i32>, ptr %52, align 4
%53 = getelementptr inbounds i32, ptr %y, i32 44
%wide.load10.11 = load <4 x i32>, ptr %53, align 4
%54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
%55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54)
%56 = add i32 %55, %51
%57 = getelementptr inbounds i32, ptr %x, i32 48
%wide.load.12 = load <4 x i32>, ptr %57, align 4
%58 = getelementptr inbounds i32, ptr %y, i32 48
%wide.load10.12 = load <4 x i32>, ptr %58, align 4
%59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
%60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59)
%61 = add i32 %60, %56
%62 = getelementptr inbounds i32, ptr %x, i32 52
%wide.load.13 = load <4 x i32>, ptr %62, align 4
%63 = getelementptr inbounds i32, ptr %y, i32 52
%wide.load10.13 = load <4 x i32>, ptr %63, align 4
%64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
%65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
%66 = add i32 %65, %61
%67 = getelementptr inbounds i32, ptr %x, i32 56
%wide.load.14 = load <4 x i32>, ptr %67, align 4
%68 = getelementptr inbounds i32, ptr %y, i32 56
%wide.load10.14 = load <4 x i32>, ptr %68, align 4
%69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
%70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69)
%71 = add i32 %70, %66
%72 = getelementptr inbounds i32, ptr %x, i32 60
%wide.load.15 = load <4 x i32>, ptr %72, align 4
%73 = getelementptr inbounds i32, ptr %y, i32 60
%wide.load10.15 = load <4 x i32>, ptr %73, align 4
%74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
%75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74)
%76 = add i32 %75, %71
ret i32 %76
}
define i32 @mlav128i32i32(ptr %x, ptr %y) {
; CHECK-LABEL: mlav128i32i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #16]
; CHECK-NEXT: vldrw.u32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #32]
; CHECK-NEXT: vldrw.u32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #48]
; CHECK-NEXT: vldrw.u32 q1, [r1, #48]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #64]
; CHECK-NEXT: vldrw.u32 q1, [r1, #64]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #80]
; CHECK-NEXT: vldrw.u32 q1, [r1, #80]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #96]
; CHECK-NEXT: vldrw.u32 q1, [r1, #96]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #112]
; CHECK-NEXT: vldrw.u32 q1, [r1, #112]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #128]
; CHECK-NEXT: vldrw.u32 q1, [r1, #128]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #144]
; CHECK-NEXT: vldrw.u32 q1, [r1, #144]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #160]
; CHECK-NEXT: vldrw.u32 q1, [r1, #160]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #176]
; CHECK-NEXT: vldrw.u32 q1, [r1, #176]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #192]
; CHECK-NEXT: vldrw.u32 q1, [r1, #192]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #208]
; CHECK-NEXT: vldrw.u32 q1, [r1, #208]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #224]
; CHECK-NEXT: vldrw.u32 q1, [r1, #224]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #240]
; CHECK-NEXT: vldrw.u32 q1, [r1, #240]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #256]
; CHECK-NEXT: vldrw.u32 q1, [r1, #256]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #272]
; CHECK-NEXT: vldrw.u32 q1, [r1, #272]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #288]
; CHECK-NEXT: vldrw.u32 q1, [r1, #288]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #304]
; CHECK-NEXT: vldrw.u32 q1, [r1, #304]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #320]
; CHECK-NEXT: vldrw.u32 q1, [r1, #320]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #336]
; CHECK-NEXT: vldrw.u32 q1, [r1, #336]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #352]
; CHECK-NEXT: vldrw.u32 q1, [r1, #352]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #368]
; CHECK-NEXT: vldrw.u32 q1, [r1, #368]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #384]
; CHECK-NEXT: vldrw.u32 q1, [r1, #384]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #400]
; CHECK-NEXT: vldrw.u32 q1, [r1, #400]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #416]
; CHECK-NEXT: vldrw.u32 q1, [r1, #416]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #432]
; CHECK-NEXT: vldrw.u32 q1, [r1, #432]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #448]
; CHECK-NEXT: vldrw.u32 q1, [r1, #448]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #464]
; CHECK-NEXT: vldrw.u32 q1, [r1, #464]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #480]
; CHECK-NEXT: vldrw.u32 q1, [r1, #480]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrw.u32 q0, [r2, #496]
; CHECK-NEXT: vldrw.u32 q1, [r1, #496]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%wide.load = load <4 x i32>, ptr %x, align 4
%wide.load10 = load <4 x i32>, ptr %y, align 4
%0 = mul nsw <4 x i32> %wide.load10, %wide.load
%1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
%2 = getelementptr inbounds i32, ptr %x, i32 4
%wide.load.1 = load <4 x i32>, ptr %2, align 4
%3 = getelementptr inbounds i32, ptr %y, i32 4
%wide.load10.1 = load <4 x i32>, ptr %3, align 4
%4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
%5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
%6 = add i32 %5, %1
%7 = getelementptr inbounds i32, ptr %x, i32 8
%wide.load.2 = load <4 x i32>, ptr %7, align 4
%8 = getelementptr inbounds i32, ptr %y, i32 8
%wide.load10.2 = load <4 x i32>, ptr %8, align 4
%9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
%10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9)
%11 = add i32 %10, %6
%12 = getelementptr inbounds i32, ptr %x, i32 12
%wide.load.3 = load <4 x i32>, ptr %12, align 4
%13 = getelementptr inbounds i32, ptr %y, i32 12
%wide.load10.3 = load <4 x i32>, ptr %13, align 4
%14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
%15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14)
%16 = add i32 %15, %11
%17 = getelementptr inbounds i32, ptr %x, i32 16
%wide.load.4 = load <4 x i32>, ptr %17, align 4
%18 = getelementptr inbounds i32, ptr %y, i32 16
%wide.load10.4 = load <4 x i32>, ptr %18, align 4
%19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
%20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19)
%21 = add i32 %20, %16
%22 = getelementptr inbounds i32, ptr %x, i32 20
%wide.load.5 = load <4 x i32>, ptr %22, align 4
%23 = getelementptr inbounds i32, ptr %y, i32 20
%wide.load10.5 = load <4 x i32>, ptr %23, align 4
%24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
%25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24)
%26 = add i32 %25, %21
%27 = getelementptr inbounds i32, ptr %x, i32 24
%wide.load.6 = load <4 x i32>, ptr %27, align 4
%28 = getelementptr inbounds i32, ptr %y, i32 24
%wide.load10.6 = load <4 x i32>, ptr %28, align 4
%29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
%30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
%31 = add i32 %30, %26
%32 = getelementptr inbounds i32, ptr %x, i32 28
%wide.load.7 = load <4 x i32>, ptr %32, align 4
%33 = getelementptr inbounds i32, ptr %y, i32 28
%wide.load10.7 = load <4 x i32>, ptr %33, align 4
%34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
%35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34)
%36 = add i32 %35, %31
%37 = getelementptr inbounds i32, ptr %x, i32 32
%wide.load.8 = load <4 x i32>, ptr %37, align 4
%38 = getelementptr inbounds i32, ptr %y, i32 32
%wide.load10.8 = load <4 x i32>, ptr %38, align 4
%39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
%40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39)
%41 = add i32 %40, %36
%42 = getelementptr inbounds i32, ptr %x, i32 36
%wide.load.9 = load <4 x i32>, ptr %42, align 4
%43 = getelementptr inbounds i32, ptr %y, i32 36
%wide.load10.9 = load <4 x i32>, ptr %43, align 4
%44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
%45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44)
%46 = add i32 %45, %41
%47 = getelementptr inbounds i32, ptr %x, i32 40
%wide.load.10 = load <4 x i32>, ptr %47, align 4
%48 = getelementptr inbounds i32, ptr %y, i32 40
%wide.load10.10 = load <4 x i32>, ptr %48, align 4
%49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
%50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49)
%51 = add i32 %50, %46
%52 = getelementptr inbounds i32, ptr %x, i32 44
%wide.load.11 = load <4 x i32>, ptr %52, align 4
%53 = getelementptr inbounds i32, ptr %y, i32 44
%wide.load10.11 = load <4 x i32>, ptr %53, align 4
%54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
%55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54)
%56 = add i32 %55, %51
%57 = getelementptr inbounds i32, ptr %x, i32 48
%wide.load.12 = load <4 x i32>, ptr %57, align 4
%58 = getelementptr inbounds i32, ptr %y, i32 48
%wide.load10.12 = load <4 x i32>, ptr %58, align 4
%59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
%60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59)
%61 = add i32 %60, %56
%62 = getelementptr inbounds i32, ptr %x, i32 52
%wide.load.13 = load <4 x i32>, ptr %62, align 4
%63 = getelementptr inbounds i32, ptr %y, i32 52
%wide.load10.13 = load <4 x i32>, ptr %63, align 4
%64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
%65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
%66 = add i32 %65, %61
%67 = getelementptr inbounds i32, ptr %x, i32 56
%wide.load.14 = load <4 x i32>, ptr %67, align 4
%68 = getelementptr inbounds i32, ptr %y, i32 56
%wide.load10.14 = load <4 x i32>, ptr %68, align 4
%69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
%70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69)
%71 = add i32 %70, %66
%72 = getelementptr inbounds i32, ptr %x, i32 60
%wide.load.15 = load <4 x i32>, ptr %72, align 4
%73 = getelementptr inbounds i32, ptr %y, i32 60
%wide.load10.15 = load <4 x i32>, ptr %73, align 4
%74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
%75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74)
%76 = add i32 %75, %71
%77 = getelementptr inbounds i32, ptr %x, i32 64
%wide.load.16 = load <4 x i32>, ptr %77, align 4
%78 = getelementptr inbounds i32, ptr %y, i32 64
%wide.load10.16 = load <4 x i32>, ptr %78, align 4
%79 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16
%80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %79)
%81 = add i32 %80, %76
%82 = getelementptr inbounds i32, ptr %x, i32 68
%wide.load.17 = load <4 x i32>, ptr %82, align 4
%83 = getelementptr inbounds i32, ptr %y, i32 68
%wide.load10.17 = load <4 x i32>, ptr %83, align 4
%84 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17
%85 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %84)
%86 = add i32 %85, %81
%87 = getelementptr inbounds i32, ptr %x, i32 72
%wide.load.18 = load <4 x i32>, ptr %87, align 4
%88 = getelementptr inbounds i32, ptr %y, i32 72
%wide.load10.18 = load <4 x i32>, ptr %88, align 4
%89 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18
%90 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %89)
%91 = add i32 %90, %86
%92 = getelementptr inbounds i32, ptr %x, i32 76
%wide.load.19 = load <4 x i32>, ptr %92, align 4
%93 = getelementptr inbounds i32, ptr %y, i32 76
%wide.load10.19 = load <4 x i32>, ptr %93, align 4
%94 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19
%95 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %94)
%96 = add i32 %95, %91
%97 = getelementptr inbounds i32, ptr %x, i32 80
%wide.load.20 = load <4 x i32>, ptr %97, align 4
%98 = getelementptr inbounds i32, ptr %y, i32 80
%wide.load10.20 = load <4 x i32>, ptr %98, align 4
%99 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20
%100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
%101 = add i32 %100, %96
%102 = getelementptr inbounds i32, ptr %x, i32 84
%wide.load.21 = load <4 x i32>, ptr %102, align 4
%103 = getelementptr inbounds i32, ptr %y, i32 84
%wide.load10.21 = load <4 x i32>, ptr %103, align 4
%104 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21
%105 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %104)
%106 = add i32 %105, %101
%107 = getelementptr inbounds i32, ptr %x, i32 88
%wide.load.22 = load <4 x i32>, ptr %107, align 4
%108 = getelementptr inbounds i32, ptr %y, i32 88
%wide.load10.22 = load <4 x i32>, ptr %108, align 4
%109 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22
%110 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %109)
%111 = add i32 %110, %106
%112 = getelementptr inbounds i32, ptr %x, i32 92
%wide.load.23 = load <4 x i32>, ptr %112, align 4
%113 = getelementptr inbounds i32, ptr %y, i32 92
%wide.load10.23 = load <4 x i32>, ptr %113, align 4
%114 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23
%115 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %114)
%116 = add i32 %115, %111
%117 = getelementptr inbounds i32, ptr %x, i32 96
%wide.load.24 = load <4 x i32>, ptr %117, align 4
%118 = getelementptr inbounds i32, ptr %y, i32 96
%wide.load10.24 = load <4 x i32>, ptr %118, align 4
%119 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24
%120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %119)
%121 = add i32 %120, %116
%122 = getelementptr inbounds i32, ptr %x, i32 100
%wide.load.25 = load <4 x i32>, ptr %122, align 4
%123 = getelementptr inbounds i32, ptr %y, i32 100
%wide.load10.25 = load <4 x i32>, ptr %123, align 4
%124 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25
%125 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %124)
%126 = add i32 %125, %121
%127 = getelementptr inbounds i32, ptr %x, i32 104
%wide.load.26 = load <4 x i32>, ptr %127, align 4
%128 = getelementptr inbounds i32, ptr %y, i32 104
%wide.load10.26 = load <4 x i32>, ptr %128, align 4
%129 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26
%130 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %129)
%131 = add i32 %130, %126
%132 = getelementptr inbounds i32, ptr %x, i32 108
%wide.load.27 = load <4 x i32>, ptr %132, align 4
%133 = getelementptr inbounds i32, ptr %y, i32 108
%wide.load10.27 = load <4 x i32>, ptr %133, align 4
%134 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27
%135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134)
%136 = add i32 %135, %131
%137 = getelementptr inbounds i32, ptr %x, i32 112
%wide.load.28 = load <4 x i32>, ptr %137, align 4
%138 = getelementptr inbounds i32, ptr %y, i32 112
%wide.load10.28 = load <4 x i32>, ptr %138, align 4
%139 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28
%140 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %139)
%141 = add i32 %140, %136
%142 = getelementptr inbounds i32, ptr %x, i32 116
%wide.load.29 = load <4 x i32>, ptr %142, align 4
%143 = getelementptr inbounds i32, ptr %y, i32 116
%wide.load10.29 = load <4 x i32>, ptr %143, align 4
%144 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29
%145 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %144)
%146 = add i32 %145, %141
%147 = getelementptr inbounds i32, ptr %x, i32 120
%wide.load.30 = load <4 x i32>, ptr %147, align 4
%148 = getelementptr inbounds i32, ptr %y, i32 120
%wide.load10.30 = load <4 x i32>, ptr %148, align 4
%149 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30
%150 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %149)
%151 = add i32 %150, %146
%152 = getelementptr inbounds i32, ptr %x, i32 124
%wide.load.31 = load <4 x i32>, ptr %152, align 4
%153 = getelementptr inbounds i32, ptr %y, i32 124
%wide.load10.31 = load <4 x i32>, ptr %153, align 4
%154 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31
%155 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %154)
%156 = add i32 %155, %151
ret i32 %156
}
define i32 @mlav2i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav2i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrsh.w r2, [r0]
; CHECK-NEXT: ldrsh.w r3, [r1]
; CHECK-NEXT: ldrsh.w r0, [r0, #2]
; CHECK-NEXT: ldrsh.w r1, [r1, #2]
; CHECK-NEXT: muls r0, r1, r0
; CHECK-NEXT: smlabb r0, r3, r2, r0
; CHECK-NEXT: bx lr
entry:
%0 = load i16, ptr %x, align 2
%conv = sext i16 %0 to i32
%1 = load i16, ptr %y, align 2
%conv2 = sext i16 %1 to i32
%mul = mul nsw i32 %conv2, %conv
%arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1
%2 = load i16, ptr %arrayidx.1, align 2
%conv.1 = sext i16 %2 to i32
%arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1
%3 = load i16, ptr %arrayidx1.1, align 2
%conv2.1 = sext i16 %3 to i32
%mul.1 = mul nsw i32 %conv2.1, %conv.1
%add.1 = add nsw i32 %mul.1, %mul
ret i32 %add.1
}
define i32 @mlav4i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav4i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <4 x i16>, ptr %x, align 2
%1 = sext <4 x i16> %0 to <4 x i32>
%2 = load <4 x i16>, ptr %y, align 2
%3 = sext <4 x i16> %2 to <4 x i32>
%4 = mul nsw <4 x i32> %3, %1
%5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
ret i32 %5
}
define i32 @mlav8i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav8i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vldrh.u16 q1, [r1]
; CHECK-NEXT: vmlav.s16 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, ptr %x, align 2
%1 = sext <8 x i16> %0 to <8 x i32>
%2 = load <8 x i16>, ptr %y, align 2
%3 = sext <8 x i16> %2 to <8 x i32>
%4 = mul nsw <8 x i32> %3, %1
%5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
ret i32 %5
}
define i32 @mlav16i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav16i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: vmlav.u32 r2, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #8]
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #16]
; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r0, #24]
; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
; CHECK-NEXT: vmlava.u32 r2, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: bx lr
entry:
%0 = load <16 x i16>, ptr %x, align 2
%1 = sext <16 x i16> %0 to <16 x i32>
%2 = load <16 x i16>, ptr %y, align 2
%3 = sext <16 x i16> %2 to <16 x i32>
%4 = mul nsw <16 x i32> %3, %1
%5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
ret i32 %5
}
define i32 @mlav24i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav24i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vldrh.u16 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.s16 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <8 x i16>, ptr %x, align 2
%1 = sext <8 x i16> %0 to <8 x i32>
%2 = load <8 x i16>, ptr %y, align 2
%3 = sext <8 x i16> %2 to <8 x i32>
%4 = mul nsw <8 x i32> %3, %1
%arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8
%arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8
%5 = load <16 x i16>, ptr %arrayidx.8, align 2
%6 = sext <16 x i16> %5 to <16 x i32>
%7 = load <16 x i16>, ptr %arrayidx1.8, align 2
%8 = sext <16 x i16> %7 to <16 x i32>
%9 = mul nsw <16 x i32> %8, %6
%10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
%11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
%op.rdx = add nsw i32 %10, %11
ret i32 %op.rdx
}
define i32 @mlav32i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav32i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.s32 q0, [r0]
; CHECK-NEXT: vldrh.s32 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #16]
; CHECK-NEXT: vldrh.s32 q1, [r1, #16]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #24]
; CHECK-NEXT: vldrh.s32 q1, [r1, #24]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #32]
; CHECK-NEXT: vldrh.s32 q1, [r1, #32]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #40]
; CHECK-NEXT: vldrh.s32 q1, [r1, #40]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #48]
; CHECK-NEXT: vldrh.s32 q1, [r1, #48]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: vldrh.s32 q0, [r2, #56]
; CHECK-NEXT: vldrh.s32 q1, [r1, #56]
; CHECK-NEXT: vmlava.u32 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%0 = load <32 x i16>, ptr %x, align 2
%1 = sext <32 x i16> %0 to <32 x i32>
%2 = load <32 x i16>, ptr %y, align 2
%3 = sext <32 x i16> %2 to <32 x i32>
%4 = mul nsw <32 x i32> %3, %1
%5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
ret i32 %5
}
define i32 @mlav64i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav64i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vldrh.u16 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #16]
; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #32]
; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #48]
; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #64]
; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #80]
; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #96]
; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #112]
; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%wide.load = load <8 x i16>, ptr %x, align 2
%0 = sext <8 x i16> %wide.load to <8 x i32>
%wide.load11 = load <8 x i16>, ptr %y, align 2
%1 = sext <8 x i16> %wide.load11 to <8 x i32>
%2 = mul nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%4 = getelementptr inbounds i16, ptr %x, i32 8
%wide.load.1 = load <8 x i16>, ptr %4, align 2
%5 = sext <8 x i16> %wide.load.1 to <8 x i32>
%6 = getelementptr inbounds i16, ptr %y, i32 8
%wide.load11.1 = load <8 x i16>, ptr %6, align 2
%7 = sext <8 x i16> %wide.load11.1 to <8 x i32>
%8 = mul nsw <8 x i32> %7, %5
%9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
%10 = add i32 %9, %3
%11 = getelementptr inbounds i16, ptr %x, i32 16
%wide.load.2 = load <8 x i16>, ptr %11, align 2
%12 = sext <8 x i16> %wide.load.2 to <8 x i32>
%13 = getelementptr inbounds i16, ptr %y, i32 16
%wide.load11.2 = load <8 x i16>, ptr %13, align 2
%14 = sext <8 x i16> %wide.load11.2 to <8 x i32>
%15 = mul nsw <8 x i32> %14, %12
%16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
%17 = add i32 %16, %10
%18 = getelementptr inbounds i16, ptr %x, i32 24
%wide.load.3 = load <8 x i16>, ptr %18, align 2
%19 = sext <8 x i16> %wide.load.3 to <8 x i32>
%20 = getelementptr inbounds i16, ptr %y, i32 24
%wide.load11.3 = load <8 x i16>, ptr %20, align 2
%21 = sext <8 x i16> %wide.load11.3 to <8 x i32>
%22 = mul nsw <8 x i32> %21, %19
%23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22)
%24 = add i32 %23, %17
%25 = getelementptr inbounds i16, ptr %x, i32 32
%wide.load.4 = load <8 x i16>, ptr %25, align 2
%26 = sext <8 x i16> %wide.load.4 to <8 x i32>
%27 = getelementptr inbounds i16, ptr %y, i32 32
%wide.load11.4 = load <8 x i16>, ptr %27, align 2
%28 = sext <8 x i16> %wide.load11.4 to <8 x i32>
%29 = mul nsw <8 x i32> %28, %26
%30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29)
%31 = add i32 %30, %24
%32 = getelementptr inbounds i16, ptr %x, i32 40
%wide.load.5 = load <8 x i16>, ptr %32, align 2
%33 = sext <8 x i16> %wide.load.5 to <8 x i32>
%34 = getelementptr inbounds i16, ptr %y, i32 40
%wide.load11.5 = load <8 x i16>, ptr %34, align 2
%35 = sext <8 x i16> %wide.load11.5 to <8 x i32>
%36 = mul nsw <8 x i32> %35, %33
%37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36)
%38 = add i32 %37, %31
%39 = getelementptr inbounds i16, ptr %x, i32 48
%wide.load.6 = load <8 x i16>, ptr %39, align 2
%40 = sext <8 x i16> %wide.load.6 to <8 x i32>
%41 = getelementptr inbounds i16, ptr %y, i32 48
%wide.load11.6 = load <8 x i16>, ptr %41, align 2
%42 = sext <8 x i16> %wide.load11.6 to <8 x i32>
%43 = mul nsw <8 x i32> %42, %40
%44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
%45 = add i32 %44, %38
%46 = getelementptr inbounds i16, ptr %x, i32 56
%wide.load.7 = load <8 x i16>, ptr %46, align 2
%47 = sext <8 x i16> %wide.load.7 to <8 x i32>
%48 = getelementptr inbounds i16, ptr %y, i32 56
%wide.load11.7 = load <8 x i16>, ptr %48, align 2
%49 = sext <8 x i16> %wide.load11.7 to <8 x i32>
%50 = mul nsw <8 x i32> %49, %47
%51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
%52 = add i32 %51, %45
ret i32 %52
}
define i32 @mlav128i32i16(ptr %x, ptr %y) {
; CHECK-LABEL: mlav128i32i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r0]
; CHECK-NEXT: vldrh.u16 q1, [r1]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: vmlav.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #16]
; CHECK-NEXT: vldrh.u16 q1, [r1, #16]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #32]
; CHECK-NEXT: vldrh.u16 q1, [r1, #32]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #48]
; CHECK-NEXT: vldrh.u16 q1, [r1, #48]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #64]
; CHECK-NEXT: vldrh.u16 q1, [r1, #64]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #80]
; CHECK-NEXT: vldrh.u16 q1, [r1, #80]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #96]
; CHECK-NEXT: vldrh.u16 q1, [r1, #96]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #112]
; CHECK-NEXT: vldrh.u16 q1, [r1, #112]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #128]
; CHECK-NEXT: vldrh.u16 q1, [r1, #128]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #144]
; CHECK-NEXT: vldrh.u16 q1, [r1, #144]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #160]
; CHECK-NEXT: vldrh.u16 q1, [r1, #160]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #176]
; CHECK-NEXT: vldrh.u16 q1, [r1, #176]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #192]
; CHECK-NEXT: vldrh.u16 q1, [r1, #192]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #208]
; CHECK-NEXT: vldrh.u16 q1, [r1, #208]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #224]
; CHECK-NEXT: vldrh.u16 q1, [r1, #224]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r2, #240]
; CHECK-NEXT: vldrh.u16 q1, [r1, #240]
; CHECK-NEXT: vmlava.s16 r0, q1, q0
; CHECK-NEXT: bx lr
entry:
%wide.load = load <8 x i16>, ptr %x, align 2
%0 = sext <8 x i16> %wide.load to <8 x i32>
%wide.load11 = load <8 x i16>, ptr %y, align 2
%1 = sext <8 x i16> %wide.load11 to <8 x i32>
%2 = mul nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%4 = getelementptr inbounds i16, ptr %x, i32 8
%wide.load.1 = load <8 x i16>, ptr %4, align 2
%5 = sext <8 x i16> %wide.load.1 to <8 x i32>
%6 = getelementptr inbounds i16, ptr %y, i32 8
%wide.load11.1 = load <8 x i16>, ptr %6, align 2
%7 = sext <8 x i16> %wide.load11.1 to <8 x i32>
%8 = mul nsw <8 x i32> %7, %5
%9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
%10 = add i32 %9, %3
%11 = getelementptr inbounds i16, ptr %x, i32 16
%wide.load.2 = load <8 x i16>, ptr %11, align 2
%12 = sext <8 x i16> %wide.load.2 to <8 x i32>
%13 = getelementptr inbounds i16, ptr %y, i32 16
%wide.load11.2 = load <8 x i16>, ptr %13, align 2
%14 = sext <8 x i16> %wide.load11.2 to <8 x i32>
%15 = mul nsw <8 x i32> %14, %12
%16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
%17 = add i32 %16, %10
%18 = getelementptr inbounds i16, ptr %x, i32 24
%wide.load.3 = load <8 x i16>, ptr %18, align 2
%19 = sext <8 x i16> %wide.load.3 to <8 x i32>
%20 = getelementptr inbounds i16, ptr %y, i32 24
%wide.load11.3 = load <8 x i16>, ptr %20, align 2
%21 = sext <8 x i16> %wide.load11.3 to <8 x i32>
%22 = mul nsw <8 x i32> %21, %19
%23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22)
%24 = add i32 %23, %17
%25 = getelementptr inbounds i16, ptr %x, i32 32
%wide.load.4 = load <8 x i16>, ptr %25, align 2
%26 = sext <8 x i16> %wide.load.4 to <8 x i32>
%27 = getelementptr inbounds i16, ptr %y, i32 32
%wide.load11.4 = load <8 x i16>, ptr %27, align 2
%28 = sext <8 x i16> %wide.load11.4 to <8 x i32>
%29 = mul nsw <8 x i32> %28, %26
%30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29)
%31 = add i32 %30, %24
%32 = getelementptr inbounds i16, ptr %x, i32 40
%wide.load.5 = load <8 x i16>, ptr %32, align 2
%33 = sext <8 x i16> %wide.load.5 to <8 x i32>
%34 = getelementptr inbounds i16, ptr %y, i32 40
%wide.load11.5 = load <8 x i16>, ptr %34, align 2
%35 = sext <8 x i16> %wide.load11.5 to <8 x i32>
%36 = mul nsw <8 x i32> %35, %33
%37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36)
%38 = add i32 %37, %31
%39 = getelementptr inbounds i16, ptr %x, i32 48
%wide.load.6 = load <8 x i16>, ptr %39, align 2
%40 = sext <8 x i16> %wide.load.6 to <8 x i32>
%41 = getelementptr inbounds i16, ptr %y, i32 48
%wide.load11.6 = load <8 x i16>, ptr %41, align 2
%42 = sext <8 x i16> %wide.load11.6 to <8 x i32>
%43 = mul nsw <8 x i32> %42, %40
%44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43)
%45 = add i32 %44, %38
%46 = getelementptr inbounds i16, ptr %x, i32 56
%wide.load.7 = load <8 x i16>, ptr %46, align 2
%47 = sext <8 x i16> %wide.load.7 to <8 x i32>
%48 = getelementptr inbounds i16, ptr %y, i32 56
%wide.load11.7 = load <8 x i16>, ptr %48, align 2
%49 = sext <8 x i16> %wide.load11.7 to <8 x i32>
%50 = mul nsw <8 x i32> %49, %47
%51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
%52 = add i32 %51, %45
%53 = getelementptr inbounds i16, ptr %x, i32 64
%wide.load.8 = load <8 x i16>, ptr %53, align 2
%54 = sext <8 x i16> %wide.load.8 to <8 x i32>
%55 = getelementptr inbounds i16, ptr %y, i32 64
%wide.load11.8 = load <8 x i16>, ptr %55, align 2
%56 = sext <8 x i16> %wide.load11.8 to <8 x i32>
%57 = mul nsw <8 x i32> %56, %54
%58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
%59 = add i32 %58, %52
%60 = getelementptr inbounds i16, ptr %x, i32 72
%wide.load.9 = load <8 x i16>, ptr %60, align 2
%61 = sext <8 x i16> %wide.load.9 to <8 x i32>
%62 = getelementptr inbounds i16, ptr %y, i32 72
%wide.load11.9 = load <8 x i16>, ptr %62, align 2
%63 = sext <8 x i16> %wide.load11.9 to <8 x i32>
%64 = mul nsw <8 x i32> %63, %61
%65 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %64)
%66 = add i32 %65, %59
%67 = getelementptr inbounds i16, ptr %x, i32 80
%wide.load.10 = load <8 x i16>, ptr %67, align 2
%68 = sext