|  | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-BASE | 
|  | ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT | 
|  | ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE | 
|  | ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT | 
|  |  | 
|  | define i32 @addv_v2i32(<2 x i32> %a) { | 
|  | ; CHECK-LABEL: addv_v2i32: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s | 
|  | ; CHECK-NEXT:    fmov w0, s0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) | 
|  | ret i32 %arg1 | 
|  | } | 
|  |  | 
|  | define i16 @addv_v4i16(<4 x i16> %a) { | 
|  | ; CHECK-LABEL: addv_v4i16: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addv h0, v0.4h | 
|  | ; CHECK-NEXT:    fmov w0, s0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) | 
|  | ret i16 %arg1 | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i32_v4i32(<4 x i32> %x) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i32: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-NEXT:    fmov w0, s0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i8 @addv_v8i8(<8 x i8> %a) { | 
|  | ; CHECK-LABEL: addv_v8i8: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addv b0, v0.8b | 
|  | ; CHECK-NEXT:    fmov w0, s0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) | 
|  | ret i8 %arg1 | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i64_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i32> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i64_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    saddlv d0, v0.4s | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i32> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i64_zsext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v1.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 0, i32 1> | 
|  | %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 2, i32 3> | 
|  | %xx = zext <2 x i32> %x to <2 x i64> | 
|  | %yy = sext <2 x i32> %y to <2 x i64> | 
|  | %zz = add <2 x i64> %xx, %yy | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { | 
|  | ; CHECK-LABEL: add_v2i32_v2i64_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i32> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) { | 
|  | ; CHECK-LABEL: add_v2i32_v2i64_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i32> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) { | 
|  | ; CHECK-LABEL: add_v8i16_v8i32_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-NEXT:    fmov w0, s0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i16> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) { | 
|  | ; CHECK-LABEL: add_v8i16_v8i32_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-NEXT:    fmov w0, s0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i16> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i32_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i32_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i16> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i32_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i32_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i16> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i16_v8i16: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i16_v8i16: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    uxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i16_v8i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i16_v8i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    mov w0, v0.s[0] | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i16> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i16_v8i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i16_v8i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    smov x0, v0.s[0] | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i16> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    mov w0, v0.s[0] | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i16> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    smov x0, v0.s[0] | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i16> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { | 
|  | ; CHECK-SD-LABEL: add_v2i16_v2i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff | 
|  | ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v2i16_v2i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    fmov x0, d0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i16> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) { | 
|  | ; CHECK-LABEL: add_v2i16_v2i64_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    shl v0.2d, v0.2d, #48 | 
|  | ; CHECK-NEXT:    sshr v0.2d, v0.2d, #48 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i16> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i32_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i32_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff | 
|  | ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i8> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i32_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24 | 
|  | ; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i32_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i8> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i16_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddlp v0.8h, v0.16b | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i16_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i16_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddlp v0.8h, v0.16b | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    smov w0, v0.h[0] | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i16_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i16_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i16_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i16_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    smov w0, v0.h[0] | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i16_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i8: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    addv b0, v0.16b | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i8: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv b0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    uxtb w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) | 
|  | ret i8 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v2.2d, v3.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    and x0, x8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v3.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v4.2d, v3.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    saddl v2.2d, v3.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    sxth x0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    and x0, x8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    sxth x0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff | 
|  | ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    and x0, x8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i8> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0 | 
|  | ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    ssra v1.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    sxth x0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i8> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v2i8_v2i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff | 
|  | ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v2i8_v2i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    fmov x0, d0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i8> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) { | 
|  | ; CHECK-LABEL: add_v2i8_v2i64_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-NEXT:    sshr v0.2d, v0.2d, #56 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i8> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i64_v2i64(<2 x i64> %x) { | 
|  | ; CHECK-LABEL: add_v2i64_v2i64: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x0, d0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i32_acc: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-NEXT:    fmov w8, s0 | 
|  | ; CHECK-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i64_acc_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i32> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v4i32_v4i64_acc_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    saddlv d0, v0.4s | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i32> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i32> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v2i32_v2i64_acc_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i32> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) { | 
|  | ; CHECK-LABEL: add_v8i16_v8i32_acc_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-NEXT:    fmov w8, s0 | 
|  | ; CHECK-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i16> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) { | 
|  | ; CHECK-LABEL: add_v8i16_v8i32_acc_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-NEXT:    fmov w8, s0 | 
|  | ; CHECK-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i16> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i16> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i16> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i16_v8i16_acc: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-SD-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i16_v8i16_acc: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w8, w0, w8, uxth | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) | 
|  | %r = add i16 %z, %a | 
|  | ret i16 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i16_v8i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, uxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i16> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i16_v8i64_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, sxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i16> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, uxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i16> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i16_v4i64_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, sxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i16> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff | 
|  | ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i16> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v2i16_v2i64_acc_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    shl v0.2d, v0.2d, #48 | 
|  | ; CHECK-NEXT:    sshr v0.2d, v0.2d, #48 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i16> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w0, w8, uxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w0, w8, sxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w0, w8, uxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w0, w8, sxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff | 
|  | ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w0, w0, w8, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i8> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i32_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24 | 
|  | ; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w0, w8, w0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i32_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w0, w0, w8, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i8> %x to <4 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %r = add i32 %z, %a | 
|  | ret i32 %r | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) { | 
|  | ; CHECK-LABEL: add_v16i8_v16i16_acc_zext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-NEXT:    fmov w8, s0 | 
|  | ; CHECK-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) | 
|  | %r = add i16 %z, %a | 
|  | ret i16 %r | 
|  | } | 
|  |  | 
|  | define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) { | 
|  | ; CHECK-LABEL: add_v16i8_v16i16_acc_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-NEXT:    fmov w8, s0 | 
|  | ; CHECK-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-NEXT:    sxth w0, w8 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) | 
|  | %r = add i16 %z, %a | 
|  | ret i16 %r | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-SD-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) | 
|  | %r = add i16 %z, %a | 
|  | ret i16 %r | 
|  | } | 
|  |  | 
|  | define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-SD-NEXT:    sxth w0, w8 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-GI-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) | 
|  | %r = add i16 %z, %a | 
|  | ret i16 %r | 
|  | } | 
|  |  | 
|  | define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i8_acc: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    addv b0, v0.16b | 
|  | ; CHECK-SD-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-NEXT:    add w8, w8, w0 | 
|  | ; CHECK-SD-NEXT:    and w0, w8, #0xff | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i8_acc: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv b0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add w8, w0, w8, uxtb | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) | 
|  | %r = add i8 %z, %a | 
|  | ret i8 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v2.2d, v3.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v16i8_v16i64_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v3.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v4.2d, v3.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    saddl v2.2d, v3.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v8i8_v8i64_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff | 
|  | ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i8> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v4i8_v4i64_acc_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0 | 
|  | ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    ssra v1.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x0, w8, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i8> %x to <4 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { | 
|  | ; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff | 
|  | ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x8, d0 | 
|  | ; CHECK-SD-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i8> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v2i8_v2i64_acc_sext: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-NEXT:    sshr v0.2d, v0.2d, #56 | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i8> %x to <2 x i64> | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) { | 
|  | ; CHECK-LABEL: add_v2i64_v2i64_acc: | 
|  | ; CHECK:       // %bb.0: // %entry | 
|  | ; CHECK-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-NEXT:    fmov x8, d0 | 
|  | ; CHECK-NEXT:    add x0, x8, x0 | 
|  | ; CHECK-NEXT:    ret | 
|  | entry: | 
|  | %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) | 
|  | %r = add i64 %z, %a | 
|  | ret i64 %r | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i32_v4i32: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i32_v4i32: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x) | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i32_v4i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s | 
|  | ; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-GI-NEXT:    uaddlv d1, v1.4s | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i32> %x to <4 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %yy = zext <4 x i32> %y to <4 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i32_v4i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddlp v1.2d, v1.4s | 
|  | ; CHECK-SD-NEXT:    sadalp v1.2d, v0.4s | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv d0, v0.4s | 
|  | ; CHECK-GI-NEXT:    saddlv d1, v1.4s | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i32> %x to <4 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %yy = sext <4 x i32> %y to <4 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0 | 
|  | ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1 | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0] | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i32> %x to <2 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %yy = zext <2 x i32> %y to <2 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i32_v2i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0 | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i32> %x to <2 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %yy = sext <2 x i32> %y to <2 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i16_v8i32_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h | 
|  | ; CHECK-SD-NEXT:    uadalp v1.4s, v0.8h | 
|  | ; CHECK-SD-NEXT:    addv s0, v1.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    uaddlv s1, v1.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i16> %x to <8 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %yy = zext <8 x i16> %y to <8 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i16_v8i32_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddlp v1.4s, v1.8h | 
|  | ; CHECK-SD-NEXT:    sadalp v1.4s, v0.8h | 
|  | ; CHECK-SD-NEXT:    addv s0, v1.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    saddlv s1, v1.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i16> %x to <8 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %yy = sext <8 x i16> %y to <8 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0 | 
|  | ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1 | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0] | 
|  | ; CHECK-SD-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    uaddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i16> %x to <4 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %yy = zext <4 x i16> %y to <4 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i16_v4i32_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    saddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i16> %x to <4 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %yy = sext <4 x i16> %y to <4 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) { | 
|  | ; CHECK-SD-BASE-LABEL: test_udot_v8i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_udot_v8i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_udot_v8i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_udot_v8i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %0 = zext <8 x i8> %a to <8 x i32> | 
|  | %1 = zext <8 x i8> %b to <8 x i32> | 
|  | %2 = mul nuw nsw <8 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) { | 
|  | ; CHECK-SD-BASE-LABEL: test_udot_v16i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v1.16b, v0.16b | 
|  | ; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v0.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_udot_v16i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_udot_v16i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    umull v4.4s, v3.4h, v2.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v5.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umlal2 v4.4s, v3.8h, v2.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umlal2 v5.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v5.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_udot_v16i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %0 = zext <16 x i8> %a to <16 x i32> | 
|  | %1 = zext <16 x i8> %b to <16 x i32> | 
|  | %2 = mul nuw nsw <16 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_udot_v24i8(ptr %p1, ptr %p2) { | 
|  | ; CHECK-SD-BASE-LABEL: test_udot_v24i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q0, [x0] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q1, [x1] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x0, #16] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d3, [x1, #16] | 
|  | ; CHECK-SD-BASE-NEXT:    umull v2.8h, v3.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    umull v3.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    umull2 v0.8h, v1.16b, v0.16b | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v3.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v2.4s, v0.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_udot_v24i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q2, [x0] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q3, [x1] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x0, #16] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d5, [x1, #16] | 
|  | ; CHECK-SD-DOT-NEXT:    udot v1.2s, v5.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v0.4s, v3.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v1.2s, v1.2s, v1.2s | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s1 | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w9, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w9, w8 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_udot_v24i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q0, [x0] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q1, [x1] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16] | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h | 
|  | ; CHECK-GI-BASE-NEXT:    addv s2, v6.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s3, v4.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v5.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s5, v7.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s2 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s3 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s5 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w10, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_udot_v24i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q2, [x0] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q4, [x1] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16] | 
|  | ; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b | 
|  | ; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %a = load <24 x i8>, ptr %p1 | 
|  | %b = load <24 x i8>, ptr %p2 | 
|  | %0 = zext <24 x i8> %a to <24 x i32> | 
|  | %1 = zext <24 x i8> %b to <24 x i32> | 
|  | %2 = mul nuw nsw <24 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { | 
|  | ; CHECK-SD-BASE-LABEL: test_udot_v48i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ldp q4, q0, [x0, #16] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q2, [x1, #32] | 
|  | ; CHECK-SD-BASE-NEXT:    ldp q1, q5, [x1] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q3, [x0] | 
|  | ; CHECK-SD-BASE-NEXT:    umull2 v6.8h, v2.16b, v0.16b | 
|  | ; CHECK-SD-BASE-NEXT:    umull v0.8h, v2.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    umull2 v7.8h, v1.16b, v3.16b | 
|  | ; CHECK-SD-BASE-NEXT:    umull v1.8h, v1.8b, v3.8b | 
|  | ; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v5.16b, v4.16b | 
|  | ; CHECK-SD-BASE-NEXT:    umull v3.8h, v5.8b, v4.8b | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v7.8h, v6.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v1.8h, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v6.4s, v7.4h, v6.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v1.4h, v0.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v4.4s, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw2 v4.4s, v5.4s, v3.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw v2.4s, v6.4s, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v3.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v1.4s, v4.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_udot_v48i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32] | 
|  | ; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    ldp q3, q1, [x0] | 
|  | ; CHECK-SD-DOT-NEXT:    ldp q4, q2, [x1] | 
|  | ; CHECK-SD-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_udot_v48i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q6, [x0, #32] | 
|  | ; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32] | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v20.8h, v6.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v16.8h, v3.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v17.8h, v2.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v5.4s, v0.4h, v1.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v19.4s, v16.4h, v17.4h | 
|  | ; CHECK-GI-BASE-NEXT:    ushll v1.8h, v7.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v16.4s, v16.8h, v17.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v17.4s, v3.4h, v2.4h | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h | 
|  | ; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v7.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s18, v18.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v4.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s5, v5.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s19, v19.4s | 
|  | ; CHECK-GI-BASE-NEXT:    umull v3.4s, v1.4h, v20.4h | 
|  | ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v1.8h, v20.8h | 
|  | ; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v6.4h | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s18 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s5 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w12, s19 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v16.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s5, v17.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s3, v3.4s | 
|  | ; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v7.8h, v6.8h | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w11, w12 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w10 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s5 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w12, s2 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v20.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w9, w10 | 
|  | ; CHECK-GI-BASE-NEXT:    add w10, w11, w12 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s3 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w9, w10 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_udot_v48i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32] | 
|  | ; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0] | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32] | 
|  | ; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v1.4s, v6.16b, v4.16b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v2.4s, v16.16b, v7.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s2, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s2 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %a = load <48 x i8>, ptr %p1 | 
|  | %b = load <48 x i8>, ptr %p2 | 
|  | %0 = zext <48 x i8> %a to <48 x i32> | 
|  | %1 = zext <48 x i8> %b to <48 x i32> | 
|  | %2 = mul nuw nsw <48 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) { | 
|  | ; CHECK-SD-BASE-LABEL: test_sdot_v8i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_sdot_v8i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_sdot_v8i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_sdot_v8i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %0 = sext <8 x i8> %a to <8 x i32> | 
|  | %1 = sext <8 x i8> %b to <8 x i32> | 
|  | %2 = mul nuw nsw <8 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) { | 
|  | ; CHECK-SD-BASE-LABEL: test_sdot_v16i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v1.16b, v0.16b | 
|  | ; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v0.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_sdot_v16i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_sdot_v16i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    smull v4.4s, v3.4h, v2.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v5.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smlal2 v4.4s, v3.8h, v2.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smlal2 v5.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v5.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_sdot_v16i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %0 = sext <16 x i8> %a to <16 x i32> | 
|  | %1 = sext <16 x i8> %b to <16 x i32> | 
|  | %2 = mul nuw nsw <16 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) { | 
|  | ; CHECK-SD-BASE-LABEL: test_sdot_v24i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q0, [x0] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q1, [x1] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x0, #16] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d3, [x1, #16] | 
|  | ; CHECK-SD-BASE-NEXT:    smull v2.8h, v3.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    smull v3.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    smull2 v0.8h, v1.16b, v0.16b | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v3.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v1.4s, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw v0.4s, v2.4s, v0.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_sdot_v24i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q2, [x0] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q3, [x1] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x0, #16] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d5, [x1, #16] | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v1.2s, v5.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v0.4s, v3.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v1.2s, v1.2s, v1.2s | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s1 | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w9, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w9, w8 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_sdot_v24i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q0, [x0] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q1, [x1] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16] | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    smull v6.4s, v5.4h, v4.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v5.8h, v4.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v7.4s, v3.4h, v2.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v0.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v3.8h, v2.8h | 
|  | ; CHECK-GI-BASE-NEXT:    addv s2, v6.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s3, v4.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v5.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s5, v7.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s2 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s3 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s5 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w10, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_sdot_v24i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q2, [x0] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q4, [x1] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16] | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v1.4s, v4.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b | 
|  | ; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %a = load <24 x i8>, ptr %p1 | 
|  | %b = load <24 x i8>, ptr %p2 | 
|  | %0 = sext <24 x i8> %a to <24 x i32> | 
|  | %1 = sext <24 x i8> %b to <24 x i32> | 
|  | %2 = mul nuw nsw <24 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { | 
|  | ; CHECK-SD-BASE-LABEL: test_sdot_v48i8: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ldp q4, q0, [x0, #16] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q2, [x1, #32] | 
|  | ; CHECK-SD-BASE-NEXT:    ldp q1, q5, [x1] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr q3, [x0] | 
|  | ; CHECK-SD-BASE-NEXT:    smull2 v6.8h, v2.16b, v0.16b | 
|  | ; CHECK-SD-BASE-NEXT:    smull v0.8h, v2.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    smull2 v7.8h, v1.16b, v3.16b | 
|  | ; CHECK-SD-BASE-NEXT:    smull v1.8h, v1.8b, v3.8b | 
|  | ; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v5.16b, v4.16b | 
|  | ; CHECK-SD-BASE-NEXT:    smull v3.8h, v5.8b, v4.8b | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v7.8h, v6.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v1.8h, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v6.4s, v7.4h, v6.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v1.4h, v0.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v4.4s, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw2 v4.4s, v5.4s, v3.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw v2.4s, v6.4s, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v3.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v1.4s, v4.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_sdot_v48i8: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32] | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    ldp q3, q1, [x0] | 
|  | ; CHECK-SD-DOT-NEXT:    ldp q4, q2, [x1] | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: test_sdot_v48i8: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q6, [x0, #32] | 
|  | ; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32] | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v20.8h, v6.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v16.8h, v3.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v17.8h, v2.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v5.4s, v0.4h, v1.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v19.4s, v16.4h, v17.4h | 
|  | ; CHECK-GI-BASE-NEXT:    sshll v1.8h, v7.8b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v16.4s, v16.8h, v17.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v17.4s, v3.4h, v2.4h | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h | 
|  | ; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v7.16b, #0 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s18, v18.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v4.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s5, v5.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s19, v19.4s | 
|  | ; CHECK-GI-BASE-NEXT:    smull v3.4s, v1.4h, v20.4h | 
|  | ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v1.8h, v20.8h | 
|  | ; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v6.4h | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s18 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s5 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w12, s19 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v16.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s5, v17.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s3, v3.4s | 
|  | ; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v7.8h, v6.8h | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w11, w12 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w10 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s5 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w12, s2 | 
|  | ; CHECK-GI-BASE-NEXT:    addv s4, v20.4s | 
|  | ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w9, w10 | 
|  | ; CHECK-GI-BASE-NEXT:    add w10, w11, w12 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s3 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w9, w10 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: test_sdot_v48i8: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32] | 
|  | ; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0] | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32] | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v1.4s, v6.16b, v4.16b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v2.4s, v16.16b, v7.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s2, v2.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s2 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %a = load <48 x i8>, ptr %p1 | 
|  | %b = load <48 x i8>, ptr %p2 | 
|  | %0 = sext <48 x i8> %a to <48 x i32> | 
|  | %1 = sext <48 x i8> %b to <48 x i32> | 
|  | %2 = mul nuw nsw <48 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2) | 
|  | ret i32 %3 | 
|  | } | 
|  |  | 
|  | ; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT | 
|  | define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) { | 
|  | ; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlv s1, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w8, s1 | 
|  | ; CHECK-SD-BASE-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    umull v3.8h, v1.8b, v0.8b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ushll v0.4s, v3.4h, #0 | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w9, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    addp v1.2s, v2.2s, v2.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s1 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: test_udot_v8i8_multi_use: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-GI-NEXT:    umull v2.4s, v1.4h, v0.4h | 
|  | ; CHECK-GI-NEXT:    mov v3.16b, v2.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s2 | 
|  | ; CHECK-GI-NEXT:    umlal2 v3.4s, v1.8h, v0.8h | 
|  | ; CHECK-GI-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    add w0, w9, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %0 = zext <8 x i8> %a to <8 x i32> | 
|  | %1 = zext <8 x i8> %b to <8 x i32> | 
|  | %2 = mul nuw nsw <8 x i32> %1, %0 | 
|  | %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) | 
|  | %4 = extractelement <8 x i32> %2, i32 0 | 
|  | %5 = add nuw nsw i32 %3, %4 | 
|  | ret i32 %5 | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i16_v8i16: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    add v0.8h, v0.8h, v1.8h | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i16_v8i16: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-GI-NEXT:    addv h1, v1.8h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w9, w8, uxth | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x) | 
|  | %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y) | 
|  | %z = add i16 %z1, %z2 | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i16_v8i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s1, v1.8h | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    mov w8, v1.s[0] | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, uxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i16> %x to <8 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %yy = zext <8 x i16> %y to <8 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i16_v8i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v4.2d, v0.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.2d, v1.4s, v3.4s | 
|  | ; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s1, v1.8h | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.8h | 
|  | ; CHECK-GI-NEXT:    smov x8, v1.s[0] | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, sxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i16> %x to <8 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %yy = sext <8 x i16> %y to <8 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i16_v4i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s | 
|  | ; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    mov w8, v1.s[0] | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, uxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i16> %x to <4 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %yy = zext <4 x i16> %y to <4 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i16_v4i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddlp v1.2d, v1.4s | 
|  | ; CHECK-SD-NEXT:    sadalp v1.2d, v0.4s | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    smov x8, v1.s[0] | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, sxtw | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i16> %x to <4 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %yy = sext <4 x i16> %y to <4 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0 | 
|  | ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1 | 
|  | ; CHECK-SD-NEXT:    movi v2.2d, #0x00ffff0000ffff | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0] | 
|  | ; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi v2.2d, #0x0000000000ffff | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b | 
|  | ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i16> %x to <2 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %yy = zext <2 x i16> %y to <2 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i16_v2i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #48 | 
|  | ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #48 | 
|  | ; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #48 | 
|  | ; CHECK-SD-NEXT:    ssra v0.2d, v1.2d, #48 | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #48 | 
|  | ; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #48 | 
|  | ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #48 | 
|  | ; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #48 | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i16> %x to <2 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %yy = sext <2 x i16> %y to <2 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v0.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v1.8h, v3.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v1.4s, v1.4h, v3.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v4.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.16b | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    and w8, w8, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9, uxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v4.4s, v0.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v4.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s1, v3.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) | 
|  | %yy = zext <16 x i8> %y to <16 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v0.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v1.8h, v3.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v1.4s, v1.4h, v3.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v4.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h1, v1.16b | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w8, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9, sxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v4.4s, v0.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v4.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s1, v3.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx) | 
|  | %yy = sext <16 x i8> %y to <16 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) { | 
|  | ; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    and w8, w8, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9, uxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v4.2s, v0.8b, v2.8b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v3.2s, v1.8b, v2.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v4.2s, v4.2s | 
|  | ; CHECK-GI-DOT-NEXT:    addp v1.2s, v3.2s, v3.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %yy = zext <8 x i8> %y to <8 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) { | 
|  | ; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddlp v1.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    sadalp v1.4s, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w8, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w8, w9, sxth | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v4.2s, v0.8b, v2.8b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v3.2s, v1.8b, v2.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v4.2s, v4.2s | 
|  | ; CHECK-GI-DOT-NEXT:    addp v1.2s, v3.2s, v3.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx) | 
|  | %yy = sext <8 x i8> %y to <8 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1 | 
|  | ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0 | 
|  | ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0] | 
|  | ; CHECK-SD-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff | 
|  | ; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b | 
|  | ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    and w8, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i8> %x to <4 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %yy = zext <4 x i8> %y to <4 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i8_v4i32_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24 | 
|  | ; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24 | 
|  | ; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24 | 
|  | ; CHECK-SD-NEXT:    ssra v0.4s, v1.4s, #24 | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8 | 
|  | ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    saddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    sxth w8, w8 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i8> %x to <4 x i32> | 
|  | %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx) | 
|  | %yy = sext <4 x i8> %y to <4 x i32> | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy) | 
|  | %z = add i32 %z1, %z2 | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v16i8_v16i16_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b | 
|  | ; CHECK-SD-NEXT:    uadalp v1.8h, v0.16b | 
|  | ; CHECK-SD-NEXT:    addv h0, v1.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    uaddlv h1, v1.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i16> | 
|  | %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) | 
|  | %yy = zext <16 x i8> %y to <16 x i16> | 
|  | %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy) | 
|  | %z = add i16 %z1, %z2 | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v16i8_v16i16_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddlp v1.8h, v1.16b | 
|  | ; CHECK-SD-NEXT:    sadalp v1.8h, v0.16b | 
|  | ; CHECK-SD-NEXT:    addv h0, v1.8h | 
|  | ; CHECK-SD-NEXT:    smov w0, v0.h[0] | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    saddlv h1, v1.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i16> | 
|  | %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx) | 
|  | %yy = sext <16 x i8> %y to <16 x i16> | 
|  | %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy) | 
|  | %z = add i16 %z1, %z2 | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0 | 
|  | ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1 | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0] | 
|  | ; CHECK-SD-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-SD-NEXT:    umov w0, v0.h[0] | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv h1, v1.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i16> | 
|  | %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) | 
|  | %yy = zext <8 x i8> %y to <8 x i16> | 
|  | %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy) | 
|  | %z = add i16 %z1, %z2 | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i8_v8i16_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    smov w0, v0.h[0] | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    saddlv h1, v1.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i16> | 
|  | %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx) | 
|  | %yy = sext <8 x i8> %y to <8 x i16> | 
|  | %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy) | 
|  | %z = add i16 %z1, %z2 | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v16i8_v16i8: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    add v0.16b, v0.16b, v1.16b | 
|  | ; CHECK-SD-NEXT:    addv b0, v0.16b | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v16i8_v16i8: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addv b0, v0.16b | 
|  | ; CHECK-GI-NEXT:    addv b1, v1.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w9, w8, uxtb | 
|  | ; CHECK-GI-NEXT:    and w0, w8, #0xff | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x) | 
|  | %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y) | 
|  | %z = add i8 %z1, %z2 | 
|  | ret i8 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v16i8_v16i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll2 v2.8h, v0.16b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v3.8h, v1.16b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v4.4s, v2.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v5.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v6.4s, v3.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v7.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v2.2d, v5.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v4.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v6.2d, v7.2s, v6.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s | 
|  | ; CHECK-SD-NEXT:    add v3.2d, v5.2d, v16.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v2.2d, v7.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v6.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h1, v1.16b | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    and x8, x8, #0xffff | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <16 x i8> %x to <16 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) | 
|  | %yy = zext <16 x i8> %y to <16 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v16i8_v16i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll2 v2.8h, v0.16b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v3.8h, v1.16b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v4.4s, v2.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v5.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v6.4s, v3.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v7.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v3.4s, v3.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v16.2d, v5.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    saddl v2.2d, v5.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v4.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v4.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v4.2d, v7.4s, v6.4s | 
|  | ; CHECK-SD-NEXT:    saddl v6.2d, v7.2s, v6.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v7.2d, v1.4s, v3.4s | 
|  | ; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s | 
|  | ; CHECK-SD-NEXT:    add v3.2d, v5.2d, v16.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v2.2d, v7.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v6.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h1, v1.16b | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    sxth x8, w8 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <16 x i8> %x to <16 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) | 
|  | %yy = sext <16 x i8> %y to <16 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i8_v8i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s | 
|  | ; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h1, v1.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    and x8, x8, #0xffff | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <8 x i8> %x to <8 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %yy = zext <8 x i8> %y to <8 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i8_v8i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0 | 
|  | ; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    saddl2 v4.2d, v0.4s, v2.4s | 
|  | ; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.2d, v1.4s, v3.4s | 
|  | ; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d | 
|  | ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h1, v1.8b | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    sxth x8, w8 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <8 x i8> %x to <8 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) | 
|  | %yy = sext <8 x i8> %y to <8 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i8_v4i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8 | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s | 
|  | ; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s | 
|  | ; CHECK-SD-NEXT:    addp d0, v1.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff | 
|  | ; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b | 
|  | ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b | 
|  | ; CHECK-GI-NEXT:    uaddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    uaddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    and x8, x8, #0xffff | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, uxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <4 x i8> %x to <4 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %yy = zext <4 x i8> %y to <4 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v3.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0 | 
|  | ; CHECK-SD-NEXT:    shl v3.2d, v3.2d, #56 | 
|  | ; CHECK-SD-NEXT:    shl v2.2d, v2.2d, #56 | 
|  | ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    sshr v3.2d, v3.2d, #56 | 
|  | ; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #56 | 
|  | ; CHECK-SD-NEXT:    ssra v3.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    ssra v2.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v3.2d, v2.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8 | 
|  | ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8 | 
|  | ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8 | 
|  | ; CHECK-GI-NEXT:    saddlv s1, v1.4h | 
|  | ; CHECK-GI-NEXT:    saddlv s0, v0.4h | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    sxth x8, w8 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, w9, sxth | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <4 x i8> %x to <4 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx) | 
|  | %yy = sext <4 x i8> %y to <4 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0 | 
|  | ; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1 | 
|  | ; CHECK-SD-NEXT:    movi v2.2d, #0x0000ff000000ff | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0] | 
|  | ; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b | 
|  | ; CHECK-SD-NEXT:    uaddlv d0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b | 
|  | ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <2 x i8> %x to <2 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %yy = zext <2 x i8> %y to <2 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i8_v2i64_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #56 | 
|  | ; CHECK-SD-NEXT:    ssra v0.2d, v1.2d, #56 | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0 | 
|  | ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56 | 
|  | ; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56 | 
|  | ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56 | 
|  | ; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56 | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <2 x i8> %x to <2 x i64> | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) | 
|  | %yy = sext <2 x i8> %y to <2 x i64> | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) { | 
|  | ; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v3.8h, v3.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v2.8h, v2.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddlp v3.4s, v3.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    sadalp v3.4s, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v3.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v5.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v6.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v6.2s, v1.8b, v5.8b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v4.2s, v3.8b, v5.8b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v6.2s, v0.8b, v5.8b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v4.2s, v2.8b, v5.8b | 
|  | ; CHECK-SD-DOT-NEXT:    add v0.2s, v6.2s, v4.2s | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h3, v3.8b | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h2, v2.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s3 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w10, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w11, s2 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w8, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    and w9, w9, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    add w9, w9, w10, uxth | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w11, sxth | 
|  | ; CHECK-GI-BASE-NEXT:    add w0, w9, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v6.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v5.2s, v0.8b, v4.8b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v6.2s, v3.8b, v4.8b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v7.2s, v1.8b, v4.8b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v16.2s, v2.8b, v4.8b | 
|  | ; CHECK-GI-DOT-NEXT:    addp v0.2s, v5.2s, v5.2s | 
|  | ; CHECK-GI-DOT-NEXT:    addp v3.2s, v6.2s, v6.2s | 
|  | ; CHECK-GI-DOT-NEXT:    addp v1.2s, v7.2s, v7.2s | 
|  | ; CHECK-GI-DOT-NEXT:    addp v2.2s, v16.2s, v16.2s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w11, s3 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w10, s2 | 
|  | ; CHECK-GI-DOT-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    add w9, w10, w11 | 
|  | ; CHECK-GI-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %axx = zext <8 x i8> %ax to <8 x i32> | 
|  | %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx) | 
|  | %ayy = zext <8 x i8> %ay to <8 x i32> | 
|  | %az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy) | 
|  | %az = add i32 %az1, %az2 | 
|  | %bxx = sext <8 x i8> %bx to <8 x i32> | 
|  | %bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx) | 
|  | %byy = sext <8 x i8> %by to <8 x i32> | 
|  | %bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy) | 
|  | %bz = add i32 %bz1, %bz2 | 
|  | %z = add i32 %az, %bz | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) { | 
|  | ; CHECK-SD-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h | 
|  | ; CHECK-SD-NEXT:    uaddlp v3.4s, v3.8h | 
|  | ; CHECK-SD-NEXT:    uadalp v1.4s, v0.8h | 
|  | ; CHECK-SD-NEXT:    uadalp v3.4s, v2.8h | 
|  | ; CHECK-SD-NEXT:    add v0.4s, v3.4s, v1.4s | 
|  | ; CHECK-SD-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0 | 
|  | ; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0 | 
|  | ; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s | 
|  | ; CHECK-GI-NEXT:    add v1.4s, v5.4s, v1.4s | 
|  | ; CHECK-GI-NEXT:    add v2.4s, v6.4s, v2.4s | 
|  | ; CHECK-GI-NEXT:    add v3.4s, v7.4s, v3.4s | 
|  | ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s | 
|  | ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %axx = zext <8 x i16> %ax to <8 x i32> | 
|  | %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %axs = add <4 x i32> %s1h, %s1l | 
|  | %ayy = zext <8 x i16> %ay to <8 x i32> | 
|  | %s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %ays = add <4 x i32> %s2h, %s2l | 
|  | %az = add <4 x i32> %axs, %ays | 
|  | %bxx = zext <8 x i16> %bx to <8 x i32> | 
|  | %s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %bxs = add <4 x i32> %s3h, %s3l | 
|  | %byy = zext <8 x i16> %by to <8 x i32> | 
|  | %s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %bys = add <4 x i32> %s4h, %s4l | 
|  | %bz = add <4 x i32> %bxs, %bys | 
|  | %z = add <4 x i32> %az, %bz | 
|  | %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z) | 
|  | ret i32 %z2 | 
|  | } | 
|  |  | 
|  | define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) { | 
|  | ; CHECK-SD-LABEL: add_pair_v2i64_v2i64: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d | 
|  | ; CHECK-SD-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-SD-NEXT:    fmov x0, d0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_pair_v2i64_v2i64: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    addp d0, v0.2d | 
|  | ; CHECK-GI-NEXT:    addp d1, v1.2d | 
|  | ; CHECK-GI-NEXT:    fmov x8, d0 | 
|  | ; CHECK-GI-NEXT:    fmov x9, d1 | 
|  | ; CHECK-GI-NEXT:    add x0, x8, x9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x) | 
|  | %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y) | 
|  | %z = add i64 %z1, %z2 | 
|  | ret i64 %z | 
|  | } | 
|  |  | 
|  | ; Irregularly sized vectors | 
|  | define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v24i8_v24i16_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    fmov s0, w0 | 
|  | ; CHECK-SD-NEXT:    ldr b1, [sp, #64] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #72 | 
|  | ; CHECK-SD-NEXT:    ldr b2, [sp] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #80 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #8 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #16 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #88 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #24 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #96 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #32 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #104 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #40 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #112 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #48 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #120 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #56 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9] | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8] | 
|  | ; CHECK-SD-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v2.8b | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v24i8_v24i16_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    fmov s0, w0 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp] | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #8] | 
|  | ; CHECK-GI-NEXT:    ldr w10, [sp, #72] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[8], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #64] | 
|  | ; CHECK-GI-NEXT:    fmov s1, w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #16] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[9], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #80] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[1], w10 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[10], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #24] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[2], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #88] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[11], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #32] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[3], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #96] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[12], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #40] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[4], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #104] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[13], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #48] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[5], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #112] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[14], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #56] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[6], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #120] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[15], w8 | 
|  | ; CHECK-GI-NEXT:    mov v1.b[7], w9 | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    uaddlv h1, v1.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <24 x i8> %x to <24 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v32i8_v32i16_zext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b | 
|  | ; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v32i8_v32i16_zext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    uaddlv h1, v1.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <32 x i8> %x to <32 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v24i8_v24i16_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    fmov s0, w0 | 
|  | ; CHECK-SD-NEXT:    ldr b1, [sp, #64] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #72 | 
|  | ; CHECK-SD-NEXT:    ldr b2, [sp] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #80 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #8 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #16 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #88 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #24 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #96 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #32 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #104 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #40 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #112 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #48 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9] | 
|  | ; CHECK-SD-NEXT:    add x9, sp, #120 | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8] | 
|  | ; CHECK-SD-NEXT:    add x8, sp, #56 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9] | 
|  | ; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8] | 
|  | ; CHECK-SD-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-SD-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    saddw v0.8h, v0.8h, v2.8b | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v24i8_v24i16_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    fmov s0, w0 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp] | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #8] | 
|  | ; CHECK-GI-NEXT:    ldr w10, [sp, #72] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[8], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #64] | 
|  | ; CHECK-GI-NEXT:    fmov s1, w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #16] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[9], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #80] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[1], w10 | 
|  | ; CHECK-GI-NEXT:    mov v0.b[10], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #24] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[2], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #88] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[11], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #32] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[3], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #96] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[12], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #40] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[4], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #104] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[13], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #48] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[5], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #112] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[14], w8 | 
|  | ; CHECK-GI-NEXT:    ldr w8, [sp, #56] | 
|  | ; CHECK-GI-NEXT:    mov v1.b[6], w9 | 
|  | ; CHECK-GI-NEXT:    ldr w9, [sp, #120] | 
|  | ; CHECK-GI-NEXT:    mov v0.b[15], w8 | 
|  | ; CHECK-GI-NEXT:    mov v1.b[7], w9 | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    saddlv h1, v1.8b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <24 x i8> %x to <24 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) { | 
|  | ; CHECK-SD-LABEL: add_v32i8_v32i16_sext: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    saddl2 v2.8h, v0.16b, v1.16b | 
|  | ; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h | 
|  | ; CHECK-SD-NEXT:    addv h0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: add_v32i8_v32i16_sext: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-NEXT:    saddlv h1, v1.16b | 
|  | ; CHECK-GI-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <32 x i8> %x to <32 x i16> | 
|  | %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx) | 
|  | ret i16 %z | 
|  | } | 
|  |  | 
|  | ; Irregularly sized vectors and larger extends | 
|  | define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    fmov s0, w0 | 
|  | ; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #72 | 
|  | ; CHECK-SD-BASE-NEXT:    ldr b2, [sp] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #80 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #8 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #16 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #88 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #24 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #96 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #32 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #104 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #40 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #112 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #48 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #120 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #56 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v2.8h, v2.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v3.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    fmov s0, w0 | 
|  | ; CHECK-SD-DOT-NEXT:    mov x8, sp | 
|  | ; CHECK-SD-DOT-NEXT:    ldr b1, [sp, #64] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #72 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[1], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #80 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v5.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[2], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #88 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[3], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #96 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[4], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #104 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[5], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #112 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[6], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #120 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[7], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v4.2s, v1.8b, v5.8b | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #8 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #16 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #24 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[11], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #32 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[12], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #40 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[13], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #48 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[14], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #56 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[15], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    fmov s0, w0 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64] | 
|  | ; CHECK-GI-BASE-NEXT:    fmov s1, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    fmov s0, w0 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72] | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov s1, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80] | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8] | 
|  | ; CHECK-GI-DOT-NEXT:    fmov d1, d1 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16] | 
|  | ; CHECK-GI-DOT-NEXT:    udot v4.4s, v1.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v3.16b | 
|  | ; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <24 x i8> %x to <24 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v1.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v3.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_zext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_zext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_zext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    udot v4.4s, v0.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v3.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = zext <32 x i8> %x to <32 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    fmov s0, w0 | 
|  | ; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #72 | 
|  | ; CHECK-SD-BASE-NEXT:    ldr b2, [sp] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #80 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #8 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #16 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #88 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #24 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #96 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #32 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #104 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #40 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #112 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #48 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    add x9, sp, #120 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    add x8, sp, #56 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x9] | 
|  | ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8] | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v2.8h, v2.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v3.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v3.4s, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    fmov s0, w0 | 
|  | ; CHECK-SD-DOT-NEXT:    mov x8, sp | 
|  | ; CHECK-SD-DOT-NEXT:    ldr b1, [sp, #64] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #72 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[1], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #80 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v5.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[2], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #88 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[3], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #96 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[4], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #104 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[5], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #112 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[6], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    add x9, sp, #120 | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[7], [x9] | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v4.2s, v1.8b, v5.8b | 
|  | ; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w9, s1 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #8 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #16 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #24 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[11], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #32 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[12], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #40 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[13], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #48 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[14], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    add x8, sp, #56 | 
|  | ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[15], [x8] | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w8, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    add w0, w8, w9 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    fmov s0, w0 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8] | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64] | 
|  | ; CHECK-GI-BASE-NEXT:    fmov s1, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120] | 
|  | ; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8 | 
|  | ; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9 | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    fmov s0, w0 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp] | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72] | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.8b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    fmov s1, w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80] | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8] | 
|  | ; CHECK-GI-DOT-NEXT:    fmov d1, d1 | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16] | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v4.4s, v1.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56] | 
|  | ; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v3.16b | 
|  | ; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <24 x i8> %x to <24 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) { | 
|  | ; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v1.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v0.16b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0 | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v3.8h, v2.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v0.8h, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h | 
|  | ; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s | 
|  | ; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_sext: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b | 
|  | ; CHECK-SD-DOT-NEXT:    addv s0, v3.4s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_sext: | 
|  | ; CHECK-GI-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b | 
|  | ; CHECK-GI-BASE-NEXT:    saddlv h1, v1.16b | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w8, s0 | 
|  | ; CHECK-GI-BASE-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-BASE-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-BASE-NEXT:    sxth w0, w8 | 
|  | ; CHECK-GI-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_sext: | 
|  | ; CHECK-GI-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-GI-DOT-NEXT:    movi v2.16b, #1 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000 | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v4.4s, v0.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b | 
|  | ; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v3.4s | 
|  | ; CHECK-GI-DOT-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-DOT-NEXT:    ret | 
|  | entry: | 
|  | %xx = sext <32 x i8> %x to <32 x i32> | 
|  | %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx) | 
|  | ret i32 %z | 
|  | } | 
|  |  | 
|  | define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { | 
|  | ; CHECK-SD-BASE-LABEL: full: | 
|  | ; CHECK-SD-BASE:       // %bb.0: // %entry | 
|  | ; CHECK-SD-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3 | 
|  | ; CHECK-SD-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1 | 
|  | ; CHECK-SD-BASE-NEXT:    sxtw x8, w3 | 
|  | ; CHECK-SD-BASE-NEXT:    sxtw x9, w1 | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d0, [x0] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x2] | 
|  | ; CHECK-SD-BASE-NEXT:    add x10, x0, x9 | 
|  | ; CHECK-SD-BASE-NEXT:    add x11, x2, x8 | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-SD-BASE-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-BASE-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-SD-BASE-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-BASE-NEXT:    uaddlp v0.4s, v0.8h | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-BASE-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-SD-BASE-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-BASE-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-SD-BASE-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-BASE-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-SD-BASE-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-BASE-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d2, [x11, x8] | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    ldr d1, [x10, x9] | 
|  | ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h | 
|  | ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-SD-BASE-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-BASE-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-SD-DOT-LABEL: full: | 
|  | ; CHECK-SD-DOT:       // %bb.0: // %entry | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d0, [x0] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x2] | 
|  | ; CHECK-SD-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3 | 
|  | ; CHECK-SD-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1 | 
|  | ; CHECK-SD-DOT-NEXT:    sxtw x8, w3 | 
|  | ; CHECK-SD-DOT-NEXT:    sxtw x9, w1 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000 | 
|  | ; CHECK-SD-DOT-NEXT:    movi v3.8b, #1 | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b | 
|  | ; CHECK-SD-DOT-NEXT:    add x11, x2, x8 | 
|  | ; CHECK-SD-DOT-NEXT:    add x10, x0, x9 | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-SD-DOT-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-DOT-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-SD-DOT-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-DOT-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-SD-DOT-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-DOT-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-SD-DOT-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-DOT-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-SD-DOT-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-SD-DOT-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d1, [x10, x9] | 
|  | ; CHECK-SD-DOT-NEXT:    ldr d4, [x11, x8] | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b | 
|  | ; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b | 
|  | ; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s | 
|  | ; CHECK-SD-DOT-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-DOT-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: full: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1 | 
|  | ; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3 | 
|  | ; CHECK-GI-NEXT:    sxtw x9, w1 | 
|  | ; CHECK-GI-NEXT:    sxtw x8, w3 | 
|  | ; CHECK-GI-NEXT:    ldr d0, [x0] | 
|  | ; CHECK-GI-NEXT:    ldr d1, [x2] | 
|  | ; CHECK-GI-NEXT:    add x10, x0, x9 | 
|  | ; CHECK-GI-NEXT:    add x11, x2, x8 | 
|  | ; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b | 
|  | ; CHECK-GI-NEXT:    ldr d1, [x10] | 
|  | ; CHECK-GI-NEXT:    ldr d2, [x11] | 
|  | ; CHECK-GI-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-GI-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b | 
|  | ; CHECK-GI-NEXT:    ldr d3, [x10] | 
|  | ; CHECK-GI-NEXT:    ldr d4, [x11] | 
|  | ; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0 | 
|  | ; CHECK-GI-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-GI-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-GI-NEXT:    ldr d2, [x10] | 
|  | ; CHECK-GI-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0 | 
|  | ; CHECK-GI-NEXT:    ldr d6, [x11] | 
|  | ; CHECK-GI-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b | 
|  | ; CHECK-GI-NEXT:    abs v5.4s, v5.4s | 
|  | ; CHECK-GI-NEXT:    abs v0.4s, v0.4s | 
|  | ; CHECK-GI-NEXT:    ldr d4, [x10] | 
|  | ; CHECK-GI-NEXT:    ldr d16, [x11] | 
|  | ; CHECK-GI-NEXT:    abs v7.4s, v7.4s | 
|  | ; CHECK-GI-NEXT:    abs v1.4s, v1.4s | 
|  | ; CHECK-GI-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-GI-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b | 
|  | ; CHECK-GI-NEXT:    ldr d6, [x10] | 
|  | ; CHECK-GI-NEXT:    ldr d17, [x11] | 
|  | ; CHECK-GI-NEXT:    add x10, x10, x9 | 
|  | ; CHECK-GI-NEXT:    add x11, x11, x8 | 
|  | ; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b | 
|  | ; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0 | 
|  | ; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s | 
|  | ; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s | 
|  | ; CHECK-GI-NEXT:    ldr d5, [x10] | 
|  | ; CHECK-GI-NEXT:    ldr d7, [x11] | 
|  | ; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0 | 
|  | ; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b | 
|  | ; CHECK-GI-NEXT:    ldr d17, [x11, x8] | 
|  | ; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0 | 
|  | ; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b | 
|  | ; CHECK-GI-NEXT:    ldr d7, [x10, x9] | 
|  | ; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0 | 
|  | ; CHECK-GI-NEXT:    abs v16.4s, v16.4s | 
|  | ; CHECK-GI-NEXT:    abs v3.4s, v3.4s | 
|  | ; CHECK-GI-NEXT:    abs v18.4s, v18.4s | 
|  | ; CHECK-GI-NEXT:    abs v2.4s, v2.4s | 
|  | ; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b | 
|  | ; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0 | 
|  | ; CHECK-GI-NEXT:    abs v19.4s, v19.4s | 
|  | ; CHECK-GI-NEXT:    abs v4.4s, v4.4s | 
|  | ; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s | 
|  | ; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0 | 
|  | ; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s | 
|  | ; CHECK-GI-NEXT:    abs v17.4s, v17.4s | 
|  | ; CHECK-GI-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-NEXT:    abs v6.4s, v6.4s | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s | 
|  | ; CHECK-GI-NEXT:    addv s3, v3.4s | 
|  | ; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0 | 
|  | ; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0 | 
|  | ; CHECK-GI-NEXT:    abs v16.4s, v16.4s | 
|  | ; CHECK-GI-NEXT:    abs v5.4s, v5.4s | 
|  | ; CHECK-GI-NEXT:    fmov w8, s1 | 
|  | ; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s | 
|  | ; CHECK-GI-NEXT:    addv s2, v2.4s | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    addv s4, v4.4s | 
|  | ; CHECK-GI-NEXT:    fmov w10, s3 | 
|  | ; CHECK-GI-NEXT:    abs v18.4s, v18.4s | 
|  | ; CHECK-GI-NEXT:    abs v7.4s, v7.4s | 
|  | ; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s | 
|  | ; CHECK-GI-NEXT:    add w8, w8, w9 | 
|  | ; CHECK-GI-NEXT:    addv s3, v6.4s | 
|  | ; CHECK-GI-NEXT:    fmov w9, s2 | 
|  | ; CHECK-GI-NEXT:    add w8, w10, w8 | 
|  | ; CHECK-GI-NEXT:    fmov w10, s4 | 
|  | ; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s | 
|  | ; CHECK-GI-NEXT:    addv s1, v1.4s | 
|  | ; CHECK-GI-NEXT:    add w8, w9, w8 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s3 | 
|  | ; CHECK-GI-NEXT:    add w8, w10, w8 | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    add w8, w9, w8 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s1 | 
|  | ; CHECK-GI-NEXT:    add w8, w9, w8 | 
|  | ; CHECK-GI-NEXT:    fmov w9, s0 | 
|  | ; CHECK-GI-NEXT:    add w0, w9, w8 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %idx.ext8 = sext i32 %s2 to i64 | 
|  | %idx.ext = sext i32 %s1 to i64 | 
|  | %0 = load <8 x i8>, ptr %p1, align 1 | 
|  | %1 = zext <8 x i8> %0 to <8 x i32> | 
|  | %2 = load <8 x i8>, ptr %p2, align 1 | 
|  | %3 = zext <8 x i8> %2 to <8 x i32> | 
|  | %4 = sub nsw <8 x i32> %1, %3 | 
|  | %5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true) | 
|  | %6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) | 
|  | %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext | 
|  | %add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8 | 
|  | %7 = load <8 x i8>, ptr %add.ptr, align 1 | 
|  | %8 = zext <8 x i8> %7 to <8 x i32> | 
|  | %9 = load <8 x i8>, ptr %add.ptr9, align 1 | 
|  | %10 = zext <8 x i8> %9 to <8 x i32> | 
|  | %11 = sub nsw <8 x i32> %8, %10 | 
|  | %12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true) | 
|  | %13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12) | 
|  | %op.rdx.1 = add i32 %13, %6 | 
|  | %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext | 
|  | %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8 | 
|  | %14 = load <8 x i8>, ptr %add.ptr.1, align 1 | 
|  | %15 = zext <8 x i8> %14 to <8 x i32> | 
|  | %16 = load <8 x i8>, ptr %add.ptr9.1, align 1 | 
|  | %17 = zext <8 x i8> %16 to <8 x i32> | 
|  | %18 = sub nsw <8 x i32> %15, %17 | 
|  | %19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true) | 
|  | %20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19) | 
|  | %op.rdx.2 = add i32 %20, %op.rdx.1 | 
|  | %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext | 
|  | %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8 | 
|  | %21 = load <8 x i8>, ptr %add.ptr.2, align 1 | 
|  | %22 = zext <8 x i8> %21 to <8 x i32> | 
|  | %23 = load <8 x i8>, ptr %add.ptr9.2, align 1 | 
|  | %24 = zext <8 x i8> %23 to <8 x i32> | 
|  | %25 = sub nsw <8 x i32> %22, %24 | 
|  | %26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true) | 
|  | %27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26) | 
|  | %op.rdx.3 = add i32 %27, %op.rdx.2 | 
|  | %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext | 
|  | %add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8 | 
|  | %28 = load <8 x i8>, ptr %add.ptr.3, align 1 | 
|  | %29 = zext <8 x i8> %28 to <8 x i32> | 
|  | %30 = load <8 x i8>, ptr %add.ptr9.3, align 1 | 
|  | %31 = zext <8 x i8> %30 to <8 x i32> | 
|  | %32 = sub nsw <8 x i32> %29, %31 | 
|  | %33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true) | 
|  | %34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33) | 
|  | %op.rdx.4 = add i32 %34, %op.rdx.3 | 
|  | %add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext | 
|  | %add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8 | 
|  | %35 = load <8 x i8>, ptr %add.ptr.4, align 1 | 
|  | %36 = zext <8 x i8> %35 to <8 x i32> | 
|  | %37 = load <8 x i8>, ptr %add.ptr9.4, align 1 | 
|  | %38 = zext <8 x i8> %37 to <8 x i32> | 
|  | %39 = sub nsw <8 x i32> %36, %38 | 
|  | %40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true) | 
|  | %41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40) | 
|  | %op.rdx.5 = add i32 %41, %op.rdx.4 | 
|  | %add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext | 
|  | %add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8 | 
|  | %42 = load <8 x i8>, ptr %add.ptr.5, align 1 | 
|  | %43 = zext <8 x i8> %42 to <8 x i32> | 
|  | %44 = load <8 x i8>, ptr %add.ptr9.5, align 1 | 
|  | %45 = zext <8 x i8> %44 to <8 x i32> | 
|  | %46 = sub nsw <8 x i32> %43, %45 | 
|  | %47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true) | 
|  | %48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47) | 
|  | %op.rdx.6 = add i32 %48, %op.rdx.5 | 
|  | %add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext | 
|  | %add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8 | 
|  | %49 = load <8 x i8>, ptr %add.ptr.6, align 1 | 
|  | %50 = zext <8 x i8> %49 to <8 x i32> | 
|  | %51 = load <8 x i8>, ptr %add.ptr9.6, align 1 | 
|  | %52 = zext <8 x i8> %51 to <8 x i32> | 
|  | %53 = sub nsw <8 x i32> %50, %52 | 
|  | %54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true) | 
|  | %55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54) | 
|  | %op.rdx.7 = add i32 %55, %op.rdx.6 | 
|  | ret i32 %op.rdx.7 | 
|  | } | 
|  |  | 
|  | define i32 @extract_hi_lo(<8 x i16> %a) { | 
|  | ; CHECK-SD-LABEL: extract_hi_lo: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: extract_hi_lo: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0 | 
|  | ; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %z1 = zext <4 x i16> %e1 to <4 x i32> | 
|  | %z2 = zext <4 x i16> %e2 to <4 x i32> | 
|  | %z4 = add <4 x i32> %z1, %z2 | 
|  | %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4) | 
|  | ret i32 %z5 | 
|  | } | 
|  |  | 
|  | define i32 @extract_hi_hi(<8 x i16> %a) { | 
|  | ; CHECK-SD-LABEL: extract_hi_hi: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    mov v0.d[0], v0.d[1] | 
|  | ; CHECK-SD-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: extract_hi_hi: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddl2 v0.4s, v0.8h, v0.8h | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> | 
|  | %z2 = zext <4 x i16> %e2 to <4 x i32> | 
|  | %z4 = add <4 x i32> %z2, %z2 | 
|  | %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4) | 
|  | ret i32 %z5 | 
|  | } | 
|  |  | 
|  | define i32 @extract_lo_lo(<8 x i16> %a) { | 
|  | ; CHECK-SD-LABEL: extract_lo_lo: | 
|  | ; CHECK-SD:       // %bb.0: // %entry | 
|  | ; CHECK-SD-NEXT:    mov v0.d[1], v0.d[0] | 
|  | ; CHECK-SD-NEXT:    uaddlv s0, v0.8h | 
|  | ; CHECK-SD-NEXT:    fmov w0, s0 | 
|  | ; CHECK-SD-NEXT:    ret | 
|  | ; | 
|  | ; CHECK-GI-LABEL: extract_lo_lo: | 
|  | ; CHECK-GI:       // %bb.0: // %entry | 
|  | ; CHECK-GI-NEXT:    uaddl v0.4s, v0.4h, v0.4h | 
|  | ; CHECK-GI-NEXT:    addv s0, v0.4s | 
|  | ; CHECK-GI-NEXT:    fmov w0, s0 | 
|  | ; CHECK-GI-NEXT:    ret | 
|  | entry: | 
|  | %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> | 
|  | %z1 = zext <4 x i16> %e1 to <4 x i32> | 
|  | %z4 = add <4 x i32> %z1, %z1 | 
|  | %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4) | 
|  | ret i32 %z5 | 
|  | } | 
|  |  | 
|  | declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1 | 
|  | declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) | 
|  | declare i16 @llvm.vector.reduce.add.v24i16(<24 x i16>) | 
|  | declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) | 
|  | declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) | 
|  | declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) | 
|  | declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) | 
|  | declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) | 
|  | declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) | 
|  | declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) | 
|  | declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>) | 
|  | declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) | 
|  | declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>) | 
|  | declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) | 
|  | declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) | 
|  | declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) | 
|  | declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) | 
|  | declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) | 
|  | declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) |