| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s |
| |
| ; Various reductions generated fro SLP vectorizing unrolled loops. Generated |
| ; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed. |
| |
| define i32 @addv2i32i32(ptr %x) { |
| ; CHECK-LABEL: addv2i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrd r0, r1, [r0] |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i32, ptr %x, align 4 |
| %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1 |
| %1 = load i32, ptr %arrayidx.1, align 4 |
| %add.1 = add nsw i32 %1, %0 |
| ret i32 %add.1 |
| } |
| |
| define i32 @addv4i32i32(ptr %x) { |
| ; CHECK-LABEL: addv4i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vaddv.u32 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i32>, ptr %x, align 4 |
| %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) |
| ret i32 %1 |
| } |
| |
| define i32 @addv8i32i32(ptr %x) { |
| ; CHECK-LABEL: addv8i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q1, [r0] |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u32 r0, q1 |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i32>, ptr %x, align 4 |
| %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) |
| ret i32 %1 |
| } |
| |
| define i32 @addv16i32i32(ptr %x) { |
| ; CHECK-LABEL: addv16i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q1, [r0] |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i32>, ptr %x, align 4 |
| %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) |
| ret i32 %1 |
| } |
| |
| define i32 @addv24i32i32(ptr %x) { |
| ; CHECK-LABEL: addv24i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q1, [r0] |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i32>, ptr %x, align 4 |
| %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8 |
| %1 = load <16 x i32>, ptr %arrayidx.8, align 4 |
| %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) |
| %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) |
| %op.rdx = add nsw i32 %2, %3 |
| ret i32 %op.rdx |
| } |
| |
| define i32 @addv32i32i32(ptr %x) { |
| ; CHECK-LABEL: addv32i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q1, [r0] |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: mov r1, r0 |
| ; CHECK-NEXT: vaddv.u32 r0, q1 |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1, #32] |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1, #48] |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1, #64] |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1, #80] |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1, #96] |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r1, #112] |
| ; CHECK-NEXT: vaddva.u32 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i32>, ptr %x, align 4 |
| %1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %0) |
| ret i32 %1 |
| } |
| |
| define i32 @addv64i32i32(ptr %x) { |
| ; CHECK-LABEL: addv64i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q1, [r0] |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #128] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #144] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #160] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #176] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #192] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #208] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #224] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #240] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <64 x i32>, ptr %x, align 4 |
| %1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %0) |
| ret i32 %1 |
| } |
| |
| define i32 @addv128i32i32(ptr %x) { |
| ; CHECK-LABEL: addv128i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q1, [r0] |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #112] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #128] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #144] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #160] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #176] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #192] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #208] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #224] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #240] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #256] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #272] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #288] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #304] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #320] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #336] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #352] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #368] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #384] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #400] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #416] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #432] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #448] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #464] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #480] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #496] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <4 x i32>, ptr %x, align 4 |
| %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) |
| %1 = getelementptr inbounds i32, ptr %x, i32 4 |
| %wide.load.1 = load <4 x i32>, ptr %1, align 4 |
| %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1) |
| %3 = add i32 %2, %0 |
| %4 = getelementptr inbounds i32, ptr %x, i32 8 |
| %wide.load.2 = load <4 x i32>, ptr %4, align 4 |
| %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2) |
| %6 = add i32 %5, %3 |
| %7 = getelementptr inbounds i32, ptr %x, i32 12 |
| %wide.load.3 = load <4 x i32>, ptr %7, align 4 |
| %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3) |
| %9 = add i32 %8, %6 |
| %10 = getelementptr inbounds i32, ptr %x, i32 16 |
| %wide.load.4 = load <4 x i32>, ptr %10, align 4 |
| %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4) |
| %12 = add i32 %11, %9 |
| %13 = getelementptr inbounds i32, ptr %x, i32 20 |
| %wide.load.5 = load <4 x i32>, ptr %13, align 4 |
| %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5) |
| %15 = add i32 %14, %12 |
| %16 = getelementptr inbounds i32, ptr %x, i32 24 |
| %wide.load.6 = load <4 x i32>, ptr %16, align 4 |
| %17 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6) |
| %18 = add i32 %17, %15 |
| %19 = getelementptr inbounds i32, ptr %x, i32 28 |
| %wide.load.7 = load <4 x i32>, ptr %19, align 4 |
| %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7) |
| %21 = add i32 %20, %18 |
| %22 = getelementptr inbounds i32, ptr %x, i32 32 |
| %wide.load.8 = load <4 x i32>, ptr %22, align 4 |
| %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8) |
| %24 = add i32 %23, %21 |
| %25 = getelementptr inbounds i32, ptr %x, i32 36 |
| %wide.load.9 = load <4 x i32>, ptr %25, align 4 |
| %26 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9) |
| %27 = add i32 %26, %24 |
| %28 = getelementptr inbounds i32, ptr %x, i32 40 |
| %wide.load.10 = load <4 x i32>, ptr %28, align 4 |
| %29 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10) |
| %30 = add i32 %29, %27 |
| %31 = getelementptr inbounds i32, ptr %x, i32 44 |
| %wide.load.11 = load <4 x i32>, ptr %31, align 4 |
| %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11) |
| %33 = add i32 %32, %30 |
| %34 = getelementptr inbounds i32, ptr %x, i32 48 |
| %wide.load.12 = load <4 x i32>, ptr %34, align 4 |
| %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12) |
| %36 = add i32 %35, %33 |
| %37 = getelementptr inbounds i32, ptr %x, i32 52 |
| %wide.load.13 = load <4 x i32>, ptr %37, align 4 |
| %38 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13) |
| %39 = add i32 %38, %36 |
| %40 = getelementptr inbounds i32, ptr %x, i32 56 |
| %wide.load.14 = load <4 x i32>, ptr %40, align 4 |
| %41 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14) |
| %42 = add i32 %41, %39 |
| %43 = getelementptr inbounds i32, ptr %x, i32 60 |
| %wide.load.15 = load <4 x i32>, ptr %43, align 4 |
| %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15) |
| %45 = add i32 %44, %42 |
| %46 = getelementptr inbounds i32, ptr %x, i32 64 |
| %wide.load.16 = load <4 x i32>, ptr %46, align 4 |
| %47 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16) |
| %48 = add i32 %47, %45 |
| %49 = getelementptr inbounds i32, ptr %x, i32 68 |
| %wide.load.17 = load <4 x i32>, ptr %49, align 4 |
| %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17) |
| %51 = add i32 %50, %48 |
| %52 = getelementptr inbounds i32, ptr %x, i32 72 |
| %wide.load.18 = load <4 x i32>, ptr %52, align 4 |
| %53 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18) |
| %54 = add i32 %53, %51 |
| %55 = getelementptr inbounds i32, ptr %x, i32 76 |
| %wide.load.19 = load <4 x i32>, ptr %55, align 4 |
| %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19) |
| %57 = add i32 %56, %54 |
| %58 = getelementptr inbounds i32, ptr %x, i32 80 |
| %wide.load.20 = load <4 x i32>, ptr %58, align 4 |
| %59 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20) |
| %60 = add i32 %59, %57 |
| %61 = getelementptr inbounds i32, ptr %x, i32 84 |
| %wide.load.21 = load <4 x i32>, ptr %61, align 4 |
| %62 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21) |
| %63 = add i32 %62, %60 |
| %64 = getelementptr inbounds i32, ptr %x, i32 88 |
| %wide.load.22 = load <4 x i32>, ptr %64, align 4 |
| %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22) |
| %66 = add i32 %65, %63 |
| %67 = getelementptr inbounds i32, ptr %x, i32 92 |
| %wide.load.23 = load <4 x i32>, ptr %67, align 4 |
| %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23) |
| %69 = add i32 %68, %66 |
| %70 = getelementptr inbounds i32, ptr %x, i32 96 |
| %wide.load.24 = load <4 x i32>, ptr %70, align 4 |
| %71 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24) |
| %72 = add i32 %71, %69 |
| %73 = getelementptr inbounds i32, ptr %x, i32 100 |
| %wide.load.25 = load <4 x i32>, ptr %73, align 4 |
| %74 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25) |
| %75 = add i32 %74, %72 |
| %76 = getelementptr inbounds i32, ptr %x, i32 104 |
| %wide.load.26 = load <4 x i32>, ptr %76, align 4 |
| %77 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26) |
| %78 = add i32 %77, %75 |
| %79 = getelementptr inbounds i32, ptr %x, i32 108 |
| %wide.load.27 = load <4 x i32>, ptr %79, align 4 |
| %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27) |
| %81 = add i32 %80, %78 |
| %82 = getelementptr inbounds i32, ptr %x, i32 112 |
| %wide.load.28 = load <4 x i32>, ptr %82, align 4 |
| %83 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28) |
| %84 = add i32 %83, %81 |
| %85 = getelementptr inbounds i32, ptr %x, i32 116 |
| %wide.load.29 = load <4 x i32>, ptr %85, align 4 |
| %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29) |
| %87 = add i32 %86, %84 |
| %88 = getelementptr inbounds i32, ptr %x, i32 120 |
| %wide.load.30 = load <4 x i32>, ptr %88, align 4 |
| %89 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30) |
| %90 = add i32 %89, %87 |
| %91 = getelementptr inbounds i32, ptr %x, i32 124 |
| %wide.load.31 = load <4 x i32>, ptr %91, align 4 |
| %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31) |
| %93 = add i32 %92, %90 |
| ret i32 %93 |
| } |
| |
| define i32 @addv2i32i16(ptr %x) { |
| ; CHECK-LABEL: addv2i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrsh.w r1, [r0] |
| ; CHECK-NEXT: ldrsh.w r0, [r0, #2] |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i16, ptr %x, align 2 |
| %conv = sext i16 %0 to i32 |
| %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 |
| %1 = load i16, ptr %arrayidx.1, align 2 |
| %conv.1 = sext i16 %1 to i32 |
| %add.1 = add nsw i32 %conv, %conv.1 |
| ret i32 %add.1 |
| } |
| |
| define i32 @addv4i32i16(ptr %x) { |
| ; CHECK-LABEL: addv4i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q0, [r0] |
| ; CHECK-NEXT: vaddv.u32 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i16>, ptr %x, align 2 |
| %1 = sext <4 x i16> %0 to <4 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv8i32i16(ptr %x) { |
| ; CHECK-LABEL: addv8i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q0, [r0] |
| ; CHECK-NEXT: vaddv.s16 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i16>, ptr %x, align 2 |
| %1 = sext <8 x i16> %0 to <8 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv16i32i16(ptr %x) { |
| ; CHECK-LABEL: addv16i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q1, [r0] |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i16>, ptr %x, align 2 |
| %1 = sext <16 x i16> %0 to <16 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv24i32i16(ptr %x) { |
| ; CHECK-LABEL: addv24i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q1, [r0] |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i16>, ptr %x, align 2 |
| %1 = sext <16 x i16> %0 to <16 x i32> |
| %arrayidx.16 = getelementptr inbounds i16, ptr %x, i32 16 |
| %2 = load <8 x i16>, ptr %arrayidx.16, align 2 |
| %3 = sext <8 x i16> %2 to <8 x i32> |
| %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) |
| %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) |
| %op.rdx = add nsw i32 %4, %5 |
| ret i32 %op.rdx |
| } |
| |
| define i32 @addv32i32i16(ptr %x) { |
| ; CHECK-LABEL: addv32i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q1, [r0] |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #40] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #56] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i16>, ptr %x, align 2 |
| %1 = sext <32 x i16> %0 to <32 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv64i32i16(ptr %x) { |
| ; CHECK-LABEL: addv64i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q1, [r0] |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] |
| ; CHECK-NEXT: ldrsh.w r1, [r0, #120] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: ldrsh.w r3, [r0, #122] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] |
| ; CHECK-NEXT: ldrsh.w r12, [r0, #124] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #40] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #56] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #72] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #88] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #112] |
| ; CHECK-NEXT: ldrsh.w r0, [r0, #126] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: add r1, r2 |
| ; CHECK-NEXT: add r1, r3 |
| ; CHECK-NEXT: add r1, r12 |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i16>, ptr %x, align 2 |
| %1 = sext <32 x i16> %0 to <32 x i32> |
| %arrayidx.32 = getelementptr inbounds i16, ptr %x, i32 32 |
| %2 = load <16 x i16>, ptr %arrayidx.32, align 2 |
| %3 = sext <16 x i16> %2 to <16 x i32> |
| %arrayidx.48 = getelementptr inbounds i16, ptr %x, i32 48 |
| %4 = load <8 x i16>, ptr %arrayidx.48, align 2 |
| %5 = sext <8 x i16> %4 to <8 x i32> |
| %arrayidx.56 = getelementptr inbounds i16, ptr %x, i32 56 |
| %6 = load <4 x i16>, ptr %arrayidx.56, align 2 |
| %7 = sext <4 x i16> %6 to <4 x i32> |
| %arrayidx.60 = getelementptr inbounds i16, ptr %x, i32 60 |
| %8 = load i16, ptr %arrayidx.60, align 2 |
| %conv.60 = sext i16 %8 to i32 |
| %arrayidx.61 = getelementptr inbounds i16, ptr %x, i32 61 |
| %9 = load i16, ptr %arrayidx.61, align 2 |
| %conv.61 = sext i16 %9 to i32 |
| %arrayidx.62 = getelementptr inbounds i16, ptr %x, i32 62 |
| %10 = load i16, ptr %arrayidx.62, align 2 |
| %conv.62 = sext i16 %10 to i32 |
| %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) |
| %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) |
| %op.rdx = add nsw i32 %11, %12 |
| %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) |
| %op.rdx8 = add nsw i32 %op.rdx, %13 |
| %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) |
| %op.rdx9 = add nsw i32 %op.rdx8, %14 |
| %15 = add nsw i32 %op.rdx9, %conv.60 |
| %16 = add nsw i32 %15, %conv.61 |
| %17 = add nsw i32 %16, %conv.62 |
| %arrayidx.63 = getelementptr inbounds i16, ptr %x, i32 63 |
| %18 = load i16, ptr %arrayidx.63, align 2 |
| %conv.63 = sext i16 %18 to i32 |
| %add.63 = add nsw i32 %17, %conv.63 |
| ret i32 %add.63 |
| } |
| |
| define i32 @addv128i32i16(ptr %x) { |
| ; CHECK-LABEL: addv128i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q1, [r0] |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.s16 r2, q1 |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #128] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #144] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #160] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #176] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #192] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #208] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #224] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #240] |
| ; CHECK-NEXT: vaddva.s16 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <8 x i16>, ptr %x, align 2 |
| %0 = sext <8 x i16> %wide.load to <8 x i32> |
| %1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %0) |
| %2 = getelementptr inbounds i16, ptr %x, i32 8 |
| %wide.load.1 = load <8 x i16>, ptr %2, align 2 |
| %3 = sext <8 x i16> %wide.load.1 to <8 x i32> |
| %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) |
| %5 = add i32 %4, %1 |
| %6 = getelementptr inbounds i16, ptr %x, i32 16 |
| %wide.load.2 = load <8 x i16>, ptr %6, align 2 |
| %7 = sext <8 x i16> %wide.load.2 to <8 x i32> |
| %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) |
| %9 = add i32 %8, %5 |
| %10 = getelementptr inbounds i16, ptr %x, i32 24 |
| %wide.load.3 = load <8 x i16>, ptr %10, align 2 |
| %11 = sext <8 x i16> %wide.load.3 to <8 x i32> |
| %12 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %11) |
| %13 = add i32 %12, %9 |
| %14 = getelementptr inbounds i16, ptr %x, i32 32 |
| %wide.load.4 = load <8 x i16>, ptr %14, align 2 |
| %15 = sext <8 x i16> %wide.load.4 to <8 x i32> |
| %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) |
| %17 = add i32 %16, %13 |
| %18 = getelementptr inbounds i16, ptr %x, i32 40 |
| %wide.load.5 = load <8 x i16>, ptr %18, align 2 |
| %19 = sext <8 x i16> %wide.load.5 to <8 x i32> |
| %20 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19) |
| %21 = add i32 %20, %17 |
| %22 = getelementptr inbounds i16, ptr %x, i32 48 |
| %wide.load.6 = load <8 x i16>, ptr %22, align 2 |
| %23 = sext <8 x i16> %wide.load.6 to <8 x i32> |
| %24 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %23) |
| %25 = add i32 %24, %21 |
| %26 = getelementptr inbounds i16, ptr %x, i32 56 |
| %wide.load.7 = load <8 x i16>, ptr %26, align 2 |
| %27 = sext <8 x i16> %wide.load.7 to <8 x i32> |
| %28 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %27) |
| %29 = add i32 %28, %25 |
| %30 = getelementptr inbounds i16, ptr %x, i32 64 |
| %wide.load.8 = load <8 x i16>, ptr %30, align 2 |
| %31 = sext <8 x i16> %wide.load.8 to <8 x i32> |
| %32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %31) |
| %33 = add i32 %32, %29 |
| %34 = getelementptr inbounds i16, ptr %x, i32 72 |
| %wide.load.9 = load <8 x i16>, ptr %34, align 2 |
| %35 = sext <8 x i16> %wide.load.9 to <8 x i32> |
| %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35) |
| %37 = add i32 %36, %33 |
| %38 = getelementptr inbounds i16, ptr %x, i32 80 |
| %wide.load.10 = load <8 x i16>, ptr %38, align 2 |
| %39 = sext <8 x i16> %wide.load.10 to <8 x i32> |
| %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39) |
| %41 = add i32 %40, %37 |
| %42 = getelementptr inbounds i16, ptr %x, i32 88 |
| %wide.load.11 = load <8 x i16>, ptr %42, align 2 |
| %43 = sext <8 x i16> %wide.load.11 to <8 x i32> |
| %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) |
| %45 = add i32 %44, %41 |
| %46 = getelementptr inbounds i16, ptr %x, i32 96 |
| %wide.load.12 = load <8 x i16>, ptr %46, align 2 |
| %47 = sext <8 x i16> %wide.load.12 to <8 x i32> |
| %48 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47) |
| %49 = add i32 %48, %45 |
| %50 = getelementptr inbounds i16, ptr %x, i32 104 |
| %wide.load.13 = load <8 x i16>, ptr %50, align 2 |
| %51 = sext <8 x i16> %wide.load.13 to <8 x i32> |
| %52 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %51) |
| %53 = add i32 %52, %49 |
| %54 = getelementptr inbounds i16, ptr %x, i32 112 |
| %wide.load.14 = load <8 x i16>, ptr %54, align 2 |
| %55 = sext <8 x i16> %wide.load.14 to <8 x i32> |
| %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55) |
| %57 = add i32 %56, %53 |
| %58 = getelementptr inbounds i16, ptr %x, i32 120 |
| %wide.load.15 = load <8 x i16>, ptr %58, align 2 |
| %59 = sext <8 x i16> %wide.load.15 to <8 x i32> |
| %60 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %59) |
| %61 = add i32 %60, %57 |
| ret i32 %61 |
| } |
| |
| define i32 @addv2i32i8(ptr %x) { |
| ; CHECK-LABEL: addv2i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrb r1, [r0] |
| ; CHECK-NEXT: ldrb r0, [r0, #1] |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i8, ptr %x, align 1 |
| %conv = zext i8 %0 to i32 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 |
| %1 = load i8, ptr %arrayidx.1, align 1 |
| %conv.1 = zext i8 %1 to i32 |
| %add.1 = add nuw nsw i32 %conv, %conv.1 |
| ret i32 %add.1 |
| } |
| |
| define i32 @addv4i32i8(ptr %x) { |
| ; CHECK-LABEL: addv4i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u32 q0, [r0] |
| ; CHECK-NEXT: vaddv.u32 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i8>, ptr %x, align 1 |
| %1 = zext <4 x i8> %0 to <4 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv8i32i8(ptr %x) { |
| ; CHECK-LABEL: addv8i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u16 q0, [r0] |
| ; CHECK-NEXT: vaddv.u16 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i8>, ptr %x, align 1 |
| %1 = zext <8 x i8> %0 to <8 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv16i32i8(ptr %x) { |
| ; CHECK-LABEL: addv16i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q0, [r0] |
| ; CHECK-NEXT: vaddv.u8 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i8>, ptr %x, align 1 |
| %1 = zext <16 x i8> %0 to <16 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv24i32i8(ptr %x) { |
| ; CHECK-LABEL: addv24i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q1, [r0] |
| ; CHECK-NEXT: vldrb.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u8 r0, q1 |
| ; CHECK-NEXT: vaddva.u16 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i8>, ptr %x, align 1 |
| %1 = zext <16 x i8> %0 to <16 x i32> |
| %arrayidx.16 = getelementptr inbounds i8, ptr %x, i32 16 |
| %2 = load <8 x i8>, ptr %arrayidx.16, align 1 |
| %3 = zext <8 x i8> %2 to <8 x i32> |
| %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1) |
| %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) |
| %op.rdx = add nuw nsw i32 %4, %5 |
| ret i32 %op.rdx |
| } |
| |
| define i32 @addv32i32i8(ptr %x) { |
| ; CHECK-LABEL: addv32i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u32 q1, [r0] |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #12] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #20] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #24] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i8>, ptr %x, align 1 |
| %1 = zext <32 x i8> %0 to <32 x i32> |
| %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) |
| ret i32 %2 |
| } |
| |
| define i32 @addv64i32i8(ptr %x) { |
| ; CHECK-LABEL: addv64i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u32 q1, [r0] |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #4] |
| ; CHECK-NEXT: ldrb.w r1, [r0, #60] |
| ; CHECK-NEXT: vaddv.u32 r2, q1 |
| ; CHECK-NEXT: ldrb.w r3, [r0, #61] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #8] |
| ; CHECK-NEXT: ldrb.w r12, [r0, #62] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #12] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #20] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #24] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u16 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrb.u32 q0, [r0, #56] |
| ; CHECK-NEXT: ldrb.w r0, [r0, #63] |
| ; CHECK-NEXT: vaddva.u32 r2, q0 |
| ; CHECK-NEXT: add r1, r2 |
| ; CHECK-NEXT: add r1, r3 |
| ; CHECK-NEXT: add r1, r12 |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i8>, ptr %x, align 1 |
| %1 = zext <32 x i8> %0 to <32 x i32> |
| %arrayidx.32 = getelementptr inbounds i8, ptr %x, i32 32 |
| %2 = load <16 x i8>, ptr %arrayidx.32, align 1 |
| %3 = zext <16 x i8> %2 to <16 x i32> |
| %arrayidx.48 = getelementptr inbounds i8, ptr %x, i32 48 |
| %4 = load <8 x i8>, ptr %arrayidx.48, align 1 |
| %5 = zext <8 x i8> %4 to <8 x i32> |
| %arrayidx.56 = getelementptr inbounds i8, ptr %x, i32 56 |
| %6 = load <4 x i8>, ptr %arrayidx.56, align 1 |
| %7 = zext <4 x i8> %6 to <4 x i32> |
| %arrayidx.60 = getelementptr inbounds i8, ptr %x, i32 60 |
| %8 = load i8, ptr %arrayidx.60, align 1 |
| %conv.60 = zext i8 %8 to i32 |
| %arrayidx.61 = getelementptr inbounds i8, ptr %x, i32 61 |
| %9 = load i8, ptr %arrayidx.61, align 1 |
| %conv.61 = zext i8 %9 to i32 |
| %arrayidx.62 = getelementptr inbounds i8, ptr %x, i32 62 |
| %10 = load i8, ptr %arrayidx.62, align 1 |
| %conv.62 = zext i8 %10 to i32 |
| %11 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1) |
| %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) |
| %op.rdx = add nuw nsw i32 %11, %12 |
| %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5) |
| %op.rdx8 = add nuw nsw i32 %op.rdx, %13 |
| %14 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %7) |
| %op.rdx9 = add nuw nsw i32 %op.rdx8, %14 |
| %15 = add nuw nsw i32 %op.rdx9, %conv.60 |
| %16 = add nuw nsw i32 %15, %conv.61 |
| %17 = add nuw nsw i32 %16, %conv.62 |
| %arrayidx.63 = getelementptr inbounds i8, ptr %x, i32 63 |
| %18 = load i8, ptr %arrayidx.63, align 1 |
| %conv.63 = zext i8 %18 to i32 |
| %add.63 = add nuw nsw i32 %17, %conv.63 |
| ret i32 %add.63 |
| } |
| |
| define i32 @addv128i32i8(ptr %x) { |
| ; CHECK-LABEL: addv128i32i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q1, [r0] |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] |
| ; CHECK-NEXT: mov r1, r0 |
| ; CHECK-NEXT: vaddv.u8 r0, q1 |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r1, #32] |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r1, #48] |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r1, #64] |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r1, #80] |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r1, #96] |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r1, #112] |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <16 x i8>, ptr %x, align 1 |
| %0 = zext <16 x i8> %wide.load to <16 x i32> |
| %1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %0) |
| %2 = getelementptr inbounds i8, ptr %x, i32 16 |
| %wide.load.1 = load <16 x i8>, ptr %2, align 1 |
| %3 = zext <16 x i8> %wide.load.1 to <16 x i32> |
| %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) |
| %5 = add i32 %4, %1 |
| %6 = getelementptr inbounds i8, ptr %x, i32 32 |
| %wide.load.2 = load <16 x i8>, ptr %6, align 1 |
| %7 = zext <16 x i8> %wide.load.2 to <16 x i32> |
| %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7) |
| %9 = add i32 %8, %5 |
| %10 = getelementptr inbounds i8, ptr %x, i32 48 |
| %wide.load.3 = load <16 x i8>, ptr %10, align 1 |
| %11 = zext <16 x i8> %wide.load.3 to <16 x i32> |
| %12 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %11) |
| %13 = add i32 %12, %9 |
| %14 = getelementptr inbounds i8, ptr %x, i32 64 |
| %wide.load.4 = load <16 x i8>, ptr %14, align 1 |
| %15 = zext <16 x i8> %wide.load.4 to <16 x i32> |
| %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15) |
| %17 = add i32 %16, %13 |
| %18 = getelementptr inbounds i8, ptr %x, i32 80 |
| %wide.load.5 = load <16 x i8>, ptr %18, align 1 |
| %19 = zext <16 x i8> %wide.load.5 to <16 x i32> |
| %20 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19) |
| %21 = add i32 %20, %17 |
| %22 = getelementptr inbounds i8, ptr %x, i32 96 |
| %wide.load.6 = load <16 x i8>, ptr %22, align 1 |
| %23 = zext <16 x i8> %wide.load.6 to <16 x i32> |
| %24 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %23) |
| %25 = add i32 %24, %21 |
| %26 = getelementptr inbounds i8, ptr %x, i32 112 |
| %wide.load.7 = load <16 x i8>, ptr %26, align 1 |
| %27 = zext <16 x i8> %wide.load.7 to <16 x i32> |
| %28 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %27) |
| %29 = add i32 %28, %25 |
| ret i32 %29 |
| } |
| |
| define signext i16 @addv2i16i16(ptr %x) { |
| ; CHECK-LABEL: addv2i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrh r1, [r0] |
| ; CHECK-NEXT: ldrh r0, [r0, #2] |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: sxth r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i16, ptr %x, align 2 |
| %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 |
| %1 = load i16, ptr %arrayidx.1, align 2 |
| %add.1 = add i16 %1, %0 |
| ret i16 %add.1 |
| } |
| |
| define signext i16 @addv4i16i16(ptr %x) { |
| ; CHECK-LABEL: addv4i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u32 q0, [r0] |
| ; CHECK-NEXT: vaddv.u32 r0, q0 |
| ; CHECK-NEXT: sxth r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i16>, ptr %x, align 2 |
| %1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %0) |
| ret i16 %1 |
| } |
| |
| define signext i16 @addv8i16i16(ptr %x) { |
| ; CHECK-LABEL: addv8i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q0, [r0] |
| ; CHECK-NEXT: vaddv.u16 r0, q0 |
| ; CHECK-NEXT: sxth r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i16>, ptr %x, align 2 |
| %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) |
| ret i16 %1 |
| } |
| |
| define signext i16 @addv16i16i16(ptr %x) { |
| ; CHECK-LABEL: addv16i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q1, [r0] |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u16 r0, q1 |
| ; CHECK-NEXT: vaddva.u16 r0, q0 |
| ; CHECK-NEXT: sxth r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i16>, ptr %x, align 2 |
| %1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %0) |
| ret i16 %1 |
| } |
| |
| define signext i16 @addv24i16i16(ptr %x) { |
| ; CHECK-LABEL: addv24i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q1, [r0] |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u16 r2, q1 |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: sxth r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i16>, ptr %x, align 2 |
| %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 |
| %1 = load <16 x i16>, ptr %arrayidx.8, align 2 |
| %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1) |
| %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %0) |
| %op.rdx = add i16 %2, %3 |
| ret i16 %op.rdx |
| } |
| |
| define signext i16 @addv32i16i16(ptr %x) { |
| ; CHECK-LABEL: addv32i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q1, [r0] |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u16 r2, q1 |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: sxth r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i16>, ptr %x, align 2 |
| %1 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %0) |
| ret i16 %1 |
| } |
| |
| define signext i16 @addv64i16i16(ptr %x) { |
| ; CHECK-LABEL: addv64i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q1, [r0] |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u16 r2, q1 |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: sxth r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <64 x i16>, ptr %x, align 2 |
| %1 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %0) |
| ret i16 %1 |
| } |
| |
| define signext i16 @addv128i16i16(ptr %x) { |
| ; CHECK-LABEL: addv128i16i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q1, [r0] |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u16 r2, q1 |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #112] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #128] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #144] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #160] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #176] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #192] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #208] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #224] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r0, #240] |
| ; CHECK-NEXT: vaddva.u16 r2, q0 |
| ; CHECK-NEXT: sxth r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <8 x i16>, ptr %x, align 2 |
| %0 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load) |
| %1 = getelementptr inbounds i16, ptr %x, i32 8 |
| %wide.load.1 = load <8 x i16>, ptr %1, align 2 |
| %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1) |
| %3 = add i16 %2, %0 |
| %4 = getelementptr inbounds i16, ptr %x, i32 16 |
| %wide.load.2 = load <8 x i16>, ptr %4, align 2 |
| %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2) |
| %6 = add i16 %5, %3 |
| %7 = getelementptr inbounds i16, ptr %x, i32 24 |
| %wide.load.3 = load <8 x i16>, ptr %7, align 2 |
| %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3) |
| %9 = add i16 %8, %6 |
| %10 = getelementptr inbounds i16, ptr %x, i32 32 |
| %wide.load.4 = load <8 x i16>, ptr %10, align 2 |
| %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4) |
| %12 = add i16 %11, %9 |
| %13 = getelementptr inbounds i16, ptr %x, i32 40 |
| %wide.load.5 = load <8 x i16>, ptr %13, align 2 |
| %14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5) |
| %15 = add i16 %14, %12 |
| %16 = getelementptr inbounds i16, ptr %x, i32 48 |
| %wide.load.6 = load <8 x i16>, ptr %16, align 2 |
| %17 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6) |
| %18 = add i16 %17, %15 |
| %19 = getelementptr inbounds i16, ptr %x, i32 56 |
| %wide.load.7 = load <8 x i16>, ptr %19, align 2 |
| %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7) |
| %21 = add i16 %20, %18 |
| %22 = getelementptr inbounds i16, ptr %x, i32 64 |
| %wide.load.8 = load <8 x i16>, ptr %22, align 2 |
| %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8) |
| %24 = add i16 %23, %21 |
| %25 = getelementptr inbounds i16, ptr %x, i32 72 |
| %wide.load.9 = load <8 x i16>, ptr %25, align 2 |
| %26 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9) |
| %27 = add i16 %26, %24 |
| %28 = getelementptr inbounds i16, ptr %x, i32 80 |
| %wide.load.10 = load <8 x i16>, ptr %28, align 2 |
| %29 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10) |
| %30 = add i16 %29, %27 |
| %31 = getelementptr inbounds i16, ptr %x, i32 88 |
| %wide.load.11 = load <8 x i16>, ptr %31, align 2 |
| %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11) |
| %33 = add i16 %32, %30 |
| %34 = getelementptr inbounds i16, ptr %x, i32 96 |
| %wide.load.12 = load <8 x i16>, ptr %34, align 2 |
| %35 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12) |
| %36 = add i16 %35, %33 |
| %37 = getelementptr inbounds i16, ptr %x, i32 104 |
| %wide.load.13 = load <8 x i16>, ptr %37, align 2 |
| %38 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13) |
| %39 = add i16 %38, %36 |
| %40 = getelementptr inbounds i16, ptr %x, i32 112 |
| %wide.load.14 = load <8 x i16>, ptr %40, align 2 |
| %41 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14) |
| %42 = add i16 %41, %39 |
| %43 = getelementptr inbounds i16, ptr %x, i32 120 |
| %wide.load.15 = load <8 x i16>, ptr %43, align 2 |
| %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15) |
| %45 = add i16 %44, %42 |
| ret i16 %45 |
| } |
| |
| define zeroext i8 @addv2i8i8(ptr %x) { |
| ; CHECK-LABEL: addv2i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrb r1, [r0] |
| ; CHECK-NEXT: ldrb r0, [r0, #1] |
| ; CHECK-NEXT: add r0, r1 |
| ; CHECK-NEXT: uxtb r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i8, ptr %x, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %x, i32 1 |
| %1 = load i8, ptr %arrayidx.1, align 1 |
| %add.1 = add i8 %1, %0 |
| ret i8 %add.1 |
| } |
| |
| define zeroext i8 @addv4i8i8(ptr %x) { |
| ; CHECK-LABEL: addv4i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u32 q0, [r0] |
| ; CHECK-NEXT: vaddv.u32 r0, q0 |
| ; CHECK-NEXT: uxtb r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i8>, ptr %x, align 1 |
| %1 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %0) |
| ret i8 %1 |
| } |
| |
| define zeroext i8 @addv8i8i8(ptr %x) { |
| ; CHECK-LABEL: addv8i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u16 q0, [r0] |
| ; CHECK-NEXT: vaddv.u16 r0, q0 |
| ; CHECK-NEXT: uxtb r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i8>, ptr %x, align 1 |
| %1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0) |
| ret i8 %1 |
| } |
| |
| define zeroext i8 @addv16i8i8(ptr %x) { |
| ; CHECK-LABEL: addv16i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q0, [r0] |
| ; CHECK-NEXT: vaddv.u8 r0, q0 |
| ; CHECK-NEXT: uxtb r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i8>, ptr %x, align 1 |
| %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %0) |
| ret i8 %1 |
| } |
| |
| define zeroext i8 @addv24i8i8(ptr %x) { |
| ; CHECK-LABEL: addv24i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u16 q1, [r0] |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #8] |
| ; CHECK-NEXT: vaddv.u16 r0, q1 |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: uxtb r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i8>, ptr %x, align 1 |
| %arrayidx.8 = getelementptr inbounds i8, ptr %x, i32 8 |
| %1 = load <16 x i8>, ptr %arrayidx.8, align 1 |
| %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1) |
| %3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %0) |
| %op.rdx = add i8 %2, %3 |
| ret i8 %op.rdx |
| } |
| |
| define zeroext i8 @addv32i8i8(ptr %x) { |
| ; CHECK-LABEL: addv32i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q1, [r0] |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u8 r0, q1 |
| ; CHECK-NEXT: vaddva.u8 r0, q0 |
| ; CHECK-NEXT: uxtb r0, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i8>, ptr %x, align 1 |
| %1 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %0) |
| ret i8 %1 |
| } |
| |
| define zeroext i8 @addv64i8i8(ptr %x) { |
| ; CHECK-LABEL: addv64i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q1, [r0] |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u8 r2, q1 |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: uxtb r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <64 x i8>, ptr %x, align 1 |
| %1 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %0) |
| ret i8 %1 |
| } |
| |
| define zeroext i8 @addv128i8i8(ptr %x) { |
| ; CHECK-LABEL: addv128i8i8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrb.u8 q1, [r0] |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] |
| ; CHECK-NEXT: vaddv.u8 r2, q1 |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #48] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #64] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #80] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #96] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: vldrb.u8 q0, [r0, #112] |
| ; CHECK-NEXT: vaddva.u8 r2, q0 |
| ; CHECK-NEXT: uxtb r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <16 x i8>, ptr %x, align 1 |
| %0 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load) |
| %1 = getelementptr inbounds i8, ptr %x, i32 16 |
| %wide.load.1 = load <16 x i8>, ptr %1, align 1 |
| %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1) |
| %3 = add i8 %2, %0 |
| %4 = getelementptr inbounds i8, ptr %x, i32 32 |
| %wide.load.2 = load <16 x i8>, ptr %4, align 1 |
| %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2) |
| %6 = add i8 %5, %3 |
| %7 = getelementptr inbounds i8, ptr %x, i32 48 |
| %wide.load.3 = load <16 x i8>, ptr %7, align 1 |
| %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3) |
| %9 = add i8 %8, %6 |
| %10 = getelementptr inbounds i8, ptr %x, i32 64 |
| %wide.load.4 = load <16 x i8>, ptr %10, align 1 |
| %11 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4) |
| %12 = add i8 %11, %9 |
| %13 = getelementptr inbounds i8, ptr %x, i32 80 |
| %wide.load.5 = load <16 x i8>, ptr %13, align 1 |
| %14 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5) |
| %15 = add i8 %14, %12 |
| %16 = getelementptr inbounds i8, ptr %x, i32 96 |
| %wide.load.6 = load <16 x i8>, ptr %16, align 1 |
| %17 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6) |
| %18 = add i8 %17, %15 |
| %19 = getelementptr inbounds i8, ptr %x, i32 112 |
| %wide.load.7 = load <16 x i8>, ptr %19, align 1 |
| %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7) |
| %21 = add i8 %20, %18 |
| ret i8 %21 |
| } |
| |
| |
| |
| define i32 @mlav2i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav2i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrd r0, r2, [r0] |
| ; CHECK-NEXT: ldrd r1, r3, [r1] |
| ; CHECK-NEXT: muls r0, r1, r0 |
| ; CHECK-NEXT: mla r0, r3, r2, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i32, ptr %x, align 4 |
| %1 = load i32, ptr %y, align 4 |
| %mul = mul nsw i32 %1, %0 |
| %arrayidx.1 = getelementptr inbounds i32, ptr %x, i32 1 |
| %2 = load i32, ptr %arrayidx.1, align 4 |
| %arrayidx1.1 = getelementptr inbounds i32, ptr %y, i32 1 |
| %3 = load i32, ptr %arrayidx1.1, align 4 |
| %mul.1 = mul nsw i32 %3, %2 |
| %add.1 = add nsw i32 %mul.1, %mul |
| ret i32 %add.1 |
| } |
| |
| define i32 @mlav4i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav4i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i32>, ptr %x, align 4 |
| %1 = load <4 x i32>, ptr %y, align 4 |
| %2 = mul nsw <4 x i32> %1, %0 |
| %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) |
| ret i32 %3 |
| } |
| |
| define i32 @mlav8i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav8i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vmlav.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i32>, ptr %x, align 4 |
| %1 = load <8 x i32>, ptr %y, align 4 |
| %2 = mul nsw <8 x i32> %1, %0 |
| %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) |
| ret i32 %3 |
| } |
| |
| define i32 @mlav16i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav16i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: vmlav.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #32] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r0, #48] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i32>, ptr %x, align 4 |
| %1 = load <16 x i32>, ptr %y, align 4 |
| %2 = mul nsw <16 x i32> %1, %0 |
| %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2) |
| ret i32 %3 |
| } |
| |
| define i32 @mlav24i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav24i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i32>, ptr %x, align 4 |
| %1 = load <8 x i32>, ptr %y, align 4 |
| %2 = mul nsw <8 x i32> %1, %0 |
| %arrayidx.8 = getelementptr inbounds i32, ptr %x, i32 8 |
| %arrayidx1.8 = getelementptr inbounds i32, ptr %y, i32 8 |
| %3 = load <16 x i32>, ptr %arrayidx.8, align 4 |
| %4 = load <16 x i32>, ptr %arrayidx1.8, align 4 |
| %5 = mul nsw <16 x i32> %4, %3 |
| %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) |
| %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) |
| %op.rdx = add nsw i32 %6, %7 |
| ret i32 %op.rdx |
| } |
| |
| define i32 @mlav32i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav32i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #96] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i32>, ptr %x, align 4 |
| %1 = load <32 x i32>, ptr %y, align 4 |
| %2 = mul nsw <32 x i32> %1, %0 |
| %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2) |
| ret i32 %3 |
| } |
| |
| define i32 @mlav64i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav64i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #96] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #128] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #128] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #144] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #144] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #160] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #160] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #176] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #176] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #192] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #192] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #208] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #208] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #224] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #224] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #240] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #240] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <4 x i32>, ptr %x, align 4 |
| %wide.load10 = load <4 x i32>, ptr %y, align 4 |
| %0 = mul nsw <4 x i32> %wide.load10, %wide.load |
| %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) |
| %2 = getelementptr inbounds i32, ptr %x, i32 4 |
| %wide.load.1 = load <4 x i32>, ptr %2, align 4 |
| %3 = getelementptr inbounds i32, ptr %y, i32 4 |
| %wide.load10.1 = load <4 x i32>, ptr %3, align 4 |
| %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 |
| %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) |
| %6 = add i32 %5, %1 |
| %7 = getelementptr inbounds i32, ptr %x, i32 8 |
| %wide.load.2 = load <4 x i32>, ptr %7, align 4 |
| %8 = getelementptr inbounds i32, ptr %y, i32 8 |
| %wide.load10.2 = load <4 x i32>, ptr %8, align 4 |
| %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 |
| %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) |
| %11 = add i32 %10, %6 |
| %12 = getelementptr inbounds i32, ptr %x, i32 12 |
| %wide.load.3 = load <4 x i32>, ptr %12, align 4 |
| %13 = getelementptr inbounds i32, ptr %y, i32 12 |
| %wide.load10.3 = load <4 x i32>, ptr %13, align 4 |
| %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 |
| %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) |
| %16 = add i32 %15, %11 |
| %17 = getelementptr inbounds i32, ptr %x, i32 16 |
| %wide.load.4 = load <4 x i32>, ptr %17, align 4 |
| %18 = getelementptr inbounds i32, ptr %y, i32 16 |
| %wide.load10.4 = load <4 x i32>, ptr %18, align 4 |
| %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 |
| %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) |
| %21 = add i32 %20, %16 |
| %22 = getelementptr inbounds i32, ptr %x, i32 20 |
| %wide.load.5 = load <4 x i32>, ptr %22, align 4 |
| %23 = getelementptr inbounds i32, ptr %y, i32 20 |
| %wide.load10.5 = load <4 x i32>, ptr %23, align 4 |
| %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 |
| %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) |
| %26 = add i32 %25, %21 |
| %27 = getelementptr inbounds i32, ptr %x, i32 24 |
| %wide.load.6 = load <4 x i32>, ptr %27, align 4 |
| %28 = getelementptr inbounds i32, ptr %y, i32 24 |
| %wide.load10.6 = load <4 x i32>, ptr %28, align 4 |
| %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 |
| %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) |
| %31 = add i32 %30, %26 |
| %32 = getelementptr inbounds i32, ptr %x, i32 28 |
| %wide.load.7 = load <4 x i32>, ptr %32, align 4 |
| %33 = getelementptr inbounds i32, ptr %y, i32 28 |
| %wide.load10.7 = load <4 x i32>, ptr %33, align 4 |
| %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 |
| %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34) |
| %36 = add i32 %35, %31 |
| %37 = getelementptr inbounds i32, ptr %x, i32 32 |
| %wide.load.8 = load <4 x i32>, ptr %37, align 4 |
| %38 = getelementptr inbounds i32, ptr %y, i32 32 |
| %wide.load10.8 = load <4 x i32>, ptr %38, align 4 |
| %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 |
| %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39) |
| %41 = add i32 %40, %36 |
| %42 = getelementptr inbounds i32, ptr %x, i32 36 |
| %wide.load.9 = load <4 x i32>, ptr %42, align 4 |
| %43 = getelementptr inbounds i32, ptr %y, i32 36 |
| %wide.load10.9 = load <4 x i32>, ptr %43, align 4 |
| %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 |
| %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44) |
| %46 = add i32 %45, %41 |
| %47 = getelementptr inbounds i32, ptr %x, i32 40 |
| %wide.load.10 = load <4 x i32>, ptr %47, align 4 |
| %48 = getelementptr inbounds i32, ptr %y, i32 40 |
| %wide.load10.10 = load <4 x i32>, ptr %48, align 4 |
| %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 |
| %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49) |
| %51 = add i32 %50, %46 |
| %52 = getelementptr inbounds i32, ptr %x, i32 44 |
| %wide.load.11 = load <4 x i32>, ptr %52, align 4 |
| %53 = getelementptr inbounds i32, ptr %y, i32 44 |
| %wide.load10.11 = load <4 x i32>, ptr %53, align 4 |
| %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 |
| %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54) |
| %56 = add i32 %55, %51 |
| %57 = getelementptr inbounds i32, ptr %x, i32 48 |
| %wide.load.12 = load <4 x i32>, ptr %57, align 4 |
| %58 = getelementptr inbounds i32, ptr %y, i32 48 |
| %wide.load10.12 = load <4 x i32>, ptr %58, align 4 |
| %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 |
| %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59) |
| %61 = add i32 %60, %56 |
| %62 = getelementptr inbounds i32, ptr %x, i32 52 |
| %wide.load.13 = load <4 x i32>, ptr %62, align 4 |
| %63 = getelementptr inbounds i32, ptr %y, i32 52 |
| %wide.load10.13 = load <4 x i32>, ptr %63, align 4 |
| %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 |
| %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) |
| %66 = add i32 %65, %61 |
| %67 = getelementptr inbounds i32, ptr %x, i32 56 |
| %wide.load.14 = load <4 x i32>, ptr %67, align 4 |
| %68 = getelementptr inbounds i32, ptr %y, i32 56 |
| %wide.load10.14 = load <4 x i32>, ptr %68, align 4 |
| %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 |
| %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69) |
| %71 = add i32 %70, %66 |
| %72 = getelementptr inbounds i32, ptr %x, i32 60 |
| %wide.load.15 = load <4 x i32>, ptr %72, align 4 |
| %73 = getelementptr inbounds i32, ptr %y, i32 60 |
| %wide.load10.15 = load <4 x i32>, ptr %73, align 4 |
| %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 |
| %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74) |
| %76 = add i32 %75, %71 |
| ret i32 %76 |
| } |
| |
| define i32 @mlav128i32i32(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav128i32i32: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrw.u32 q0, [r0] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #16] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #32] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #48] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #64] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #64] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #80] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #80] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #96] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #96] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #112] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #112] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #128] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #128] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #144] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #144] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #160] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #160] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #176] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #176] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #192] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #192] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #208] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #208] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #224] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #224] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #240] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #240] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #256] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #256] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #272] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #272] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #288] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #288] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #304] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #304] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #320] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #320] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #336] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #336] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #352] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #352] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #368] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #368] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #384] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #384] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #400] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #400] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #416] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #416] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #432] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #432] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #448] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #448] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #464] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #464] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #480] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #480] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrw.u32 q0, [r2, #496] |
| ; CHECK-NEXT: vldrw.u32 q1, [r1, #496] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <4 x i32>, ptr %x, align 4 |
| %wide.load10 = load <4 x i32>, ptr %y, align 4 |
| %0 = mul nsw <4 x i32> %wide.load10, %wide.load |
| %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) |
| %2 = getelementptr inbounds i32, ptr %x, i32 4 |
| %wide.load.1 = load <4 x i32>, ptr %2, align 4 |
| %3 = getelementptr inbounds i32, ptr %y, i32 4 |
| %wide.load10.1 = load <4 x i32>, ptr %3, align 4 |
| %4 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1 |
| %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) |
| %6 = add i32 %5, %1 |
| %7 = getelementptr inbounds i32, ptr %x, i32 8 |
| %wide.load.2 = load <4 x i32>, ptr %7, align 4 |
| %8 = getelementptr inbounds i32, ptr %y, i32 8 |
| %wide.load10.2 = load <4 x i32>, ptr %8, align 4 |
| %9 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2 |
| %10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %9) |
| %11 = add i32 %10, %6 |
| %12 = getelementptr inbounds i32, ptr %x, i32 12 |
| %wide.load.3 = load <4 x i32>, ptr %12, align 4 |
| %13 = getelementptr inbounds i32, ptr %y, i32 12 |
| %wide.load10.3 = load <4 x i32>, ptr %13, align 4 |
| %14 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3 |
| %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %14) |
| %16 = add i32 %15, %11 |
| %17 = getelementptr inbounds i32, ptr %x, i32 16 |
| %wide.load.4 = load <4 x i32>, ptr %17, align 4 |
| %18 = getelementptr inbounds i32, ptr %y, i32 16 |
| %wide.load10.4 = load <4 x i32>, ptr %18, align 4 |
| %19 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4 |
| %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %19) |
| %21 = add i32 %20, %16 |
| %22 = getelementptr inbounds i32, ptr %x, i32 20 |
| %wide.load.5 = load <4 x i32>, ptr %22, align 4 |
| %23 = getelementptr inbounds i32, ptr %y, i32 20 |
| %wide.load10.5 = load <4 x i32>, ptr %23, align 4 |
| %24 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5 |
| %25 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %24) |
| %26 = add i32 %25, %21 |
| %27 = getelementptr inbounds i32, ptr %x, i32 24 |
| %wide.load.6 = load <4 x i32>, ptr %27, align 4 |
| %28 = getelementptr inbounds i32, ptr %y, i32 24 |
| %wide.load10.6 = load <4 x i32>, ptr %28, align 4 |
| %29 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6 |
| %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29) |
| %31 = add i32 %30, %26 |
| %32 = getelementptr inbounds i32, ptr %x, i32 28 |
| %wide.load.7 = load <4 x i32>, ptr %32, align 4 |
| %33 = getelementptr inbounds i32, ptr %y, i32 28 |
| %wide.load10.7 = load <4 x i32>, ptr %33, align 4 |
| %34 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7 |
| %35 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %34) |
| %36 = add i32 %35, %31 |
| %37 = getelementptr inbounds i32, ptr %x, i32 32 |
| %wide.load.8 = load <4 x i32>, ptr %37, align 4 |
| %38 = getelementptr inbounds i32, ptr %y, i32 32 |
| %wide.load10.8 = load <4 x i32>, ptr %38, align 4 |
| %39 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8 |
| %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %39) |
| %41 = add i32 %40, %36 |
| %42 = getelementptr inbounds i32, ptr %x, i32 36 |
| %wide.load.9 = load <4 x i32>, ptr %42, align 4 |
| %43 = getelementptr inbounds i32, ptr %y, i32 36 |
| %wide.load10.9 = load <4 x i32>, ptr %43, align 4 |
| %44 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9 |
| %45 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %44) |
| %46 = add i32 %45, %41 |
| %47 = getelementptr inbounds i32, ptr %x, i32 40 |
| %wide.load.10 = load <4 x i32>, ptr %47, align 4 |
| %48 = getelementptr inbounds i32, ptr %y, i32 40 |
| %wide.load10.10 = load <4 x i32>, ptr %48, align 4 |
| %49 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10 |
| %50 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %49) |
| %51 = add i32 %50, %46 |
| %52 = getelementptr inbounds i32, ptr %x, i32 44 |
| %wide.load.11 = load <4 x i32>, ptr %52, align 4 |
| %53 = getelementptr inbounds i32, ptr %y, i32 44 |
| %wide.load10.11 = load <4 x i32>, ptr %53, align 4 |
| %54 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11 |
| %55 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %54) |
| %56 = add i32 %55, %51 |
| %57 = getelementptr inbounds i32, ptr %x, i32 48 |
| %wide.load.12 = load <4 x i32>, ptr %57, align 4 |
| %58 = getelementptr inbounds i32, ptr %y, i32 48 |
| %wide.load10.12 = load <4 x i32>, ptr %58, align 4 |
| %59 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12 |
| %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %59) |
| %61 = add i32 %60, %56 |
| %62 = getelementptr inbounds i32, ptr %x, i32 52 |
| %wide.load.13 = load <4 x i32>, ptr %62, align 4 |
| %63 = getelementptr inbounds i32, ptr %y, i32 52 |
| %wide.load10.13 = load <4 x i32>, ptr %63, align 4 |
| %64 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13 |
| %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64) |
| %66 = add i32 %65, %61 |
| %67 = getelementptr inbounds i32, ptr %x, i32 56 |
| %wide.load.14 = load <4 x i32>, ptr %67, align 4 |
| %68 = getelementptr inbounds i32, ptr %y, i32 56 |
| %wide.load10.14 = load <4 x i32>, ptr %68, align 4 |
| %69 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14 |
| %70 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %69) |
| %71 = add i32 %70, %66 |
| %72 = getelementptr inbounds i32, ptr %x, i32 60 |
| %wide.load.15 = load <4 x i32>, ptr %72, align 4 |
| %73 = getelementptr inbounds i32, ptr %y, i32 60 |
| %wide.load10.15 = load <4 x i32>, ptr %73, align 4 |
| %74 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15 |
| %75 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %74) |
| %76 = add i32 %75, %71 |
| %77 = getelementptr inbounds i32, ptr %x, i32 64 |
| %wide.load.16 = load <4 x i32>, ptr %77, align 4 |
| %78 = getelementptr inbounds i32, ptr %y, i32 64 |
| %wide.load10.16 = load <4 x i32>, ptr %78, align 4 |
| %79 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16 |
| %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %79) |
| %81 = add i32 %80, %76 |
| %82 = getelementptr inbounds i32, ptr %x, i32 68 |
| %wide.load.17 = load <4 x i32>, ptr %82, align 4 |
| %83 = getelementptr inbounds i32, ptr %y, i32 68 |
| %wide.load10.17 = load <4 x i32>, ptr %83, align 4 |
| %84 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17 |
| %85 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %84) |
| %86 = add i32 %85, %81 |
| %87 = getelementptr inbounds i32, ptr %x, i32 72 |
| %wide.load.18 = load <4 x i32>, ptr %87, align 4 |
| %88 = getelementptr inbounds i32, ptr %y, i32 72 |
| %wide.load10.18 = load <4 x i32>, ptr %88, align 4 |
| %89 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18 |
| %90 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %89) |
| %91 = add i32 %90, %86 |
| %92 = getelementptr inbounds i32, ptr %x, i32 76 |
| %wide.load.19 = load <4 x i32>, ptr %92, align 4 |
| %93 = getelementptr inbounds i32, ptr %y, i32 76 |
| %wide.load10.19 = load <4 x i32>, ptr %93, align 4 |
| %94 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19 |
| %95 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %94) |
| %96 = add i32 %95, %91 |
| %97 = getelementptr inbounds i32, ptr %x, i32 80 |
| %wide.load.20 = load <4 x i32>, ptr %97, align 4 |
| %98 = getelementptr inbounds i32, ptr %y, i32 80 |
| %wide.load10.20 = load <4 x i32>, ptr %98, align 4 |
| %99 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20 |
| %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99) |
| %101 = add i32 %100, %96 |
| %102 = getelementptr inbounds i32, ptr %x, i32 84 |
| %wide.load.21 = load <4 x i32>, ptr %102, align 4 |
| %103 = getelementptr inbounds i32, ptr %y, i32 84 |
| %wide.load10.21 = load <4 x i32>, ptr %103, align 4 |
| %104 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21 |
| %105 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %104) |
| %106 = add i32 %105, %101 |
| %107 = getelementptr inbounds i32, ptr %x, i32 88 |
| %wide.load.22 = load <4 x i32>, ptr %107, align 4 |
| %108 = getelementptr inbounds i32, ptr %y, i32 88 |
| %wide.load10.22 = load <4 x i32>, ptr %108, align 4 |
| %109 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22 |
| %110 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %109) |
| %111 = add i32 %110, %106 |
| %112 = getelementptr inbounds i32, ptr %x, i32 92 |
| %wide.load.23 = load <4 x i32>, ptr %112, align 4 |
| %113 = getelementptr inbounds i32, ptr %y, i32 92 |
| %wide.load10.23 = load <4 x i32>, ptr %113, align 4 |
| %114 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23 |
| %115 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %114) |
| %116 = add i32 %115, %111 |
| %117 = getelementptr inbounds i32, ptr %x, i32 96 |
| %wide.load.24 = load <4 x i32>, ptr %117, align 4 |
| %118 = getelementptr inbounds i32, ptr %y, i32 96 |
| %wide.load10.24 = load <4 x i32>, ptr %118, align 4 |
| %119 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24 |
| %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %119) |
| %121 = add i32 %120, %116 |
| %122 = getelementptr inbounds i32, ptr %x, i32 100 |
| %wide.load.25 = load <4 x i32>, ptr %122, align 4 |
| %123 = getelementptr inbounds i32, ptr %y, i32 100 |
| %wide.load10.25 = load <4 x i32>, ptr %123, align 4 |
| %124 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25 |
| %125 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %124) |
| %126 = add i32 %125, %121 |
| %127 = getelementptr inbounds i32, ptr %x, i32 104 |
| %wide.load.26 = load <4 x i32>, ptr %127, align 4 |
| %128 = getelementptr inbounds i32, ptr %y, i32 104 |
| %wide.load10.26 = load <4 x i32>, ptr %128, align 4 |
| %129 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26 |
| %130 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %129) |
| %131 = add i32 %130, %126 |
| %132 = getelementptr inbounds i32, ptr %x, i32 108 |
| %wide.load.27 = load <4 x i32>, ptr %132, align 4 |
| %133 = getelementptr inbounds i32, ptr %y, i32 108 |
| %wide.load10.27 = load <4 x i32>, ptr %133, align 4 |
| %134 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27 |
| %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134) |
| %136 = add i32 %135, %131 |
| %137 = getelementptr inbounds i32, ptr %x, i32 112 |
| %wide.load.28 = load <4 x i32>, ptr %137, align 4 |
| %138 = getelementptr inbounds i32, ptr %y, i32 112 |
| %wide.load10.28 = load <4 x i32>, ptr %138, align 4 |
| %139 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28 |
| %140 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %139) |
| %141 = add i32 %140, %136 |
| %142 = getelementptr inbounds i32, ptr %x, i32 116 |
| %wide.load.29 = load <4 x i32>, ptr %142, align 4 |
| %143 = getelementptr inbounds i32, ptr %y, i32 116 |
| %wide.load10.29 = load <4 x i32>, ptr %143, align 4 |
| %144 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29 |
| %145 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %144) |
| %146 = add i32 %145, %141 |
| %147 = getelementptr inbounds i32, ptr %x, i32 120 |
| %wide.load.30 = load <4 x i32>, ptr %147, align 4 |
| %148 = getelementptr inbounds i32, ptr %y, i32 120 |
| %wide.load10.30 = load <4 x i32>, ptr %148, align 4 |
| %149 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30 |
| %150 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %149) |
| %151 = add i32 %150, %146 |
| %152 = getelementptr inbounds i32, ptr %x, i32 124 |
| %wide.load.31 = load <4 x i32>, ptr %152, align 4 |
| %153 = getelementptr inbounds i32, ptr %y, i32 124 |
| %wide.load10.31 = load <4 x i32>, ptr %153, align 4 |
| %154 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31 |
| %155 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %154) |
| %156 = add i32 %155, %151 |
| ret i32 %156 |
| } |
| |
| define i32 @mlav2i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav2i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: ldrsh.w r2, [r0] |
| ; CHECK-NEXT: ldrsh.w r3, [r1] |
| ; CHECK-NEXT: ldrsh.w r0, [r0, #2] |
| ; CHECK-NEXT: ldrsh.w r1, [r1, #2] |
| ; CHECK-NEXT: muls r0, r1, r0 |
| ; CHECK-NEXT: smlabb r0, r3, r2, r0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load i16, ptr %x, align 2 |
| %conv = sext i16 %0 to i32 |
| %1 = load i16, ptr %y, align 2 |
| %conv2 = sext i16 %1 to i32 |
| %mul = mul nsw i32 %conv2, %conv |
| %arrayidx.1 = getelementptr inbounds i16, ptr %x, i32 1 |
| %2 = load i16, ptr %arrayidx.1, align 2 |
| %conv.1 = sext i16 %2 to i32 |
| %arrayidx1.1 = getelementptr inbounds i16, ptr %y, i32 1 |
| %3 = load i16, ptr %arrayidx1.1, align 2 |
| %conv2.1 = sext i16 %3 to i32 |
| %mul.1 = mul nsw i32 %conv2.1, %conv.1 |
| %add.1 = add nsw i32 %mul.1, %mul |
| ret i32 %add.1 |
| } |
| |
| define i32 @mlav4i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav4i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q0, [r0] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1] |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <4 x i16>, ptr %x, align 2 |
| %1 = sext <4 x i16> %0 to <4 x i32> |
| %2 = load <4 x i16>, ptr %y, align 2 |
| %3 = sext <4 x i16> %2 to <4 x i32> |
| %4 = mul nsw <4 x i32> %3, %1 |
| %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4) |
| ret i32 %5 |
| } |
| |
| define i32 @mlav8i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav8i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q0, [r0] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1] |
| ; CHECK-NEXT: vmlav.s16 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i16>, ptr %x, align 2 |
| %1 = sext <8 x i16> %0 to <8 x i32> |
| %2 = load <8 x i16>, ptr %y, align 2 |
| %3 = sext <8 x i16> %2 to <8 x i32> |
| %4 = mul nsw <8 x i32> %3, %1 |
| %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) |
| ret i32 %5 |
| } |
| |
| define i32 @mlav16i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav16i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q0, [r0] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1] |
| ; CHECK-NEXT: vmlav.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #8] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #16] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r0, #24] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] |
| ; CHECK-NEXT: vmlava.u32 r2, q1, q0 |
| ; CHECK-NEXT: mov r0, r2 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <16 x i16>, ptr %x, align 2 |
| %1 = sext <16 x i16> %0 to <16 x i32> |
| %2 = load <16 x i16>, ptr %y, align 2 |
| %3 = sext <16 x i16> %2 to <16 x i32> |
| %4 = mul nsw <16 x i32> %3, %1 |
| %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4) |
| ret i32 %5 |
| } |
| |
| define i32 @mlav24i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav24i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q0, [r0] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #16] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #24] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #32] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #40] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <8 x i16>, ptr %x, align 2 |
| %1 = sext <8 x i16> %0 to <8 x i32> |
| %2 = load <8 x i16>, ptr %y, align 2 |
| %3 = sext <8 x i16> %2 to <8 x i32> |
| %4 = mul nsw <8 x i32> %3, %1 |
| %arrayidx.8 = getelementptr inbounds i16, ptr %x, i32 8 |
| %arrayidx1.8 = getelementptr inbounds i16, ptr %y, i32 8 |
| %5 = load <16 x i16>, ptr %arrayidx.8, align 2 |
| %6 = sext <16 x i16> %5 to <16 x i32> |
| %7 = load <16 x i16>, ptr %arrayidx1.8, align 2 |
| %8 = sext <16 x i16> %7 to <16 x i32> |
| %9 = mul nsw <16 x i32> %8, %6 |
| %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9) |
| %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4) |
| %op.rdx = add nsw i32 %10, %11 |
| ret i32 %op.rdx |
| } |
| |
| define i32 @mlav32i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav32i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.s32 q0, [r0] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #8] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #16] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #24] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #24] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #32] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #40] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #40] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #48] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.s32 q0, [r2, #56] |
| ; CHECK-NEXT: vldrh.s32 q1, [r1, #56] |
| ; CHECK-NEXT: vmlava.u32 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %0 = load <32 x i16>, ptr %x, align 2 |
| %1 = sext <32 x i16> %0 to <32 x i32> |
| %2 = load <32 x i16>, ptr %y, align 2 |
| %3 = sext <32 x i16> %2 to <32 x i32> |
| %4 = mul nsw <32 x i32> %3, %1 |
| %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4) |
| ret i32 %5 |
| } |
| |
| define i32 @mlav64i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav64i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q0, [r0] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #16] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #32] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #48] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #64] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #80] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #96] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #112] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <8 x i16>, ptr %x, align 2 |
| %0 = sext <8 x i16> %wide.load to <8 x i32> |
| %wide.load11 = load <8 x i16>, ptr %y, align 2 |
| %1 = sext <8 x i16> %wide.load11 to <8 x i32> |
| %2 = mul nsw <8 x i32> %1, %0 |
| %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) |
| %4 = getelementptr inbounds i16, ptr %x, i32 8 |
| %wide.load.1 = load <8 x i16>, ptr %4, align 2 |
| %5 = sext <8 x i16> %wide.load.1 to <8 x i32> |
| %6 = getelementptr inbounds i16, ptr %y, i32 8 |
| %wide.load11.1 = load <8 x i16>, ptr %6, align 2 |
| %7 = sext <8 x i16> %wide.load11.1 to <8 x i32> |
| %8 = mul nsw <8 x i32> %7, %5 |
| %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) |
| %10 = add i32 %9, %3 |
| %11 = getelementptr inbounds i16, ptr %x, i32 16 |
| %wide.load.2 = load <8 x i16>, ptr %11, align 2 |
| %12 = sext <8 x i16> %wide.load.2 to <8 x i32> |
| %13 = getelementptr inbounds i16, ptr %y, i32 16 |
| %wide.load11.2 = load <8 x i16>, ptr %13, align 2 |
| %14 = sext <8 x i16> %wide.load11.2 to <8 x i32> |
| %15 = mul nsw <8 x i32> %14, %12 |
| %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) |
| %17 = add i32 %16, %10 |
| %18 = getelementptr inbounds i16, ptr %x, i32 24 |
| %wide.load.3 = load <8 x i16>, ptr %18, align 2 |
| %19 = sext <8 x i16> %wide.load.3 to <8 x i32> |
| %20 = getelementptr inbounds i16, ptr %y, i32 24 |
| %wide.load11.3 = load <8 x i16>, ptr %20, align 2 |
| %21 = sext <8 x i16> %wide.load11.3 to <8 x i32> |
| %22 = mul nsw <8 x i32> %21, %19 |
| %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22) |
| %24 = add i32 %23, %17 |
| %25 = getelementptr inbounds i16, ptr %x, i32 32 |
| %wide.load.4 = load <8 x i16>, ptr %25, align 2 |
| %26 = sext <8 x i16> %wide.load.4 to <8 x i32> |
| %27 = getelementptr inbounds i16, ptr %y, i32 32 |
| %wide.load11.4 = load <8 x i16>, ptr %27, align 2 |
| %28 = sext <8 x i16> %wide.load11.4 to <8 x i32> |
| %29 = mul nsw <8 x i32> %28, %26 |
| %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29) |
| %31 = add i32 %30, %24 |
| %32 = getelementptr inbounds i16, ptr %x, i32 40 |
| %wide.load.5 = load <8 x i16>, ptr %32, align 2 |
| %33 = sext <8 x i16> %wide.load.5 to <8 x i32> |
| %34 = getelementptr inbounds i16, ptr %y, i32 40 |
| %wide.load11.5 = load <8 x i16>, ptr %34, align 2 |
| %35 = sext <8 x i16> %wide.load11.5 to <8 x i32> |
| %36 = mul nsw <8 x i32> %35, %33 |
| %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36) |
| %38 = add i32 %37, %31 |
| %39 = getelementptr inbounds i16, ptr %x, i32 48 |
| %wide.load.6 = load <8 x i16>, ptr %39, align 2 |
| %40 = sext <8 x i16> %wide.load.6 to <8 x i32> |
| %41 = getelementptr inbounds i16, ptr %y, i32 48 |
| %wide.load11.6 = load <8 x i16>, ptr %41, align 2 |
| %42 = sext <8 x i16> %wide.load11.6 to <8 x i32> |
| %43 = mul nsw <8 x i32> %42, %40 |
| %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) |
| %45 = add i32 %44, %38 |
| %46 = getelementptr inbounds i16, ptr %x, i32 56 |
| %wide.load.7 = load <8 x i16>, ptr %46, align 2 |
| %47 = sext <8 x i16> %wide.load.7 to <8 x i32> |
| %48 = getelementptr inbounds i16, ptr %y, i32 56 |
| %wide.load11.7 = load <8 x i16>, ptr %48, align 2 |
| %49 = sext <8 x i16> %wide.load11.7 to <8 x i32> |
| %50 = mul nsw <8 x i32> %49, %47 |
| %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) |
| %52 = add i32 %51, %45 |
| ret i32 %52 |
| } |
| |
| define i32 @mlav128i32i16(ptr %x, ptr %y) { |
| ; CHECK-LABEL: mlav128i32i16: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: vldrh.u16 q0, [r0] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1] |
| ; CHECK-NEXT: mov r2, r0 |
| ; CHECK-NEXT: vmlav.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #16] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #32] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #32] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #48] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #48] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #64] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #64] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #80] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #80] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #96] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #96] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #112] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #112] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #128] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #128] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #144] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #144] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #160] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #160] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #176] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #176] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #192] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #192] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #208] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #208] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #224] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #224] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: vldrh.u16 q0, [r2, #240] |
| ; CHECK-NEXT: vldrh.u16 q1, [r1, #240] |
| ; CHECK-NEXT: vmlava.s16 r0, q1, q0 |
| ; CHECK-NEXT: bx lr |
| entry: |
| %wide.load = load <8 x i16>, ptr %x, align 2 |
| %0 = sext <8 x i16> %wide.load to <8 x i32> |
| %wide.load11 = load <8 x i16>, ptr %y, align 2 |
| %1 = sext <8 x i16> %wide.load11 to <8 x i32> |
| %2 = mul nsw <8 x i32> %1, %0 |
| %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2) |
| %4 = getelementptr inbounds i16, ptr %x, i32 8 |
| %wide.load.1 = load <8 x i16>, ptr %4, align 2 |
| %5 = sext <8 x i16> %wide.load.1 to <8 x i32> |
| %6 = getelementptr inbounds i16, ptr %y, i32 8 |
| %wide.load11.1 = load <8 x i16>, ptr %6, align 2 |
| %7 = sext <8 x i16> %wide.load11.1 to <8 x i32> |
| %8 = mul nsw <8 x i32> %7, %5 |
| %9 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8) |
| %10 = add i32 %9, %3 |
| %11 = getelementptr inbounds i16, ptr %x, i32 16 |
| %wide.load.2 = load <8 x i16>, ptr %11, align 2 |
| %12 = sext <8 x i16> %wide.load.2 to <8 x i32> |
| %13 = getelementptr inbounds i16, ptr %y, i32 16 |
| %wide.load11.2 = load <8 x i16>, ptr %13, align 2 |
| %14 = sext <8 x i16> %wide.load11.2 to <8 x i32> |
| %15 = mul nsw <8 x i32> %14, %12 |
| %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15) |
| %17 = add i32 %16, %10 |
| %18 = getelementptr inbounds i16, ptr %x, i32 24 |
| %wide.load.3 = load <8 x i16>, ptr %18, align 2 |
| %19 = sext <8 x i16> %wide.load.3 to <8 x i32> |
| %20 = getelementptr inbounds i16, ptr %y, i32 24 |
| %wide.load11.3 = load <8 x i16>, ptr %20, align 2 |
| %21 = sext <8 x i16> %wide.load11.3 to <8 x i32> |
| %22 = mul nsw <8 x i32> %21, %19 |
| %23 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %22) |
| %24 = add i32 %23, %17 |
| %25 = getelementptr inbounds i16, ptr %x, i32 32 |
| %wide.load.4 = load <8 x i16>, ptr %25, align 2 |
| %26 = sext <8 x i16> %wide.load.4 to <8 x i32> |
| %27 = getelementptr inbounds i16, ptr %y, i32 32 |
| %wide.load11.4 = load <8 x i16>, ptr %27, align 2 |
| %28 = sext <8 x i16> %wide.load11.4 to <8 x i32> |
| %29 = mul nsw <8 x i32> %28, %26 |
| %30 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %29) |
| %31 = add i32 %30, %24 |
| %32 = getelementptr inbounds i16, ptr %x, i32 40 |
| %wide.load.5 = load <8 x i16>, ptr %32, align 2 |
| %33 = sext <8 x i16> %wide.load.5 to <8 x i32> |
| %34 = getelementptr inbounds i16, ptr %y, i32 40 |
| %wide.load11.5 = load <8 x i16>, ptr %34, align 2 |
| %35 = sext <8 x i16> %wide.load11.5 to <8 x i32> |
| %36 = mul nsw <8 x i32> %35, %33 |
| %37 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %36) |
| %38 = add i32 %37, %31 |
| %39 = getelementptr inbounds i16, ptr %x, i32 48 |
| %wide.load.6 = load <8 x i16>, ptr %39, align 2 |
| %40 = sext <8 x i16> %wide.load.6 to <8 x i32> |
| %41 = getelementptr inbounds i16, ptr %y, i32 48 |
| %wide.load11.6 = load <8 x i16>, ptr %41, align 2 |
| %42 = sext <8 x i16> %wide.load11.6 to <8 x i32> |
| %43 = mul nsw <8 x i32> %42, %40 |
| %44 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %43) |
| %45 = add i32 %44, %38 |
| %46 = getelementptr inbounds i16, ptr %x, i32 56 |
| %wide.load.7 = load <8 x i16>, ptr %46, align 2 |
| %47 = sext <8 x i16> %wide.load.7 to <8 x i32> |
| %48 = getelementptr inbounds i16, ptr %y, i32 56 |
| %wide.load11.7 = load <8 x i16>, ptr %48, align 2 |
| %49 = sext <8 x i16> %wide.load11.7 to <8 x i32> |
| %50 = mul nsw <8 x i32> %49, %47 |
| %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50) |
| %52 = add i32 %51, %45 |
| %53 = getelementptr inbounds i16, ptr %x, i32 64 |
| %wide.load.8 = load <8 x i16>, ptr %53, align 2 |
| %54 = sext <8 x i16> %wide.load.8 to <8 x i32> |
| %55 = getelementptr inbounds i16, ptr %y, i32 64 |
| %wide.load11.8 = load <8 x i16>, ptr %55, align 2 |
| %56 = sext <8 x i16> %wide.load11.8 to <8 x i32> |
| %57 = mul nsw <8 x i32> %56, %54 |
| %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57) |
| %59 = add i32 %58, %52 |
| %60 = getelementptr inbounds i16, ptr %x, i32 72 |
| %wide.load.9 = load <8 x i16>, ptr %60, align 2 |
| %61 = sext <8 x i16> %wide.load.9 to <8 x i32> |
| %62 = getelementptr inbounds i16, ptr %y, i32 72 |
| %wide.load11.9 = load <8 x i16>, ptr %62, align 2 |
| %63 = sext <8 x i16> %wide.load11.9 to <8 x i32> |
| %64 = mul nsw <8 x i32> %63, %61 |
| %65 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %64) |
| %66 = add i32 %65, %59 |
| %67 = getelementptr inbounds i16, ptr %x, i32 80 |
| %wide.load.10 = load <8 x i16>, ptr %67, align 2 |
| %68 = sext |