| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NODOT |
| ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT |
| ; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT-I8MM |
| |
| define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { |
| ; CHECK-NODOT-LABEL: udot: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b |
| ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v3.8h |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: udot v0.4s, v2.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = zext <16 x i8> %u to <16 x i32> |
| %s.wide = zext <16 x i8> %s to <16 x i32> |
| %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){ |
| ; CHECK-NODOT-LABEL: udot_in_loop: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-NODOT-NEXT: mov x8, xzr |
| ; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body |
| ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-NODOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-NODOT-NEXT: add x8, x8, #16 |
| ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b |
| ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b |
| ; CHECK-NODOT-NEXT: cmp x8, #16 |
| ; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v4.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v4.8h |
| ; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h |
| ; CHECK-NODOT-NEXT: b.ne .LBB1_1 |
| ; CHECK-NODOT-NEXT: // %bb.2: // %end |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_in_loop: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: mov x8, xzr |
| ; CHECK-DOT-NEXT: .LBB1_1: // %vector.body |
| ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-NEXT: udot v1.4s, v2.16b, v3.16b |
| ; CHECK-DOT-NEXT: cmp x8, #16 |
| ; CHECK-DOT-NEXT: b.ne .LBB1_1 |
| ; CHECK-DOT-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_in_loop: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: mov x8, xzr |
| ; CHECK-DOT-I8MM-NEXT: .LBB1_1: // %vector.body |
| ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: udot v1.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: cmp x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: b.ne .LBB1_1 |
| ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ] |
| %gep1 = getelementptr i8, ptr %p1, i64 %index |
| %load1 = load <16 x i8>, ptr %gep1, align 16 |
| %load1.wide = zext <16 x i8> %load1 to <16 x i32> |
| %gep2 = getelementptr i8, ptr %p2, i64 %index |
| %load2 = load <16 x i8>, ptr %gep2, align 16 |
| %load2.wide = zext <16 x i8> %load2 to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul) |
| %index.next = add nuw i64 %index, 16 |
| %cmp = icmp eq i64 %index.next, 16 |
| br i1 %cmp, label %end, label %vector.body |
| |
| end: |
| ret <4 x i32> %acc |
| } |
| |
| define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { |
| ; CHECK-NODOT-LABEL: udot_narrow: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b |
| ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 |
| ; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_narrow: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: udot v0.2s, v2.8b, v1.8b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_narrow: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: udot v0.2s, v2.8b, v1.8b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = zext <8 x i8> %u to <8 x i32> |
| %s.wide = zext <8 x i8> %s to <8 x i32> |
| %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { |
| ; CHECK-NODOT-LABEL: sdot: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b |
| ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h |
| ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v3.8h |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sdot: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sdot: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: sdot v0.4s, v2.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = sext <16 x i8> %u to <16 x i32> |
| %s.wide = sext <16 x i8> %s to <16 x i32> |
| %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { |
| ; CHECK-NODOT-LABEL: sdot_narrow: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b |
| ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 |
| ; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sdot_narrow: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: sdot v0.2s, v2.8b, v1.8b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sdot_narrow: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: sdot v0.2s, v2.8b, v1.8b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = sext <8 x i8> %u to <8 x i32> |
| %s.wide = sext <8 x i8> %s to <8 x i32> |
| %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { |
| ; CHECK-NODOT-LABEL: usdot: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h |
| ; CHECK-NODOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: usdot: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: ushll v3.8h, v1.8b, #0 |
| ; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v1.8h, v1.16b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: smlal v0.4s, v4.4h, v3.4h |
| ; CHECK-DOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h |
| ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: usdot: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = zext <16 x i8> %u to <16 x i32> |
| %s.wide = sext <16 x i8> %s to <16 x i32> |
| %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){ |
| ; CHECK-NODOT-LABEL: usdot_in_loop: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-NODOT-NEXT: mov x8, xzr |
| ; CHECK-NODOT-NEXT: .LBB6_1: // %vector.body |
| ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-NODOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-NODOT-NEXT: add x8, x8, #16 |
| ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll v5.8h, v3.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0 |
| ; CHECK-NODOT-NEXT: cmp x8, #16 |
| ; CHECK-NODOT-NEXT: smlal v1.4s, v4.4h, v5.4h |
| ; CHECK-NODOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h |
| ; CHECK-NODOT-NEXT: smlal v1.4s, v2.4h, v3.4h |
| ; CHECK-NODOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h |
| ; CHECK-NODOT-NEXT: b.ne .LBB6_1 |
| ; CHECK-NODOT-NEXT: // %bb.2: // %end |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: usdot_in_loop: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: mov x8, xzr |
| ; CHECK-DOT-NEXT: .LBB6_1: // %vector.body |
| ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: ushll v5.8h, v3.8b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v3.8h, v3.16b, #0 |
| ; CHECK-DOT-NEXT: cmp x8, #16 |
| ; CHECK-DOT-NEXT: smlal v1.4s, v4.4h, v5.4h |
| ; CHECK-DOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h |
| ; CHECK-DOT-NEXT: smlal v1.4s, v2.4h, v3.4h |
| ; CHECK-DOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h |
| ; CHECK-DOT-NEXT: b.ne .LBB6_1 |
| ; CHECK-DOT-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: usdot_in_loop: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: mov x8, xzr |
| ; CHECK-DOT-I8MM-NEXT: .LBB6_1: // %vector.body |
| ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: usdot v1.4s, v3.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: cmp x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: b.ne .LBB6_1 |
| ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ] |
| %gep1 = getelementptr i8, ptr %p1, i64 %index |
| %load1 = load <16 x i8>, ptr %gep1, align 16 |
| %load1.wide = sext <16 x i8> %load1 to <16 x i32> |
| %gep2 = getelementptr i8, ptr %p2, i64 %index |
| %load2 = load <16 x i8>, ptr %gep2, align 16 |
| %load2.wide = zext <16 x i8> %load2 to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul) |
| %index.next = add nuw i64 %index, 16 |
| %cmp = icmp eq i64 %index.next, 16 |
| br i1 %cmp, label %end, label %vector.body |
| |
| end: |
| ret <4 x i32> %acc |
| } |
| |
| define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ |
| ; CHECK-NODOT-LABEL: usdot_narrow: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 |
| ; CHECK-NODOT-NEXT: smull2 v1.4s, v2.8h, v1.8h |
| ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v5.4h, v4.4h |
| ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: usdot_narrow: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-DOT-NEXT: sshll v2.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-DOT-NEXT: smull v3.4s, v2.4h, v1.4h |
| ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-DOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 |
| ; CHECK-DOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 |
| ; CHECK-DOT-NEXT: smull2 v1.4s, v2.8h, v1.8h |
| ; CHECK-DOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 |
| ; CHECK-DOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-DOT-NEXT: add v0.2s, v3.2s, v0.2s |
| ; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v4.4h |
| ; CHECK-DOT-NEXT: add v0.2s, v1.2s, v0.2s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: usdot_narrow: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: usdot v0.2s, v1.8b, v2.8b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = zext <8 x i8> %u to <8 x i32> |
| %s.wide = sext <8 x i8> %s to <8 x i32> |
| %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{ |
| ; CHECK-NODOT-LABEL: sudot: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h |
| ; CHECK-NODOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sudot: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: sshll v3.8h, v1.8b, #0 |
| ; CHECK-DOT-NEXT: ushll v4.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v1.8h, v1.16b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: smlal v0.4s, v4.4h, v3.4h |
| ; CHECK-DOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h |
| ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sudot: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %s.wide = sext <16 x i8> %u to <16 x i32> |
| %u.wide = zext <16 x i8> %s to <16 x i32> |
| %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){ |
| ; CHECK-NODOT-LABEL: sudot_in_loop: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-NODOT-NEXT: mov x8, xzr |
| ; CHECK-NODOT-NEXT: .LBB9_1: // %vector.body |
| ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-NODOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-NODOT-NEXT: add x8, x8, #16 |
| ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll v5.8h, v3.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0 |
| ; CHECK-NODOT-NEXT: cmp x8, #16 |
| ; CHECK-NODOT-NEXT: smlal v1.4s, v4.4h, v5.4h |
| ; CHECK-NODOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h |
| ; CHECK-NODOT-NEXT: smlal v1.4s, v2.4h, v3.4h |
| ; CHECK-NODOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h |
| ; CHECK-NODOT-NEXT: b.ne .LBB9_1 |
| ; CHECK-NODOT-NEXT: // %bb.2: // %end |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sudot_in_loop: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: mov x8, xzr |
| ; CHECK-DOT-NEXT: .LBB9_1: // %vector.body |
| ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-NEXT: ushll v4.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: sshll v5.8h, v3.8b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v3.8h, v3.16b, #0 |
| ; CHECK-DOT-NEXT: cmp x8, #16 |
| ; CHECK-DOT-NEXT: smlal v1.4s, v4.4h, v5.4h |
| ; CHECK-DOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h |
| ; CHECK-DOT-NEXT: smlal v1.4s, v2.4h, v3.4h |
| ; CHECK-DOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h |
| ; CHECK-DOT-NEXT: b.ne .LBB9_1 |
| ; CHECK-DOT-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sudot_in_loop: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: mov x8, xzr |
| ; CHECK-DOT-I8MM-NEXT: .LBB9_1: // %vector.body |
| ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: usdot v1.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: cmp x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: b.ne .LBB9_1 |
| ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ] |
| %gep1 = getelementptr i8, ptr %p1, i64 %index |
| %load1 = load <16 x i8>, ptr %gep1, align 16 |
| %load1.wide = zext <16 x i8> %load1 to <16 x i32> |
| %gep2 = getelementptr i8, ptr %p2, i64 %index |
| %load2 = load <16 x i8>, ptr %gep2, align 16 |
| %load2.wide = sext <16 x i8> %load2 to <16 x i32> |
| %mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul) |
| %index.next = add nuw i64 %index, 16 |
| %cmp = icmp eq i64 %index.next, 16 |
| br i1 %cmp, label %end, label %vector.body |
| |
| end: |
| ret <4 x i32> %acc |
| } |
| |
| define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ |
| ; CHECK-NODOT-LABEL: sudot_narrow: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 |
| ; CHECK-NODOT-NEXT: smull2 v1.4s, v2.8h, v1.8h |
| ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v5.4h, v4.4h |
| ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sudot_narrow: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: sshll v1.8h, v1.8b, #0 |
| ; CHECK-DOT-NEXT: ushll v2.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-DOT-NEXT: smull v3.4s, v2.4h, v1.4h |
| ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h |
| ; CHECK-DOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8 |
| ; CHECK-DOT-NEXT: ext v5.16b, v2.16b, v2.16b, #8 |
| ; CHECK-DOT-NEXT: smull2 v1.4s, v2.8h, v1.8h |
| ; CHECK-DOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8 |
| ; CHECK-DOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-DOT-NEXT: add v0.2s, v3.2s, v0.2s |
| ; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v4.4h |
| ; CHECK-DOT-NEXT: add v0.2s, v1.2s, v0.2s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sudot_narrow: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: usdot v0.2s, v2.8b, v1.8b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %u.wide = sext <8 x i8> %u to <8 x i32> |
| %s.wide = zext <8 x i8> %s to <8 x i32> |
| %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { |
| ; CHECK-NODOT-LABEL: udot_8to64: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b |
| ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b |
| ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 |
| ; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s |
| ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s |
| ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s |
| ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s |
| ; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s |
| ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s |
| ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s |
| ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_8to64: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-NEXT: uaddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_8to64: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: uaddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| %a.wide = zext <16 x i8> %a to <16 x i64> |
| %b.wide = zext <16 x i8> %b to <16 x i64> |
| %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( |
| <4 x i64> %acc, <16 x i64> %mult) |
| ret <4 x i64> %partial.reduce |
| } |
| |
| define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ |
| ; CHECK-NODOT-LABEL: sdot_8to64: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b |
| ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b |
| ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 |
| ; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s |
| ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s |
| ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s |
| ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s |
| ; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s |
| ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s |
| ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sdot_8to64: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sdot_8to64: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| %a.wide = sext <16 x i8> %a to <16 x i64> |
| %b.wide = sext <16 x i8> %b to <16 x i64> |
| %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( |
| <4 x i64> %acc, <16 x i64> %mult) |
| ret <4 x i64> %partial.reduce |
| } |
| |
| define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){ |
| ; CHECK-NODOT-LABEL: usdot_8to64: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: sshll v5.8h, v3.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0 |
| ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll v7.4s, v2.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll v16.4s, v5.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll v17.4s, v3.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0 |
| ; CHECK-NODOT-NEXT: smlal v0.2d, v6.2s, v16.2s |
| ; CHECK-NODOT-NEXT: smlal v1.2d, v7.2s, v17.2s |
| ; CHECK-NODOT-NEXT: smlal2 v0.2d, v6.4s, v16.4s |
| ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v17.4s |
| ; CHECK-NODOT-NEXT: smlal v0.2d, v4.2s, v5.2s |
| ; CHECK-NODOT-NEXT: smlal v1.2d, v2.2s, v3.2s |
| ; CHECK-NODOT-NEXT: smlal2 v0.2d, v4.4s, v5.4s |
| ; CHECK-NODOT-NEXT: smlal2 v1.2d, v2.4s, v3.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: usdot_8to64: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: ushll v4.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: sshll v5.8h, v3.8b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v3.8h, v3.16b, #0 |
| ; CHECK-DOT-NEXT: ushll v6.4s, v4.4h, #0 |
| ; CHECK-DOT-NEXT: ushll v7.4s, v2.4h, #0 |
| ; CHECK-DOT-NEXT: sshll v16.4s, v5.4h, #0 |
| ; CHECK-DOT-NEXT: sshll v17.4s, v3.4h, #0 |
| ; CHECK-DOT-NEXT: ushll2 v4.4s, v4.8h, #0 |
| ; CHECK-DOT-NEXT: ushll2 v2.4s, v2.8h, #0 |
| ; CHECK-DOT-NEXT: sshll2 v5.4s, v5.8h, #0 |
| ; CHECK-DOT-NEXT: sshll2 v3.4s, v3.8h, #0 |
| ; CHECK-DOT-NEXT: smlal v0.2d, v6.2s, v16.2s |
| ; CHECK-DOT-NEXT: smlal v1.2d, v7.2s, v17.2s |
| ; CHECK-DOT-NEXT: smlal2 v0.2d, v6.4s, v16.4s |
| ; CHECK-DOT-NEXT: smlal2 v1.2d, v7.4s, v17.4s |
| ; CHECK-DOT-NEXT: smlal v0.2d, v4.2s, v5.2s |
| ; CHECK-DOT-NEXT: smlal v1.2d, v2.2s, v3.2s |
| ; CHECK-DOT-NEXT: smlal2 v0.2d, v4.4s, v5.4s |
| ; CHECK-DOT-NEXT: smlal2 v1.2d, v2.4s, v3.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: usdot_8to64: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: usdot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| %a.wide = zext <16 x i8> %a to <16 x i64> |
| %b.wide = sext <16 x i8> %b to <16 x i64> |
| %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( |
| <4 x i64> %acc, <16 x i64> %mult) |
| ret <4 x i64> %partial.reduce |
| } |
| |
| define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) { |
| ; CHECK-NODOT-LABEL: sudot_8to64: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: ushll v5.8h, v3.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0 |
| ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll v7.4s, v2.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll v16.4s, v5.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll v17.4s, v3.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 |
| ; CHECK-NODOT-NEXT: smlal v0.2d, v6.2s, v16.2s |
| ; CHECK-NODOT-NEXT: smlal v1.2d, v7.2s, v17.2s |
| ; CHECK-NODOT-NEXT: smlal2 v0.2d, v6.4s, v16.4s |
| ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v17.4s |
| ; CHECK-NODOT-NEXT: smlal v0.2d, v4.2s, v5.2s |
| ; CHECK-NODOT-NEXT: smlal v1.2d, v2.2s, v3.2s |
| ; CHECK-NODOT-NEXT: smlal2 v0.2d, v4.4s, v5.4s |
| ; CHECK-NODOT-NEXT: smlal2 v1.2d, v2.4s, v3.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sudot_8to64: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: ushll v5.8h, v3.8b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v3.8h, v3.16b, #0 |
| ; CHECK-DOT-NEXT: sshll v6.4s, v4.4h, #0 |
| ; CHECK-DOT-NEXT: sshll v7.4s, v2.4h, #0 |
| ; CHECK-DOT-NEXT: ushll v16.4s, v5.4h, #0 |
| ; CHECK-DOT-NEXT: ushll v17.4s, v3.4h, #0 |
| ; CHECK-DOT-NEXT: sshll2 v4.4s, v4.8h, #0 |
| ; CHECK-DOT-NEXT: sshll2 v2.4s, v2.8h, #0 |
| ; CHECK-DOT-NEXT: ushll2 v5.4s, v5.8h, #0 |
| ; CHECK-DOT-NEXT: ushll2 v3.4s, v3.8h, #0 |
| ; CHECK-DOT-NEXT: smlal v0.2d, v6.2s, v16.2s |
| ; CHECK-DOT-NEXT: smlal v1.2d, v7.2s, v17.2s |
| ; CHECK-DOT-NEXT: smlal2 v0.2d, v6.4s, v16.4s |
| ; CHECK-DOT-NEXT: smlal2 v1.2d, v7.4s, v17.4s |
| ; CHECK-DOT-NEXT: smlal v0.2d, v4.2s, v5.2s |
| ; CHECK-DOT-NEXT: smlal v1.2d, v2.2s, v3.2s |
| ; CHECK-DOT-NEXT: smlal2 v0.2d, v4.4s, v5.4s |
| ; CHECK-DOT-NEXT: smlal2 v1.2d, v2.4s, v3.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sudot_8to64: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: usdot v4.4s, v3.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| %a.wide = sext <16 x i8> %a to <16 x i64> |
| %b.wide = zext <16 x i8> %b to <16 x i64> |
| %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64( |
| <4 x i64> %acc, <16 x i64> %mult) |
| ret <4 x i64> %partial.reduce |
| } |
| |
| define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ |
| ; CHECK-NODOT-LABEL: udot_no_bin_op: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0 |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v2.8h |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_no_bin_op: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: movi v2.16b, #1 |
| ; CHECK-DOT-NEXT: udot v0.4s, v1.16b, v2.16b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_no_bin_op: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: movi v2.16b, #1 |
| ; CHECK-DOT-I8MM-NEXT: udot v0.4s, v1.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %a.wide = zext <16 x i8> %a to <16 x i32> |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){ |
| ; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-NODOT-NEXT: mov x8, xzr |
| ; CHECK-NODOT-NEXT: .LBB16_1: // %vector.body |
| ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-NODOT-NEXT: add x8, x8, #16 |
| ; CHECK-NODOT-NEXT: cmp x8, #16 |
| ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v3.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h |
| ; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h |
| ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h |
| ; CHECK-NODOT-NEXT: b.ne .LBB16_1 |
| ; CHECK-NODOT-NEXT: // %bb.2: // %end |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_no_bin_op_in_loop: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: movi v2.16b, #1 |
| ; CHECK-DOT-NEXT: mov x8, xzr |
| ; CHECK-DOT-NEXT: .LBB16_1: // %vector.body |
| ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-NEXT: ldr q3, [x0, x8] |
| ; CHECK-DOT-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-NEXT: cmp x8, #16 |
| ; CHECK-DOT-NEXT: udot v1.4s, v3.16b, v2.16b |
| ; CHECK-DOT-NEXT: b.ne .LBB16_1 |
| ; CHECK-DOT-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_in_loop: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: movi v2.16b, #1 |
| ; CHECK-DOT-I8MM-NEXT: mov x8, xzr |
| ; CHECK-DOT-I8MM-NEXT: .LBB16_1: // %vector.body |
| ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-I8MM-NEXT: ldr q3, [x0, x8] |
| ; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b |
| ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: cmp x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: udot v1.4s, v3.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: b.ne .LBB16_1 |
| ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: |
| %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| %acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ] |
| %gep = getelementptr i8, ptr %p, i64 %index |
| %load = load <16 x i8>, ptr %gep, align 16 |
| %load.wide = zext <16 x i8> %load to <16 x i32> |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %load.wide) |
| %index.next = add nuw i64 %index, 16 |
| %cmp = icmp eq i64 %index.next, 16 |
| br i1 %cmp, label %end, label %vector.body |
| |
| end: |
| ret <4 x i32> %acc |
| } |
| |
| define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){ |
| ; CHECK-NODOT-LABEL: sdot_no_bin_op: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0 |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h |
| ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v2.8h |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sdot_no_bin_op: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: movi v2.16b, #1 |
| ; CHECK-DOT-NEXT: sdot v0.4s, v1.16b, v2.16b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: movi v2.16b, #1 |
| ; CHECK-DOT-I8MM-NEXT: sdot v0.4s, v1.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %a.wide = sext <16 x i8> %a to <16 x i32> |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ |
| ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0 |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0 |
| ; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 |
| ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_no_bin_op_narrow: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: movi v2.8b, #1 |
| ; CHECK-DOT-NEXT: udot v0.2s, v1.8b, v2.8b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_narrow: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: movi v2.8b, #1 |
| ; CHECK-DOT-I8MM-NEXT: udot v0.2s, v1.8b, v2.8b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %a.wide = zext <8 x i8> %a to <8 x i32> |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){ |
| ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0 |
| ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0 |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0 |
| ; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8 |
| ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8 |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8 |
| ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h |
| ; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sdot_no_bin_op_narrow: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: movi v2.8b, #1 |
| ; CHECK-DOT-NEXT: sdot v0.2s, v1.8b, v2.8b |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_narrow: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: movi v2.8b, #1 |
| ; CHECK-DOT-I8MM-NEXT: sdot v0.2s, v1.8b, v2.8b |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %a.wide = sext <8 x i8> %a to <8 x i32> |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ |
| ; CHECK-NODOT-LABEL: udot_no_bin_op_8to64: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0 |
| ; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s |
| ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s |
| ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s |
| ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s |
| ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s |
| ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s |
| ; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: udot_no_bin_op_8to64: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: movi v3.16b, #1 |
| ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-NEXT: uaddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_8to64: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: movi v3.16b, #1 |
| ; CHECK-DOT-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: udot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: uaddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-I8MM-NEXT: uaddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %a.wide = zext <16 x i8> %a to <16 x i64> |
| %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) |
| ret <4 x i64> %partial.reduce |
| } |
| |
| define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){ |
| ; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64: |
| ; CHECK-NODOT: // %bb.0: |
| ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0 |
| ; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s |
| ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s |
| ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s |
| ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s |
| ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s |
| ; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: sdot_no_bin_op_8to64: |
| ; CHECK-DOT: // %bb.0: |
| ; CHECK-DOT-NEXT: movi v3.16b, #1 |
| ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_8to64: |
| ; CHECK-DOT-I8MM: // %bb.0: |
| ; CHECK-DOT-I8MM-NEXT: movi v3.16b, #1 |
| ; CHECK-DOT-I8MM-NEXT: movi v4.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: sdot v4.4s, v2.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: saddw v0.2d, v0.2d, v4.2s |
| ; CHECK-DOT-I8MM-NEXT: saddw2 v0.2d, v0.2d, v4.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| %a.wide = sext <16 x i8> %a to <16 x i64> |
| %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide) |
| ret <4 x i64> %partial.reduce |
| } |
| |
| define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ |
| ; CHECK-COMMON-LABEL: not_udot: |
| ; CHECK-COMMON: // %bb.0: |
| ; CHECK-COMMON-NEXT: umull v1.8h, v2.8b, v1.8b |
| ; CHECK-COMMON-NEXT: uaddw v0.4s, v0.4s, v1.4h |
| ; CHECK-COMMON-NEXT: uaddw2 v0.4s, v0.4s, v1.8h |
| ; CHECK-COMMON-NEXT: ret |
| %u.wide = zext <8 x i8> %u to <8 x i32> |
| %s.wide = zext <8 x i8> %s to <8 x i32> |
| %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult) |
| ret <4 x i32> %partial.reduce |
| } |
| |
| define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) { |
| ; CHECK-COMMON-LABEL: not_udot_narrow: |
| ; CHECK-COMMON: // %bb.0: |
| ; CHECK-COMMON-NEXT: bic v1.4h, #255, lsl #8 |
| ; CHECK-COMMON-NEXT: bic v2.4h, #255, lsl #8 |
| ; CHECK-COMMON-NEXT: // kill: def $d0 killed $d0 def $q0 |
| ; CHECK-COMMON-NEXT: umull v3.4s, v2.4h, v1.4h |
| ; CHECK-COMMON-NEXT: umlal v0.4s, v2.4h, v1.4h |
| ; CHECK-COMMON-NEXT: ext v1.16b, v3.16b, v3.16b, #8 |
| ; CHECK-COMMON-NEXT: add v0.2s, v1.2s, v0.2s |
| ; CHECK-COMMON-NEXT: ret |
| %u.wide = zext <4 x i8> %u to <4 x i32> |
| %s.wide = zext <4 x i8> %s to <4 x i32> |
| %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide |
| %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult) |
| ret <2 x i32> %partial.reduce |
| } |
| |
| define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ |
| ; CHECK-COMMON-LABEL: udot_different_types: |
| ; CHECK-COMMON: // %bb.0: // %entry |
| ; CHECK-COMMON-NEXT: ushll v2.8h, v2.8b, #0 |
| ; CHECK-COMMON-NEXT: ushll v3.4s, v1.4h, #0 |
| ; CHECK-COMMON-NEXT: ushll2 v1.4s, v1.8h, #0 |
| ; CHECK-COMMON-NEXT: ushll v4.4s, v2.4h, #0 |
| ; CHECK-COMMON-NEXT: ushll2 v2.4s, v2.8h, #0 |
| ; CHECK-COMMON-NEXT: umlal v0.2d, v3.2s, v4.2s |
| ; CHECK-COMMON-NEXT: umlal2 v0.2d, v3.4s, v4.4s |
| ; CHECK-COMMON-NEXT: umlal v0.2d, v1.2s, v2.2s |
| ; CHECK-COMMON-NEXT: umlal2 v0.2d, v1.4s, v2.4s |
| ; CHECK-COMMON-NEXT: ret |
| entry: |
| %a.wide = zext <8 x i16> %a to <8 x i64> |
| %b.wide = zext <8 x i8> %b to <8 x i64> |
| %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) |
| ret <2 x i64> %partial.reduce |
| } |
| |
| define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ |
| ; CHECK-COMMON-LABEL: sdot_different_types: |
| ; CHECK-COMMON: // %bb.0: // %entry |
| ; CHECK-COMMON-NEXT: sshll v2.8h, v2.8b, #0 |
| ; CHECK-COMMON-NEXT: sshll v3.4s, v1.4h, #0 |
| ; CHECK-COMMON-NEXT: sshll2 v1.4s, v1.8h, #0 |
| ; CHECK-COMMON-NEXT: sshll v4.4s, v2.4h, #0 |
| ; CHECK-COMMON-NEXT: sshll2 v2.4s, v2.8h, #0 |
| ; CHECK-COMMON-NEXT: smlal v0.2d, v3.2s, v4.2s |
| ; CHECK-COMMON-NEXT: smlal2 v0.2d, v3.4s, v4.4s |
| ; CHECK-COMMON-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-COMMON-NEXT: smlal2 v0.2d, v1.4s, v2.4s |
| ; CHECK-COMMON-NEXT: ret |
| entry: |
| %a.wide = sext <8 x i16> %a to <8 x i64> |
| %b.wide = sext <8 x i8> %b to <8 x i64> |
| %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) |
| ret <2 x i64> %partial.reduce |
| } |
| |
| define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ |
| ; CHECK-COMMON-LABEL: usdot_different_types: |
| ; CHECK-COMMON: // %bb.0: // %entry |
| ; CHECK-COMMON-NEXT: sshll v2.8h, v2.8b, #0 |
| ; CHECK-COMMON-NEXT: ushll v3.4s, v1.4h, #0 |
| ; CHECK-COMMON-NEXT: ushll2 v1.4s, v1.8h, #0 |
| ; CHECK-COMMON-NEXT: sshll v4.4s, v2.4h, #0 |
| ; CHECK-COMMON-NEXT: sshll2 v2.4s, v2.8h, #0 |
| ; CHECK-COMMON-NEXT: smlal v0.2d, v3.2s, v4.2s |
| ; CHECK-COMMON-NEXT: smlal2 v0.2d, v3.4s, v4.4s |
| ; CHECK-COMMON-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-COMMON-NEXT: smlal2 v0.2d, v1.4s, v2.4s |
| ; CHECK-COMMON-NEXT: ret |
| entry: |
| %a.wide = zext <8 x i16> %a to <8 x i64> |
| %b.wide = sext <8 x i8> %b to <8 x i64> |
| %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) |
| ret <2 x i64> %partial.reduce |
| } |
| |
| define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){ |
| ; CHECK-COMMON-LABEL: sudot_different_types: |
| ; CHECK-COMMON: // %bb.0: // %entry |
| ; CHECK-COMMON-NEXT: ushll v2.8h, v2.8b, #0 |
| ; CHECK-COMMON-NEXT: sshll v3.4s, v1.4h, #0 |
| ; CHECK-COMMON-NEXT: sshll2 v1.4s, v1.8h, #0 |
| ; CHECK-COMMON-NEXT: ushll v4.4s, v2.4h, #0 |
| ; CHECK-COMMON-NEXT: ushll2 v2.4s, v2.8h, #0 |
| ; CHECK-COMMON-NEXT: smlal v0.2d, v3.2s, v4.2s |
| ; CHECK-COMMON-NEXT: smlal2 v0.2d, v3.4s, v4.4s |
| ; CHECK-COMMON-NEXT: smlal v0.2d, v1.2s, v2.2s |
| ; CHECK-COMMON-NEXT: smlal2 v0.2d, v1.4s, v2.4s |
| ; CHECK-COMMON-NEXT: ret |
| entry: |
| %a.wide = sext <8 x i16> %a to <8 x i64> |
| %b.wide = zext <8 x i8> %b to <8 x i64> |
| %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide |
| %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult) |
| ret <2 x i64> %partial.reduce |
| } |
| |
| define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) { |
| ; CHECK-NODOT-LABEL: usdot_multiple_zext_users: |
| ; CHECK-NODOT: // %bb.0: // %entry |
| ; CHECK-NODOT-NEXT: movi v0.2d, #0000000000000000 |
| ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-NODOT-NEXT: mov x8, xzr |
| ; CHECK-NODOT-NEXT: .LBB28_1: // %vector.body |
| ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NODOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-NODOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-NODOT-NEXT: ldr q4, [x2, x8] |
| ; CHECK-NODOT-NEXT: add x8, x8, #16 |
| ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0 |
| ; CHECK-NODOT-NEXT: ushll v6.8h, v4.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll v7.8h, v3.8b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-NODOT-NEXT: ushll2 v4.8h, v4.16b, #0 |
| ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0 |
| ; CHECK-NODOT-NEXT: cmp x8, #1024 |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v5.4h, v6.4h |
| ; CHECK-NODOT-NEXT: smlal v1.4s, v7.4h, v6.4h |
| ; CHECK-NODOT-NEXT: smlal2 v0.4s, v5.8h, v6.8h |
| ; CHECK-NODOT-NEXT: smlal2 v1.4s, v7.8h, v6.8h |
| ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v4.4h |
| ; CHECK-NODOT-NEXT: smlal v1.4s, v3.4h, v4.4h |
| ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v4.8h |
| ; CHECK-NODOT-NEXT: smlal2 v1.4s, v3.8h, v4.8h |
| ; CHECK-NODOT-NEXT: b.ne .LBB28_1 |
| ; CHECK-NODOT-NEXT: // %bb.2: // %end |
| ; CHECK-NODOT-NEXT: add v0.4s, v1.4s, v0.4s |
| ; CHECK-NODOT-NEXT: ret |
| ; |
| ; CHECK-DOT-LABEL: usdot_multiple_zext_users: |
| ; CHECK-DOT: // %bb.0: // %entry |
| ; CHECK-DOT-NEXT: movi v0.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-NEXT: mov x8, xzr |
| ; CHECK-DOT-NEXT: .LBB28_1: // %vector.body |
| ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-NEXT: ldr q4, [x2, x8] |
| ; CHECK-DOT-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-NEXT: sshll v5.8h, v2.8b, #0 |
| ; CHECK-DOT-NEXT: ushll v6.8h, v4.8b, #0 |
| ; CHECK-DOT-NEXT: sshll v7.8h, v3.8b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0 |
| ; CHECK-DOT-NEXT: ushll2 v4.8h, v4.16b, #0 |
| ; CHECK-DOT-NEXT: sshll2 v3.8h, v3.16b, #0 |
| ; CHECK-DOT-NEXT: cmp x8, #1024 |
| ; CHECK-DOT-NEXT: smlal v0.4s, v5.4h, v6.4h |
| ; CHECK-DOT-NEXT: smlal v1.4s, v7.4h, v6.4h |
| ; CHECK-DOT-NEXT: smlal2 v0.4s, v5.8h, v6.8h |
| ; CHECK-DOT-NEXT: smlal2 v1.4s, v7.8h, v6.8h |
| ; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v4.4h |
| ; CHECK-DOT-NEXT: smlal v1.4s, v3.4h, v4.4h |
| ; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v4.8h |
| ; CHECK-DOT-NEXT: smlal2 v1.4s, v3.8h, v4.8h |
| ; CHECK-DOT-NEXT: b.ne .LBB28_1 |
| ; CHECK-DOT-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-NEXT: add v0.4s, v1.4s, v0.4s |
| ; CHECK-DOT-NEXT: ret |
| ; |
| ; CHECK-DOT-I8MM-LABEL: usdot_multiple_zext_users: |
| ; CHECK-DOT-I8MM: // %bb.0: // %entry |
| ; CHECK-DOT-I8MM-NEXT: movi v0.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000 |
| ; CHECK-DOT-I8MM-NEXT: mov x8, xzr |
| ; CHECK-DOT-I8MM-NEXT: .LBB28_1: // %vector.body |
| ; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8] |
| ; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8] |
| ; CHECK-DOT-I8MM-NEXT: ldr q4, [x2, x8] |
| ; CHECK-DOT-I8MM-NEXT: add x8, x8, #16 |
| ; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v4.16b, v2.16b |
| ; CHECK-DOT-I8MM-NEXT: usdot v1.4s, v4.16b, v3.16b |
| ; CHECK-DOT-I8MM-NEXT: cmp x8, #1024 |
| ; CHECK-DOT-I8MM-NEXT: b.ne .LBB28_1 |
| ; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end |
| ; CHECK-DOT-I8MM-NEXT: add v0.4s, v1.4s, v0.4s |
| ; CHECK-DOT-I8MM-NEXT: ret |
| entry: |
| br label %vector.body |
| |
| vector.body: |
| %iv = phi i64 [ 0, %entry ], [ %iv.next, %vector.body ] |
| %acc1 = phi <4 x i32> [ zeroinitializer, %entry], [ %psum1, %vector.body] |
| %acc2 = phi <4 x i32> [ zeroinitializer, %entry], [ %psum2, %vector.body] |
| %ptr1 = getelementptr i8, ptr %p1, i64 %iv |
| %ptr2 = getelementptr i8, ptr %p2, i64 %iv |
| %ptr3 = getelementptr i8, ptr %p3, i64 %iv |
| %load1 = load <16 x i8>, ptr %ptr1 |
| %load2 = load <16 x i8>, ptr %ptr2 |
| %load3 = load <16 x i8>, ptr %ptr3 |
| %sext1 = sext <16 x i8> %load1 to <16 x i32> |
| %zext = zext <16 x i8> %load3 to <16 x i32> |
| %mul1 = mul <16 x i32> %sext1, %zext |
| %psum1 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc1, <16 x i32> %mul1) |
| %sext2 = sext <16 x i8> %load2 to <16 x i32> |
| %mul2 = mul <16 x i32> %sext2, %zext |
| %psum2 = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc2, <16 x i32> %mul2) |
| %iv.next = add i64 %iv, 16 |
| %1 = icmp eq i64 %iv.next, 1024 |
| br i1 %1, label %end, label %vector.body |
| |
| end: |
| %2 = add <4 x i32> %psum2, %psum1 |
| ret <4 x i32> %2 |
| } |
| |
| define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){ |
| ; CHECK-COMMON-LABEL: udot_16to64: |
| ; CHECK-COMMON: // %bb.0: // %entry |
| ; CHECK-COMMON-NEXT: ushll v2.4s, v1.4h, #0 |
| ; CHECK-COMMON-NEXT: ushll2 v1.4s, v1.8h, #0 |
| ; CHECK-COMMON-NEXT: uaddw v0.2d, v0.2d, v2.2s |
| ; CHECK-COMMON-NEXT: uaddw2 v0.2d, v0.2d, v2.4s |
| ; CHECK-COMMON-NEXT: uaddw v0.2d, v0.2d, v1.2s |
| ; CHECK-COMMON-NEXT: uaddw2 v0.2d, v0.2d, v1.4s |
| ; CHECK-COMMON-NEXT: ret |
| entry: |
| %input.wide = zext <8 x i16> %input to <8 x i64> |
| %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide) |
| ret <2 x i64> %partial.reduce |
| } |
| |