| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-i16i64 -enable-subreg-liveness -force-streaming -verify-machineinstrs < %s | FileCheck %s |
| |
| ; == FVDOT == |
| |
| define void @test_fvdot_lane_za32_vg1x2_nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm) { |
| ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: fvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: fvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zm, i32 3) |
| ret void |
| } |
| |
| |
| ; == BFVDOT == |
| |
| define void @test_fvdot_lane_za32_vg1x2_nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm) { |
| ; CHECK-LABEL: test_fvdot_lane_za32_vg1x2_nxv8bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: bfvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: bfvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zm, i32 3) |
| ret void |
| } |
| |
| |
| ; == SVDOT == |
| |
| define void @test_svdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) { |
| ; CHECK-LABEL: test_svdot_lane_za32_vg1x2_nxv8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: svdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm, i32 3) |
| ret void |
| } |
| |
| define void @test_svdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) { |
| ; CHECK-LABEL: test_svdot_lane_za32_vg1x4_nxv16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: svdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| ret void |
| } |
| |
| define void @test_svdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) { |
| ; CHECK-LABEL: test_svdot_lane_za64_vg1x4_nxv8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: svdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1] |
| ; CHECK-NEXT: svdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm, i32 1) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm, i32 1) |
| ret void |
| } |
| |
| define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { |
| ; CHECK-LABEL: svdot_form_2x_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: add x9, x0, x1 |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 |
| %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0 |
| %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1 |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0) |
| ret void |
| } |
| |
| define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 8 x i16> %scalable_arg) #0 { |
| ; CHECK-LABEL: svdot_form_2x_tuple_svecc: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: addvl sp, sp, #-3 |
| ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: add x9, x0, x1 |
| ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z10.h, z11.h }, z0.h[0] |
| ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload |
| ; CHECK-NEXT: str z0, [x0] |
| ; CHECK-NEXT: addvl sp, sp, #3 |
| ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 |
| %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0 |
| %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1 |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0) |
| store <vscale x 8 x i16> %scalable_arg, ptr %ptr |
| ret void |
| } |
| |
| define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { |
| ; CHECK-LABEL: svdot_form_4x_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| ret void |
| } |
| |
| define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 8 x i16> %scalable_arg) #0 { |
| ; CHECK-LABEL: svdot_form_4x_tuple_svecc: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: addvl sp, sp, #-9 |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload |
| ; CHECK-NEXT: str z0, [x0] |
| ; CHECK-NEXT: addvl sp, sp, #9 |
| ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| store <vscale x 8 x i16> %scalable_arg, ptr %ptr |
| ret void |
| } |
| |
| ; == UVDOT == |
| |
| define void @test_uvdot_lane_za32_vg1x2_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm) { |
| ; CHECK-LABEL: test_uvdot_lane_za32_vg1x2_nxv8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: uvdot za.s[w8, 7, vgx2], { z0.h, z1.h }, z2.h[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zm, i32 3) |
| ret void |
| } |
| |
| define void @test_uvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) { |
| ; CHECK-LABEL: test_uvdot_lane_za32_vg1x4_nxv16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: uvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| ret void |
| } |
| |
| define void @test_uvdot_lane_za64_vg1x4_nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm) { |
| ; CHECK-LABEL: test_uvdot_lane_za64_vg1x4_nxv8i16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: uvdot za.d[w8, 0, vgx4], { z0.h - z3.h }, z4.h[1] |
| ; CHECK-NEXT: uvdot za.d[w8, 7, vgx4], { z0.h - z3.h }, z4.h[1] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm, i32 1) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zm, i32 1) |
| ret void |
| } |
| |
| define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { |
| ; CHECK-LABEL: uvdot_form_2x_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: add x9, x0, x1 |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 |
| %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0 |
| %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1 |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0) |
| ret void |
| } |
| |
| define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 8 x i16> %scalable_arg) #0 { |
| ; CHECK-LABEL: uvdot_form_2x_tuple_svecc: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: addvl sp, sp, #-3 |
| ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: add x9, x0, x1 |
| ; CHECK-NEXT: str z11, [sp, #1, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: str z10, [sp, #2, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: ld1h { z2.h, z10.h }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1h { z3.h, z11.h }, pn8/z, [x9] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z2.h, z3.h }, z0.h[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z10.h, z11.h }, z0.h[0] |
| ; CHECK-NEXT: ldr z11, [sp, #1, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z10, [sp, #2, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload |
| ; CHECK-NEXT: str z0, [x0] |
| ; CHECK-NEXT: addvl sp, sp, #3 |
| ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0 |
| %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0 |
| %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1 |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0) |
| store <vscale x 8 x i16> %scalable_arg, ptr %ptr |
| ret void |
| } |
| |
| define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { |
| ; CHECK-LABEL: uvdot_form_4x_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| ret void |
| } |
| |
| define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 8 x i16> %scalable_arg) #0 { |
| ; CHECK-LABEL: uvdot_form_4x_tuple_svecc: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: addvl sp, sp, #-9 |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload |
| ; CHECK-NEXT: str z0, [x0] |
| ; CHECK-NEXT: addvl sp, sp, #9 |
| ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| store <vscale x 8 x i16> %scalable_arg, ptr %ptr |
| ret void |
| } |
| |
| ; == SUVDOT == |
| |
| define void @test_suvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) { |
| ; CHECK-LABEL: test_suvdot_lane_za32_vg1x4_nxv16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: suvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| ret void |
| } |
| |
| define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { |
| ; CHECK-LABEL: suvdot_form_4x_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| ret void |
| } |
| |
| define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 8 x i16> %scalable_arg) #0 { |
| ; CHECK-LABEL: suvdot_form_4x_tuple_svecc: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: addvl sp, sp, #-9 |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload |
| ; CHECK-NEXT: str z0, [x0] |
| ; CHECK-NEXT: addvl sp, sp, #9 |
| ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| store <vscale x 8 x i16> %scalable_arg, ptr %ptr |
| ret void |
| } |
| |
| ; == USVDOT == |
| |
| define void @test_usvdot_lane_za32_vg1x4_nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm) { |
| ; CHECK-LABEL: test_usvdot_lane_za32_vg1x4_nxv16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: usvdot za.s[w8, 7, vgx4], { z0.b - z3.b }, z4.b[3] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zm, i32 3) |
| ret void |
| } |
| |
| define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { |
| ; CHECK-LABEL: usvdot_form_4x_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| ret void |
| } |
| |
| define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 8 x i16> %scalable_arg) #0 { |
| ; CHECK-LABEL: usvdot_form_4x_tuple_svecc: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill |
| ; CHECK-NEXT: addvl sp, sp, #-9 |
| ; CHECK-NEXT: lsl x9, x1, #1 |
| ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: add x10, x9, x1 |
| ; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill |
| ; CHECK-NEXT: ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0] |
| ; CHECK-NEXT: ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1] |
| ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] |
| ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] |
| ; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] |
| ; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload |
| ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload |
| ; CHECK-NEXT: str z0, [x0] |
| ; CHECK-NEXT: addvl sp, sp, #9 |
| ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2 |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0 |
| %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1 |
| %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2 |
| %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0 |
| %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1 |
| %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2 |
| %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0 |
| %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1 |
| %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2 |
| %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3 |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0) |
| tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0) |
| store <vscale x 8 x i16> %scalable_arg, ptr %ptr |
| ret void |
| } |
| |
| attributes #0 = { nounwind "target-features"="+sme2" } |
| attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" } |
| |
| ; == FVDOT == |
| declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32) |
| declare void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32) |
| |
| ; == SVDOT == |
| declare void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) |
| declare void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) |
| declare void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) |
| |
| ; == UVDOT == |
| declare void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) |
| declare void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) |
| declare void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32) |
| |
| ; == SUVDOT == |
| declare void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) |
| |
| ; == USVDOT == |
| declare void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32) |