| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s |
| |
| ; |
| ; Move Multi-Vector To Tile (Write) x 2 |
| ; |
| |
| ; Horizontal |
| |
| define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.b[w12, 0:1], { z0.b, z1.b } |
| ; CHECK-NEXT: mov za0h.b[w12, 14:15], { z0.b, z1.b } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) |
| %slice.14 = add i32 %slice, 14 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_b_tuple(i64 %stride, ptr %ptr) { |
| ; CHECK-LABEL: za_write_vg2_horiz_b_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: mov w12, wzr |
| ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x1] |
| ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x1, x0] |
| ; CHECK-NEXT: mov za0h.b[w12, 0:1], { z16.b, z17.b } |
| ; CHECK-NEXT: mov za0h.b[w12, 0:1], { z24.b, z25.b } |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 |
| %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5) |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } |
| ; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) |
| %slice.6 = add i32 %slice, 6 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } |
| ; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) |
| %slice.6 = add i32 %slice, 6 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } |
| ; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) |
| %slice.6 = add i32 %slice, 6 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_s: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s } |
| ; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) |
| %slice.2 = add i32 %slice, 2 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s } |
| ; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) |
| %slice.2 = add i32 %slice, 2 |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_horiz_f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) |
| ret void |
| } |
| |
| ; Vertical |
| |
| define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.b[w12, 0:1], { z0.b, z1.b } |
| ; CHECK-NEXT: mov za0v.b[w12, 14:15], { z0.b, z1.b } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) |
| %slice.14 = add i32 %slice, 14 |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } |
| ; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) |
| %slice.6 = add i32 %slice, 6 |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } |
| ; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) |
| %slice.6 = add i32 %slice, 6 |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } |
| ; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) |
| %slice.6 = add i32 %slice, 6 |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_s: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s } |
| ; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) |
| %slice.2 = add i32 %slice, 2 |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s } |
| ; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) |
| %slice.2 = add i32 %slice, 2 |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) |
| ret void |
| } |
| |
| define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) { |
| ; CHECK-LABEL: za_write_vg2_vert_f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) |
| ret void |
| } |
| |
| ; |
| ; Move Multi-Vector To Tile (Write) x 4 |
| ; |
| |
| ; Horizontal |
| |
| define void @za_write_vg4_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.b[w12, 0:3], { z0.b - z3.b } |
| ; CHECK-NEXT: mov za0h.b[w12, 12:15], { z0.b - z3.b } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) |
| %slice.12 = add i32 %slice, 12 |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } |
| ; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) |
| %slice.4 = add i32 %slice, 4 |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } |
| ; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) |
| %slice.4 = add i32 %slice, 4 |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } |
| ; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) |
| %slice.4 = add i32 %slice, 4 |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_s: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_horiz_f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) |
| ret void |
| } |
| |
| ; Vertical |
| |
| define void @za_write_vg4_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.b[w12, 0:3], { z0.b - z3.b } |
| ; CHECK-NEXT: mov za0v.b[w12, 12:15], { z0.b - z3.b } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) |
| %slice.12 = add i32 %slice, 12 |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } |
| ; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) |
| %slice.4 = add i32 %slice, 4 |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } |
| ; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) |
| %slice.4 = add i32 %slice, 4 |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } |
| ; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) |
| %slice.4 = add i32 %slice, 4 |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_s: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) { |
| ; CHECK-LABEL: za_write_vg4_vert_f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) |
| ret void |
| } |
| |
| define void @za_write_vg4_vert_f64_tuple(i64 %stride, ptr %ptr) { |
| ; CHECK-LABEL: za_write_vg4_vert_f64_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: lsl x8, x0, #1 |
| ; CHECK-NEXT: add x9, x1, x0 |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1] |
| ; CHECK-NEXT: ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x9] |
| ; CHECK-NEXT: mov w12, wzr |
| ; CHECK-NEXT: add x10, x1, x8 |
| ; CHECK-NEXT: add x8, x9, x8 |
| ; CHECK-NEXT: ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x10] |
| ; CHECK-NEXT: ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x8] |
| ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z16.d - z19.d } |
| ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z20.d - z23.d } |
| ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z24.d - z27.d } |
| ; CHECK-NEXT: mov za0v.d[w12, 0:3], { z28.d - z31.d } |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 0 |
| %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 1 |
| %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 2 |
| %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 0 |
| %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 1 |
| %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 2 |
| %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 0 |
| %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 1 |
| %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 2 |
| %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 0 |
| %18 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 1 |
| %19 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 2 |
| %20 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 3 |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %7, <vscale x 2 x double> %12, <vscale x 2 x double> %17) |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %8, <vscale x 2 x double> %13, <vscale x 2 x double> %18) |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %4, <vscale x 2 x double> %9, <vscale x 2 x double> %14, <vscale x 2 x double> %19) |
| call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %5, <vscale x 2 x double> %10, <vscale x 2 x double> %15, <vscale x 2 x double> %20) |
| ret void |
| } |
| |
| ; |
| ; Move Multi-Vector To ZA (Write) x2 |
| ; |
| |
| define void @za_write_vg1x2_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_b_tuple(i64 %stride, ptr %ptr) { |
| ; CHECK-LABEL: za_write_vg1x2_b_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x1] |
| ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x1, x0] |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z16.d, z17.d } |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z24.d, z25.d } |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0 |
| %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0 |
| %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5) |
| call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_s: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) |
| ret void |
| } |
| |
| define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) { |
| ; CHECK-LABEL: za_write_vg1x2_f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) |
| ret void |
| } |
| |
| ; |
| ; Move Multi-Vector To ZA (Write) x4 |
| ; |
| |
| define void @za_write_vg1x4_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_f16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_s: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_f32: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) { |
| ; CHECK-LABEL: za_write_vg1x4_f64: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w8, w0 |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) |
| %slice.7 = add i32 %slice, 7 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) |
| ret void |
| } |
| |
| define void @za_write_vg1x4_f64_tuple(i64 %stride, ptr %ptr) { |
| ; CHECK-LABEL: za_write_vg1x4_f64_tuple: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: lsl x9, x0, #1 |
| ; CHECK-NEXT: add x10, x1, x0 |
| ; CHECK-NEXT: ptrue pn8.b |
| ; CHECK-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1] |
| ; CHECK-NEXT: ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x10] |
| ; CHECK-NEXT: mov w8, wzr |
| ; CHECK-NEXT: add x11, x1, x9 |
| ; CHECK-NEXT: add x9, x10, x9 |
| ; CHECK-NEXT: ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x11] |
| ; CHECK-NEXT: ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x9] |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z16.d - z19.d } |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z20.d - z23.d } |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z24.d - z27.d } |
| ; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z28.d - z31.d } |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() |
| %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %ptr) |
| %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 0 |
| %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 1 |
| %4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 2 |
| %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 3 |
| %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride |
| %6 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2) |
| %7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 0 |
| %8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 1 |
| %9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 2 |
| %10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 3 |
| %mul3 = shl i64 %stride, 1 |
| %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3 |
| %11 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx4) |
| %12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 0 |
| %13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 1 |
| %14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 2 |
| %15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 3 |
| %mul5 = mul i64 %stride, 3 |
| %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5 |
| %16 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx6) |
| %17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 0 |
| %18 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 1 |
| %19 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 2 |
| %20 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 3 |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %7, <vscale x 2 x double> %12, <vscale x 2 x double> %17) |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %8, <vscale x 2 x double> %13, <vscale x 2 x double> %18) |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %4, <vscale x 2 x double> %9, <vscale x 2 x double> %14, <vscale x 2 x double> %19) |
| call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %5, <vscale x 2 x double> %10, <vscale x 2 x double> %15, <vscale x 2 x double> %20) |
| ret void |
| } |
| |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>) |
| declare void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>) |
| |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>) |
| declare void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>) |
| |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) |
| declare void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) |
| |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) |
| declare void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) |
| |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>) |
| declare void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>) |
| |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) |
| declare void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) |