blob: 2e4a5f082311ebd213250cc7dd7c4ff02b0a4f23 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
;
; Move Multi-Vector To Tile (Write) x 2
;
; Horizontal
define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.b[w12, 0:1], { z0.b, z1.b }
; CHECK-NEXT: mov za0h.b[w12, 14:15], { z0.b, z1.b }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
%slice.14 = add i32 %slice, 14
call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
ret void
}
define void @za_write_vg2_horiz_b_tuple(i64 %stride, ptr %ptr) {
; CHECK-LABEL: za_write_vg2_horiz_b_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x1]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
; CHECK-NEXT: mov za0h.b[w12, 0:1], { z16.b, z17.b }
; CHECK-NEXT: mov za0h.b[w12, 0:1], { z24.b, z25.b }
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5)
call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
ret void
}
define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h }
; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
%slice.6 = add i32 %slice, 6
call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
ret void
}
define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h }
; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
%slice.6 = add i32 %slice, 6
call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
ret void
}
define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h }
; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
%slice.6 = add i32 %slice, 6
call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
ret void
}
define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s }
; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
%slice.2 = add i32 %slice, 2
call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
ret void
}
define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s }
; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
%slice.2 = add i32 %slice, 2
call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
ret void
}
define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2)
ret void
}
define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
; CHECK-LABEL: za_write_vg2_horiz_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2)
ret void
}
; Vertical
define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.b[w12, 0:1], { z0.b, z1.b }
; CHECK-NEXT: mov za0v.b[w12, 14:15], { z0.b, z1.b }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
%slice.14 = add i32 %slice, 14
call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
ret void
}
define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h }
; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
%slice.6 = add i32 %slice, 6
call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
ret void
}
define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h }
; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
%slice.6 = add i32 %slice, 6
call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
ret void
}
define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h }
; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
%slice.6 = add i32 %slice, 6
call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
ret void
}
define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s }
; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
%slice.2 = add i32 %slice, 2
call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2)
ret void
}
define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s }
; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
%slice.2 = add i32 %slice, 2
call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2)
ret void
}
define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2)
ret void
}
define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) {
; CHECK-LABEL: za_write_vg2_vert_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2)
ret void
}
;
; Move Multi-Vector To Tile (Write) x 4
;
; Horizontal
define void @za_write_vg4_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.b[w12, 0:3], { z0.b - z3.b }
; CHECK-NEXT: mov za0h.b[w12, 12:15], { z0.b - z3.b }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
%slice.12 = add i32 %slice, 12
call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
ret void
}
define void @za_write_vg4_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h }
; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
%slice.4 = add i32 %slice, 4
call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
ret void
}
define void @za_write_vg4_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h }
; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
%slice.4 = add i32 %slice, 4
call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
ret void
}
define void @za_write_vg4_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h }
; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
%slice.4 = add i32 %slice, 4
call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
ret void
}
define void @za_write_vg4_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
ret void
}
define void @za_write_vg4_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
ret void
}
define void @za_write_vg4_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
ret void
}
define void @za_write_vg4_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) {
; CHECK-LABEL: za_write_vg4_horiz_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4)
ret void
}
; Vertical
define void @za_write_vg4_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.b[w12, 0:3], { z0.b - z3.b }
; CHECK-NEXT: mov za0v.b[w12, 12:15], { z0.b - z3.b }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
%slice.12 = add i32 %slice, 12
call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
ret void
}
define void @za_write_vg4_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h }
; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
%slice.4 = add i32 %slice, 4
call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
ret void
}
define void @za_write_vg4_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h }
; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
%slice.4 = add i32 %slice, 4
call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
ret void
}
define void @za_write_vg4_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h }
; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
%slice.4 = add i32 %slice, 4
call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
ret void
}
define void @za_write_vg4_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4)
ret void
}
define void @za_write_vg4_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4)
ret void
}
define void @za_write_vg4_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4)
ret void
}
define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) {
; CHECK-LABEL: za_write_vg4_vert_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4)
ret void
}
define void @za_write_vg4_vert_f64_tuple(i64 %stride, ptr %ptr) {
; CHECK-LABEL: za_write_vg4_vert_f64_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x8, x0, #1
; CHECK-NEXT: add x9, x1, x0
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1]
; CHECK-NEXT: ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x9]
; CHECK-NEXT: mov w12, wzr
; CHECK-NEXT: add x10, x1, x8
; CHECK-NEXT: add x8, x9, x8
; CHECK-NEXT: ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x10]
; CHECK-NEXT: ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x8]
; CHECK-NEXT: mov za0v.d[w12, 0:3], { z16.d - z19.d }
; CHECK-NEXT: mov za0v.d[w12, 0:3], { z20.d - z23.d }
; CHECK-NEXT: mov za0v.d[w12, 0:3], { z24.d - z27.d }
; CHECK-NEXT: mov za0v.d[w12, 0:3], { z28.d - z31.d }
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
%3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
%4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 2
%5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
%8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
%9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 2
%10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
%13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 1
%14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 2
%15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 0
%18 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 1
%19 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 2
%20 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 3
call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %7, <vscale x 2 x double> %12, <vscale x 2 x double> %17)
call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %8, <vscale x 2 x double> %13, <vscale x 2 x double> %18)
call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %4, <vscale x 2 x double> %9, <vscale x 2 x double> %14, <vscale x 2 x double> %19)
call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 0, <vscale x 2 x double> %5, <vscale x 2 x double> %10, <vscale x 2 x double> %15, <vscale x 2 x double> %20)
ret void
}
;
; Move Multi-Vector To ZA (Write) x2
;
define void @za_write_vg1x2_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) {
; CHECK-LABEL: za_write_vg1x2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2)
ret void
}
define void @za_write_vg1x2_b_tuple(i64 %stride, ptr %ptr) {
; CHECK-LABEL: za_write_vg1x2_b_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x1]
; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x1, x0]
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z16.d, z17.d }
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z24.d, z25.d }
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
%3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
%5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
%6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5)
call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6)
ret void
}
define void @za_write_vg1x2_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) {
; CHECK-LABEL: za_write_vg1x2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2)
ret void
}
define void @za_write_vg1x2_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) {
; CHECK-LABEL: za_write_vg1x2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2)
ret void
}
define void @za_write_vg1x2_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) {
; CHECK-LABEL: za_write_vg1x2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2)
ret void
}
define void @za_write_vg1x2_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) {
; CHECK-LABEL: za_write_vg1x2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2)
ret void
}
define void @za_write_vg1x2_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) {
; CHECK-LABEL: za_write_vg1x2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2)
ret void
}
define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) {
; CHECK-LABEL: za_write_vg1x2_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2)
ret void
}
define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) {
; CHECK-LABEL: za_write_vg1x2_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2)
ret void
}
;
; Move Multi-Vector To ZA (Write) x4
;
define void @za_write_vg1x4_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) {
; CHECK-LABEL: za_write_vg1x4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4)
ret void
}
define void @za_write_vg1x4_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) {
; CHECK-LABEL: za_write_vg1x4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4)
ret void
}
define void @za_write_vg1x4_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) {
; CHECK-LABEL: za_write_vg1x4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4)
ret void
}
define void @za_write_vg1x4_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) {
; CHECK-LABEL: za_write_vg1x4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4)
ret void
}
define void @za_write_vg1x4_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) {
; CHECK-LABEL: za_write_vg1x4_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4)
ret void
}
define void @za_write_vg1x4_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) {
; CHECK-LABEL: za_write_vg1x4_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4)
ret void
}
define void @za_write_vg1x4_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) {
; CHECK-LABEL: za_write_vg1x4_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4)
ret void
}
define void @za_write_vg1x4_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) {
; CHECK-LABEL: za_write_vg1x4_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d }
; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d }
; CHECK-NEXT: ret
call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4)
%slice.7 = add i32 %slice, 7
call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4)
ret void
}
define void @za_write_vg1x4_f64_tuple(i64 %stride, ptr %ptr) {
; CHECK-LABEL: za_write_vg1x4_f64_tuple:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: lsl x9, x0, #1
; CHECK-NEXT: add x10, x1, x0
; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: ld1d { z16.d, z20.d, z24.d, z28.d }, pn8/z, [x1]
; CHECK-NEXT: ld1d { z17.d, z21.d, z25.d, z29.d }, pn8/z, [x10]
; CHECK-NEXT: mov w8, wzr
; CHECK-NEXT: add x11, x1, x9
; CHECK-NEXT: add x9, x10, x9
; CHECK-NEXT: ld1d { z18.d, z22.d, z26.d, z30.d }, pn8/z, [x11]
; CHECK-NEXT: ld1d { z19.d, z23.d, z27.d, z31.d }, pn8/z, [x9]
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z16.d - z19.d }
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z20.d - z23.d }
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z24.d - z27.d }
; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z28.d - z31.d }
; CHECK-NEXT: ret
entry:
%0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
%1 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
%2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
%3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
%4 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 2
%5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %1, 3
%arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
%6 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
%7 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 0
%8 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 1
%9 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 2
%10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %6, 3
%mul3 = shl i64 %stride, 1
%arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
%11 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx4)
%12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 0
%13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 1
%14 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 2
%15 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %11, 3
%mul5 = mul i64 %stride, 3
%arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
%16 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx6)
%17 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 0
%18 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 1
%19 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 2
%20 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %16, 3
call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %7, <vscale x 2 x double> %12, <vscale x 2 x double> %17)
call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %8, <vscale x 2 x double> %13, <vscale x 2 x double> %18)
call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %4, <vscale x 2 x double> %9, <vscale x 2 x double> %14, <vscale x 2 x double> %19)
call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 0, <vscale x 2 x double> %5, <vscale x 2 x double> %10, <vscale x 2 x double> %15, <vscale x 2 x double> %20)
ret void
}
declare void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
declare void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)