| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s |
| |
| define void @st1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) { |
| ; CHECK-LABEL: st1b: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w1 |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: st1b {za0h.b[w12, 15]}, p0, [x0] |
| ; CHECK-NEXT: st1b {za0v.b[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: ret |
| %tileslice = add i32 %sliceidx, 15 |
| call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| ret void; |
| } |
| |
| define void @st1b_with_addr_offset(<vscale x 16 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { |
| ; CHECK-LABEL: st1b_with_addr_offset: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: mov w12, w2 |
| ; CHECK-NEXT: st1b {za0h.b[w13, 0]}, p0, [x0, x1] |
| ; CHECK-NEXT: st1b {za0v.b[w12, 15]}, p0, [x0, x1] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i8, ptr %ptr, i64 %index |
| %tileslice = add i32 %sliceidx, 15 |
| call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 %tileslice) |
| ret void; |
| } |
| |
| define void @st1h(<vscale x 8 x i1> %pg, ptr %ptr, i32 %sliceidx) { |
| ; CHECK-LABEL: st1h: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w1 |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0] |
| ; CHECK-NEXT: st1h {za1h.h[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1h {za0v.h[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1h {za1v.h[w12, 7]}, p0, [x0] |
| ; CHECK-NEXT: ret |
| %tileslice = add i32 %sliceidx, 7 |
| call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 %tileslice) |
| ret void; |
| } |
| |
| define void @st1h_with_addr_offset(<vscale x 8 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { |
| ; CHECK-LABEL: st1h_with_addr_offset: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w2 |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0, x1, lsl #1] |
| ; CHECK-NEXT: st1h {za1v.h[w13, 0]}, p0, [x0, x1, lsl #1] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i16, ptr %ptr, i64 %index |
| %tileslice = add i32 %sliceidx, 7 |
| call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> %pg, ptr %base, i32 0, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> %pg, ptr %base, i32 1, i32 0) |
| ret void; |
| } |
| |
| define void @st1w(<vscale x 4 x i1> %pg, ptr %ptr, i32 %sliceidx) { |
| ; CHECK-LABEL: st1w: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: mov w12, w1 |
| ; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za1h.s[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za2h.s[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za3h.s[w12, 3]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za0v.s[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za1v.s[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za2v.s[w12, 3]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za3v.s[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: ret |
| %tileslice = add i32 %sliceidx, 3 |
| call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 0) |
| call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 0) |
| ret void; |
| } |
| |
| define void @st1w_with_addr_offset(<vscale x 4 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { |
| ; CHECK-LABEL: st1w_with_addr_offset: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: mov w12, w2 |
| ; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0, x1, lsl #2] |
| ; CHECK-NEXT: st1w {za3v.s[w12, 3]}, p0, [x0, x1, lsl #2] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i32, ptr %ptr, i64 %index |
| %tileslice = add i32 %sliceidx, 3 |
| call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %base, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> %pg, ptr %base, i32 3, i32 %tileslice) |
| ret void; |
| } |
| |
| define void @st1d(<vscale x 2 x i1> %pg, ptr %ptr, i32 %sliceidx) { |
| ; CHECK-LABEL: st1d: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: mov w12, w1 |
| ; CHECK-NEXT: st1d {za0h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za1h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za2h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za3h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za4h.d[w12, 1]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za5h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za6h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za7h.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za0v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za1v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za2v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za3v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za4v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za5v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za6v.d[w13, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1d {za7v.d[w12, 1]}, p0, [x0] |
| ; CHECK-NEXT: ret |
| %tileslice = add i32 %sliceidx, 1 |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0) |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 %tileslice) |
| ret void; |
| } |
| |
| define void @st1d_with_addr_offset(<vscale x 2 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { |
| ; CHECK-LABEL: st1d_with_addr_offset: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, w2 |
| ; CHECK-NEXT: mov w13, wzr |
| ; CHECK-NEXT: st1d {za0h.d[w12, 1]}, p0, [x0, x1, lsl #3] |
| ; CHECK-NEXT: st1d {za7v.d[w13, 0]}, p0, [x0, x1, lsl #3] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i64, ptr %ptr, i64 %index |
| %tileslice = add i32 %sliceidx, 1 |
| call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> %pg, ptr %base, i32 0, i32 %tileslice) |
| call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> %pg, ptr %base, i32 7, i32 0) |
| ret void; |
| } |
| |
| define void @st1q(<vscale x 1 x i1> %pg, ptr %ptr) { |
| ; CHECK-LABEL: st1q: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, wzr |
| ; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za1h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za2h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za3h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za4h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za5h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za6h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za7h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za8h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za9h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za10h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za11h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za12h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za13h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za14h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za15h.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za0v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za1v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za2v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za3v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za4v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za5v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za6v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za7v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za8v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za9v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za10v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za11v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za12v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za13v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za14v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0) |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0) |
| ret void; |
| } |
| |
| define void @st1q_with_addr_offset(<vscale x 1 x i1> %pg, ptr %ptr, i64 %index) { |
| ; CHECK-LABEL: st1q_with_addr_offset: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, wzr |
| ; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0, x1, lsl #4] |
| ; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0, x1, lsl #4] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i128, ptr %ptr, i64 %index |
| call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> %pg, ptr %base, i32 0, i32 0) |
| call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> %pg, ptr %base, i32 15, i32 0) |
| ret void; |
| } |
| |
| define void @str(ptr %ptr) { |
| ; CHECK-LABEL: str: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, wzr |
| ; CHECK-NEXT: str za[w12, 0], [x0] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0) |
| ret void; |
| } |
| |
| define void @str_with_off_15(ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_15: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, #15 // =0xf |
| ; CHECK-NEXT: add x8, x0, #15 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i8, ptr %ptr, i64 15 |
| call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) |
| ret void; |
| } |
| |
| define void @str_with_off_15mulvl(ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_15mulvl: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, #15 // =0xf |
| ; CHECK-NEXT: addvl x8, x0, #15 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: ret |
| %vscale = call i64 @llvm.vscale.i64() |
| %mulvl = mul i64 %vscale, 240 |
| %base = getelementptr i8, ptr %ptr, i64 %mulvl |
| call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) |
| ret void; |
| } |
| |
| define void @str_with_off_16mulvl(ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_16mulvl: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, #16 // =0x10 |
| ; CHECK-NEXT: addvl x8, x0, #16 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: ret |
| %vscale = call i64 @llvm.vscale.i64() |
| %mulvl = mul i64 %vscale, 256 |
| %base = getelementptr i8, ptr %ptr, i64 %mulvl |
| call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0) |
| ret void; |
| } |
| |
| define void @str_with_off_var(ptr %base, i32 %off) { |
| ; CHECK-LABEL: str_with_off_var: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sxtw x8, w1 |
| ; CHECK-NEXT: rdsvl x9, #1 |
| ; CHECK-NEXT: add w12, w1, #16 |
| ; CHECK-NEXT: madd x8, x9, x8, x0 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: ret |
| call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) |
| ret void; |
| } |
| |
| define void @str_with_off_15imm(ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_15imm: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov w12, #15 // =0xf |
| ; CHECK-NEXT: add x8, x0, #15 |
| ; CHECK-NEXT: str za[w12, 15], [x8, #15, mul vl] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i8, ptr %ptr, i64 15 |
| call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15) |
| ret void; |
| } |
| |
| define void @str_with_off_16imm(ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_16imm: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: rdsvl x8, #1 |
| ; CHECK-NEXT: mov w12, #31 // =0x1f |
| ; CHECK-NEXT: add x8, x0, x8, lsl #4 |
| ; CHECK-NEXT: add x8, x8, #15 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: ret |
| %base = getelementptr i8, ptr %ptr, i64 15 |
| call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16) |
| ret void; |
| } |
| |
| define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_many_imm: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: str za[w12, 1], [x1, #1, mul vl] |
| ; CHECK-NEXT: str za[w12, 2], [x1, #2, mul vl] |
| ; CHECK-NEXT: str za[w12, 3], [x1, #3, mul vl] |
| ; CHECK-NEXT: str za[w12, 4], [x1, #4, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4) |
| ret void |
| } |
| |
| define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_many_imm_15_18: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rdsvl x8, #1 |
| ; CHECK-NEXT: mov w12, w0 |
| ; CHECK-NEXT: add x8, x1, x8, lsl #4 |
| ; CHECK-NEXT: str za[w12, 15], [x1, #15, mul vl] |
| ; CHECK-NEXT: add w12, w0, #16 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] |
| ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) |
| ret void |
| } |
| |
| define void @str_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_many_imm_16_19: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rdsvl x8, #1 |
| ; CHECK-NEXT: add w12, w0, #16 |
| ; CHECK-NEXT: add x8, x1, x8, lsl #4 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] |
| ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] |
| ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 19) |
| ret void |
| } |
| |
| define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_many_imm_31_34: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rdsvl x8, #1 |
| ; CHECK-NEXT: add w12, w0, #16 |
| ; CHECK-NEXT: add w13, w0, #32 |
| ; CHECK-NEXT: add x9, x1, x8, lsl #4 |
| ; CHECK-NEXT: add x8, x1, x8, lsl #5 |
| ; CHECK-NEXT: str za[w12, 15], [x9, #15, mul vl] |
| ; CHECK-NEXT: str za[w13, 0], [x8] |
| ; CHECK-NEXT: str za[w13, 1], [x8, #1, mul vl] |
| ; CHECK-NEXT: str za[w13, 2], [x8, #2, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 31) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) |
| ret void |
| } |
| |
| define void @str_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr) { |
| ; CHECK-LABEL: str_with_off_many_imm_32_35: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: rdsvl x8, #1 |
| ; CHECK-NEXT: add w12, w0, #32 |
| ; CHECK-NEXT: add x8, x1, x8, lsl #5 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] |
| ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] |
| ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 35) |
| ret void |
| } |
| |
| define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { |
| ; CHECK-LABEL: str_with_off_many_var: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: sxtw x8, w2 |
| ; CHECK-NEXT: rdsvl x9, #1 |
| ; CHECK-NEXT: add w12, w0, w2 |
| ; CHECK-NEXT: madd x8, x9, x8, x1 |
| ; CHECK-NEXT: str za[w12, 0], [x8] |
| ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] |
| ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] |
| ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = trunc i64 %vnum to i32 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0) |
| %1 = add i32 %0, 1 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) |
| %2 = add i32 %0, 2 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) |
| %3 = add i32 %0, 3 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) |
| ret void |
| } |
| |
| define void @str_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { |
| ; CHECK-LABEL: str_with_off_many_var_high: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: add w8, w2, #32 |
| ; CHECK-NEXT: rdsvl x10, #1 |
| ; CHECK-NEXT: sxtw x9, w8 |
| ; CHECK-NEXT: add w12, w0, w8 |
| ; CHECK-NEXT: madd x9, x10, x9, x1 |
| ; CHECK-NEXT: str za[w12, 1], [x9, #1, mul vl] |
| ; CHECK-NEXT: str za[w12, 2], [x9, #2, mul vl] |
| ; CHECK-NEXT: str za[w12, 3], [x9, #3, mul vl] |
| ; CHECK-NEXT: str za[w12, 4], [x9, #4, mul vl] |
| ; CHECK-NEXT: ret |
| entry: |
| %0 = trunc i64 %vnum to i32 |
| %1 = add i32 %0, 33 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) |
| %2 = add i32 %0, 34 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) |
| %3 = add i32 %0, 35 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) |
| %4 = add i32 %0, 36 |
| tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %4) |
| ret void |
| } |
| |
| |
| ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' |
| ; that's decomposed into a base + offset in ISel. |
| define void @test_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) { |
| ; CHECK-LABEL: test_sink_tile0_offset_operand: |
| ; CHECK: // %bb.0: // %entry |
| ; CHECK-NEXT: mov w12, w1 |
| ; CHECK-NEXT: .LBB24_1: // %for.body |
| ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0] |
| ; CHECK-NEXT: subs w2, w2, #1 |
| ; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0] |
| ; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0] |
| ; CHECK-NEXT: b.ne .LBB24_1 |
| ; CHECK-NEXT: // %bb.2: // %exit |
| ; CHECK-NEXT: ret |
| entry: |
| %add0 = add i32 %base, 1 |
| %add1 = add i32 %base, 2 |
| br label %for.body |
| |
| for.body: |
| %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] |
| tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %base) |
| tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add0) |
| tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add1) |
| %inc = add nuw nsw i32 %i, 1 |
| %exitcond.not = icmp eq i32 %inc, %N |
| br i1 %exitcond.not, label %exit, label %for.body |
| |
| exit: |
| ret void |
| } |
| |
| declare void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1>, ptr, i32, i32) |
| declare void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1>, ptr, i32, i32) |
| |
| declare void @llvm.aarch64.sme.str(i32, ptr, i32) |
| declare i64 @llvm.vscale.i64() |