blob: 4f41410010302e1699b70856785b67290e8b9b30 [file] [log] [blame]
Phoebe Wangc72a7512024-11-01 16:45:03 +08001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
3; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
4
5@buf = dso_local global [2048 x i8] zeroinitializer, align 16
6@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
7
8define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
9; CHECK-LABEL: test_tile_2rpntlvwz0:
10; CHECK: # %bb.0: # %entry
11; CHECK-NEXT: pushq %rbp
12; CHECK-NEXT: .cfi_def_cfa_offset 16
13; CHECK-NEXT: .cfi_offset %rbp, -16
14; CHECK-NEXT: movq %rsp, %rbp
15; CHECK-NEXT: .cfi_def_cfa_register %rbp
16; CHECK-NEXT: pushq %rbx
17; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
18; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000
19; CHECK-NEXT: .cfi_offset %rbx, -24
20; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
21; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
22; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
23; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
24; CHECK-NEXT: movw %si, %cx
25; CHECK-NEXT: movw %di, %ax
26; CHECK-NEXT: # implicit-def: $al
27; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
28; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
29; CHECK-NEXT: # implicit-def: $al
30; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
31; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
32; CHECK-NEXT: # implicit-def: $al
33; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
34; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
35; CHECK-NEXT: # implicit-def: $cl
36; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
37; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
38; CHECK-NEXT: # implicit-def: $al
39; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
40; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
41; CHECK-NEXT: # implicit-def: $al
42; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
43; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
44; CHECK-NEXT: # implicit-def: $al
45; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
46; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
47; CHECK-NEXT: # implicit-def: $al
48; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
49; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
50; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
51; CHECK-NEXT: movl $buf, %esi
52; CHECK-NEXT: movl $32, %edi
53; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4
54; CHECK-NEXT: movabsq $64, %rbx
55; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
56; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
57; CHECK-NEXT: movabsq $64, %rbx
58; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
59; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
60; CHECK-NEXT: movl $64, %edi
61; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
62; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi)
63; CHECK-NEXT: movl $64, %edi
64; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
65; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
66; CHECK-NEXT: tilezero %tmm0
67; CHECK-NEXT: movl $64, %edi
68; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
69; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
70; CHECK-NEXT: movl $64, %edi
71; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
72; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1
73; CHECK-NEXT: movl $64, %edi
74; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
75; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2
76; CHECK-NEXT: movl $64, %edi
77; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
78; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
79; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
80; CHECK-NEXT: movl $64, %edi
81; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
82; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
83; CHECK-NEXT: movl $64, %edi
84; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
85; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
86; CHECK-NEXT: movl $buf2, %edx
87; CHECK-NEXT: movl $32, %esi
88; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
89; CHECK-NEXT: leaq -8(%rbp), %rsp
90; CHECK-NEXT: popq %rbx
91; CHECK-NEXT: popq %rbp
92; CHECK-NEXT: .cfi_def_cfa %rsp, 8
93; CHECK-NEXT: tilerelease
94; CHECK-NEXT: vzeroupper
95; CHECK-NEXT: retq
96entry:
97 %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
98 %1 = extractvalue { x86_amx, x86_amx } %0, 0
99 %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
100 %3 = extractvalue { x86_amx, x86_amx } %0, 1
101 %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
102 %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
103 %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
104 %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
105 %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
106 %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
107 %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
108 %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
109 %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
110 tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
111 ret void
112}
113
114declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
115
116declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
117
118declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
119
120declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
121
122declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
123
124declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
125
126attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
127attributes #1 = { argmemonly nofree nounwind readonly }
128attributes #2 = { nofree nosync nounwind readnone }
129attributes #3 = { nounwind }
130attributes #4 = { argmemonly nounwind writeonly }
131
132!llvm.module.flags = !{!0, !1, !2}
133
134!0 = !{i32 1, !"wchar_size", i32 4}
135!1 = !{i32 7, !"uwtable", i32 2}
136!2 = !{i32 7, !"frame-pointer", i32 2}