Phoebe Wang | c72a751 | 2024-11-01 16:45:03 +0800 | [diff] [blame] | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| 2 | ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ |
| 3 | ; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s |
| 4 | |
| 5 | @buf = dso_local global [2048 x i8] zeroinitializer, align 16 |
| 6 | @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 |
| 7 | |
| 8 | define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { |
| 9 | ; CHECK-LABEL: test_tile_2rpntlvwz0: |
| 10 | ; CHECK: # %bb.0: # %entry |
| 11 | ; CHECK-NEXT: pushq %rbp |
| 12 | ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| 13 | ; CHECK-NEXT: .cfi_offset %rbp, -16 |
| 14 | ; CHECK-NEXT: movq %rsp, %rbp |
| 15 | ; CHECK-NEXT: .cfi_def_cfa_register %rbp |
| 16 | ; CHECK-NEXT: pushq %rbx |
| 17 | ; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 |
| 18 | ; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 |
| 19 | ; CHECK-NEXT: .cfi_offset %rbx, -24 |
| 20 | ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| 21 | ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) |
| 22 | ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) |
| 23 | ; CHECK-NEXT: # kill: def $dx killed $dx killed $edx |
| 24 | ; CHECK-NEXT: movw %si, %cx |
| 25 | ; CHECK-NEXT: movw %di, %ax |
| 26 | ; CHECK-NEXT: # implicit-def: $al |
| 27 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 28 | ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| 29 | ; CHECK-NEXT: # implicit-def: $al |
| 30 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 31 | ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| 32 | ; CHECK-NEXT: # implicit-def: $al |
| 33 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 34 | ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| 35 | ; CHECK-NEXT: # implicit-def: $cl |
| 36 | ; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) |
| 37 | ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| 38 | ; CHECK-NEXT: # implicit-def: $al |
| 39 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 40 | ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| 41 | ; CHECK-NEXT: # implicit-def: $al |
| 42 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 43 | ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| 44 | ; CHECK-NEXT: # implicit-def: $al |
| 45 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 46 | ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| 47 | ; CHECK-NEXT: # implicit-def: $al |
| 48 | ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| 49 | ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| 50 | ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| 51 | ; CHECK-NEXT: movl $buf, %esi |
| 52 | ; CHECK-NEXT: movl $32, %edi |
| 53 | ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 |
| 54 | ; CHECK-NEXT: movabsq $64, %rbx |
| 55 | ; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill |
| 56 | ; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload |
| 57 | ; CHECK-NEXT: movabsq $64, %rbx |
| 58 | ; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill |
| 59 | ; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload |
| 60 | ; CHECK-NEXT: movl $64, %edi |
| 61 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 62 | ; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) |
| 63 | ; CHECK-NEXT: movl $64, %edi |
| 64 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 65 | ; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| 66 | ; CHECK-NEXT: tilezero %tmm0 |
| 67 | ; CHECK-NEXT: movl $64, %edi |
| 68 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 69 | ; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| 70 | ; CHECK-NEXT: movl $64, %edi |
| 71 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 72 | ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 |
| 73 | ; CHECK-NEXT: movl $64, %edi |
| 74 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 75 | ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 |
| 76 | ; CHECK-NEXT: movl $64, %edi |
| 77 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 78 | ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 |
| 79 | ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 |
| 80 | ; CHECK-NEXT: movl $64, %edi |
| 81 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 82 | ; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| 83 | ; CHECK-NEXT: movl $64, %edi |
| 84 | ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| 85 | ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 |
| 86 | ; CHECK-NEXT: movl $buf2, %edx |
| 87 | ; CHECK-NEXT: movl $32, %esi |
| 88 | ; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) |
| 89 | ; CHECK-NEXT: leaq -8(%rbp), %rsp |
| 90 | ; CHECK-NEXT: popq %rbx |
| 91 | ; CHECK-NEXT: popq %rbp |
| 92 | ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 |
| 93 | ; CHECK-NEXT: tilerelease |
| 94 | ; CHECK-NEXT: vzeroupper |
| 95 | ; CHECK-NEXT: retq |
| 96 | entry: |
| 97 | %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 |
| 98 | %1 = extractvalue { x86_amx, x86_amx } %0, 0 |
| 99 | %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 |
| 100 | %3 = extractvalue { x86_amx, x86_amx } %0, 1 |
| 101 | %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 |
| 102 | %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 |
| 103 | %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 |
| 104 | %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 |
| 105 | %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 |
| 106 | %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 |
| 107 | %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 |
| 108 | %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 |
| 109 | %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 |
| 110 | tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 |
| 111 | ret void |
| 112 | } |
| 113 | |
| 114 | declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 |
| 115 | |
| 116 | declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 |
| 117 | |
| 118 | declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 |
| 119 | |
| 120 | declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 |
| 121 | |
| 122 | declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 |
| 123 | |
| 124 | declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 |
| 125 | |
| 126 | attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } |
| 127 | attributes #1 = { argmemonly nofree nounwind readonly } |
| 128 | attributes #2 = { nofree nosync nounwind readnone } |
| 129 | attributes #3 = { nounwind } |
| 130 | attributes #4 = { argmemonly nounwind writeonly } |
| 131 | |
| 132 | !llvm.module.flags = !{!0, !1, !2} |
| 133 | |
| 134 | !0 = !{i32 1, !"wchar_size", i32 4} |
| 135 | !1 = !{i32 7, !"uwtable", i32 2} |
| 136 | !2 = !{i32 7, !"frame-pointer", i32 2} |