| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ |
| ; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s |
| |
| @buf = dso_local global [2048 x i8] zeroinitializer, align 16 |
| @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16 |
| |
| define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 { |
| ; CHECK-LABEL: test_tile_2rpntlvwz0: |
| ; CHECK: # %bb.0: # %entry |
| ; CHECK-NEXT: pushq %rbp |
| ; CHECK-NEXT: .cfi_def_cfa_offset 16 |
| ; CHECK-NEXT: .cfi_offset %rbp, -16 |
| ; CHECK-NEXT: movq %rsp, %rbp |
| ; CHECK-NEXT: .cfi_def_cfa_register %rbp |
| ; CHECK-NEXT: pushq %rbx |
| ; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 |
| ; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000 |
| ; CHECK-NEXT: .cfi_offset %rbx, -24 |
| ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 |
| ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # kill: def $dx killed $dx killed $edx |
| ; CHECK-NEXT: movw %si, %cx |
| ; CHECK-NEXT: movw %di, %ax |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $cl |
| ; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: # implicit-def: $al |
| ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) |
| ; CHECK-NEXT: movl $buf, %esi |
| ; CHECK-NEXT: movl $32, %edi |
| ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4 |
| ; CHECK-NEXT: movabsq $64, %rbx |
| ; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill |
| ; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload |
| ; CHECK-NEXT: movabsq $64, %rbx |
| ; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill |
| ; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi) |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| ; CHECK-NEXT: tilezero %tmm0 |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1 |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2 |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 |
| ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi) |
| ; CHECK-NEXT: movl $64, %edi |
| ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi |
| ; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0 |
| ; CHECK-NEXT: movl $buf2, %edx |
| ; CHECK-NEXT: movl $32, %esi |
| ; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi) |
| ; CHECK-NEXT: leaq -8(%rbp), %rsp |
| ; CHECK-NEXT: popq %rbx |
| ; CHECK-NEXT: popq %rbp |
| ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 |
| ; CHECK-NEXT: tilerelease |
| ; CHECK-NEXT: vzeroupper |
| ; CHECK-NEXT: retq |
| entry: |
| %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3 |
| %1 = extractvalue { x86_amx, x86_amx } %0, 0 |
| %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3 |
| %3 = extractvalue { x86_amx, x86_amx } %0, 1 |
| %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3 |
| %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3 |
| %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3 |
| %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3 |
| %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3 |
| %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3 |
| %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3 |
| %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3 |
| %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3 |
| tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3 |
| ret void |
| } |
| |
| declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1 |
| |
| declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2 |
| |
| declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3 |
| |
| declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3 |
| |
| declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2 |
| |
| declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4 |
| |
| attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" } |
| attributes #1 = { argmemonly nofree nounwind readonly } |
| attributes #2 = { nofree nosync nounwind readnone } |
| attributes #3 = { nounwind } |
| attributes #4 = { argmemonly nounwind writeonly } |
| |
| !llvm.module.flags = !{!0, !1, !2} |
| |
| !0 = !{i32 1, !"wchar_size", i32 4} |
| !1 = !{i32 7, !"uwtable", i32 2} |
| !2 = !{i32 7, !"frame-pointer", i32 2} |