blob: 4f41410010302e1699b70856785b67290e8b9b30 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
@buf = dso_local global [2048 x i8] zeroinitializer, align 16
@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
; CHECK-LABEL: test_tile_2rpntlvwz0:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
; CHECK-NEXT: movw %si, %cx
; CHECK-NEXT: movw %di, %ax
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $cl
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: # implicit-def: $al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf, %esi
; CHECK-NEXT: movl $32, %edi
; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4
; CHECK-NEXT: movabsq $64, %rbx
; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
; CHECK-NEXT: movabsq $64, %rbx
; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi)
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
; CHECK-NEXT: movl $64, %edi
; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
; CHECK-NEXT: movl $buf2, %edx
; CHECK-NEXT: movl $32, %esi
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
; CHECK-NEXT: leaq -8(%rbp), %rsp
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
%1 = extractvalue { x86_amx, x86_amx } %0, 0
%2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
%3 = extractvalue { x86_amx, x86_amx } %0, 1
%4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
%5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
%6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
%7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
%8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
%9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
%10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
%11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
%12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
ret void
}
declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
attributes #1 = { argmemonly nofree nounwind readonly }
attributes #2 = { nofree nosync nounwind readnone }
attributes #3 = { nounwind }
attributes #4 = { argmemonly nounwind writeonly }
!llvm.module.flags = !{!0, !1, !2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 2}
!2 = !{i32 7, !"frame-pointer", i32 2}