blob: f1c7d39f2b9c36e7e8a6af178972a0895216996b [file] [edit]
;; Verify that inline assembly is correctly accounted for in the
;; inst_pref_size calculation. The inst_pref_size is computed via MCExpr
;; label subtraction (.Lfunc_end - func_sym), giving exact code size.
;; See inst-prefetch-hint.ll for explanation of the instprefsize expression.
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj < %s -o %t.gfx11.o
; RUN: llvm-objdump -s -j .rodata %t.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s -o %t.gfx12.o
; RUN: llvm-objdump -s -j .rodata %t.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s
;; Verify text assembly round-trips through the MC assembler.
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=obj -o %t.rt.gfx11.o
; RUN: llvm-objdump -s -j .rodata %t.rt.gfx11.o | FileCheck --check-prefix=OBJ-GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj -o %t.rt.gfx12.o
; RUN: llvm-objdump -s -j .rodata %t.rt.gfx12.o | FileCheck --check-prefix=OBJ-GFX12 %s
;; --- .fill directive: .fill 256, 4, 0 => 1024 bytes + 4 (s_endpgm) = 1028 ---
;; pref_size = divideCeil(1028, 128) = 9
; GFX11-LABEL: .amdhsa_kernel test_fill
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_fill
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end0-test_fill)<<4)&4080)>>4
;; Object: kernel descriptor at 0x00, COMPUTE_PGM_RSRC3 at 0x2C:
;; pref_size=9 -> 9<<4 = 0x90
; OBJ-GFX11: 0020 {{.*}}90000000
; OBJ-GFX12: 0020 {{.*}}90000000
define amdgpu_kernel void @test_fill() {
call void asm sideeffect ".fill 256, 4, 0", ""()
ret void
}
;; --- .space directive: .space 1024 => 1024 bytes + 4 = 1028 ---
;; pref_size = 9
; GFX11-LABEL: .amdhsa_kernel test_space
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_space
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end1-test_space)<<4)&4080)>>4
;; Object: kernel descriptor at 0x40, COMPUTE_PGM_RSRC3 at 0x6C:
;; pref_size=9 -> 9<<4 = 0x90
; OBJ-GFX11: 0060 {{.*}}90000000
; OBJ-GFX12: 0060 {{.*}}90000000
define amdgpu_kernel void @test_space() {
call void asm sideeffect ".space 1024", ""()
ret void
}
;; --- Instructions: 32 x s_nop (4 bytes each) = 128 + 4 = 132 ---
;; pref_size = divideCeil(132, 128) = 2
; GFX11-LABEL: .amdhsa_kernel test_instructions
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_instructions
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end2-test_instructions)<<4)&4080)>>4
;; Object: kernel descriptor at 0x80, COMPUTE_PGM_RSRC3 at 0xAC:
;; pref_size=2 -> 2<<4 = 0x20
; OBJ-GFX11: 00a0 {{.*}}20000000
; OBJ-GFX12: 00a0 {{.*}}20000000
define amdgpu_kernel void @test_instructions() {
call void asm sideeffect "s_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0\0As_nop 0", ""()
ret void
}
;; --- Comments emit no bytes: only s_endpgm = 4 bytes ---
;; pref_size = 1
; GFX11-LABEL: .amdhsa_kernel test_comments
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_comments
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end3-test_comments)<<4)&4080)>>4
;; Object: kernel descriptor at 0xC0, COMPUTE_PGM_RSRC3 at 0xEC:
;; pref_size=1 -> 1<<4 = 0x10
; OBJ-GFX11: 00e0 {{.*}}10000000
; OBJ-GFX12: 00e0 {{.*}}10000000
define amdgpu_kernel void @test_comments() {
call void asm sideeffect "; comment 1\0A; comment 2\0A; comment 3", ""()
ret void
}
;; --- Empty inline asm: only s_endpgm = 4 bytes ---
;; pref_size = 1
; GFX11-LABEL: .amdhsa_kernel test_empty_asm
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_empty_asm
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end4-test_empty_asm)<<4)&4080)>>4
;; Object: kernel descriptor at 0x100, COMPUTE_PGM_RSRC3 at 0x12C:
;; pref_size=1 -> 1<<4 = 0x10
; OBJ-GFX11: 0120 {{.*}}10000000
; OBJ-GFX12: 0120 {{.*}}10000000
define amdgpu_kernel void @test_empty_asm() {
call void asm sideeffect "", ""()
ret void
}
;; --- Multiple inline asm blocks: .fill (512) + .space (512) + s_endpgm (4) = 1028 ---
;; pref_size = divideCeil(1028, 128) = 9
; GFX11-LABEL: .amdhsa_kernel test_multiple_asm
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_multiple_asm
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end5-test_multiple_asm)<<4)&4080)>>4
;; Object: kernel descriptor at 0x140, COMPUTE_PGM_RSRC3 at 0x16C:
;; pref_size=9 -> 9<<4 = 0x90
; OBJ-GFX11: 0160 {{.*}}90000000
; OBJ-GFX12: 0160 {{.*}}90000000
define amdgpu_kernel void @test_multiple_asm() {
call void asm sideeffect ".fill 128, 4, 0", ""()
call void asm sideeffect ".space 512", ""()
ret void
}
;; --- Large function that exceeds GFX11 6-bit field max (63) ---
;; .fill 2048, 4, 0 = 8192 bytes + 4 = 8196 bytes
;; divideCeil(8196, 128) = 65, but GFX11 max = (1<<6)-1 = 63
;; pref_size should clamp to 63
; GFX11-LABEL: .amdhsa_kernel test_clamping
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_clamping
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end6-test_clamping)<<4)&4080)>>4
;; Object: kernel descriptor at 0x180, COMPUTE_PGM_RSRC3 at 0x1AC:
;; gfx11: clamped to 63 -> 63<<4 = 0x3F0
;; gfx12: no clamping, 65 -> 65<<4 = 0x410
; OBJ-GFX11: 01a0 {{.*}}f0030000
; OBJ-GFX12: 01a0 {{.*}}10040000
define amdgpu_kernel void @test_clamping() {
call void asm sideeffect ".fill 2048, 4, 0", ""()
ret void
}
;; --- Large function that exceeds both GFX11 and GFX12 field max ---
;; .fill 8192, 4, 0 = 32768 bytes + 4 = 32772 bytes
;; divideCeil(32772, 128) = 257
;; GFX11 max = 63, GFX12 max = 255 -> both clamp
; GFX11-LABEL: .amdhsa_kernel test_clamping_both
; GFX11: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&1008)>>4
; GFX12-LABEL: .amdhsa_kernel test_clamping_both
; GFX12: .amdhsa_inst_pref_size ((instprefsize(.Lfunc_end7-test_clamping_both)<<4)&4080)>>4
;; Object: kernel descriptor at 0x1C0, COMPUTE_PGM_RSRC3 at 0x1EC:
;; gfx11: clamped to 63 -> 63<<4 = 0x3F0
;; gfx12: clamped to 255 -> 255<<4 = 0xFF0
; OBJ-GFX11: 01e0 {{.*}}f0030000
; OBJ-GFX12: 01e0 {{.*}}f00f0000
define amdgpu_kernel void @test_clamping_both() {
call void asm sideeffect ".fill 8192, 4, 0", ""()
ret void
}