| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s |
| ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s |
| |
| declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i1_signext(i1 signext) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_i8(i8) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i8_signext(i8 signext) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_i16(i16) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i16_signext(i16 signext) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_i32(i32) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i64(i64) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2i64(<2 x i64>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i64(<3 x i64>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4i64(<4 x i64>) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_f16(half) #0 |
| declare hidden amdgpu_gfx void @external_void_func_f32(float) #0 |
| declare hidden amdgpu_gfx void @external_void_func_f64(double) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2f32(<2 x float>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2f64(<2 x double>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3f32(<3 x float>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3f64(<3 x double>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v5f32(<5 x float>) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_v2i16(<2 x i16>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2f16(<2 x half>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i16(<3 x i16>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3f16(<3 x half>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4i16(<4 x i16>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4f16(<4 x half>) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_v2i32(<2 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i32(<3 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4i32(<4 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v5i32(<5 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v8i32(<8 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v16i32(<16 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v32i32(<32 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_f16_inreg(half inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_f32_inreg(float inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_f64_inreg(double inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4f16_inreg(<4 x half> inreg) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg, i32 inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg) #0 |
| declare hidden amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg, i32 inreg) #0 |
| |
| ; return value and argument |
| declare hidden amdgpu_gfx i32 @external_i32_func_i32(i32) #0 |
| |
| ; Structs |
| declare hidden amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 }) #0 |
| declare hidden amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval({ i8, i32 })) #0 |
| declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret({ i8, i32 }), { i8, i32 } addrspace(5)* byval({ i8, i32 })) #0 |
| |
| declare hidden amdgpu_gfx void @external_void_func_v16i8(<16 x i8>) #0 |
| |
| define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i1_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i1_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i1(i1 true) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i1_signext: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i1_signext: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %var = load volatile i1, i1 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_i1_signext(i1 %var) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i1_zeroext: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i1_zeroext: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %var = load volatile i1, i1 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_i1_zeroext(i1 %var) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i8_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i8_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i8(i8 123) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i8_signext: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i8_signext: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %var = load volatile i8, i8 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_i8_signext(i8 %var) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i8_zeroext: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i8_zeroext: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %var = load volatile i8, i8 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_i8_zeroext(i8 %var) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i16_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i16_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i16(i16 123) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i16_signext: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i16_signext: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %var = load volatile i16, i16 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_i16_signext(i16 %var) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i16_zeroext: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i16_zeroext: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %var = load volatile i16, i16 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_i16_zeroext(i16 %var) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i32(i32 42) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i64_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i64_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i64(i64 123) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x i64>, <2 x i64> addrspace(1)* null |
| call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i64_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i64_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %load = load <2 x i64>, <2 x i64> addrspace(1)* null |
| %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2> |
| |
| call amdgpu_gfx void @external_void_func_v3i64(<3 x i64> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, 4 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, 3 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v7, 4 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %load = load <2 x i64>, <2 x i64> addrspace(1)* null |
| %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_f16_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_f16_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_f16(half 4.0) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_f32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_f32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_f32(float 4.0) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2f32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2f32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v5f32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v5f32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_f64_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_f64_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_f64(double 4.0) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2f64_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2f64_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f64_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f64_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x i16>, <2 x i16> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <3 x i16>, <3 x i16> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <3 x half>, <3 x half> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i16_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i16_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f16_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f16_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <4 x i16>, <4 x i16> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i16_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i16_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2f16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x half>, <2 x half> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x i32>, <2 x i32> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i32_i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 6 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i32_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 6 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <4 x i32>, <4 x i32> addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v5i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 5 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v5i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 5 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v8i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v8i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v8, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[4:5] |
| ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[4:5] offset:16 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef |
| %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr |
| call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v8i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 6 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 7 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, 8 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v8i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 6 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, 7 |
| ; GFX10-NEXT: v_mov_b32_e32 v7, 8 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v16i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v16, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 |
| ; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 |
| ; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v16i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v16, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x3 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] |
| ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 |
| ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 |
| ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef |
| %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr |
| call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v32i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 |
| ; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 |
| ; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 |
| ; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 |
| ; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 |
| ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v32i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v32, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x7 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] |
| ; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 |
| ; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 |
| ; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 |
| ; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 |
| ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 |
| ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 |
| ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef |
| %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr |
| call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v32i32_i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[4:5] |
| ; GFX9-NEXT: global_load_dwordx4 v[4:7], v28, s[4:5] offset:16 |
| ; GFX9-NEXT: global_load_dwordx4 v[8:11], v28, s[4:5] offset:32 |
| ; GFX9-NEXT: global_load_dwordx4 v[12:15], v28, s[4:5] offset:48 |
| ; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 |
| ; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 |
| ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 |
| ; GFX9-NEXT: s_nop 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(7) |
| ; GFX9-NEXT: global_load_dword v32, v[0:1], off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v32i32_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v32, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: global_load_dword v33, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x7 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] |
| ; GFX10-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 |
| ; GFX10-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 |
| ; GFX10-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 |
| ; GFX10-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 |
| ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 |
| ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 |
| ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 |
| ; GFX10-NEXT: s_waitcnt vmcnt(8) |
| ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef |
| %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 |
| %val1 = load i32, i32 addrspace(1)* undef |
| call amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 { |
| ; GFX9-LABEL: test_call_external_i32_func_i32_imm: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v42, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: v_writelane_b32 v42, s30, 0 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v40, v0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v42, s31, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v41, v1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: global_store_dword v[40:41], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GFX9-NEXT: v_readlane_b32 s4, v42, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v42, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v42, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_i32_func_i32_imm: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v42, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: v_mov_b32_e32 v40, v0 |
| ; GFX10-NEXT: v_writelane_b32 v42, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 |
| ; GFX10-NEXT: v_mov_b32_e32 v41, v1 |
| ; GFX10-NEXT: v_writelane_b32 v42, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: global_store_dword v[40:41], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: v_readlane_b32 s4, v42, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v42, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v42, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42) |
| store volatile i32 %val, i32 addrspace(1)* %out |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:4 |
| ; GFX9-NEXT: global_load_ubyte v0, v2, s[4:5] |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_struct_i8_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_ubyte v0, v2, s[4:5] |
| ; GFX10-NEXT: global_load_dword v1, v2, s[4:5] offset:4 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef |
| %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 |
| call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 8 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 8 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = alloca { i8, i32 }, align 4, addrspace(5) |
| %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0 |
| %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1 |
| store i8 3, i8 addrspace(5)* %gep0 |
| store i32 8, i32 addrspace(5)* %gep1 |
| call amdgpu_gfx void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 8 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 |
| ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 |
| ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_byte v[0:1], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_dword v[0:1], v1, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 8 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 |
| ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_byte v[0:1], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: global_store_dword v[0:1], v1, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %in.val = alloca { i8, i32 }, align 4, addrspace(5) |
| %out.val = alloca { i8, i32 }, align 4, addrspace(5) |
| %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0 |
| %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1 |
| store i8 3, i8 addrspace(5)* %in.gep0 |
| store i32 8, i32 addrspace(5)* %in.gep1 |
| call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val) |
| %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0 |
| %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1 |
| %out.val0 = load i8, i8 addrspace(5)* %out.gep0 |
| %out.val1 = load i32, i32 addrspace(5)* %out.gep1 |
| |
| store volatile i8 %out.val0, i8 addrspace(1)* undef |
| store volatile i32 %out.val1, i32 addrspace(1)* undef |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v16i8: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v0 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, v1 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, v2 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v12, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, v16 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, v17 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, v18 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v16i8: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[4:5] |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v0 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v0 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v18, 24, v0 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, v1 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v2 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v2 |
| ; GFX10-NEXT: v_mov_b32_e32 v8, v2 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v13, 8, v3 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v3 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v15, 24, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v12, v3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, v16 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, v17 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, v18 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef |
| %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr |
| call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val) |
| ret void |
| } |
| |
| define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX9-LABEL: tail_call_byval_align16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: tail_call_byval_align16: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 |
| ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| entry: |
| %alloca = alloca double, align 8, addrspace(5) |
| tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval(double) align 16 %alloca) |
| ret void |
| } |
| |
| ; inreg arguments are put in sgprs |
| define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_i1_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_i1_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i1_inreg(i1 true) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i8_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i8_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i8_inreg(i8 123) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i16_inreg(i16 123) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 42 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 42 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i32_inreg(i32 42) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX9-NEXT: s_mov_b32 s5, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX10-NEXT: s_mov_b32 s5, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_i64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_i64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_i64_inreg(i64 123) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i64_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i64_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x i64>, <2 x i64> addrspace(4)* null |
| call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| ; GFX9-NEXT: s_mov_b32 s6, 3 |
| ; GFX9-NEXT: s_mov_b32 s7, 4 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| ; GFX10-NEXT: s_mov_b32 s5, 2 |
| ; GFX10-NEXT: s_mov_b32 s6, 3 |
| ; GFX10-NEXT: s_mov_b32 s7, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2i64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> <i64 8589934593, i64 17179869187>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i64_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s8, 1 |
| ; GFX9-NEXT: s_mov_b32 s9, 2 |
| ; GFX9-NEXT: s_getpc_b64 s[10:11] |
| ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i64_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX10-NEXT: s_mov_b32 s8, 1 |
| ; GFX10-NEXT: s_mov_b32 s9, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[10:11] |
| ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3i64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3i64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %load = load <2 x i64>, <2 x i64> addrspace(4)* null |
| %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2> |
| |
| call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i64_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s8, 1 |
| ; GFX9-NEXT: s_mov_b32 s9, 2 |
| ; GFX9-NEXT: s_mov_b32 s10, 3 |
| ; GFX9-NEXT: s_mov_b32 s11, 4 |
| ; GFX9-NEXT: s_getpc_b64 s[12:13] |
| ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i64_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_mov_b64 s[4:5], 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX10-NEXT: s_mov_b32 s8, 1 |
| ; GFX10-NEXT: s_mov_b32 s9, 2 |
| ; GFX10-NEXT: s_mov_b32 s10, 3 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s11, 4 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[12:13] |
| ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v4i64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v4i64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %load = load <2 x i64>, <2 x i64> addrspace(4)* null |
| %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x4400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x4400 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_f16_inreg(half 4.0) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 4.0 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 4.0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_f32_inreg(float 4.0) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX10-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> <float 1.0, float 2.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX9-NEXT: s_mov_b32 s6, 4.0 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX10-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX10-NEXT: s_mov_b32 s6, 4.0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3f32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> <float 1.0, float 2.0, float 4.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX9-NEXT: s_mov_b32 s6, 4.0 |
| ; GFX9-NEXT: s_mov_b32 s7, -1.0 |
| ; GFX9-NEXT: s_mov_b32 s8, 0.5 |
| ; GFX9-NEXT: s_getpc_b64 s[10:11] |
| ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX10-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX10-NEXT: s_mov_b32 s6, 4.0 |
| ; GFX10-NEXT: s_mov_b32 s7, -1.0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s8, 0.5 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[10:11] |
| ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5f32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v5f32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 0 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s5, 0x40100000 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_f64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_f64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_f64_inreg(double 4.0) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX9-NEXT: s_mov_b32 s6, 0 |
| ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX10-NEXT: s_mov_b32 s6, 0 |
| ; GFX10-NEXT: s_mov_b32 s7, 0x40100000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v2f64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> <double 2.0, double 4.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX9-NEXT: s_mov_b32 s6, 0 |
| ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 |
| ; GFX9-NEXT: s_mov_b32 s8, 0 |
| ; GFX9-NEXT: s_mov_b32 s9, 0x40200000 |
| ; GFX9-NEXT: s_getpc_b64 s[10:11] |
| ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s5, 2.0 |
| ; GFX10-NEXT: s_mov_b32 s6, 0 |
| ; GFX10-NEXT: s_mov_b32 s7, 0x40100000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s8, 0 |
| ; GFX10-NEXT: s_mov_b32 s9, 0x40200000 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[10:11] |
| ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v3f64_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v3f64_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> <double 2.0, double 4.0, double 8.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i16_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i16_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x i16>, <2 x i16> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i16_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i16_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <3 x i16>, <3 x i16> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f16_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f16_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <3 x half>, <3 x half> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x20001 |
| ; GFX9-NEXT: s_mov_b32 s5, 3 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 0x20001 |
| ; GFX10-NEXT: s_mov_b32 s5, 3 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3i16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3i16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> <i16 1, i16 2, i16 3>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 |
| ; GFX9-NEXT: s_movk_i32 s5, 0x4400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 |
| ; GFX10-NEXT: s_movk_i32 s5, 0x4400 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v3f16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v3f16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> <half 1.0, half 2.0, half 4.0>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i16_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i16_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <4 x i16>, <4 x i16> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x20001 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x40003 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 0x20001 |
| ; GFX10-NEXT: s_mov_b32 s5, 0x40003 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v4i16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v4i16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> <i16 1, i16 2, i16 3, i16 4>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2f16_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2f16_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2f16_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2f16_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x half>, <2 x half> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <2 x i32>, <2 x i32> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| ; GFX9-NEXT: s_getpc_b64 s[6:7] |
| ; GFX9-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| ; GFX10-NEXT: s_mov_b32 s5, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[6:7] |
| ; GFX10-NEXT: s_add_u32 s6, s6, external_void_func_v2i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s7, s7, external_void_func_v2i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> <i32 1, i32 2>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 3 |
| ; GFX9-NEXT: s_mov_b32 s5, 4 |
| ; GFX9-NEXT: s_mov_b32 s6, 5 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 3 |
| ; GFX10-NEXT: s_mov_b32 s5, 4 |
| ; GFX10-NEXT: s_mov_b32 s6, 5 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> <i32 3, i32 4, i32 5>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 3 |
| ; GFX9-NEXT: s_mov_b32 s5, 4 |
| ; GFX9-NEXT: s_mov_b32 s6, 5 |
| ; GFX9-NEXT: s_mov_b32 s7, 6 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 3 |
| ; GFX10-NEXT: s_mov_b32 s5, 4 |
| ; GFX10-NEXT: s_mov_b32 s6, 5 |
| ; GFX10-NEXT: s_mov_b32 s7, 6 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> <i32 3, i32 4, i32 5>, i32 6) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %val = load <4 x i32>, <4 x i32> addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| ; GFX9-NEXT: s_mov_b32 s6, 3 |
| ; GFX9-NEXT: s_mov_b32 s7, 4 |
| ; GFX9-NEXT: s_getpc_b64 s[8:9] |
| ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| ; GFX10-NEXT: s_mov_b32 s5, 2 |
| ; GFX10-NEXT: s_mov_b32 s6, 3 |
| ; GFX10-NEXT: s_mov_b32 s7, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[8:9] |
| ; GFX10-NEXT: s_add_u32 s8, s8, external_void_func_v4i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[8:9] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> <i32 1, i32 2, i32 3, i32 4>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| ; GFX9-NEXT: s_mov_b32 s6, 3 |
| ; GFX9-NEXT: s_mov_b32 s7, 4 |
| ; GFX9-NEXT: s_mov_b32 s8, 5 |
| ; GFX9-NEXT: s_getpc_b64 s[10:11] |
| ; GFX9-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| ; GFX10-NEXT: s_mov_b32 s5, 2 |
| ; GFX10-NEXT: s_mov_b32 s6, 3 |
| ; GFX10-NEXT: s_mov_b32 s7, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s8, 5 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[10:11] |
| ; GFX10-NEXT: s_add_u32 s10, s10, external_void_func_v5i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s11, s11, external_void_func_v5i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[10:11] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v8i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 |
| ; GFX9-NEXT: s_getpc_b64 s[12:13] |
| ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v8i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[12:13] |
| ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <8 x i32> addrspace(4)*, <8 x i32> addrspace(4)* addrspace(4)* undef |
| %val = load <8 x i32>, <8 x i32> addrspace(4)* %ptr |
| call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| ; GFX9-NEXT: s_mov_b32 s6, 3 |
| ; GFX9-NEXT: s_mov_b32 s7, 4 |
| ; GFX9-NEXT: s_mov_b32 s8, 5 |
| ; GFX9-NEXT: s_mov_b32 s9, 6 |
| ; GFX9-NEXT: s_mov_b32 s10, 7 |
| ; GFX9-NEXT: s_mov_b32 s11, 8 |
| ; GFX9-NEXT: s_getpc_b64 s[12:13] |
| ; GFX9-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[12:13] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| ; GFX10-NEXT: s_mov_b32 s5, 2 |
| ; GFX10-NEXT: s_mov_b32 s6, 3 |
| ; GFX10-NEXT: s_mov_b32 s7, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_mov_b32 s8, 5 |
| ; GFX10-NEXT: s_mov_b32 s9, 6 |
| ; GFX10-NEXT: s_mov_b32 s10, 7 |
| ; GFX10-NEXT: s_mov_b32 s11, 8 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[12:13] |
| ; GFX10-NEXT: s_add_u32 s12, s12, external_void_func_v8i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s13, s13, external_void_func_v8i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[12:13] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v16i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 |
| ; GFX9-NEXT: s_getpc_b64 s[20:21] |
| ; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v16i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_getpc_b64 s[20:21] |
| ; GFX10-NEXT: s_add_u32 s20, s20, external_void_func_v16i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s21, s21, external_void_func_v16i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[20:21] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <16 x i32> addrspace(4)*, <16 x i32> addrspace(4)* addrspace(4)* undef |
| %val = load <16 x i32>, <16 x i32> addrspace(4)* %ptr |
| call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v32i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 18 |
| ; GFX9-NEXT: v_writelane_b32 v40, s36, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s37, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s38, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s39, 3 |
| ; GFX9-NEXT: v_writelane_b32 v40, s40, 4 |
| ; GFX9-NEXT: v_writelane_b32 v40, s41, 5 |
| ; GFX9-NEXT: v_writelane_b32 v40, s42, 6 |
| ; GFX9-NEXT: v_writelane_b32 v40, s43, 7 |
| ; GFX9-NEXT: v_writelane_b32 v40, s44, 8 |
| ; GFX9-NEXT: v_writelane_b32 v40, s45, 9 |
| ; GFX9-NEXT: v_writelane_b32 v40, s46, 10 |
| ; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s47, 11 |
| ; GFX9-NEXT: v_writelane_b32 v40, s48, 12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s49, 13 |
| ; GFX9-NEXT: v_writelane_b32 v40, s50, 14 |
| ; GFX9-NEXT: v_writelane_b32 v40, s51, 15 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 |
| ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s46 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s47 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s48 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s49 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s50 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s51 |
| ; GFX9-NEXT: s_mov_b32 s20, s36 |
| ; GFX9-NEXT: s_mov_b32 s21, s37 |
| ; GFX9-NEXT: s_mov_b32 s22, s38 |
| ; GFX9-NEXT: s_mov_b32 s23, s39 |
| ; GFX9-NEXT: s_mov_b32 s24, s40 |
| ; GFX9-NEXT: s_mov_b32 s25, s41 |
| ; GFX9-NEXT: s_mov_b32 s26, s42 |
| ; GFX9-NEXT: s_mov_b32 s27, s43 |
| ; GFX9-NEXT: s_mov_b32 s28, s44 |
| ; GFX9-NEXT: s_mov_b32 s29, s45 |
| ; GFX9-NEXT: s_getpc_b64 s[30:31] |
| ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 16 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 17 |
| ; GFX9-NEXT: v_readlane_b32 s51, v40, 15 |
| ; GFX9-NEXT: v_readlane_b32 s50, v40, 14 |
| ; GFX9-NEXT: v_readlane_b32 s49, v40, 13 |
| ; GFX9-NEXT: v_readlane_b32 s48, v40, 12 |
| ; GFX9-NEXT: v_readlane_b32 s47, v40, 11 |
| ; GFX9-NEXT: v_readlane_b32 s46, v40, 10 |
| ; GFX9-NEXT: v_readlane_b32 s45, v40, 9 |
| ; GFX9-NEXT: v_readlane_b32 s44, v40, 8 |
| ; GFX9-NEXT: v_readlane_b32 s43, v40, 7 |
| ; GFX9-NEXT: v_readlane_b32 s42, v40, 6 |
| ; GFX9-NEXT: v_readlane_b32 s41, v40, 5 |
| ; GFX9-NEXT: v_readlane_b32 s40, v40, 4 |
| ; GFX9-NEXT: v_readlane_b32 s39, v40, 3 |
| ; GFX9-NEXT: v_readlane_b32 s38, v40, 2 |
| ; GFX9-NEXT: v_readlane_b32 s37, v40, 1 |
| ; GFX9-NEXT: v_readlane_b32 s36, v40, 0 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 18 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v32i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 |
| ; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s36, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 |
| ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 |
| ; GFX10-NEXT: v_writelane_b32 v40, s40, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s41, 5 |
| ; GFX10-NEXT: v_writelane_b32 v40, s42, 6 |
| ; GFX10-NEXT: v_writelane_b32 v40, s43, 7 |
| ; GFX10-NEXT: v_writelane_b32 v40, s44, 8 |
| ; GFX10-NEXT: v_writelane_b32 v40, s45, 9 |
| ; GFX10-NEXT: v_writelane_b32 v40, s46, 10 |
| ; GFX10-NEXT: v_writelane_b32 v40, s47, 11 |
| ; GFX10-NEXT: v_writelane_b32 v40, s48, 12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s49, 13 |
| ; GFX10-NEXT: v_writelane_b32 v40, s50, 14 |
| ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 |
| ; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 |
| ; GFX10-NEXT: s_getpc_b64 s[30:31] |
| ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s46 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s47 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s48 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s49 |
| ; GFX10-NEXT: s_mov_b32 s20, s36 |
| ; GFX10-NEXT: s_mov_b32 s21, s37 |
| ; GFX10-NEXT: s_mov_b32 s22, s38 |
| ; GFX10-NEXT: s_mov_b32 s23, s39 |
| ; GFX10-NEXT: s_mov_b32 s24, s40 |
| ; GFX10-NEXT: s_mov_b32 s25, s41 |
| ; GFX10-NEXT: s_mov_b32 s26, s42 |
| ; GFX10-NEXT: s_mov_b32 s27, s43 |
| ; GFX10-NEXT: s_mov_b32 s28, s44 |
| ; GFX10-NEXT: s_mov_b32 s29, s45 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, s50 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, s51 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 |
| ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 |
| ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 16 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 17 |
| ; GFX10-NEXT: v_readlane_b32 s51, v40, 15 |
| ; GFX10-NEXT: v_readlane_b32 s50, v40, 14 |
| ; GFX10-NEXT: v_readlane_b32 s49, v40, 13 |
| ; GFX10-NEXT: v_readlane_b32 s48, v40, 12 |
| ; GFX10-NEXT: v_readlane_b32 s47, v40, 11 |
| ; GFX10-NEXT: v_readlane_b32 s46, v40, 10 |
| ; GFX10-NEXT: v_readlane_b32 s45, v40, 9 |
| ; GFX10-NEXT: v_readlane_b32 s44, v40, 8 |
| ; GFX10-NEXT: v_readlane_b32 s43, v40, 7 |
| ; GFX10-NEXT: v_readlane_b32 s42, v40, 6 |
| ; GFX10-NEXT: v_readlane_b32 s41, v40, 5 |
| ; GFX10-NEXT: v_readlane_b32 s40, v40, 4 |
| ; GFX10-NEXT: v_readlane_b32 s39, v40, 3 |
| ; GFX10-NEXT: v_readlane_b32 s38, v40, 2 |
| ; GFX10-NEXT: v_readlane_b32 s37, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s36, v40, 0 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 18 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef |
| %val = load <32 x i32>, <32 x i32> addrspace(4)* %ptr |
| call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> %val) |
| ret void |
| } |
| |
| define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { |
| ; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 18 |
| ; GFX9-NEXT: v_writelane_b32 v40, s36, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s37, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s38, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s39, 3 |
| ; GFX9-NEXT: v_writelane_b32 v40, s40, 4 |
| ; GFX9-NEXT: v_writelane_b32 v40, s41, 5 |
| ; GFX9-NEXT: v_writelane_b32 v40, s42, 6 |
| ; GFX9-NEXT: v_writelane_b32 v40, s43, 7 |
| ; GFX9-NEXT: v_writelane_b32 v40, s44, 8 |
| ; GFX9-NEXT: v_writelane_b32 v40, s45, 9 |
| ; GFX9-NEXT: v_writelane_b32 v40, s46, 10 |
| ; GFX9-NEXT: v_writelane_b32 v40, s47, 11 |
| ; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 |
| ; GFX9-NEXT: s_load_dword s22, s[4:5], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s48, 12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s49, 13 |
| ; GFX9-NEXT: v_writelane_b32 v40, s50, 14 |
| ; GFX9-NEXT: v_writelane_b32 v40, s51, 15 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 |
| ; GFX9-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s22 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s46 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s47 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s48 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s49 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s50 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 16 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s51 |
| ; GFX9-NEXT: s_mov_b32 s20, s36 |
| ; GFX9-NEXT: s_mov_b32 s21, s37 |
| ; GFX9-NEXT: s_mov_b32 s22, s38 |
| ; GFX9-NEXT: s_mov_b32 s23, s39 |
| ; GFX9-NEXT: s_mov_b32 s24, s40 |
| ; GFX9-NEXT: s_mov_b32 s25, s41 |
| ; GFX9-NEXT: s_mov_b32 s26, s42 |
| ; GFX9-NEXT: s_mov_b32 s27, s43 |
| ; GFX9-NEXT: s_mov_b32 s28, s44 |
| ; GFX9-NEXT: s_mov_b32 s29, s45 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 17 |
| ; GFX9-NEXT: s_getpc_b64 s[30:31] |
| ; GFX9-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[30:31] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 16 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 17 |
| ; GFX9-NEXT: v_readlane_b32 s51, v40, 15 |
| ; GFX9-NEXT: v_readlane_b32 s50, v40, 14 |
| ; GFX9-NEXT: v_readlane_b32 s49, v40, 13 |
| ; GFX9-NEXT: v_readlane_b32 s48, v40, 12 |
| ; GFX9-NEXT: v_readlane_b32 s47, v40, 11 |
| ; GFX9-NEXT: v_readlane_b32 s46, v40, 10 |
| ; GFX9-NEXT: v_readlane_b32 s45, v40, 9 |
| ; GFX9-NEXT: v_readlane_b32 s44, v40, 8 |
| ; GFX9-NEXT: v_readlane_b32 s43, v40, 7 |
| ; GFX9-NEXT: v_readlane_b32 s42, v40, 6 |
| ; GFX9-NEXT: v_readlane_b32 s41, v40, 5 |
| ; GFX9-NEXT: v_readlane_b32 s40, v40, 4 |
| ; GFX9-NEXT: v_readlane_b32 s39, v40, 3 |
| ; GFX9-NEXT: v_readlane_b32 s38, v40, 2 |
| ; GFX9-NEXT: v_readlane_b32 s37, v40, 1 |
| ; GFX9-NEXT: v_readlane_b32 s36, v40, 0 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 18 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 18 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x0 |
| ; GFX10-NEXT: s_load_dword s22, s[4:5], 0x0 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s36, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s37, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s38, 2 |
| ; GFX10-NEXT: v_writelane_b32 v40, s39, 3 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s22 |
| ; GFX10-NEXT: v_writelane_b32 v40, s40, 4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s41, 5 |
| ; GFX10-NEXT: v_writelane_b32 v40, s42, 6 |
| ; GFX10-NEXT: v_writelane_b32 v40, s43, 7 |
| ; GFX10-NEXT: v_writelane_b32 v40, s44, 8 |
| ; GFX10-NEXT: v_writelane_b32 v40, s45, 9 |
| ; GFX10-NEXT: v_writelane_b32 v40, s46, 10 |
| ; GFX10-NEXT: v_writelane_b32 v40, s47, 11 |
| ; GFX10-NEXT: v_writelane_b32 v40, s48, 12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s49, 13 |
| ; GFX10-NEXT: v_writelane_b32 v40, s50, 14 |
| ; GFX10-NEXT: v_writelane_b32 v40, s51, 15 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: s_load_dwordx16 s[36:51], s[20:21], 0x40 |
| ; GFX10-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 16 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 17 |
| ; GFX10-NEXT: s_getpc_b64 s[30:31] |
| ; GFX10-NEXT: s_add_u32 s30, s30, external_void_func_v32i32_i32_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s31, s31, external_void_func_v32i32_i32_inreg@rel32@hi+12 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s46 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s47 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s48 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s49 |
| ; GFX10-NEXT: s_mov_b32 s20, s36 |
| ; GFX10-NEXT: s_mov_b32 s21, s37 |
| ; GFX10-NEXT: s_mov_b32 s22, s38 |
| ; GFX10-NEXT: s_mov_b32 s23, s39 |
| ; GFX10-NEXT: s_mov_b32 s24, s40 |
| ; GFX10-NEXT: s_mov_b32 s25, s41 |
| ; GFX10-NEXT: s_mov_b32 s26, s42 |
| ; GFX10-NEXT: s_mov_b32 s27, s43 |
| ; GFX10-NEXT: s_mov_b32 s28, s44 |
| ; GFX10-NEXT: s_mov_b32 s29, s45 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, s50 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, s51 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 |
| ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 |
| ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[30:31] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 16 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 17 |
| ; GFX10-NEXT: v_readlane_b32 s51, v40, 15 |
| ; GFX10-NEXT: v_readlane_b32 s50, v40, 14 |
| ; GFX10-NEXT: v_readlane_b32 s49, v40, 13 |
| ; GFX10-NEXT: v_readlane_b32 s48, v40, 12 |
| ; GFX10-NEXT: v_readlane_b32 s47, v40, 11 |
| ; GFX10-NEXT: v_readlane_b32 s46, v40, 10 |
| ; GFX10-NEXT: v_readlane_b32 s45, v40, 9 |
| ; GFX10-NEXT: v_readlane_b32 s44, v40, 8 |
| ; GFX10-NEXT: v_readlane_b32 s43, v40, 7 |
| ; GFX10-NEXT: v_readlane_b32 s42, v40, 6 |
| ; GFX10-NEXT: v_readlane_b32 s41, v40, 5 |
| ; GFX10-NEXT: v_readlane_b32 s40, v40, 4 |
| ; GFX10-NEXT: v_readlane_b32 s39, v40, 3 |
| ; GFX10-NEXT: v_readlane_b32 s38, v40, 2 |
| ; GFX10-NEXT: v_readlane_b32 s37, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s36, v40, 0 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 18 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| %ptr0 = load <32 x i32> addrspace(4)*, <32 x i32> addrspace(4)* addrspace(4)* undef |
| %val0 = load <32 x i32>, <32 x i32> addrspace(4)* %ptr0 |
| %val1 = load i32, i32 addrspace(4)* undef |
| call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> %val0, i32 %val1) |
| ret void |
| } |
| |
| define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { |
| ; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 |
| ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 |
| ; GFX9-NEXT: s_waitcnt vmcnt(1) |
| ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt vmcnt(1) |
| ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| entry: |
| call amdgpu_gfx void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) |
| ret void |
| } |
| |
| define amdgpu_gfx void @stack_12xv3i32() #0 { |
| ; GFX9-LABEL: stack_12xv3i32: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 12 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 13 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 14 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 15 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v10, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v11, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v12, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v13, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v14, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v15, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v16, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v17, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v18, 6 |
| ; GFX9-NEXT: v_mov_b32_e32 v19, 6 |
| ; GFX9-NEXT: v_mov_b32_e32 v20, 6 |
| ; GFX9-NEXT: v_mov_b32_e32 v21, 7 |
| ; GFX9-NEXT: v_mov_b32_e32 v22, 7 |
| ; GFX9-NEXT: v_mov_b32_e32 v23, 7 |
| ; GFX9-NEXT: v_mov_b32_e32 v24, 8 |
| ; GFX9-NEXT: v_mov_b32_e32 v25, 8 |
| ; GFX9-NEXT: v_mov_b32_e32 v26, 8 |
| ; GFX9-NEXT: v_mov_b32_e32 v27, 9 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 9 |
| ; GFX9-NEXT: v_mov_b32_e32 v29, 9 |
| ; GFX9-NEXT: v_mov_b32_e32 v30, 10 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, 11 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: stack_12xv3i32: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 12 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 13 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 14 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 15 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v7, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v8, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v10, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v11, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v12, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v13, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v14, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v15, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v16, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v17, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v18, 6 |
| ; GFX10-NEXT: v_mov_b32_e32 v19, 6 |
| ; GFX10-NEXT: v_mov_b32_e32 v20, 6 |
| ; GFX10-NEXT: v_mov_b32_e32 v21, 7 |
| ; GFX10-NEXT: v_mov_b32_e32 v22, 7 |
| ; GFX10-NEXT: v_mov_b32_e32 v23, 7 |
| ; GFX10-NEXT: v_mov_b32_e32 v24, 8 |
| ; GFX10-NEXT: v_mov_b32_e32 v25, 8 |
| ; GFX10-NEXT: v_mov_b32_e32 v26, 8 |
| ; GFX10-NEXT: v_mov_b32_e32 v27, 9 |
| ; GFX10-NEXT: v_mov_b32_e32 v28, 9 |
| ; GFX10-NEXT: v_mov_b32_e32 v29, 9 |
| ; GFX10-NEXT: v_mov_b32_e32 v30, 10 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, 11 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| entry: |
| call amdgpu_gfx void @external_void_func_12xv3i32( |
| <3 x i32><i32 0, i32 0, i32 0>, |
| <3 x i32><i32 1, i32 1, i32 1>, |
| <3 x i32><i32 2, i32 2, i32 2>, |
| <3 x i32><i32 3, i32 3, i32 3>, |
| <3 x i32><i32 4, i32 4, i32 4>, |
| <3 x i32><i32 5, i32 5, i32 5>, |
| <3 x i32><i32 6, i32 6, i32 6>, |
| <3 x i32><i32 7, i32 7, i32 7>, |
| <3 x i32><i32 8, i32 8, i32 8>, |
| <3 x i32><i32 9, i32 9, i32 9>, |
| <3 x i32><i32 10, i32 11, i32 12>, |
| <3 x i32><i32 13, i32 14, i32 15>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @stack_8xv5i32() #0 { |
| ; GFX9-LABEL: stack_8xv5i32: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 8 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 9 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 10 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 11 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 12 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 13 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 14 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 15 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, 1 |
| ; GFX9-NEXT: v_mov_b32_e32 v10, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v11, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v12, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v13, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v14, 2 |
| ; GFX9-NEXT: v_mov_b32_e32 v15, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v16, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v17, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v18, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v19, 3 |
| ; GFX9-NEXT: v_mov_b32_e32 v20, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v21, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v22, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v23, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v24, 4 |
| ; GFX9-NEXT: v_mov_b32_e32 v25, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v26, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v27, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v29, 5 |
| ; GFX9-NEXT: v_mov_b32_e32 v30, 6 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, 7 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: stack_8xv5i32: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 8 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 9 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 10 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 14 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 11 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 12 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 13 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 15 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 |
| ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v7, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v8, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v10, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v11, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v12, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v13, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v14, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v15, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v16, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v17, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v18, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v19, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v20, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v21, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v22, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v23, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v24, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v25, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v26, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v27, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v28, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v29, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v30, 6 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, 7 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| entry: |
| call amdgpu_gfx void @external_void_func_8xv5i32( |
| <5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>, |
| <5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>, |
| <5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>, |
| <5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>, |
| <5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>, |
| <5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>, |
| <5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>, |
| <5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>) |
| ret void |
| } |
| |
| define amdgpu_gfx void @stack_8xv5f32() #0 { |
| ; GFX9-LABEL: stack_8xv5f32: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_add_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41100000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41200000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v9, 1.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v10, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v11, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v12, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v13, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v14, 2.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v15, 0x40400000 |
| ; GFX9-NEXT: v_mov_b32_e32 v16, 0x40400000 |
| ; GFX9-NEXT: v_mov_b32_e32 v17, 0x40400000 |
| ; GFX9-NEXT: v_mov_b32_e32 v18, 0x40400000 |
| ; GFX9-NEXT: v_mov_b32_e32 v19, 0x40400000 |
| ; GFX9-NEXT: v_mov_b32_e32 v20, 4.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v21, 4.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v22, 4.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v23, 4.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v24, 4.0 |
| ; GFX9-NEXT: v_mov_b32_e32 v25, 0x40a00000 |
| ; GFX9-NEXT: v_mov_b32_e32 v26, 0x40a00000 |
| ; GFX9-NEXT: v_mov_b32_e32 v27, 0x40a00000 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000 |
| ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 |
| ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 |
| ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 |
| ; GFX9-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[4:5] |
| ; |
| ; GFX10-LABEL: stack_8xv5f32: |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_add_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 |
| ; GFX10-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v7, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v8, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v9, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v10, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v11, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v12, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v13, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v14, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v15, 0x40400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v16, 0x40400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v17, 0x40400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v18, 0x40400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v19, 0x40400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v20, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v21, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v22, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v23, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v24, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v25, 0x40a00000 |
| ; GFX10-NEXT: v_mov_b32_e32 v26, 0x40a00000 |
| ; GFX10-NEXT: v_mov_b32_e32 v27, 0x40a00000 |
| ; GFX10-NEXT: v_mov_b32_e32 v28, 0x40a00000 |
| ; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 |
| ; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 |
| ; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 |
| ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 |
| ; GFX10-NEXT: v_readlane_b32 s33, v40, 2 |
| ; GFX10-NEXT: s_or_saveexec_b32 s6, -1 |
| ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[4:5] |
| entry: |
| call amdgpu_gfx void @external_void_func_8xv5f32( |
| <5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, |
| <5 x float><float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, |
| <5 x float><float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>, |
| <5 x float><float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, |
| <5 x float><float 4.0, float 4.0, float 4.0, float 4.0, float 4.0>, |
| <5 x float><float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>, |
| <5 x float><float 6.0, float 7.0, float 8.0, float 9.0, float 10.0>, |
| <5 x float><float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>) |
| ret void |
| } |
| |
| declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval(double) align 16) #0 |
| declare hidden amdgpu_gfx void @stack_passed_f64_arg(<32 x i32>, double) #0 |
| declare hidden amdgpu_gfx void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, |
| <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>, |
| <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>, |
| <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0 |
| declare hidden amdgpu_gfx void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>, |
| <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0 |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind noinline } |